Esempio n. 1
0
    def _concatunits(self, utt, args):
        """ Concatenates units and produces waveform via residual
            excited LPC synthesis filter...
        """

        unit_rel = utt.get_relation("Unit")
        #concat:
        unit_item = unit_rel.head_item
        lpctrack = copy.deepcopy(
            unit_item["selected_unit"]["candidate"]["lpc-coefs"])
        residuals = window_residual(
            unit_item["selected_unit"]["candidate"]["lpc-coefs"],
            unit_item["selected_unit"]["candidate"]["residuals"])
        unit_item = unit_item.next_item
        while unit_item is not None:
            temptrack = unit_item["selected_unit"]["candidate"]["lpc-coefs"]
            #append lpccoefs to lpctrack:
            lpctrack.times = np.concatenate(
                (lpctrack.times, (temptrack.times + lpctrack.times[-1])))
            lpctrack.values = np.concatenate(
                (lpctrack.values, temptrack.values))
            #append windowed residuals:
            residuals.extend(
                window_residual(
                    temptrack,
                    unit_item["selected_unit"]["candidate"]["residuals"]))
            unit_item = unit_item.next_item

        #overlap add residual:
        lastsample = int(round(lpctrack.times[-1] * SAMPLERATE)) + int(
            round(len(residuals[-1]) / 2))
        residual = np.zeros(lastsample + 1)

        for i, time in enumerate(lpctrack.times):
            centersample = int(round(time * SAMPLERATE))
            firstsample = centersample - int(len(residuals[i]) / 2)
            residual[firstsample:firstsample + len(residuals[i])] += np.array(
                residuals[i])

        #synth filter:
        samples = synth_filter(lpctrack.times, lpctrack.values,
                               residual.astype(np.float), SAMPLERATE)

        #save in utterance:
        w = Waveform()
        w.samplerate = SAMPLERATE
        w.samples = samples.astype("int16")  #16bit samples
        w.channels = 1
        utt["waveform"] = w
        return utt
Esempio n. 2
0
def test(testfn="samples/data_001d.wav"):
    aud = Waveform(testfn)
    samples = zero_startends(aud.samples)
    hh = make_hpf()
    # mfreqz(h)
    # show()
    hpfsamples = filt(hh, samples)
    detsamples = detectsimple(hpfsamples)
    plot(samples)
    plot(np.abs(hpfsamples))
    plot(detsamples / 1000.0)
    print(find_segs(detsamples))
    show()
Esempio n. 3
0
 def get_wav(self):
     assert self.donesynth
     waveform = Waveform()
     waveform.samples = np.zeros(LIBHTS.HTS_Engine_get_nsamples(
         self.engine), np.int16)  #16-bit samples
     waveform.samplerate = int(
         LIBHTS.HTS_Engine_get_sampling_frequency(self.engine))
     waveform.channels = 1
     for i in range(len(waveform.samples)):
         waveform.samples[i] = LIBHTS.HTS_Engine_get_generated_speech(
             self.engine, i)  #copy
     return waveform
Esempio n. 4
0
def from_textgrid(voice):
    """ Create aligned Utterances by synthesising to Segment level
        from the orthography and simply copying label end times into
        segment items as "end" feature.
    """
    #Setup and create necessary dirs...
    CWD = os.getcwd()
    wav_dir = os.path.join(CWD, WAV_DIR)
    uttwav_dir = os.path.join(CWD, UTTWAV_DIR)
    transcr_location = os.path.join(CWD, ETC_DIR, TRANSCR_FILE)
    textgrid_dir = os.path.join(CWD, TEXTGRID_DIR)
    aligned_utts_dir = os.path.join(CWD, ALIGNED_UTT_DIR)

    os.makedirs(aligned_utts_dir)

    #update utts from textgrids...
    transcriptions = load_transcriptions_schemefile(transcr_location)

    alignments = sl.Corpus(textgrid_dir)

    #################
    for sc_utt, uttname, wavfilename in zip(
            alignments.utterances, sorted(transcriptions),
            sorted(glob(os.path.join(uttwav_dir, "*")))):
        assert sc_utt.name == uttname, "Utterance missmatch..."
        assert os.path.basename(wavfilename).startswith(
            uttname), "Utterance missmatch..."

        print("Synthesizing:", uttname)
        utt = voice.synthesize(transcriptions[uttname], 'text-to-segments')
        utt["file_id"] = uttname

        utt = transplant_segtime_info(voice, sc_utt, utt)

        #add waveform to utt:
        utt["waveform"] = Waveform(wavfilename)

        #save utt...
        ttslab.tofile(
            utt, os.path.join(aligned_utts_dir, ".".join([uttname, UTT_EXT])))
Esempio n. 5
0
def make_aligned_utts(voice, transcriptions, sc_corpus, wav_dir, output_dir):
    """ Make Word level utts and complete from 3-tier TextGrids...
    """
    def copyuttfeats(u, u2):
        for relname in ["Word", "Syllable"]:
            items = u.gr(relname).as_list()
            items2 = u2.gr(relname).as_list()
            assert [i["name"] for i in items] == [i2["name"] for i2 in items2]
            for i, i2 in zip(items, items2):
                for k in i2:
                    if not k in i:
                        i[k] = i2[k]
        return u

    for sc_utt, uttname, wavfilename in zip(
            sc_corpus.utterances, sorted(transcriptions),
            sorted(glob(os.path.join(wav_dir, "*")))):
        assert sc_utt.name == uttname, "Utterance missmatch..."
        assert os.path.basename(wavfilename).startswith(
            uttname), "Utterance missmatch..."

        print("Synthesizing:", uttname)
        utt = voice.synthesize(transcriptions[uttname], 'text-to-words')
        utt["file_id"] = uttname

        utt = complete_utt_from_textgrid(voice, sc_utt, utt)
        utt2 = voice.synthesize(transcriptions[uttname], 'text-to-segments')
        try:
            utt = copyuttfeats(utt, utt2)
        except AssertionError:
            print("WARNING: could not copy item feats for %s" % utt["file_id"])

        #add waveform to utt:
        utt["waveform"] = Waveform(wavfilename)

        #save utt...
        ttslab.tofile(utt,
                      os.path.join(output_dir, ".".join([uttname, UTT_EXT])))
Esempio n. 6
0
def make_aligned_utts(voice, transcriptions, sc_corpus, wav_dir, output_dir):
    """ Make Word level utts and complete from 3-tier TextGrids...
    """

    for sc_utt, uttname, wavfilename in zip(
            sc_corpus.utterances, sorted(transcriptions),
            sorted(glob(os.path.join(wav_dir, "*")))):
        assert sc_utt.name == uttname, "Utterance missmatch..."
        assert os.path.basename(wavfilename).startswith(
            uttname), "Utterance missmatch..."

        print("Synthesizing:", uttname)
        utt = voice.synthesize(transcriptions[uttname], 'text-to-words')
        utt["file_id"] = uttname

        utt = complete_utt_from_textgrid(voice, sc_utt, utt)

        #add waveform to utt:
        utt["waveform"] = Waveform(wavfilename)

        #save utt...
        ttslab.tofile(utt,
                      os.path.join(output_dir, ".".join([uttname, UTT_EXT])))
Esempio n. 7
0
    def update_wordview(self):
        u = self.corpusview.current_utt
        words = u.get_relation("SylStructure").as_list()
        word = words[self.corpusview.current_wordindex]
        try:
            prevword = word.prev_item
            prevwordname = prevword["name"]
            origstartsample = u["waveform"].samplerate * prevword["start"]
            synthstartsample = u["lindists"]["utt"]["waveform"].samplerate * prevword["start"]
            prevwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex-1]
        except TypeError:
            prevwordname = "NONE"
            origstartsample = 0
            synthstartsample = 0
            prevwordpronun = ""
        wordname = word["name"]
        wordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex]
        try:
            nextword = word.next_item
            nextwordname = nextword["name"]
            origendsample = u["waveform"].samplerate * nextword["end"]
            synthendsample = u["lindists"]["utt"]["waveform"].samplerate * nextword["end"]
            nextwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex+1]
        except TypeError:
            nextwordname = "NONE"
            origendsample = len(u["waveform"].samples)
            synthendsample = len(u["waveform"].samples)
            nextwordpronun = ""
            
        self.label_word1.set_label(prevwordname)
        self.label_word2.set_label(wordname)
        self.label_word3.set_label(nextwordname)

        self.entry_word1.set_text(prevwordpronun)
        self.entry_word2.set_text(wordpronun)
        self.entry_word3.set_text(nextwordpronun)

        self.origwordcontextwav = Waveform()
        self.origwordcontextwav.samplerate = u["waveform"].samplerate
        self.origwordcontextwav.samples = u["waveform"].samples[origstartsample:origendsample]
        origwordcontext_specfig = Figure(dpi=72)
        origwordcontext_specplot = origwordcontext_specfig.add_subplot(111)
        origwordcontext_specplot.specgram(self.origwordcontextwav.samples,
                                          Fs=self.origwordcontextwav.samplerate,
                                          NFFT=128, noverlap=64,
                                          xextent=(0.0, self.origwordcontextwav.samplerate*len(self.origwordcontextwav.samples)))
        origwordcontext_speccanvas = FigureCanvasGTK(origwordcontext_specfig)
        framecontents = self.frame_wordspecorig.get_children()
        if framecontents:
            self.frame_wordspecorig.remove(framecontents[0])
        self.frame_wordspecorig.add(origwordcontext_speccanvas)

        self.synthwordcontextwav = Waveform()
        self.synthwordcontextwav.samplerate = u["lindists"]["utt"]["waveform"].samplerate
        self.synthwordcontextwav.samples = u["lindists"]["utt"]["waveform"].samples[synthstartsample:synthendsample] 
        synthwordcontext_specfig = Figure(dpi=72)
        synthwordcontext_specplot = synthwordcontext_specfig.add_subplot(111)
        synthwordcontext_specplot.specgram(self.synthwordcontextwav.samples,
                                           Fs=self.synthwordcontextwav.samplerate,
                                           NFFT=128, noverlap=64,
                                           xextent=(0.0, self.synthwordcontextwav.samplerate*len(self.synthwordcontextwav.samples)))
        synthwordcontext_speccanvas = FigureCanvasGTK(synthwordcontext_specfig)
        framecontents = self.frame_wordspecsynth.get_children()
        if framecontents:
            self.frame_wordspecsynth.remove(framecontents[0])
        self.frame_wordspecsynth.add(synthwordcontext_speccanvas)
       
        self.statusbar.push(0, "Item: %s/%s (Word index: %s)" % (self.corpusview.current_index + 1, len(self.corpusview.worklist), self.corpusview.current_wordindex))
        self.table_words.show_all()
Esempio n. 8
0
class SpeechbrowserApp(object):       
    def __init__(self, voice):
        builder = gtk.Builder()
        builder.add_from_file(os.path.join(os.path.dirname(__file__), "speechbrowser.glade"))
        builder.connect_signals({"on_window1_destroy": gtk.main_quit,
                                 "on_toolbutton_open_clicked": self.on_toolbutton_open_clicked,
                                 "on_button_playutt_clicked": self.on_button_playutt_clicked,
                                 "on_button_playwordorig_clicked": self.on_button_playwordorig_clicked,
                                 "on_button_playwordsynth_clicked": self.on_button_playwordsynth_clicked,
                                 "on_button_next_clicked": self.on_button_next_clicked,
                                 "on_button_prev_clicked": self.on_button_prev_clicked})
        self.window1 = builder.get_object("window1")
        self.frame_specutt = builder.get_object("frame_specutt")
        self.button_playutt = builder.get_object("button_playutt")
        self.frame_words = builder.get_object("frame_words")
        self.entry_transcription = builder.get_object("entry_transcription")
        self.table_utt = builder.get_object("table_utt")
        self.table_words = builder.get_object("table_words")
        self.frame_wordspecorig = builder.get_object("frame_wordspecorig")
        self.frame_wordspecsynth = builder.get_object("frame_wordspecsynth")
        self.button_playwordorig = builder.get_object("button_playwordorig")
        self.button_playwordsynth = builder.get_object("button_playwordsynth")
        self.label_word1 = builder.get_object("label_word1")
        self.label_word2 = builder.get_object("label_word2")
        self.label_word3 = builder.get_object("label_word3")
        self.entry_word1 = builder.get_object("entry_word1")
        self.entry_word2 = builder.get_object("entry_word2")
        self.entry_word3 = builder.get_object("entry_word3")
        self.statusbar = builder.get_object("statusbar")
        self.entry_comment = builder.get_object("entry_comment")
        # self.combobox_comment = builder.get_object("combobox_comment")
        # liststore = gtk.ListStore(gobject.TYPE_STRING)
        # self.combobox_comment.set_model(liststore)
        # self.combobox_comment.set_entry_text_column(0)
        # self.combobox_comment.append_text("transcription error")
        # self.combobox_comment.append_text("pronunciation error")
        # self.combobox_comment.append_text("noise present")
        # self.combobox_comment.append_text("no problem")
        # cell = gtk.CellRendererText()
        # self.combobox_comment.pack_start(cell, True)
        # self.combobox_comment.add_attribute(cell, 'text', 1)

        self.window1.show()

        self.voice = voice


    def update_wordview(self):
        u = self.corpusview.current_utt
        words = u.get_relation("SylStructure").as_list()
        word = words[self.corpusview.current_wordindex]
        try:
            prevword = word.prev_item
            prevwordname = prevword["name"]
            origstartsample = u["waveform"].samplerate * prevword["start"]
            synthstartsample = u["lindists"]["utt"]["waveform"].samplerate * prevword["start"]
            prevwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex-1]
        except TypeError:
            prevwordname = "NONE"
            origstartsample = 0
            synthstartsample = 0
            prevwordpronun = ""
        wordname = word["name"]
        wordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex]
        try:
            nextword = word.next_item
            nextwordname = nextword["name"]
            origendsample = u["waveform"].samplerate * nextword["end"]
            synthendsample = u["lindists"]["utt"]["waveform"].samplerate * nextword["end"]
            nextwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex+1]
        except TypeError:
            nextwordname = "NONE"
            origendsample = len(u["waveform"].samples)
            synthendsample = len(u["waveform"].samples)
            nextwordpronun = ""
            
        self.label_word1.set_label(prevwordname)
        self.label_word2.set_label(wordname)
        self.label_word3.set_label(nextwordname)

        self.entry_word1.set_text(prevwordpronun)
        self.entry_word2.set_text(wordpronun)
        self.entry_word3.set_text(nextwordpronun)

        self.origwordcontextwav = Waveform()
        self.origwordcontextwav.samplerate = u["waveform"].samplerate
        self.origwordcontextwav.samples = u["waveform"].samples[origstartsample:origendsample]
        origwordcontext_specfig = Figure(dpi=72)
        origwordcontext_specplot = origwordcontext_specfig.add_subplot(111)
        origwordcontext_specplot.specgram(self.origwordcontextwav.samples,
                                          Fs=self.origwordcontextwav.samplerate,
                                          NFFT=128, noverlap=64,
                                          xextent=(0.0, self.origwordcontextwav.samplerate*len(self.origwordcontextwav.samples)))
        origwordcontext_speccanvas = FigureCanvasGTK(origwordcontext_specfig)
        framecontents = self.frame_wordspecorig.get_children()
        if framecontents:
            self.frame_wordspecorig.remove(framecontents[0])
        self.frame_wordspecorig.add(origwordcontext_speccanvas)

        self.synthwordcontextwav = Waveform()
        self.synthwordcontextwav.samplerate = u["lindists"]["utt"]["waveform"].samplerate
        self.synthwordcontextwav.samples = u["lindists"]["utt"]["waveform"].samples[synthstartsample:synthendsample] 
        synthwordcontext_specfig = Figure(dpi=72)
        synthwordcontext_specplot = synthwordcontext_specfig.add_subplot(111)
        synthwordcontext_specplot.specgram(self.synthwordcontextwav.samples,
                                           Fs=self.synthwordcontextwav.samplerate,
                                           NFFT=128, noverlap=64,
                                           xextent=(0.0, self.synthwordcontextwav.samplerate*len(self.synthwordcontextwav.samples)))
        synthwordcontext_speccanvas = FigureCanvasGTK(synthwordcontext_specfig)
        framecontents = self.frame_wordspecsynth.get_children()
        if framecontents:
            self.frame_wordspecsynth.remove(framecontents[0])
        self.frame_wordspecsynth.add(synthwordcontext_speccanvas)
       
        self.statusbar.push(0, "Item: %s/%s (Word index: %s)" % (self.corpusview.current_index + 1, len(self.corpusview.worklist), self.corpusview.current_wordindex))
        self.table_words.show_all()


    def savepronuns(self, wordindex):
        if wordindex != 0:
            self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][wordindex-1] = unicode(self.entry_word1.get_text(), "utf-8")
        self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][wordindex] = unicode(self.entry_word2.get_text(), "utf-8")
        try:
            self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][wordindex+1] = unicode(self.entry_word3.get_text(), "utf-8")
        except IndexError:
            pass


    def change_wordview(self, button):
        self.savepronuns(self.corpusview.current_wordindex)
        self.corpusview.current_wordindex = button.wordindex
        self.update_wordview()

    def update_uttview(self):
        utt = self.corpusview.current_utt
        origspeech_specfig = Figure(dpi=72)
        origspeech_specplot = origspeech_specfig.add_subplot(111)
        origspeech_specplot.specgram(utt["waveform"].samples, Fs=utt["waveform"].samplerate, NFFT=128, noverlap=64)
        origspeech_speccanvas = FigureCanvasGTK(origspeech_specfig)
        framecontents = self.frame_specutt.get_children()
        if framecontents:
            self.frame_specutt.remove(framecontents[0])
        self.frame_specutt.add(origspeech_speccanvas)
        self.entry_transcription.set_text(self.corpusview.transcriptions[self.corpusview.worklist[self.corpusview.current_index][0]])
        self.entry_comment.set_text(self.corpusview.comments[self.corpusview.worklist[self.corpusview.current_index][0]])
        self.buttonbox_words = gtk.HButtonBox()
        words = utt.get_relation("Word").as_list()
        for i, word in enumerate(words):
            button = gtk.Button()
            button.wordindex = i
            button.connect("clicked", self.change_wordview)
            button.set_label(word["name"])
            self.buttonbox_words.pack_end(button)
        framecontents = self.frame_words.get_children()
        if framecontents:
            self.frame_words.remove(framecontents[0])
        self.frame_words.add(self.buttonbox_words)
        self.table_utt.show_all()
        self.update_wordview()

    def on_button_next_clicked(self, obj):
        self.corpusview.transcriptions[self.corpusview.worklist[self.corpusview.current_index][0]] = unicode(self.entry_transcription.get_text(), "utf-8")
        self.corpusview.comments[self.corpusview.worklist[self.corpusview.current_index][0]] = unicode(self.entry_comment.get_text(), "utf-8")
        self.savepronuns(self.corpusview.current_wordindex)
        self.corpusview.next()
        self.update_uttview()

    def on_button_prev_clicked(self, obj):
        self.corpusview.transcriptions[self.corpusview.worklist[self.corpusview.current_index][0]] = unicode(self.entry_transcription.get_text(), "utf-8")
        self.corpusview.comments[self.corpusview.worklist[self.corpusview.current_index][0]] = unicode(self.entry_comment.get_text(), "utf-8")
        self.savepronuns(self.corpusview.current_wordindex)
        self.corpusview.prev()
        self.update_uttview()

    def on_button_playutt_clicked(self, obj):
        self.corpusview.current_utt["waveform"].play()

    def on_button_playwordorig_clicked(self, obj):
        self.origwordcontextwav.play()

    def on_button_playwordsynth_clicked(self, obj):
        self.synthwordcontextwav.play()

    def on_toolbutton_open_clicked(self, obj):
        chooser = gtk.FileChooserDialog(title=None,
                                        action=gtk.FILE_CHOOSER_ACTION_OPEN,
                                        buttons=(gtk.STOCK_CANCEL,
                                                 gtk.RESPONSE_CANCEL,
                                                 gtk.STOCK_OPEN,
                                                 gtk.RESPONSE_OK))
        chooser.set_current_folder(os.getcwd())
        response = chooser.run()
        if response == gtk.RESPONSE_OK:
            filename = chooser.get_filename()
            worklist = loadworklist(filename)
            self.corpusview = CorpusView(worklist, self.voice)
        elif response == gtk.RESPONSE_CANCEL:
            print('Closed, no files selected')
        chooser.destroy()
        self.update_uttview()
        self.update_wordview()
Esempio n. 9
0
def test2(testfn="samples/data_001d.wav"):
    aud = Waveform(testfn)
    print(inhaletimeinfo(aud.samples))
Esempio n. 10
0
class SpeechbrowserApp(object):       
    def __init__(self, phmap):
        builder = gtk.Builder()
        builder.add_from_file(os.path.join(os.getenv("TTSLABDEV_ROOT"), "voicetools/speechbrowser", "speechbrowser.glade"))
        builder.connect_signals({"on_window1_destroy": gtk.main_quit,
                                 "on_toolbutton_open_clicked": self.on_toolbutton_open_clicked,
                                 "on_button_playutt_clicked": self.on_button_playutt_clicked,
                                 "on_button_playwordorig_clicked": self.on_button_playwordorig_clicked,
                                 "on_button_playwordsynth_clicked": self.on_button_playwordsynth_clicked,
                                 "on_button_next_clicked": self.on_button_next_clicked,
                                 "on_button_prev_clicked": self.on_button_prev_clicked})
        self.window1 = builder.get_object("window1")
        self.frame_specutt = builder.get_object("frame_specutt")
        self.button_playutt = builder.get_object("button_playutt")
        self.frame_words = builder.get_object("frame_words")
        self.entry_transcription = builder.get_object("entry_transcription")
        self.table_utt = builder.get_object("table_utt")
        self.table_words = builder.get_object("table_words")
        self.frame_wordspecorig = builder.get_object("frame_wordspecorig")
        self.frame_wordspecsynth = builder.get_object("frame_wordspecsynth")
        self.button_playwordorig = builder.get_object("button_playwordorig")
        self.button_playwordsynth = builder.get_object("button_playwordsynth")
        self.label_word1 = builder.get_object("label_word1")
        self.label_word2 = builder.get_object("label_word2")
        self.label_word3 = builder.get_object("label_word3")
        self.entry_word1 = builder.get_object("entry_word1")
        self.entry_word2 = builder.get_object("entry_word2")
        self.entry_word3 = builder.get_object("entry_word3")
        self.statusbar = builder.get_object("statusbar")
        self.entry_comment = builder.get_object("entry_comment")
        # self.combobox_comment = builder.get_object("combobox_comment")
        # liststore = gtk.ListStore(gobject.TYPE_STRING)
        # self.combobox_comment.set_model(liststore)
        # self.combobox_comment.set_entry_text_column(0)
        # self.combobox_comment.append_text("transcription error")
        # self.combobox_comment.append_text("pronunciation error")
        # self.combobox_comment.append_text("noise present")
        # self.combobox_comment.append_text("no problem")
        # cell = gtk.CellRendererText()
        # self.combobox_comment.pack_start(cell, True)
        # self.combobox_comment.add_attribute(cell, 'text', 1)

        self.window1.show()

        self.phmap = phmap


    def update_wordview(self):
        u = self.corpusview.current_utt
        words = u.get_relation("SylStructure").as_list()
        word = words[self.corpusview.current_wordindex]
        try:
            prevword = word.prev_item
            prevwordname = prevword["name"]
            origstartsample = u["waveform"].samplerate * prevword["start"]
            synthstartsample = u["lindists"]["utt"]["waveform"].samplerate * prevword["start"]
            prevwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex-1]
        except TypeError:
            prevwordname = "NONE"
            origstartsample = 0
            synthstartsample = 0
            prevwordpronun = ""
        wordname = word["name"]
        wordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex]
        try:
            nextword = word.next_item
            nextwordname = nextword["name"]
            origendsample = u["waveform"].samplerate * nextword["end"]
            synthendsample = u["lindists"]["utt"]["waveform"].samplerate * nextword["end"]
            nextwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex+1]
        except TypeError:
            nextwordname = "NONE"
            origendsample = len(u["waveform"].samples)
            synthendsample = len(u["waveform"].samples)
            nextwordpronun = ""
            
        self.label_word1.set_label(prevwordname)
        self.label_word2.set_label(wordname)
        self.label_word3.set_label(nextwordname)

        self.entry_word1.set_text(prevwordpronun)
        self.entry_word2.set_text(wordpronun)
        self.entry_word3.set_text(nextwordpronun)

        self.origwordcontextwav = Waveform()
        self.origwordcontextwav.samplerate = u["waveform"].samplerate
        self.origwordcontextwav.samples = u["waveform"].samples[origstartsample:origendsample]
        origwordcontext_specfig = Figure(dpi=72)
        origwordcontext_specplot = origwordcontext_specfig.add_subplot(111)
        origwordcontext_specplot.specgram(self.origwordcontextwav.samples,
                                          Fs=self.origwordcontextwav.samplerate,
                                          NFFT=128, noverlap=64,
                                          xextent=(0.0, self.origwordcontextwav.samplerate*len(self.origwordcontextwav.samples)))
        origwordcontext_speccanvas = FigureCanvasGTK(origwordcontext_specfig)
        framecontents = self.frame_wordspecorig.get_children()
        if framecontents:
            self.frame_wordspecorig.remove(framecontents[0])
        self.frame_wordspecorig.add(origwordcontext_speccanvas)

        self.synthwordcontextwav = Waveform()
        self.synthwordcontextwav.samplerate = u["lindists"]["utt"]["waveform"].samplerate
        self.synthwordcontextwav.samples = u["lindists"]["utt"]["waveform"].samples[synthstartsample:synthendsample] 
        synthwordcontext_specfig = Figure(dpi=72)
        synthwordcontext_specplot = synthwordcontext_specfig.add_subplot(111)
        synthwordcontext_specplot.specgram(self.synthwordcontextwav.samples,
                                           Fs=self.synthwordcontextwav.samplerate,
                                           NFFT=128, noverlap=64,
                                           xextent=(0.0, self.synthwordcontextwav.samplerate*len(self.synthwordcontextwav.samples)))
        synthwordcontext_speccanvas = FigureCanvasGTK(synthwordcontext_specfig)
        framecontents = self.frame_wordspecsynth.get_children()
        if framecontents:
            self.frame_wordspecsynth.remove(framecontents[0])
        self.frame_wordspecsynth.add(synthwordcontext_speccanvas)
       
        self.statusbar.push(0, "Item: %s/%s (Word index: %s)" % (self.corpusview.current_index + 1, len(self.corpusview.worklist), self.corpusview.current_wordindex))
        self.table_words.show_all()


    def savepronuns(self, wordindex):
        if wordindex != 0:
            self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][wordindex-1] = unicode(self.entry_word1.get_text(), "utf-8")
        self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][wordindex] = unicode(self.entry_word2.get_text(), "utf-8")
        try:
            self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][wordindex+1] = unicode(self.entry_word3.get_text(), "utf-8")
        except IndexError:
            pass


    def change_wordview(self, button):
        self.savepronuns(self.corpusview.current_wordindex)
        self.corpusview.current_wordindex = button.wordindex
        self.update_wordview()

    def update_uttview(self):
        utt = self.corpusview.current_utt
        origspeech_specfig = Figure(dpi=72)
        origspeech_specplot = origspeech_specfig.add_subplot(111)
        origspeech_specplot.specgram(utt["waveform"].samples, Fs=utt["waveform"].samplerate, NFFT=128, noverlap=64)
        origspeech_speccanvas = FigureCanvasGTK(origspeech_specfig)
        framecontents = self.frame_specutt.get_children()
        if framecontents:
            self.frame_specutt.remove(framecontents[0])
        self.frame_specutt.add(origspeech_speccanvas)
        self.entry_transcription.set_text(self.corpusview.transcriptions[self.corpusview.worklist[self.corpusview.current_index][0]])
        self.entry_comment.set_text(self.corpusview.comments[self.corpusview.worklist[self.corpusview.current_index][0]])
        self.buttonbox_words = gtk.HButtonBox()
        words = utt.get_relation("Word").as_list()
        for i, word in enumerate(words):
            button = gtk.Button()
            button.wordindex = i
            button.connect("clicked", self.change_wordview)
            button.set_label(word["name"])
            self.buttonbox_words.pack_end(button)
        framecontents = self.frame_words.get_children()
        if framecontents:
            self.frame_words.remove(framecontents[0])
        self.frame_words.add(self.buttonbox_words)
        self.table_utt.show_all()
        self.update_wordview()

    def on_button_next_clicked(self, obj):
        self.corpusview.transcriptions[self.corpusview.worklist[self.corpusview.current_index][0]] = unicode(self.entry_transcription.get_text(), "utf-8")
        self.corpusview.comments[self.corpusview.worklist[self.corpusview.current_index][0]] = unicode(self.entry_comment.get_text(), "utf-8")
        self.savepronuns(self.corpusview.current_wordindex)
        self.corpusview.next()
        self.update_uttview()

    def on_button_prev_clicked(self, obj):
        self.corpusview.transcriptions[self.corpusview.worklist[self.corpusview.current_index][0]] = unicode(self.entry_transcription.get_text(), "utf-8")
        self.corpusview.comments[self.corpusview.worklist[self.corpusview.current_index][0]] = unicode(self.entry_comment.get_text(), "utf-8")
        self.savepronuns(self.corpusview.current_wordindex)
        self.corpusview.prev()
        self.update_uttview()

    def on_button_playutt_clicked(self, obj):
        self.corpusview.current_utt["waveform"].play()

    def on_button_playwordorig_clicked(self, obj):
        self.origwordcontextwav.play()

    def on_button_playwordsynth_clicked(self, obj):
        self.synthwordcontextwav.play()

    def on_toolbutton_open_clicked(self, obj):
        chooser = gtk.FileChooserDialog(title=None,
                                        action=gtk.FILE_CHOOSER_ACTION_OPEN,
                                        buttons=(gtk.STOCK_CANCEL,
                                                 gtk.RESPONSE_CANCEL,
                                                 gtk.STOCK_OPEN,
                                                 gtk.RESPONSE_OK))
        chooser.set_current_folder(os.getcwd())
        response = chooser.run()
        if response == gtk.RESPONSE_OK:
            filename = chooser.get_filename()
            worklist = loadworklist(filename)
            self.corpusview = CorpusView(worklist, self.phmap)
        elif response == gtk.RESPONSE_CANCEL:
            print('Closed, no files selected')
        chooser.destroy()
        self.update_uttview()
        self.update_wordview()
Esempio n. 11
0
    def update_wordview(self):
        u = self.corpusview.current_utt
        words = u.get_relation("SylStructure").as_list()
        word = words[self.corpusview.current_wordindex]
        try:
            prevword = word.prev_item
            prevwordname = prevword["name"]
            origstartsample = u["waveform"].samplerate * prevword["start"]
            synthstartsample = u["lindists"]["utt"]["waveform"].samplerate * prevword["start"]
            prevwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex-1]
        except TypeError:
            prevwordname = "NONE"
            origstartsample = 0
            synthstartsample = 0
            prevwordpronun = ""
        wordname = word["name"]
        wordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex]
        try:
            nextword = word.next_item
            nextwordname = nextword["name"]
            origendsample = u["waveform"].samplerate * nextword["end"]
            synthendsample = u["lindists"]["utt"]["waveform"].samplerate * nextword["end"]
            nextwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex+1]
        except TypeError:
            nextwordname = "NONE"
            origendsample = len(u["waveform"].samples)
            synthendsample = len(u["waveform"].samples)
            nextwordpronun = ""
            
        self.label_word1.set_label(prevwordname)
        self.label_word2.set_label(wordname)
        self.label_word3.set_label(nextwordname)

        self.entry_word1.set_text(prevwordpronun)
        self.entry_word2.set_text(wordpronun)
        self.entry_word3.set_text(nextwordpronun)

        self.origwordcontextwav = Waveform()
        self.origwordcontextwav.samplerate = u["waveform"].samplerate
        self.origwordcontextwav.samples = u["waveform"].samples[origstartsample:origendsample]
        origwordcontext_specfig = Figure(dpi=72)
        origwordcontext_specplot = origwordcontext_specfig.add_subplot(111)
        origwordcontext_specplot.specgram(self.origwordcontextwav.samples,
                                          Fs=self.origwordcontextwav.samplerate,
                                          NFFT=128, noverlap=64,
                                          xextent=(0.0, self.origwordcontextwav.samplerate*len(self.origwordcontextwav.samples)))
        origwordcontext_speccanvas = FigureCanvasGTK(origwordcontext_specfig)
        framecontents = self.frame_wordspecorig.get_children()
        if framecontents:
            self.frame_wordspecorig.remove(framecontents[0])
        self.frame_wordspecorig.add(origwordcontext_speccanvas)

        self.synthwordcontextwav = Waveform()
        self.synthwordcontextwav.samplerate = u["lindists"]["utt"]["waveform"].samplerate
        self.synthwordcontextwav.samples = u["lindists"]["utt"]["waveform"].samples[synthstartsample:synthendsample] 
        synthwordcontext_specfig = Figure(dpi=72)
        synthwordcontext_specplot = synthwordcontext_specfig.add_subplot(111)
        synthwordcontext_specplot.specgram(self.synthwordcontextwav.samples,
                                           Fs=self.synthwordcontextwav.samplerate,
                                           NFFT=128, noverlap=64,
                                           xextent=(0.0, self.synthwordcontextwav.samplerate*len(self.synthwordcontextwav.samples)))
        synthwordcontext_speccanvas = FigureCanvasGTK(synthwordcontext_specfig)
        framecontents = self.frame_wordspecsynth.get_children()
        if framecontents:
            self.frame_wordspecsynth.remove(framecontents[0])
        self.frame_wordspecsynth.add(synthwordcontext_speccanvas)
       
        self.statusbar.push(0, "Item: %s/%s (Word index: %s)" % (self.corpusview.current_index + 1, len(self.corpusview.worklist), self.corpusview.current_wordindex))
        self.table_words.show_all()
Esempio n. 12
0
    def hts_synth(self, utt, processname):
        htsparms = self.engine_parms.copy()
        htsparms["-of"] = "%(tempolf0_file)s"
        if "htsparms" in utt:
            htsparms.update(utt["htsparms"])   #parm overrides for this utt...

        #build command string and execute:
        cmds = self.hts_bin
        for k in htsparms:
            if htsparms[k]:
                if htsparms[k] is True:
                    cmds += " " + k
                else:
                    cmds += " " + k + " " + str(htsparms[k])
        cmds += " %(tempilab_file)s"

        fd1, tempwav_file = mkstemp(prefix="ttslab_", suffix=".wav")
        fd2, tempilab_file = mkstemp(prefix="ttslab_")
        fd3, tempolab_file = mkstemp(prefix="ttslab_")
        fd4, tempolf0_file = mkstemp(prefix="ttslab_")

        cmds = cmds % {'models_dir': self.models_dir,
                       'tempwav_file': tempwav_file,
                       'tempilab_file': tempilab_file,
                       'tempolab_file': tempolab_file,
                       'tempolf0_file': tempolf0_file}
        #print(cmds)
        with codecs.open(tempilab_file, "w", encoding="utf-8") as outfh:
            outfh.write("\n".join(utt["hts_label"]))

        os.system(cmds)

        #load seg endtimes into utt:
        with open(tempolab_file) as infh:
            lines = infh.readlines()
            segs = utt.get_relation("Segment").as_list()
            assert len(segs) == len(lines)
            for line, seg in zip(lines, segs):
                seg["end"] = hts_labels_tone.htk_int_to_float(line.split()[1])

        #load audio:
        utt["waveform"] = Waveform(tempwav_file)

        #load lf0:
        f0 = np.exp(np.fromfile(tempolf0_file, "float32")) #load and lf0 to hertz
        #to semitones relative to 1Hz:
        f0[f0.nonzero()] = 12.0 * np.log2(f0[f0.nonzero()]) # 12 * log2 (F0 / F0reference) where F0reference = 1
        f0t = Track()
        f0t.values = f0
        f0t.times = np.arange(len(f0), dtype=np.float64) * 0.005
        utt["f0"] = f0t

        #cleanup tempfiles:
        os.close(fd1)
        os.close(fd2)
        os.close(fd3)
        os.close(fd4)
        os.remove(tempwav_file)
        os.remove(tempolab_file)
        os.remove(tempilab_file)
        os.remove(tempolf0_file)

        return utt