def get_wav(self): assert self.donesynth waveform = Waveform() waveform.samples = np.zeros(LIBHTS.HTS_Engine_get_nsamples( self.engine), np.int16) #16-bit samples waveform.samplerate = int( LIBHTS.HTS_Engine_get_sampling_frequency(self.engine)) waveform.channels = 1 for i in range(len(waveform.samples)): waveform.samples[i] = LIBHTS.HTS_Engine_get_generated_speech( self.engine, i) #copy return waveform
def test(testfn="samples/data_001d.wav"): aud = Waveform(testfn) samples = zero_startends(aud.samples) hh = make_hpf() # mfreqz(h) # show() hpfsamples = filt(hh, samples) detsamples = detectsimple(hpfsamples) plot(samples) plot(np.abs(hpfsamples)) plot(detsamples / 1000.0) print(find_segs(detsamples)) show()
def _concatunits(self, utt, args): """ Concatenates units and produces waveform via residual excited LPC synthesis filter... """ unit_rel = utt.get_relation("Unit") #concat: unit_item = unit_rel.head_item lpctrack = copy.deepcopy( unit_item["selected_unit"]["candidate"]["lpc-coefs"]) residuals = window_residual( unit_item["selected_unit"]["candidate"]["lpc-coefs"], unit_item["selected_unit"]["candidate"]["residuals"]) unit_item = unit_item.next_item while unit_item is not None: temptrack = unit_item["selected_unit"]["candidate"]["lpc-coefs"] #append lpccoefs to lpctrack: lpctrack.times = np.concatenate( (lpctrack.times, (temptrack.times + lpctrack.times[-1]))) lpctrack.values = np.concatenate( (lpctrack.values, temptrack.values)) #append windowed residuals: residuals.extend( window_residual( temptrack, unit_item["selected_unit"]["candidate"]["residuals"])) unit_item = unit_item.next_item #overlap add residual: lastsample = int(round(lpctrack.times[-1] * SAMPLERATE)) + int( round(len(residuals[-1]) / 2)) residual = np.zeros(lastsample + 1) for i, time in enumerate(lpctrack.times): centersample = int(round(time * SAMPLERATE)) firstsample = centersample - int(len(residuals[i]) / 2) residual[firstsample:firstsample + len(residuals[i])] += np.array( residuals[i]) #synth filter: samples = synth_filter(lpctrack.times, lpctrack.values, residual.astype(np.float), SAMPLERATE) #save in utterance: w = Waveform() w.samplerate = SAMPLERATE w.samples = samples.astype("int16") #16bit samples w.channels = 1 utt["waveform"] = w return utt
def from_textgrid(voice): """ Create aligned Utterances by synthesising to Segment level from the orthography and simply copying label end times into segment items as "end" feature. """ #Setup and create necessary dirs... CWD = os.getcwd() wav_dir = os.path.join(CWD, WAV_DIR) uttwav_dir = os.path.join(CWD, UTTWAV_DIR) transcr_location = os.path.join(CWD, ETC_DIR, TRANSCR_FILE) textgrid_dir = os.path.join(CWD, TEXTGRID_DIR) aligned_utts_dir = os.path.join(CWD, ALIGNED_UTT_DIR) os.makedirs(aligned_utts_dir) #update utts from textgrids... transcriptions = load_transcriptions_schemefile(transcr_location) alignments = sl.Corpus(textgrid_dir) ################# for sc_utt, uttname, wavfilename in zip( alignments.utterances, sorted(transcriptions), sorted(glob(os.path.join(uttwav_dir, "*")))): assert sc_utt.name == uttname, "Utterance missmatch..." assert os.path.basename(wavfilename).startswith( uttname), "Utterance missmatch..." print("Synthesizing:", uttname) utt = voice.synthesize(transcriptions[uttname], 'text-to-segments') utt["file_id"] = uttname utt = transplant_segtime_info(voice, sc_utt, utt) #add waveform to utt: utt["waveform"] = Waveform(wavfilename) #save utt... ttslab.tofile( utt, os.path.join(aligned_utts_dir, ".".join([uttname, UTT_EXT])))
def make_aligned_utts(voice, transcriptions, sc_corpus, wav_dir, output_dir): """ Make Word level utts and complete from 3-tier TextGrids... """ def copyuttfeats(u, u2): for relname in ["Word", "Syllable"]: items = u.gr(relname).as_list() items2 = u2.gr(relname).as_list() assert [i["name"] for i in items] == [i2["name"] for i2 in items2] for i, i2 in zip(items, items2): for k in i2: if not k in i: i[k] = i2[k] return u for sc_utt, uttname, wavfilename in zip( sc_corpus.utterances, sorted(transcriptions), sorted(glob(os.path.join(wav_dir, "*")))): assert sc_utt.name == uttname, "Utterance missmatch..." assert os.path.basename(wavfilename).startswith( uttname), "Utterance missmatch..." print("Synthesizing:", uttname) utt = voice.synthesize(transcriptions[uttname], 'text-to-words') utt["file_id"] = uttname utt = complete_utt_from_textgrid(voice, sc_utt, utt) utt2 = voice.synthesize(transcriptions[uttname], 'text-to-segments') try: utt = copyuttfeats(utt, utt2) except AssertionError: print("WARNING: could not copy item feats for %s" % utt["file_id"]) #add waveform to utt: utt["waveform"] = Waveform(wavfilename) #save utt... ttslab.tofile(utt, os.path.join(output_dir, ".".join([uttname, UTT_EXT])))
def make_aligned_utts(voice, transcriptions, sc_corpus, wav_dir, output_dir): """ Make Word level utts and complete from 3-tier TextGrids... """ for sc_utt, uttname, wavfilename in zip( sc_corpus.utterances, sorted(transcriptions), sorted(glob(os.path.join(wav_dir, "*")))): assert sc_utt.name == uttname, "Utterance missmatch..." assert os.path.basename(wavfilename).startswith( uttname), "Utterance missmatch..." print("Synthesizing:", uttname) utt = voice.synthesize(transcriptions[uttname], 'text-to-words') utt["file_id"] = uttname utt = complete_utt_from_textgrid(voice, sc_utt, utt) #add waveform to utt: utt["waveform"] = Waveform(wavfilename) #save utt... ttslab.tofile(utt, os.path.join(output_dir, ".".join([uttname, UTT_EXT])))
def update_wordview(self): u = self.corpusview.current_utt words = u.get_relation("SylStructure").as_list() word = words[self.corpusview.current_wordindex] try: prevword = word.prev_item prevwordname = prevword["name"] origstartsample = u["waveform"].samplerate * prevword["start"] synthstartsample = u["lindists"]["utt"]["waveform"].samplerate * prevword["start"] prevwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex-1] except TypeError: prevwordname = "NONE" origstartsample = 0 synthstartsample = 0 prevwordpronun = "" wordname = word["name"] wordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex] try: nextword = word.next_item nextwordname = nextword["name"] origendsample = u["waveform"].samplerate * nextword["end"] synthendsample = u["lindists"]["utt"]["waveform"].samplerate * nextword["end"] nextwordpronun = self.corpusview.pronuns[self.corpusview.worklist[self.corpusview.current_index][0]][self.corpusview.current_wordindex+1] except TypeError: nextwordname = "NONE" origendsample = len(u["waveform"].samples) synthendsample = len(u["waveform"].samples) nextwordpronun = "" self.label_word1.set_label(prevwordname) self.label_word2.set_label(wordname) self.label_word3.set_label(nextwordname) self.entry_word1.set_text(prevwordpronun) self.entry_word2.set_text(wordpronun) self.entry_word3.set_text(nextwordpronun) self.origwordcontextwav = Waveform() self.origwordcontextwav.samplerate = u["waveform"].samplerate self.origwordcontextwav.samples = u["waveform"].samples[origstartsample:origendsample] origwordcontext_specfig = Figure(dpi=72) origwordcontext_specplot = origwordcontext_specfig.add_subplot(111) origwordcontext_specplot.specgram(self.origwordcontextwav.samples, Fs=self.origwordcontextwav.samplerate, NFFT=128, noverlap=64, xextent=(0.0, self.origwordcontextwav.samplerate*len(self.origwordcontextwav.samples))) origwordcontext_speccanvas = FigureCanvasGTK(origwordcontext_specfig) framecontents = self.frame_wordspecorig.get_children() if framecontents: self.frame_wordspecorig.remove(framecontents[0]) self.frame_wordspecorig.add(origwordcontext_speccanvas) self.synthwordcontextwav = Waveform() self.synthwordcontextwav.samplerate = u["lindists"]["utt"]["waveform"].samplerate self.synthwordcontextwav.samples = u["lindists"]["utt"]["waveform"].samples[synthstartsample:synthendsample] synthwordcontext_specfig = Figure(dpi=72) synthwordcontext_specplot = synthwordcontext_specfig.add_subplot(111) synthwordcontext_specplot.specgram(self.synthwordcontextwav.samples, Fs=self.synthwordcontextwav.samplerate, NFFT=128, noverlap=64, xextent=(0.0, self.synthwordcontextwav.samplerate*len(self.synthwordcontextwav.samples))) synthwordcontext_speccanvas = FigureCanvasGTK(synthwordcontext_specfig) framecontents = self.frame_wordspecsynth.get_children() if framecontents: self.frame_wordspecsynth.remove(framecontents[0]) self.frame_wordspecsynth.add(synthwordcontext_speccanvas) self.statusbar.push(0, "Item: %s/%s (Word index: %s)" % (self.corpusview.current_index + 1, len(self.corpusview.worklist), self.corpusview.current_wordindex)) self.table_words.show_all()
def test2(testfn="samples/data_001d.wav"): aud = Waveform(testfn) print(inhaletimeinfo(aud.samples))
def hts_synth(self, utt, processname): htsparms = self.engine_parms.copy() htsparms["-of"] = "%(tempolf0_file)s" if "htsparms" in utt: htsparms.update(utt["htsparms"]) #parm overrides for this utt... #build command string and execute: cmds = self.hts_bin for k in htsparms: if htsparms[k]: if htsparms[k] is True: cmds += " " + k else: cmds += " " + k + " " + str(htsparms[k]) cmds += " %(tempilab_file)s" fd1, tempwav_file = mkstemp(prefix="ttslab_", suffix=".wav") fd2, tempilab_file = mkstemp(prefix="ttslab_") fd3, tempolab_file = mkstemp(prefix="ttslab_") fd4, tempolf0_file = mkstemp(prefix="ttslab_") cmds = cmds % {'models_dir': self.models_dir, 'tempwav_file': tempwav_file, 'tempilab_file': tempilab_file, 'tempolab_file': tempolab_file, 'tempolf0_file': tempolf0_file} #print(cmds) with codecs.open(tempilab_file, "w", encoding="utf-8") as outfh: outfh.write("\n".join(utt["hts_label"])) os.system(cmds) #load seg endtimes into utt: with open(tempolab_file) as infh: lines = infh.readlines() segs = utt.get_relation("Segment").as_list() assert len(segs) == len(lines) for line, seg in zip(lines, segs): seg["end"] = hts_labels_tone.htk_int_to_float(line.split()[1]) #load audio: utt["waveform"] = Waveform(tempwav_file) #load lf0: f0 = np.exp(np.fromfile(tempolf0_file, "float32")) #load and lf0 to hertz #to semitones relative to 1Hz: f0[f0.nonzero()] = 12.0 * np.log2(f0[f0.nonzero()]) # 12 * log2 (F0 / F0reference) where F0reference = 1 f0t = Track() f0t.values = f0 f0t.times = np.arange(len(f0), dtype=np.float64) * 0.005 utt["f0"] = f0t #cleanup tempfiles: os.close(fd1) os.close(fd2) os.close(fd3) os.close(fd4) os.remove(tempwav_file) os.remove(tempolab_file) os.remove(tempilab_file) os.remove(tempolf0_file) return utt