def get_f0(args): fn, f0_path, f0min, f0max, tstep, semitones, outf0dir = args basename = os.path.basename(fn).split(".")[0] print("PROCESSING: " + basename) t = Track() t.name = basename t.get_f0(fn, f0min, f0max, timestep=tstep, semitones=semitones) ttslab.tofile(t, os.path.join(outf0dir, basename + "." + TRACK_EXT))
def draw_sylstruct_graph_pitch_waveform(u): #use seg end times to calculate start and end times for all #items... u.fill_startendtimes() g = nx.Graph() posdict = {} nodelist = [] nodesizelist = [] for word in u.get_relation("SylStructure"): nodelist.append(word) nodesizelist.append(300 * len(str(word))) posdict[word] = [word["end"] + word["start"] / 2, 3] if word.prev_item: g.add_edge(word.prev_item, word) if word.next_item: g.add_edge(word.next_item, word) g.add_edge(word.first_daughter, word) g.add_edge(word.last_daughter, word) for syl in word.get_daughters(): nodelist.append(syl) nodesizelist.append(400) posdict[syl] = [syl["end"] + syl["start"] / 2, 2] if syl.prev_item: g.add_edge(syl.prev_item, syl) if syl.next_item: g.add_edge(syl.next_item, syl) g.add_edge(syl.first_daughter, syl) g.add_edge(syl.last_daughter, syl) for seg in syl.get_daughters(): nodelist.append(seg) nodesizelist.append(350) posdict[seg] = [seg["end"] + seg["start"] / 2, 1] if seg.prev_item: g.add_edge(seg.prev_item, seg) if seg.next_item: g.add_edge(seg.next_item, seg) uttendtime = u.get_relation("Segment").tail_item["end"] bounds = np.array([word["end"] for word in u.get_relation("Word")]) #get the pitch: d = mkdtemp() u["waveform"].write(os.path.join(d, "utt.wav")) f0t = Track() f0t.get_f0(os.path.join(d, "utt.wav"), semitones=True) shutil.rmtree(d) fig1 = plt.figure( ) #edgecolor=(0.921568627451, 0.921568627451, 0.921568627451)) ax = fig1.add_subplot(111) ax.set_title("Utterance") # ax.set_ylim(0, 5) nx.draw(g, pos=posdict, ax=ax, nodelist=nodelist, node_size=nodesizelist) plt.xticks([], []) plt.yticks([1.0, 2.0, 3.0], ["segment", "syllable", "word"]) fig2 = plt.figure() ax1 = fig2.add_subplot(111) ax1.set_title("Pitch") ax1.set_ylabel("Semitones (relative to 1 Hz)") ax1.set_xlabel("Syllables") plt.plot(f0t.times, f0t.values, color='green') ax1.set_ylim(bottom=75.0) plt.xticks([syl["end"] for syl in u.gr("Syllable")], [getsylsegstr(syl) for syl in u.gr("Syllable")]) ax1.grid() fig3 = plt.figure() ax2 = fig3.add_subplot(111) decimate_factor = 10 ax2.set_title("Waveform (decimation factor: %s)" % decimate_factor) ax2.set_ylabel("Amplitude") ax2.set_xlabel("Syllables") waveform = ss.decimate(u["waveform"].samples, decimate_factor) plt.plot(np.arange(len(waveform)) * (1.0 / u["waveform"].samplerate * decimate_factor), waveform, color='b') #ax2.set_xticks(bounds*u["waveform"].samplerate, [''] * len(bounds)) plt.xticks([syl["end"] for syl in u.gr("Syllable")], [getsylsegstr(syl) for syl in u.gr("Syllable")]) # fig3.set_facecolor((0.921568627451, 0.921568627451, 0.921568627451)) ax2.grid() #plt.show() return fig1, fig2, fig3
def draw_sylstruct_graph_pitch_waveform(u): #use seg end times to calculate start and end times for all #items... u.fill_startendtimes() g = nx.Graph() posdict = {} nodelist = [] nodesizelist = [] for word in u.get_relation("SylStructure"): nodelist.append(word) nodesizelist.append(300 * len(str(word))) posdict[word] = [word["end"] + word["start"] / 2, 3] if word.prev_item: g.add_edge(word.prev_item, word) if word.next_item: g.add_edge(word.next_item, word) g.add_edge(word.first_daughter, word) g.add_edge(word.last_daughter, word) for syl in word.get_daughters(): nodelist.append(syl) nodesizelist.append(400) posdict[syl] = [syl["end"] + syl["start"] / 2, 2] if syl.prev_item: g.add_edge(syl.prev_item, syl) if syl.next_item: g.add_edge(syl.next_item, syl) g.add_edge(syl.first_daughter, syl) g.add_edge(syl.last_daughter, syl) for seg in syl.get_daughters(): nodelist.append(seg) nodesizelist.append(350) posdict[seg] = [seg["end"] + seg["start"] / 2, 1] if seg.prev_item: g.add_edge(seg.prev_item, seg) if seg.next_item: g.add_edge(seg.next_item, seg) uttendtime = u.get_relation("Segment").tail_item["end"] bounds = np.array([word["end"] for word in u.get_relation("Word")]) #get the pitch: d = mkdtemp() u["waveform"].write(os.path.join(d, "utt.wav")) f0t = Track() f0t.get_f0(os.path.join(d, "utt.wav")) shutil.rmtree(d) fig = pl.figure( ) #edgecolor=(0.921568627451, 0.921568627451, 0.921568627451)) ax = fig.add_subplot(311) ax.set_title("Utterance") # ax.set_ylim(0, 5) # ax.set_xticks(bounds) # ax.grid() nx.draw(g, pos=posdict, ax=ax, nodelist=nodelist, node_size=nodesizelist) ax1 = fig.add_subplot(312) ax1.set_title("Pitch") ax1.set_ylim(20.0, 300.0) ax1.set_ylabel("Hertz") pl.plot(f0t.times, f0t.values, color='green') pl.xticks([syl["end"] for syl in u.gr("Syllable")], [syl["tone"] for syl in u.gr("Syllable")]) ax1.grid() ax2 = fig.add_subplot(313) ax2.set_title("Waveform") ax2.set_xlim(0, uttendtime * u["waveform"].samplerate) pl.plot(u["waveform"].samples, color='b') ax2.set_xticks(bounds * u["waveform"].samplerate, [''] * len(bounds)) fig.set_facecolor((0.921568627451, 0.921568627451, 0.921568627451)) # ax2.grid() # pl.show() # fig.savefig("output.png") return fig
import sys import array import math import numpy as np import ttslab from ttslab.trackfile import Track ttslab.extend(Track, "ttslab.trackfile.funcs.tfuncs_praat") def friendly_log(f): try: return math.log(f) except ValueError: return float('-1e+10') if __name__ == "__main__": fn = sys.argv[1] outfn = sys.argv[2] minf0 = float(sys.argv[3]) maxf0 = float(sys.argv[4]) t = Track() t.get_f0(fn, minpitch=minf0, maxpitch=maxf0, timestep=0.005, fixocterrs=True) #timestep hardcoded here because of hack below... #hack aligns samples with equiv from HTS script: pad = np.array([0.0, 0.0]).reshape(-1, 1) f0hzvalues = np.concatenate([pad, t.values, pad]) lf0 = array.array(b"f", map(friendly_log, f0hzvalues)) with open(outfn, "wb") as outfh: lf0.tofile(outfh)