def encode(a, pcm): """ Encode a speech waveform. The encoding framers (frames and pitch) pad the frames so that the first frame is centered on sample zero. This is consistent with STRAIGHT and SPTK (I hope!). At least, it means the pitch can have longer frame lengths and still align with the OLA'd frames. """ if opt.ola: frameSize = pcm.seconds_to_period(0.025, 'atleast') # 25ms frame size else: frameSize = framePeriod pitchSize = pcm.seconds_to_period(0.1, 'atmost') print "Encoding with period", framePeriod, "size", frameSize, \ "and pitch window", pitchSize # First the pitch as it's on the unaltered waveform. The frame # should be long with no window. 1024 at 16 kHz is 64 ms. pf = ssp.Frame(a, size=pitchSize, period=framePeriod) pitch, hnr = ssp.ACPitch(pf, pcm) # Pre-emphasis pre = ssp.parameter("Pre", None) if pre is not None: a = ssp.PoleFilter(a, pre) / 5 # Keep f around after the function so the decoder can do a # reference decoding on the real excitaton. global f f = ssp.Frame(a, size=frameSize, period=framePeriod) #aw = np.hanning(frameSize+1) aw = ssp.nuttall(frameSize+1) aw = np.delete(aw, -1) w = ssp.Window(f, aw) ac = ssp.Autocorrelation(w) lp = ssp.parameter('AR', 'levinson') if lp == 'levinson': ar, g = ssp.ARLevinson(ac, lpOrder[r]) elif lp == 'ridge': ar, g = ssp.ARRidge(ac, lpOrder[r], 0.03) elif lp == 'lasso': ar, g = ssp.ARLasso(ac, lpOrder[r], 5) elif lp == 'sparse': ar, g = ssp.ARSparse(w, lpOrder[r], ssp.parameter('Gamma', 1.414)) elif lp == 'student': ar, g = ssp.ARStudent(w, lpOrder[r], ssp.parameter('DoF', 50.0)) if False: fig = ssp.Figure(5, 1) #stddev = np.sqrt(kVar) sPlot = fig.subplot() sPlot.plot(pitch, 'c') #sPlot.plot(kPitch + stddev, 'b') #sPlot.plot(kPitch - stddev, 'b') sPlot.set_xlim(0, len(pitch)) sPlot.set_ylim(0, 500) plt.show() return (ar, g, pitch, hnr)
def setUp(self): """ Generate one (short) frame of a 1 kHz sinusoid at a sampling rate of 16 kHz. It doesn't matter too much what it is, just that it is representative of some natural signal. """ self.pcm = ssp.PulseCodeModulation(16000) self.seq = np.zeros(64) p = self.pcm.seconds_to_period(1.0/1000); for s in range(len(self.seq)): self.seq[s] = np.sin(2*np.pi * s/p) w = ssp.nuttall(len(self.seq)+1) w = np.delete(w, -1) self.seq = ssp.Window(self.seq, w)
def get_pitch(gen_path, basefilename): (Fs, x) = io_wav.read(gen_path + basefilename + '.wav') assert Fs == 16000 pcm = ssp.PulseCodeModulation(Fs) frameSize = pcm.seconds_to_period(0.025, 'atleast') # 25ms Frame size pitchSize = pcm.seconds_to_period(0.1, 'atmost') # 100ms Pitch size pf = ssp.Frame(x, size=pitchSize, period=framePeriod) pitch, ac = ssp.ACPitch(pf, pcm, loPitch, hiPitch) # Initially pitch estimated # Pre-emphasis pre = ssp.parameter("Pre", None) if pre is not None: x = ssp.PoleFilter(x, pre) / 5 # Frame Splitting f = ssp.Frame(x, size=frameSize, period=framePeriod) # Windowing aw = ssp.nuttall(frameSize + 1) aw = np.delete(aw, -1) w = ssp.Window(f, aw) # Autocorrelation ac = ssp.Autocorrelation(w) if (len(ac) > len(pitch)): d = len(ac) - len(pitch) addon = np.ones(d) * pitch[-1] pitch = np.hstack((pitch, addon)) # Save pitch as binary lf0 = np.log(pitch) lf0.astype('float32').tofile(gen_path + basefilename + '.lf0') return pitch
print "Need one arg" exit(1) file = arg[0] import ssp import numpy as np # Load and process pcm = ssp.PulseCodeModulation() a = pcm.WavSource(file) if (ssp.parameter('Pre', None)): a = ssp.ZeroFilter(a) framePeriod = pcm.seconds_to_period(0.01) frameSize = pcm.seconds_to_period(0.02, 'atleast') f = ssp.Frame(a, size=frameSize, period=framePeriod) w = ssp.nuttall(frameSize+1) w = np.delete(w, -1) wf = ssp.Window(f, w) type = ssp.parameter('Type', 'psd') if type == 'psd': p = ssp.Periodogram(wf) p = p[:,:p.shape[1]/2+1] elif type == 'ar': a = ssp.Autocorrelation(wf) a, g = ssp.ARLevinson(a, pcm.speech_ar_order()) p = ssp.ARSpectrum(a, g, nSpec=128) elif type == 'snr': p = ssp.Periodogram(wf) n = ssp.Noise(p) p = ssp.SNRSpectrum(p, n) p = p[:,:p.shape[1]/2+1]
# Defaults for 8 kHz frameSize = 256 framePeriod = 80 lpOrder = 10 if pcm.rate == 16000: frameSize = 400 framePeriod = 160 lpOrder = 12 # Basic preprocessing g = np.ndarray((0)) a = ssp.ZeroFilter(a) f = ssp.Frame(a, size=frameSize, period=framePeriod, pad=False) f = ssp.Window(f, ssp.nuttall(frameSize)) # Next part depends on user frontend = ssp.parameter("FrontEnd", "ar") if frontend == "ar": a = ssp.Autocorrelation(f) a = ssp.AutocorrelationAllPassWarp(a, alpha=ssp.mel[pcm.rate], size=lpOrder + 1) a, g = ssp.ARLevinson(a, lpOrder) # ridge = Parameter('Ridge', 0.1) # a, g = ARRidge(a, lpOrder, ridge) # a, g = ARLasso(a, lpOrder, ridge) elif frontend == "snr": a = ssp.Periodogram(f) n = ssp.Noise(a)
# Defaults for 8 kHz frameSize = 256 framePeriod = 80 lpOrder = 10 if pcm.rate == 16000: frameSize = 400 framePeriod = 160 lpOrder = 12 # Basic preprocessing g = np.ndarray((0)) a = ssp.ZeroFilter(a) f = ssp.Frame(a, size=frameSize, period=framePeriod, pad=False) f = ssp.Window(f, ssp.nuttall(frameSize)) # Next part depends on user frontend = ssp.parameter("FrontEnd", "ar") if frontend == "ar": a = ssp.Autocorrelation(f) a = ssp.AutocorrelationAllPassWarp(a, alpha=ssp.mel[pcm.rate], size=lpOrder+1) a, g = ssp.ARLevinson(a, lpOrder) # ridge = Parameter('Ridge', 0.1) # a, g = ARRidge(a, lpOrder, ridge) # a, g = ARLasso(a, lpOrder, ridge) elif frontend == "snr": a = ssp.Periodogram(f) n = ssp.Noise(a) a = ssp.SNRSpectrum(a, n * 0.1)