now = time.clock() elapsed = now-ti ti = now print(func, elapsed) import ssp import numpy as np import matplotlib.pyplot as plt lap("Import") # Load and do basic AR to reconstruct the spectrum pcm = ssp.PulseCodeModulation() wav = pcm.WavSource(file) print("File:", file, "rate:", pcm.rate, "size:", wav.size) if ssp.parameter("ZF", 0) == 1: wav = ssp.ZeroFilter(wav) f = ssp.Frame(wav, size=256, period=128) f = ssp.Window(f, np.hanning(256)) print("frame:", f.shape[0], "x", f.shape[1]) lap("Frame") e = ssp.Energy(f) p = ssp.Periodogram(f) lap("Periodogram") order = pcm.speech_ar_order() a = ssp.Autocorrelation(f) a, g = ssp.ARLevinson(a, order) lap("Levinson") ls = ssp.ARSpectrum(a, g, nSpec=128) lap("Spectrum") # Now do some esoteric AR
from optparse import OptionParser op = OptionParser() (option, arg) = op.parse_args() if (len(arg) < 1): print "Need one arg" exit(1) file = arg[0] import ssp import numpy as np # Load and process pcm = ssp.PulseCodeModulation() a = pcm.WavSource(file) if (ssp.parameter('Pre', None)): a = ssp.ZeroFilter(a) framePeriod = pcm.seconds_to_period(0.01) frameSize = pcm.seconds_to_period(0.02, 'atleast') f = ssp.Frame(a, size=frameSize, period=framePeriod) w = ssp.nuttall(frameSize+1) w = np.delete(w, -1) wf = ssp.Window(f, w) type = ssp.parameter('Type', 'psd') if type == 'psd': p = ssp.Periodogram(wf) p = p[:,:p.shape[1]/2+1] elif type == 'ar': a = ssp.Autocorrelation(wf) a, g = ssp.ARLevinson(a, pcm.speech_ar_order()) p = ssp.ARSpectrum(a, g, nSpec=128) elif type == 'snr':
def decode(tuple): """ Decode a speech waveform. """ (ark, g, pitch, hnr) = tuple print("Frame padding:", opt.padding) nFrames = len(ark) assert (len(g) == nFrames) assert (len(pitch) == nFrames) assert (len(hnr) == nFrames) # The original framer padded the ends so the number of samples to # synthesise is a bit less than you might think if opt.ola: frameSize = framePeriod * 2 nSamples = framePeriod * (nFrames - 1) else: frameSize = framePeriod nSamples = frameSize * (nFrames - 1) ex = opt.glottal if opt.glottal == 'cepgm' and (opt.encode or opt.decode or opt.pitch): order = ark.shape[-1] - 2 ar = ark[:, 0:order] theta = ark[:, -2] magni = np.exp(ark[:, -1]) else: ar = ark # Use the original AR residual; it should be a very good reconstruction. if ex == 'ar': e = ssp.ARExcitation(f, ar, g) # Just noise. This is effectively a whisper synthesis. elif ex == 'noise': e = np.random.normal(size=(nFrames, frameSize)) # Just harmonics, and with a fixed F0. This is the classic robot # synthesis. elif ex == 'robot': ew = np.zeros(nSamples) period = int(1.0 / 200 * r) for i in range(0, len(ew), period): ew[i] = period e = ssp.Frame(ew, size=frameSize, period=framePeriod) # Synthesise harmonics plus noise in the ratio suggested by the HNR. elif ex == 'synth': # Harmonic part mperiod = int(1.0 / np.mean(pitch) * r) gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) pr, pg = ssp.pulse_response(gm, pcm, period=mperiod, order=lpOrder[r]) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i + period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod h = ssp.ARExcitation(h, pr, 1.0) fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) # Noise part n = np.random.normal(size=nSamples) n = ssp.ZeroFilter(n, 1.0) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain # Like harmonics plus noise, but with explicit sinusoids instead of time # domain impulses. elif ex == 'sine': order = 20 sine = ssp.Harmonics(r, order) h = np.zeros(nSamples) for i in range(0, len(h) - framePeriod, framePeriod): frame = i // framePeriod period = int(1.0 / pitch[frame] * r) weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i + framePeriod] = (sine.sample(pitch[frame], framePeriod) * weight) fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fn + fh * 10 # High order linear prediction. Synthesise the harmonics using noise to # excite a high order polynomial with roots resembling harmonics. elif ex == 'holp': # Some noise n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) # Use the noise to excite a high order AR model fh = np.ndarray(fn.shape) for i in range(len(fn)): hoar = ssp.ARHarmonicPoly(pitch[i], r, 0.7) fh[i] = ssp.ARResynthesis(fn[i], hoar, 1.0 / linalg.norm(hoar)**2) print(i, pitch[i], linalg.norm(hoar), np.min(fh[i]), np.max(fh[i])) print(' ', np.min(hoar), np.max(hoar)) # fh[i] *= np.sqrt(r / pitch[i]) / linalg.norm(fh[i]) # fh[i] *= np.sqrt(hnr[i] / (hnr[i] + 1)) # Weight the noise as for the other methods for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fh # fn + fh*30 # Shaped excitation. The pulses are shaped by a filter to have a # rolloff, then added to the noise. The resulting signal is # flattened using AR. elif ex == 'shaped': # Harmonic part gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) gm.angle = pcm.hertz_to_radians(np.mean(pitch) * 0.5) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i + period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod # Filter to mimic the glottal pulse hfilt = ssp.parameter("HFilt", None) hpole1 = ssp.parameter("HPole1", 0.98) hpole2 = ssp.parameter("HPole2", 0.8) angle = pcm.hertz_to_radians(np.mean(pitch)) * ssp.parameter( "Angle", 1.0) if hfilt == 'pp': h = ssp.ZeroFilter(h, 1.0) h = ssp.PolePairFilter(h, hpole1, angle) fh = ssp.Frame(h, size=frameSize, period=framePeriod) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance npole = ssp.parameter("NPole", None) nf = ssp.parameter("NoiseFreq", 4000) if npole is not None: n = ssp.PolePairFilter(n, npole, pcm.hertz_to_radians(nf)) fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert (len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) elif ex == 'ceplf': omega, alpha = ssp.glottal_pole_lf(f, pcm, pitch, hnr, visual=(opt.graphic == "ceplf")) epsilon = ssp.parameter("Epsilon", 5000.0) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) pu = np.zeros((period)) T0 = pcm.period_to_seconds(period) print(T0, ) Te = ssp.lf_te(T0, alpha[frame], omega[frame], epsilon) if Te: pu = ssp.pulse_lf(pu, T0, Te, alpha[frame], omega[frame], epsilon) h[i:i + period] = pu * weight i += period frame = i // framePeriod fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert (len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) elif ex == 'cepgm': # Infer the unstable poles via complex cepstrum, then build an explicit # glottal model. if not (opt.encode or opt.decode or opt.pitch): theta, magni = ssp.glottal_pole_gm(f, pcm, pitch, hnr, visual=(opt.graphic == "cepgm")) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break h[i] = 1 # np.random.normal() ** 2 i += period frame = i // framePeriod fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) gl = ssp.MinPhaseGlottis() for i in range(len(fh)): # This is minimum phase; the glotter will invert if required gl.setpolepair(np.abs(magni[frame]), theta[frame]) fh[i] = gl.glotter(fh[i]) if linalg.norm(fh[i]) > 1e-6: fh[i] *= np.sqrt(len(fh[i])) / linalg.norm(fh[i]) weight = np.sqrt(hnr[i] / (hnr[i] + 1)) fh[i] *= weight if (opt.graphic == "h"): fig = ssp.Figure(1, 1) hPlot = fig.subplot() hPlot.plot(h, 'r') fig.show() # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert (len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) else: print("Unknown synthesis method") exit if opt.excitation: s = e.flatten('C') / frameSize else: s = ssp.ARResynthesis(e, ar, g) if opt.ola: # Asymmetric window for OLA sw = np.hanning(frameSize + 1) sw = np.delete(sw, -1) s = ssp.Window(s, sw) s = ssp.OverlapAdd(s) else: s = s.flatten('C') gain = ssp.parameter("Gain", 1.0) return s * gain
def decode((ar, g, pitch, hnr)): """ Decode a speech waveform. """ nFrames = len(ar) assert(len(g) == nFrames) assert(len(pitch) == nFrames) assert(len(hnr) == nFrames) # The original framer padded the ends so the number of samples to # synthesise is a bit less than you might think if opt.ola: frameSize = framePeriod * 2 nSamples = framePeriod * (nFrames-1) else: frameSize = framePeriod nSamples = frameSize * (nFrames-1) ex = ssp.parameter('Excitation', 'synth') # Use the original AR residual; it should be a very good # reconstruction. if ex == 'ar': e = ssp.ARExcitation(f, ar, g) # Just noise. This is effectively a whisper synthesis. elif ex == 'noise': e = np.random.normal(size=f.shape) # Just harmonics, and with a fixed F0. This is the classic robot # syntheisis. elif ex == 'robot': ew = np.zeros(nSamples) period = int(1.0 / 200 * r) for i in range(0, len(ew), period): ew[i] = period e = ssp.Frame(ew, size=frameSize, period=framePeriod) # Synthesise harmonics plus noise in the ratio suggested by the # HNR. elif ex == 'synth': # Harmonic part mperiod = int(1.0 / np.mean(pitch) * r) gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) pr, pg = ssp.pulse_response(gm, pcm, period=mperiod, order=lpOrder[r]) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod h = ssp.ARExcitation(h, pr, 1.0) fh = ssp.Frame(h, size=frameSize, period=framePeriod) # Noise part n = np.random.normal(size=nSamples) n = ssp.ZeroFilter(n, 1.0) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain # Like harmonics plus noise, but with explicit sinusoids instead # of time domain impulses. elif ex == 'sine': order = 20 sine = ssp.Harmonics(r, order) h = np.zeros(nSamples) for i in range(0, len(h)-framePeriod, framePeriod): frame = i // framePeriod period = int(1.0 / pitch[frame] * r) weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+framePeriod] = ( sine.sample(pitch[frame], framePeriod) * weight ) fh = ssp.Frame(h, size=frameSize, period=framePeriod) n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fn + fh*10 # High order linear prediction. Synthesise the harmonics using # noise to excite a high order polynomial with roots resembling # harmonics. elif ex == 'holp': # Some noise n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) # Use the noise to excite a high order AR model fh = np.ndarray(fn.shape) for i in range(len(fn)): hoar = ssp.ARHarmonicPoly(pitch[i], r, 0.7) fh[i] = ssp.ARResynthesis(fn[i], hoar, 1.0 / linalg.norm(hoar)**2) print i, pitch[i], linalg.norm(hoar), np.min(fh[i]), np.max(fh[i]) print ' ', np.min(hoar), np.max(hoar) # fh[i] *= np.sqrt(r / pitch[i]) / linalg.norm(fh[i]) # fh[i] *= np.sqrt(hnr[i] / (hnr[i] + 1)) # Weight the noise as for the other methods for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fh # fn + fh*30 # Shaped excitation. The pulses are shaped by a filter to have a # rolloff, then added to the noise. The resulting signal is # flattened using AR. elif ex == 'shaped': # Harmonic part gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) gm.angle = pcm.hertz_to_radians(np.mean(pitch)*0.5) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod # Filter to mimic the glottal pulse hfilt = ssp.parameter("HFilt", None) hpole1 = ssp.parameter("HPole1", 0.98) hpole2 = ssp.parameter("HPole2", 0.8) angle = pcm.hertz_to_radians(np.mean(pitch)) * ssp.parameter("Angle", 1.0) if hfilt == 'pp': h = ssp.ZeroFilter(h, 1.0) h = ssp.PolePairFilter(h, hpole1, angle) if hfilt == 'g': h = ssp.GFilter(h, hpole1, angle, hpole2) if hfilt == 'p': h = ssp.PFilter(h, hpole1, angle, hpole2) fh = ssp.Frame(h, size=frameSize, period=framePeriod) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance npole = ssp.parameter("NPole", None) nf = ssp.parameter("NoiseFreq", 4000) if npole is not None: n = ssp.PolePairFilter(n, npole, pcm.hertz_to_radians(nf)) fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert(len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) else: print "Unknown synthesis method" exit if opt.excitation: s = e.flatten('C')/frameSize else: s = ssp.ARResynthesis(e, ar, g) if opt.ola: # Asymmetric window for OLA sw = np.hanning(frameSize+1) sw = np.delete(sw, -1) s = ssp.Window(s, sw) s = ssp.OverlapAdd(s) else: s = s.flatten('C') gain = ssp.parameter("Gain", 1.0) return s * gain