def getf0_python(wav, fs, frame_len, frame_step): def lowpassfilter(x, cutoff, gain): h = scipy.signal.firwin(31, cutoff) h /= h.sum() return gain * scipy.signal.filtfilt(h, np.array([1.0]), x) frame_len = int(frame_len * fs) frame_step = int(frame_step * fs) frames = frame.framesig(wav, frame_len, frame_step, winfunc=lambda x:np.ones((1,x))) # convert dtype ro float64 wav = wav.astype(np.float64) wav /= 30000.0 # convert dtype ro float64 wav = wav.astype(np.float64) # lowpass at 900Hz wav = lowpassfilter(wav, 2.0*900.0/fs, 1.0) #frame the signal ENERGY_THRESHOLD = 0.1 wsize = frame_len wrate = frame_step st = 0 en = wsize correlogram = np.zeros((0, wsize//2)) pit = np.zeros(0) while True: if en > wav.shape[0]: break frm = wav[st:en].copy() # classify as speech/non-speech using enerfy speech = True rms = np.sqrt(np.mean(frm**2)) if rms < ENERGY_THRESHOLD: speech = False # center clipping C = frm.max() * 0.4 frm[np.abs(frm) < C] = 0.0 frm[frm > C] = 1.0 frm[frm < -C] = -1.0 # autocorrelation r = np.correlate(frm, frm, 'same')[wsize//2:] if r[0] > 0: r /= r[0] correlogram = np.r_[correlogram, r.reshape((1,r.shape[0]))] # find peak r_limit = r[50:150] r_limit -= r_limit.min() r_limit /= r_limit.max() peak = np.argmax(r_limit)+50 if r_limit.max() < 0.2: cur_pit = 0 else: cur_pit = fs / peak pit = np.r_[pit, cur_pit] st += wrate en += wrate return pit
def encode(self, wav, frame_len=0.025, frame_step=0.010): winfunc=lambda x:np.hamming(x).reshape((1,x)) self.frame_len = int(frame_len * self.fs) self.frame_step = int(frame_step * self.fs) wav = frame.preemphasis(wav,coeff=0.97)## self.energy = frame.get_energy(wav, self.frame_len, self.frame_step, winfunc=winfunc) self.frames = frame.framesig(wav, self.frame_len, self.frame_step, winfunc=winfunc) param_size = len(self._encode_frame(self.frames[0])) self.params = np.zeros((self.frames.shape[0], param_size)) for i in range(self.frames.shape[0]): self.params[i, :] = self._encode_frame(self.frames[i])
def decode(self, src_signal): winfunc=lambda x:np.hamming(x).reshape((1,x)) src_energy = frame.get_energy(src_signal, self.frame_len, self.frame_step) gain = self.energy / src_energy gain_interp = np.interp(np.linspace(0,1,src_signal.shape[0]), np.linspace(0, 1, gain.shape[0]), gain) src_signal *= gain_interp src_frames = frame.framesig(src_signal, self.frame_len, self.frame_step) for i in range(self.frames.shape[0]): self.frames[i, :] = self._decode_frame(self.params[i, :], np.r_[src_frames[max(0,i-1)],src_frames[i]])[self.frame_len:] wav = frame.deframesig(self.frames, src_signal.shape[0], self.frame_len, self.frame_step, winfunc=winfunc) wav = frame.deemphasis(wav,coeff=0.97)## #wav *= gain_interp return wav
def encode(self, wav, frame_len=0.025, frame_step=0.010): winfunc = lambda x: np.hamming(x).reshape((1, x)) self.frame_len = int(frame_len * self.fs) self.frame_step = int(frame_step * self.fs) wav = frame.preemphasis(wav, coeff=0.97) ## self.energy = frame.get_energy(wav, self.frame_len, self.frame_step, winfunc=winfunc) self.frames = frame.framesig(wav, self.frame_len, self.frame_step, winfunc=winfunc) param_size = len(self._encode_frame(self.frames[0])) self.params = np.zeros((self.frames.shape[0], param_size)) for i in range(self.frames.shape[0]): self.params[i, :] = self._encode_frame(self.frames[i])
def decode(self, src_signal): winfunc = lambda x: np.hamming(x).reshape((1, x)) src_energy = frame.get_energy(src_signal, self.frame_len, self.frame_step) gain = self.energy / src_energy gain_interp = np.interp(np.linspace(0, 1, src_signal.shape[0]), np.linspace(0, 1, gain.shape[0]), gain) src_signal *= gain_interp src_frames = frame.framesig(src_signal, self.frame_len, self.frame_step) for i in range(self.frames.shape[0]): self.frames[i, :] = self._decode_frame( self.params[i, :], np.r_[src_frames[max(0, i - 1)], src_frames[i]])[self.frame_len:] wav = frame.deframesig(self.frames, src_signal.shape[0], self.frame_len, self.frame_step, winfunc=winfunc) wav = frame.deemphasis(wav, coeff=0.97) ## #wav *= gain_interp return wav
def getf0_python(wav, fs, frame_len, frame_step): def lowpassfilter(x, cutoff, gain): h = scipy.signal.firwin(31, cutoff) h /= h.sum() return gain * scipy.signal.filtfilt(h, np.array([1.0]), x) frame_len = int(frame_len * fs) frame_step = int(frame_step * fs) frames = frame.framesig(wav, frame_len, frame_step, winfunc=lambda x: np.ones((1, x))) # convert dtype ro float64 wav = wav.astype(np.float64) wav /= 30000.0 # convert dtype ro float64 wav = wav.astype(np.float64) # lowpass at 900Hz wav = lowpassfilter(wav, 2.0 * 900.0 / fs, 1.0) #frame the signal ENERGY_THRESHOLD = 0.1 wsize = frame_len wrate = frame_step st = 0 en = wsize correlogram = np.zeros((0, wsize // 2)) pit = np.zeros(0) while True: if en > wav.shape[0]: break frm = wav[st:en].copy() # classify as speech/non-speech using enerfy speech = True rms = np.sqrt(np.mean(frm**2)) if rms < ENERGY_THRESHOLD: speech = False # center clipping C = frm.max() * 0.4 frm[np.abs(frm) < C] = 0.0 frm[frm > C] = 1.0 frm[frm < -C] = -1.0 # autocorrelation r = np.correlate(frm, frm, 'same')[wsize // 2:] if r[0] > 0: r /= r[0] correlogram = np.r_[correlogram, r.reshape((1, r.shape[0]))] # find peak r_limit = r[50:150] r_limit -= r_limit.min() r_limit /= r_limit.max() peak = np.argmax(r_limit) + 50 if r_limit.max() < 0.2: cur_pit = 0 else: cur_pit = fs / peak pit = np.r_[pit, cur_pit] st += wrate en += wrate return pit