Beispiel #1
0
def getf0_python(wav, fs, frame_len, frame_step):
    def lowpassfilter(x, cutoff, gain):
        h = scipy.signal.firwin(31, cutoff)
        h /= h.sum()
        return gain * scipy.signal.filtfilt(h, np.array([1.0]), x)    
    
    frame_len = int(frame_len * fs)
    frame_step = int(frame_step * fs)
    frames = frame.framesig(wav, frame_len, frame_step, winfunc=lambda x:np.ones((1,x)))
    
    # convert dtype ro float64
    wav = wav.astype(np.float64)
    wav /= 30000.0    
    # convert dtype ro float64
    wav = wav.astype(np.float64)    
    # lowpass at 900Hz
    wav = lowpassfilter(wav, 2.0*900.0/fs, 1.0)
    #frame the signal
    ENERGY_THRESHOLD = 0.1
    wsize = frame_len
    wrate = frame_step
    st = 0
    en = wsize
    correlogram = np.zeros((0, wsize//2))
    pit = np.zeros(0)
    while True:
        if en > wav.shape[0]:
            break
        frm = wav[st:en].copy()
        # classify as speech/non-speech using enerfy
        speech = True
        rms = np.sqrt(np.mean(frm**2))
        if rms < ENERGY_THRESHOLD:
            speech = False
        # center clipping
        C = frm.max() * 0.4
        frm[np.abs(frm) < C] = 0.0
        frm[frm > C] = 1.0
        frm[frm < -C] = -1.0
        
        # autocorrelation
        r = np.correlate(frm, frm, 'same')[wsize//2:]
        if r[0] > 0:
            r /= r[0]
        correlogram = np.r_[correlogram, r.reshape((1,r.shape[0]))]
        
        # find peak
        r_limit = r[50:150]
        r_limit -= r_limit.min()
        r_limit /= r_limit.max()
        peak = np.argmax(r_limit)+50
        if r_limit.max() < 0.2:
            cur_pit = 0
        else:
            cur_pit = fs / peak
        pit = np.r_[pit, cur_pit]
        st += wrate
        en += wrate
    return pit
Beispiel #2
0
 def encode(self, wav, frame_len=0.025, frame_step=0.010):
     winfunc=lambda x:np.hamming(x).reshape((1,x))
     self.frame_len = int(frame_len * self.fs)
     self.frame_step = int(frame_step * self.fs)        
     wav = frame.preemphasis(wav,coeff=0.97)##       
     self.energy = frame.get_energy(wav, self.frame_len, self.frame_step, winfunc=winfunc)        
     
     self.frames = frame.framesig(wav, self.frame_len, self.frame_step, winfunc=winfunc)
     param_size = len(self._encode_frame(self.frames[0]))
     self.params = np.zeros((self.frames.shape[0], param_size))
     for i in range(self.frames.shape[0]):
         self.params[i, :] = self._encode_frame(self.frames[i])
Beispiel #3
0
 def decode(self, src_signal):
     winfunc=lambda x:np.hamming(x).reshape((1,x))
     src_energy = frame.get_energy(src_signal, self.frame_len, self.frame_step)
     gain = self.energy / src_energy
     gain_interp = np.interp(np.linspace(0,1,src_signal.shape[0]), np.linspace(0, 1, gain.shape[0]), gain)
     src_signal *= gain_interp
     src_frames = frame.framesig(src_signal, self.frame_len, self.frame_step)
     
     for i in range(self.frames.shape[0]):
         self.frames[i, :] = self._decode_frame(self.params[i, :], np.r_[src_frames[max(0,i-1)],src_frames[i]])[self.frame_len:]
     wav = frame.deframesig(self.frames, src_signal.shape[0], self.frame_len, self.frame_step, winfunc=winfunc)
     wav = frame.deemphasis(wav,coeff=0.97)##
     #wav *= gain_interp
     return wav
Beispiel #4
0
    def encode(self, wav, frame_len=0.025, frame_step=0.010):
        winfunc = lambda x: np.hamming(x).reshape((1, x))
        self.frame_len = int(frame_len * self.fs)
        self.frame_step = int(frame_step * self.fs)
        wav = frame.preemphasis(wav, coeff=0.97)  ##
        self.energy = frame.get_energy(wav,
                                       self.frame_len,
                                       self.frame_step,
                                       winfunc=winfunc)

        self.frames = frame.framesig(wav,
                                     self.frame_len,
                                     self.frame_step,
                                     winfunc=winfunc)
        param_size = len(self._encode_frame(self.frames[0]))
        self.params = np.zeros((self.frames.shape[0], param_size))
        for i in range(self.frames.shape[0]):
            self.params[i, :] = self._encode_frame(self.frames[i])
Beispiel #5
0
    def decode(self, src_signal):
        winfunc = lambda x: np.hamming(x).reshape((1, x))
        src_energy = frame.get_energy(src_signal, self.frame_len,
                                      self.frame_step)
        gain = self.energy / src_energy
        gain_interp = np.interp(np.linspace(0, 1, src_signal.shape[0]),
                                np.linspace(0, 1, gain.shape[0]), gain)
        src_signal *= gain_interp
        src_frames = frame.framesig(src_signal, self.frame_len,
                                    self.frame_step)

        for i in range(self.frames.shape[0]):
            self.frames[i, :] = self._decode_frame(
                self.params[i, :], np.r_[src_frames[max(0, i - 1)],
                                         src_frames[i]])[self.frame_len:]
        wav = frame.deframesig(self.frames,
                               src_signal.shape[0],
                               self.frame_len,
                               self.frame_step,
                               winfunc=winfunc)
        wav = frame.deemphasis(wav, coeff=0.97)  ##
        #wav *= gain_interp
        return wav
Beispiel #6
0
def getf0_python(wav, fs, frame_len, frame_step):
    def lowpassfilter(x, cutoff, gain):
        h = scipy.signal.firwin(31, cutoff)
        h /= h.sum()
        return gain * scipy.signal.filtfilt(h, np.array([1.0]), x)

    frame_len = int(frame_len * fs)
    frame_step = int(frame_step * fs)
    frames = frame.framesig(wav,
                            frame_len,
                            frame_step,
                            winfunc=lambda x: np.ones((1, x)))

    # convert dtype ro float64
    wav = wav.astype(np.float64)
    wav /= 30000.0
    # convert dtype ro float64
    wav = wav.astype(np.float64)
    # lowpass at 900Hz
    wav = lowpassfilter(wav, 2.0 * 900.0 / fs, 1.0)
    #frame the signal
    ENERGY_THRESHOLD = 0.1
    wsize = frame_len
    wrate = frame_step
    st = 0
    en = wsize
    correlogram = np.zeros((0, wsize // 2))
    pit = np.zeros(0)
    while True:
        if en > wav.shape[0]:
            break
        frm = wav[st:en].copy()
        # classify as speech/non-speech using enerfy
        speech = True
        rms = np.sqrt(np.mean(frm**2))
        if rms < ENERGY_THRESHOLD:
            speech = False
        # center clipping
        C = frm.max() * 0.4
        frm[np.abs(frm) < C] = 0.0
        frm[frm > C] = 1.0
        frm[frm < -C] = -1.0

        # autocorrelation
        r = np.correlate(frm, frm, 'same')[wsize // 2:]
        if r[0] > 0:
            r /= r[0]
        correlogram = np.r_[correlogram, r.reshape((1, r.shape[0]))]

        # find peak
        r_limit = r[50:150]
        r_limit -= r_limit.min()
        r_limit /= r_limit.max()
        peak = np.argmax(r_limit) + 50
        if r_limit.max() < 0.2:
            cur_pit = 0
        else:
            cur_pit = fs / peak
        pit = np.r_[pit, cur_pit]
        st += wrate
        en += wrate
    return pit