class FFNNVAD(): """ This is implementation of a FFNN based voice activity detector. It only implements decisions whether input frame is speech of non speech. It returns the posterior probability of speech for N last input frames. """ def __init__(self, cfg): self.cfg = cfg self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(self.cfg['model']) self.log_probs_speech = deque(maxlen=self.cfg['filter_length']) self.log_probs_sil = deque(maxlen=self.cfg['filter_length']) self.last_decision = 0.0 if self.cfg['frontend'] == 'MFCC': self.front_end = MFCCFrontEnd( self.cfg['sample_rate'], self.cfg['framesize'], self.cfg['usehamming'], self.cfg['preemcoef'], self.cfg['numchans'], self.cfg['ceplifter'], self.cfg['numceps'], self.cfg['enormalise'], self.cfg['zmeansource'], self.cfg['usepower'], self.cfg['usec0'], self.cfg['usecmn'], self.cfg['usedelta'], self.cfg['useacc'], self.cfg['n_last_frames']+self.cfg['n_prev_frames'], self.cfg['lofreq'], self.cfg['hifreq'], self.cfg['mel_banks_only']) else: raise ASRException('Unsupported frontend: %s' % (self.cfg['frontend'], )) def reset(self): self.log_probs_speech.clear() self.log_probs_sil.clear() def decide(self, data): """Processes the input frame whether the input segment is speech or non speech. The returned values can be in range from 0.0 to 1.0. It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment. """ data = struct.unpack('%dh' % (len(data) / 2, ), data) self.audio_recorded_in.extend(data) while len(self.audio_recorded_in) > self.cfg['framesize']: frame = self.audio_recorded_in[:self.cfg['framesize']] self.audio_recorded_in = self.audio_recorded_in[self.cfg['frameshift']:] mfcc = self.front_end.param(frame) prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc.reshape(1,len(mfcc)))[0] # print prob_sil, prob_speech self.log_probs_speech.append(log(prob_speech)) self.log_probs_sil.append(log(prob_sil)) log_prob_speech_avg = 0.0 for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil): log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil]) log_prob_speech_avg /= len(self.log_probs_speech) prob_speech_avg = np.exp(log_prob_speech_avg) # print 'prob_speech_avg: %5.3f' % prob_speech_avg self.last_decision = prob_speech_avg # returns a speech / non-speech decisions return self.last_decision
class FFNNVAD(): """ This is implementation of a FFNN based voice activity detector. It only implements decisions whether input frame is speech of non speech. It returns the posterior probability of speech for N last input frames. """ def __init__(self, cfg): self.cfg = cfg self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(self.cfg['model']) self.log_probs_speech = deque(maxlen=self.cfg['filter_length']) self.log_probs_sil = deque(maxlen=self.cfg['filter_length']) self.last_decision = 0.0 if self.cfg['frontend'] == 'MFCC': self.front_end = MFCCFrontEnd( self.cfg['sample_rate'], self.cfg['framesize'], self.cfg['usehamming'], self.cfg['preemcoef'], self.cfg['numchans'], self.cfg['ceplifter'], self.cfg['numceps'], self.cfg['enormalise'], self.cfg['zmeansource'], self.cfg['usepower'], self.cfg['usec0'], self.cfg['usecmn'], self.cfg['usedelta'], self.cfg['useacc'], self.cfg['n_last_frames'] + self.cfg['n_prev_frames'], self.cfg['lofreq'], self.cfg['hifreq'], self.cfg['mel_banks_only']) else: raise ASRException('Unsupported frontend: %s' % (self.cfg['frontend'], )) def reset(self): self.log_probs_speech.clear() self.log_probs_sil.clear() def decide(self, data): """Processes the input frame whether the input segment is speech or non speech. The returned values can be in range from 0.0 to 1.0. It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment. """ data = struct.unpack('%dh' % (len(data) / 2, ), data) self.audio_recorded_in.extend(data) while len(self.audio_recorded_in) > self.cfg['framesize']: frame = self.audio_recorded_in[:self.cfg['framesize']] self.audio_recorded_in = self.audio_recorded_in[self. cfg['frameshift']:] mfcc = self.front_end.param(frame) prob_sil, prob_speech = self.ffnn.predict_normalise( mfcc.reshape(1, len(mfcc)))[0] # print prob_sil, prob_speech self.log_probs_speech.append(log(prob_speech)) self.log_probs_sil.append(log(prob_sil)) log_prob_speech_avg = 0.0 for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil): log_prob_speech_avg += log_prob_speech - logsumexp( [log_prob_speech, log_prob_sil]) log_prob_speech_avg /= len(self.log_probs_speech) prob_speech_avg = np.exp(log_prob_speech_avg) # print 'prob_speech_avg: %5.3f' % prob_speech_avg self.last_decision = prob_speech_avg # returns a speech / non-speech decisions return self.last_decision
class FFNNVADGeneral(object): """ This is implementation of a FFNN based voice activity detector. It only implements decisions whether input frame is speech of non speech. It returns the posterior probability of speech for N last input frames. """ def __init__(self, model, filter_length, sample_rate, framesize, frameshift, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames, n_prev_frames, lofreq, hifreq, mel_banks_only): self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(model) self.log_probs_speech = deque(maxlen=filter_length) self.log_probs_sil = deque(maxlen=filter_length) self.last_decision = 0.0 self.front_end = MFCCFrontEnd(sample_rate, framesize, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames + n_prev_frames, lofreq, hifreq, mel_banks_only) self.samplerate = sample_rate self.framesize = framesize self.frameshift = frameshift def decide(self, data): """Processes the input frame whether the input segment is speech or non speech. The returned values can be in range from 0.0 to 1.0. It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment. """ # print chardet.detect(data[0]) # print(struct.calcsize('%dh' % (len(data) / 2, ))) data = struct.unpack('%dh' % (len(data) / 2, ), data) self.audio_recorded_in.extend(data) num_frame = 0 while len(self.audio_recorded_in) > self.framesize: num_frame += 1 frame = self.audio_recorded_in[:self.framesize] self.audio_recorded_in = self.audio_recorded_in[self.frameshift:] mfcc = self.front_end.param(frame) prob_sil, prob_speech = self.ffnn.predict_normalise( mfcc.reshape(1, len(mfcc)))[0] # prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc)[0] print prob_sil, prob_speech # print num_frame,self.ffnn.predict_normalise(mfcc) self.log_probs_speech.append(log(prob_speech)) self.log_probs_sil.append(log(prob_sil)) log_prob_speech_avg = 0.0 for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil): # log_probs_speech和log_probs_sil长度为2的deque,超出部分,末尾添加,起始移除。那么这里的zip将每个数据计算两次。 log_prob_speech_avg += log_prob_speech - logsumexp( [log_prob_speech, log_prob_sil]) log_prob_speech_avg /= len(self.log_probs_speech) prob_speech_avg = np.exp(log_prob_speech_avg) print 'frame: ', num_frame, 'time: ', num_frame * self.frameshift / float( self.samplerate), 'prob_speech_avg: %5.3f' % prob_speech_avg self.last_decision = prob_speech_avg self.last_decision = 1 # returns a speech / non-speech decisions return self.last_decision
class FFNNVAD(): """ This is implementation of a FFNN based voice activity detector. It only implements decisions whether input frame is speech of non speech. It returns the posterior probability of speech for N last input frames. """ def __init__(self, model, filter_length, sample_rate, framesize, frameshift, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames, n_prev_frames, lofreq, hifreq, mel_banks_only): self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(model) self.log_probs_speech = deque(maxlen=filter_length) self.log_probs_sil = deque(maxlen=filter_length) self.last_decision = 0.0 self.front_end = MFCCFrontEnd( sample_rate, framesize, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames + n_prev_frames, lofreq, hifreq, mel_banks_only) self.framesize = framesize self.frameshift = frameshift def decide(self, data): """Processes the input frame whether the input segment is speech or non speech. The returned values can be in range from 0.0 to 1.0. It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment. """ data = struct.unpack('%dh' % (len(data) / 2, ), data) self.audio_recorded_in.extend(data) while len(self.audio_recorded_in) > self.framesize: frame = self.audio_recorded_in[:self.framesize] self.audio_recorded_in = self.audio_recorded_in[self.frameshift:] mfcc = self.front_end.param(frame) prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc.reshape(1,len(mfcc)))[0] # print prob_sil, prob_speech self.log_probs_speech.append(log(prob_speech)) self.log_probs_sil.append(log(prob_sil)) log_prob_speech_avg = 0.0 for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil): log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil]) log_prob_speech_avg /= len(self.log_probs_speech) prob_speech_avg = np.exp(log_prob_speech_avg) # print 'prob_speech_avg: %5.3f' % prob_speech_avg self.last_decision = prob_speech_avg # returns a speech / non-speech decisions return self.last_decision