def get_frame(self, file_name, frame_id): """Returns a frame from a specific param file.""" if self.last_file_name != file_name: self.last_file_name = file_name # print "FN", file_name # find matching param file param_file_name = self.get_param_file_name(file_name) if param_file_name == None: raise Exception("MLFMFCCOnlineAlignedArray: param_file_name cannot be None, file_name: " + file_name) # print "PFN", param_file_name # open the param file try: self.last_param_file_features = wave.open(param_file_name, 'r') except AttributeError: print "Error opening file:", param_file_name if self.last_param_file_features.getnchannels() != 1: raise Exception('Input wave is not in mono') if self.last_param_file_features.getsampwidth() != 2: raise Exception('Input wave is not in 16bit') sample_rate = self.last_param_file_features.getframerate() self.frame_size = int(sample_rate * self.windowsize / 10000000) if self.frame_size > 1024: self.frame_size = 2048 elif self.frame_size > 512: self.frame_size = 1024 elif self.frame_size > 256: self.frame_size = 512 elif self.frame_size > 128: self.frame_size = 256 elif self.frame_size > 64: self.frame_size = 128 self.frame_shift = int(sample_rate * self.targetrate / 10000000) self.mfcc_front_end = MFCCFrontEnd(sample_rate, self.frame_size, usec0=self.usec0, usedelta=self.usedelta, useacc=self.useacc, n_last_frames=self.n_last_frames, mel_banks_only = self.mel_banks_only) # print "FS", self.frame_size self.last_param_file_features.setpos(max(frame_id * self.frame_shift - int(self.frame_size / 2), 0)) frame = self.last_param_file_features.readframes(self.frame_size) # print "LN", len(frame) frame = numpy.frombuffer(frame, dtype=numpy.int16) try: mfcc_params = self.mfcc_front_end.param(frame) except ValueError: print file_name, frame_id, len(frame) raise return mfcc_params
def get_frame(self, file_name, frame_id): """Returns a frame from a specific param file.""" if self.last_file_name != file_name: self.last_file_name = file_name # print "FN", file_name # find matching param file param_file_name = self.get_param_file_name(file_name) # print "PFN", param_file_name # open the param file self.last_param_file_features = wave.open(param_file_name, 'r') if self.last_param_file_features.getnchannels() != 1: raise Exception('Input wave is not in mono') if self.last_param_file_features.getsampwidth() != 2: raise Exception('Input wave is not in 16bit') sample_rate = self.last_param_file_features.getframerate() self.frame_size = int(sample_rate * self.windowsize / 10000000) if self.frame_size > 1024: self.frame_size = 2048 elif self.frame_size > 512: self.frame_size = 1024 elif self.frame_size > 256: self.frame_size = 512 elif self.frame_size > 128: self.frame_size = 256 elif self.frame_size > 64: self.frame_size = 128 self.frame_shift = int(sample_rate * self.targetrate / 10000000) self.mfcc_front_end = MFCCFrontEnd( sample_rate, self.frame_size, usec0=self.usec0, usedelta=self.usedelta, useacc=self.useacc, n_last_frames=self.n_last_frames) # print "FS", self.frame_size self.last_param_file_features.setpos( max(frame_id * self.frame_shift - int(self.frame_size / 2), 0)) frame = self.last_param_file_features.readframes(self.frame_size) # print "LN", len(frame) frame = numpy.frombuffer(frame, dtype=numpy.int16) mfcc_params = self.mfcc_front_end.param(frame) return mfcc_params
def __init__(self, model, filter_length, sample_rate, framesize, frameshift, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames, n_prev_frames, lofreq, hifreq, mel_banks_only): self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(model) self.log_probs_speech = deque(maxlen=filter_length) self.log_probs_sil = deque(maxlen=filter_length) self.last_decision = 0.0 self.front_end = MFCCFrontEnd( sample_rate, framesize, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames + n_prev_frames, lofreq, hifreq, mel_banks_only) self.framesize = framesize self.frameshift = frameshift
def __init__(self, cfg): self.cfg = cfg self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(self.cfg['VAD']['ffnn']['model']) self.log_probs_speech = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.log_probs_sil = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.last_decision = 0.0 if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC': self.front_end = MFCCFrontEnd( self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'], self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'], self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'], self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'], self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'], self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'], self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'], self.cfg['VAD']['ffnn']['n_last_frames']+self.cfg['VAD']['ffnn']['n_prev_frames'], self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq'], self.cfg['VAD']['ffnn']['mel_banks_only']) else: raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], ))
def __init__(self, cfg): self.cfg = cfg self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(self.cfg['VAD']['ffnn']['model']) self.log_probs_speech = deque( maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.log_probs_sil = deque( maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.last_decision = 0.0 if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC': self.front_end = MFCCFrontEnd( self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'], self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'], self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'], self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'], self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'], self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'], self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'], self.cfg['VAD']['ffnn']['n_last_frames'] + self.cfg['VAD']['ffnn']['n_prev_frames'], self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq'], self.cfg['VAD']['ffnn']['mel_banks_only']) else: raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], ))
def get_frame(self, file_name, frame_id): """Returns a frame from a specific param file.""" if self.last_file_name != file_name: self.last_file_name = file_name # print "FN", file_name # find matching param file param_file_name = self.get_param_file_name(file_name) # print "PFN", param_file_name # open the param file self.last_param_file_features = wave.open(param_file_name, 'r') if self.last_param_file_features.getnchannels() != 1: raise Exception('Input wave is not in mono') if self.last_param_file_features.getsampwidth() != 2: raise Exception('Input wave is not in 16bit') sample_rate = self.last_param_file_features.getframerate() self.frame_size = int(sample_rate * self.windowsize / 10000000) if self.frame_size > 1024: self.frame_size = 2048 elif self.frame_size > 512: self.frame_size = 1024 elif self.frame_size > 256: self.frame_size = 512 elif self.frame_size > 128: self.frame_size = 256 elif self.frame_size > 64: self.frame_size = 128 self.frame_shift = int(sample_rate * self.targetrate / 10000000) self.mfcc_front_end = MFCCFrontEnd(sample_rate, self.frame_size, usec0=self.usec0, usedelta=self.usedelta, useacc=self.useacc, n_last_frames=self.n_last_frames) # print "FS", self.frame_size self.last_param_file_features.setpos(max(frame_id * self.frame_shift - int(self.frame_size / 2), 0)) frame = self.last_param_file_features.readframes(self.frame_size) # print "LN", len(frame) frame = numpy.frombuffer(frame, dtype=numpy.int16) mfcc_params = self.mfcc_front_end.param(frame) return mfcc_params
def __init__(self, model, filter_length, sample_rate, framesize, frameshift, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames, n_prev_frames, lofreq, hifreq, mel_banks_only): self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(model) self.log_probs_speech = deque(maxlen=filter_length) self.log_probs_sil = deque(maxlen=filter_length) self.last_decision = 0.0 self.front_end = MFCCFrontEnd(sample_rate, framesize, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames + n_prev_frames, lofreq, hifreq, mel_banks_only) self.framesize = framesize self.frameshift = frameshift
class MLFMFCCOnlineAlignedArray(MLFFeaturesAlignedArray): """This is an extension of MLFFeaturesAlignedArray which computes the features on the fly from the input wav files. It uses our own implementation of the MFCC computation. As a result it does not give the same results as the HTK HCopy. The experience suggests that our MFFC features are worse than the features generated by HCopy. """ def __init__(self, windowsize=250000, targetrate=100000, filter=None, usec0=False, usedelta=True, useacc=True, n_last_frames=0, mel_banks_only = False): """Initialise the MFCC front-end. windowsize - defines the length of the window (frame) in the HTK's 100ns units targetrate - defines the period with which new coefficients should be generated (again in 100ns units) """ MLFFeaturesAlignedArray.__init__(self, filter) self.windowsize = windowsize self.targetrate = targetrate self.usec0 = usec0 self.usedelta = usedelta self.useacc = useacc self.n_last_frames = n_last_frames self.mel_banks_only = mel_banks_only self.mfcc_front_end = None def get_frame(self, file_name, frame_id): """Returns a frame from a specific param file.""" if self.last_file_name != file_name: self.last_file_name = file_name # print "FN", file_name # find matching param file param_file_name = self.get_param_file_name(file_name) if param_file_name == None: raise Exception("MLFMFCCOnlineAlignedArray: param_file_name cannot be None, file_name: " + file_name) # print "PFN", param_file_name # open the param file try: self.last_param_file_features = wave.open(param_file_name, 'r') except AttributeError: print "Error opening file:", param_file_name if self.last_param_file_features.getnchannels() != 1: raise Exception('Input wave is not in mono') if self.last_param_file_features.getsampwidth() != 2: raise Exception('Input wave is not in 16bit') sample_rate = self.last_param_file_features.getframerate() self.frame_size = int(sample_rate * self.windowsize / 10000000) if self.frame_size > 1024: self.frame_size = 2048 elif self.frame_size > 512: self.frame_size = 1024 elif self.frame_size > 256: self.frame_size = 512 elif self.frame_size > 128: self.frame_size = 256 elif self.frame_size > 64: self.frame_size = 128 self.frame_shift = int(sample_rate * self.targetrate / 10000000) self.mfcc_front_end = MFCCFrontEnd(sample_rate, self.frame_size, usec0=self.usec0, usedelta=self.usedelta, useacc=self.useacc, n_last_frames=self.n_last_frames, mel_banks_only = self.mel_banks_only) # print "FS", self.frame_size self.last_param_file_features.setpos(max(frame_id * self.frame_shift - int(self.frame_size / 2), 0)) frame = self.last_param_file_features.readframes(self.frame_size) # print "LN", len(frame) frame = numpy.frombuffer(frame, dtype=numpy.int16) try: mfcc_params = self.mfcc_front_end.param(frame) except ValueError: print file_name, frame_id, len(frame) raise return mfcc_params
class FFNNVADGeneral(object): """ This is implementation of a FFNN based voice activity detector. It only implements decisions whether input frame is speech of non speech. It returns the posterior probability of speech for N last input frames. """ def __init__(self, model, filter_length, sample_rate, framesize, frameshift, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames, n_prev_frames, lofreq, hifreq, mel_banks_only): self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(model) self.log_probs_speech = deque(maxlen=filter_length) self.log_probs_sil = deque(maxlen=filter_length) self.last_decision = 0.0 self.front_end = MFCCFrontEnd(sample_rate, framesize, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames + n_prev_frames, lofreq, hifreq, mel_banks_only) self.framesize = framesize self.frameshift = frameshift def decide(self, data): """Processes the input frame whether the input segment is speech or non speech. The returned values can be in range from 0.0 to 1.0. It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment. """ data = struct.unpack('%dh' % (len(data) / 2, ), data) self.audio_recorded_in.extend(data) while len(self.audio_recorded_in) > self.framesize: frame = self.audio_recorded_in[:self.framesize] self.audio_recorded_in = self.audio_recorded_in[self.frameshift:] mfcc = self.front_end.param(frame) prob_sil, prob_speech = self.ffnn.predict_normalise( mfcc.reshape(1, len(mfcc)))[0] # print prob_sil, prob_speech self.log_probs_speech.append(log(prob_speech)) self.log_probs_sil.append(log(prob_sil)) log_prob_speech_avg = 0.0 for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil): log_prob_speech_avg += log_prob_speech - logsumexp( [log_prob_speech, log_prob_sil]) log_prob_speech_avg /= len(self.log_probs_speech) prob_speech_avg = np.exp(log_prob_speech_avg) # print 'prob_speech_avg: %5.3f' % prob_speech_avg self.last_decision = prob_speech_avg # returns a speech / non-speech decisions return self.last_decision
class FFNNVAD(): """ This is implementation of a FFNN based voice activity detector. It only implements decisions whether input frame is speech of non speech. It returns the posterior probability of speech for N last input frames. """ def __init__(self, cfg): self.cfg = cfg self.audio_recorded_in = [] self.ffnn = FFNN() self.ffnn.load(self.cfg['VAD']['ffnn']['model']) self.log_probs_speech = deque( maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.log_probs_sil = deque( maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.last_decision = 0.0 if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC': self.front_end = MFCCFrontEnd( self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'], self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'], self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'], self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'], self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'], self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'], self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'], self.cfg['VAD']['ffnn']['n_last_frames'], self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq']) else: raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], )) def decide(self, data): """Processes the input frame whether the input segment is speech or non speech. The returned values can be in range from 0.0 to 1.0. It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment. """ data = struct.unpack('%dh' % (len(data) / 2, ), data) self.audio_recorded_in.extend(data) while len( self.audio_recorded_in) > self.cfg['VAD']['ffnn']['framesize']: frame = self.audio_recorded_in[:self. cfg['VAD']['ffnn']['framesize']] self.audio_recorded_in = self.audio_recorded_in[ self.cfg['VAD']['ffnn']['frameshift']:] mfcc = self.front_end.param(frame) prob_sil, prob_speech = self.ffnn.predict(mfcc) self.log_probs_speech.append(log(prob_speech)) self.log_probs_sil.append(log(prob_sil)) log_prob_speech_avg = 0.0 for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil): log_prob_speech_avg += log_prob_speech - logsumexp( [log_prob_speech, log_prob_sil]) log_prob_speech_avg /= len(self.log_probs_speech) prob_speech_avg = np.exp(log_prob_speech_avg) # print 'prob_speech_avg: %5.3f' % prob_speech_avg self.last_decision = prob_speech_avg # returns a speech / non-speech decisions return self.last_decision
class FFNNVADGeneral(object): """ This is implementation of a FFNN based voice activity detector. It only implements decisions whether input frame is speech of non speech. It returns the posterior probability of speech for N last input frames. """ def __init__(self, model, filter_length, sample_rate, framesize, frameshift, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames, n_prev_frames, lofreq, hifreq, mel_banks_only): self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(model) self.log_probs_speech = deque(maxlen=filter_length) self.log_probs_sil = deque(maxlen=filter_length) self.last_decision = 0.0 self.front_end = MFCCFrontEnd( sample_rate, framesize, usehamming, preemcoef, numchans, ceplifter, numceps, enormalise, zmeansource, usepower, usec0, usecmn, usedelta, useacc, n_last_frames + n_prev_frames, lofreq, hifreq, mel_banks_only) self.framesize = framesize self.frameshift = frameshift def decide(self, data): """Processes the input frame whether the input segment is speech or non speech. The returned values can be in range from 0.0 to 1.0. It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment. """ data = struct.unpack('%dh' % (len(data) / 2, ), data) self.audio_recorded_in.extend(data) while len(self.audio_recorded_in) > self.framesize: frame = self.audio_recorded_in[:self.framesize] self.audio_recorded_in = self.audio_recorded_in[self.frameshift:] mfcc = self.front_end.param(frame) prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc.reshape(1,len(mfcc)))[0] # print prob_sil, prob_speech self.log_probs_speech.append(log(prob_speech)) self.log_probs_sil.append(log(prob_sil)) log_prob_speech_avg = 0.0 for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil): log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil]) log_prob_speech_avg /= len(self.log_probs_speech) prob_speech_avg = np.exp(log_prob_speech_avg) # print 'prob_speech_avg: %5.3f' % prob_speech_avg self.last_decision = prob_speech_avg # returns a speech / non-speech decisions return self.last_decision
class FFNNVAD(): """ This is implementation of a FFNN based voice activity detector. It only implements decisions whether input frame is speech of non speech. It returns the posterior probability of speech for N last input frames. """ def __init__(self, cfg): self.cfg = cfg self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(self.cfg['VAD']['ffnn']['model']) self.log_probs_speech = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.log_probs_sil = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.last_decision = 0.0 if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC': self.front_end = MFCCFrontEnd( self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'], self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'], self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'], self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'], self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'], self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'], self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'], self.cfg['VAD']['ffnn']['n_last_frames']+self.cfg['VAD']['ffnn']['n_prev_frames'], self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq'], self.cfg['VAD']['ffnn']['mel_banks_only']) else: raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], )) def decide(self, data): """Processes the input frame whether the input segment is speech or non speech. The returned values can be in range from 0.0 to 1.0. It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment. """ data = struct.unpack('%dh' % (len(data) / 2, ), data) self.audio_recorded_in.extend(data) while len(self.audio_recorded_in) > self.cfg['VAD']['ffnn']['framesize']: frame = self.audio_recorded_in[:self.cfg['VAD']['ffnn']['framesize']] self.audio_recorded_in = self.audio_recorded_in[self.cfg['VAD']['ffnn']['frameshift']:] mfcc = self.front_end.param(frame) prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc.reshape(1,len(mfcc)))[0] # print prob_sil, prob_speech self.log_probs_speech.append(log(prob_speech)) self.log_probs_sil.append(log(prob_sil)) log_prob_speech_avg = 0.0 for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil): log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil]) log_prob_speech_avg /= len(self.log_probs_speech) prob_speech_avg = np.exp(log_prob_speech_avg) # print 'prob_speech_avg: %5.3f' % prob_speech_avg self.last_decision = prob_speech_avg # returns a speech / non-speech decisions return self.last_decision