def mfcc_to_ivector(self, fea): n_data, d_data = fea.shape l = 0 lc = 0 n = np.zeros((self.numG), dtype=np.float32) f = np.zeros((self.numG, self.dimF), dtype=np.float32) print ' Computing stats ...', # Note that we compute the stats in in sub-chunks due to memory optimization # seq_data = self.split_seq(range(n_data), 1000) for i in range(len(seq_data)): dd = fea[seq_data[i], :] l1, n1, f1 = gmm.gmm_eval(dd, self.GMM, return_accums=1) l = l + l1.sum() lc = lc + l1.shape[0] n = n + n1 f = f + f1 print '[avg llh=' + repr(l / lc) + ', #frames=' + repr(n_data) + ']' n, f = self.normalize_stats(n, f, self.ubm_means, self.ubm_norm) f = self.row(f.astype(self.v.dtype)) n = self.row(n.astype(self.v.dtype)) print ' Computing i-vector' w = iv.estimate_i(n, f, self.v, self.MVVT).T print "IVECTOR", w return w
def compute_vad(s, win_length=160, win_overlap=80, n_realignment=5, threshold=0.3): # power signal for energy computation s = s**2 # frame signal with overlap F = features.framing(s, win_length, win_length - win_overlap) # sum frames to get energy E = F.sum(axis=1) # E = np.sqrt(E) # E = np.log(E) # normalize the energy E -= E.mean() E /= E.std() # initialization mm = np.array((-1.00, 0.00, 1.00))[:, np.newaxis] ee = np.array((1.00, 1.00, 1.00))[:, np.newaxis] ww = np.array((0.33, 0.33, 0.33)) GMM = gmm.gmm_eval_prep(ww, mm, ee) E = E[:, np.newaxis] for i in xrange(n_realignment): # collect GMM statistics llh, N, F, S = gmm.gmm_eval(E, GMM, return_accums=2) # update model ww, mm, ee = gmm.gmm_update(N, F, S) # wrap model GMM = gmm.gmm_eval_prep(ww, mm, ee) # evaluate the gmm llhs llhs = gmm.gmm_llhs(E, GMM) llh = gmm.logsumexp(llhs, axis=1)[:, np.newaxis] llhs = np.exp(llhs - llh) out = np.zeros(llhs.shape[0], dtype=np.bool) out[llhs[:, 0] < threshold] = True return out
def compute_vad(s, win_length=200, win_overlap=120, n_realignment=5, threshold=0.3): import gmm # power signal for energy computation s = s**2 # frame signal with overlap F = framing(s, win_length, win_length - win_overlap) # sum frames to get energy E = F.sum(axis=1).astype(np.float64) # E = np.sqrt(E) # E = np.log(E) # normalize the energy E -= E.mean() try: E /= E.std() # initialization mm = np.array((-1.00, 0.00, 1.00))[:, np.newaxis] ee = np.array((1.00, 1.00, 1.00))[:, np.newaxis] ww = np.array((0.33, 0.33, 0.33)) GMM = gmm.gmm_eval_prep(ww, mm, ee) E = E[:, np.newaxis] for i in range(n_realignment): # collect GMM statistics llh, N, F, S = gmm.gmm_eval(E, GMM, return_accums=2) # update model ww, mm, ee = gmm.gmm_update(N, F, S) # wrap model GMM = gmm.gmm_eval_prep(ww, mm, ee) # evaluate the gmm llhs llhs = gmm.gmm_llhs(E, GMM) llh = gmm.logsumexp(llhs, axis=1)[:, np.newaxis] llhs = np.exp(llhs - llh) out = np.zeros(llhs.shape[0], dtype=np.bool) out[llhs[:, 0] < threshold] = True except RuntimeWarning: logging.info("File contains only silence") out = np.zeros(E.shape[0], dtype=np.bool) return out
def main(argv): fbank_mx = features.mel_fbank_mx(winlen_nfft=WINDOWSIZE / SOURCERATE, fs=fs, NUMCHANS=NUMCHANS, LOFREQ=LOFREQ, HIFREQ=HIFREQ) scp_list = sys.argv[1] vad_dir = sys.argv[2] wav_dir = sys.argv[3] ubm_file = sys.argv[4] v_file = sys.argv[5] out_dir = sys.argv[6] print 'Loading UBM from', ubm_file ubm_weights, ubm_means, ubm_covs = load_ubm(ubm_file) GMM = gmm.gmm_eval_prep(ubm_weights, ubm_means, ubm_covs) numG = ubm_means.shape[0] dimF = ubm_means.shape[1] # normalization of statistics - precomputing matrices if ubm_covs.shape[1] == dimF: ubm_norm = 1 / np.sqrt(ubm_covs); print 'Loading T matrix from ', v_file, '...' v = np.loadtxt(v_file, dtype=np.float32) print 'Computing MVVT ...' MVVT = iv.compute_VtV(v, numG) print 'Loading list of files to process from ' + scp_list seg_list = np.atleast_1d(np.loadtxt(scp_list, dtype=object)) # extract all sub-dir names for dir in set(map(os.path.dirname, seg_list)): mkdir_p(out_dir + '/' + dir) # go over the scp and process the audio files for ii, fn in enumerate(seg_list, 1): try: print 'Processing ', ii, '/', len(seg_list), fn np.random.seed(777) wav_file = wav_dir + '/' + fn + '.wav' raw_file = wav_dir + '/' + fn + '.raw' lab_file = vad_dir + '/' + fn + '.lab.gz' ivec_out_file = out_dir + '/' + fn + '.ivec' if os.path.isfile(wav_file): print ' Reading wave file from ' + wav_file, rate, sig = spiowav.read(wav_file) if rate != 8000: raise Exception( 'The input file ' + wav_file + ' is expected to be in 8000 Hz sampling rate, but ' + repr( rate) + ' Hz detected') else: print ' Reading raw 8000Hz, 16bit-s, 1c, file from ' + raw_file, sig = np.fromfile(raw_file, dtype='int16') print '[t=' + repr(len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr(len(sig)) + ' samples]' if ADDDITHER > 0.0: print ' Adding dither' sig = features.add_dither(sig, ADDDITHER) print ' Extracting features', fea = features.mfcc_htk(sig, window=WINDOWSIZE / SOURCERATE, noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE, fbank_mx=fbank_mx, _0='first', NUMCEPS=NUMCEPS, RAWENERGY=RAWENERGY, PREEMCOEF=PREEMCOEF, CEPLIFTER=CEPLIFTER, ZMEANSOURCE=ZMEANSOURCE, ENORMALISE=ENORMALISE, ESCALE=0.1, SILFLOOR=50.0, USEHAMMING=True) print '[n=' + repr(len(fea)) + ' frames]' print ' Adding derivatives' # [add_deriv] step fea = features.add_deriv(fea, (deltawindow, accwindow)) print ' Reshaping to SFeaCat convention' # [reshape] step fea = fea.reshape(fea.shape[0], 3, -1).transpose((0, 2, 1)).reshape(fea.shape[0], -1) # re-order coeffs like SFeaCut if vad_dir == "auto": print ' Computing VAD ' vad, n_regions, n_frames = compute_vad(sig, win_length=WINDOWSIZE / SOURCERATE, win_overlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE)[:len(fea)] else: print ' Loading VAD definition from ' + lab_file vad, n_regions, n_frames = load_vad_lab_as_bool_vec(lab_file)[:len(fea)] print ' Applying VAD [#frames=' + repr(n_frames) + ', #regions=' + repr(n_regions) + ']' fea = fea[vad, ...] if len(fea) < 3: raise NoVadException('Too few frames left: ' + str(len(fea))) print ' Applying floating CMVN' fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True) n_data, d_data = fea.shape l = 0; lc = 0 n = np.zeros((numG), dtype=np.float32) f = np.zeros((numG, dimF), dtype=np.float32) print ' Computing stats ...', # Note that we compute the stats in in sub-chunks due to memory optimization # seq_data = split_seq(range(n_data), 1000) for i in range(len(seq_data)): dd = fea[seq_data[i], :] l1, n1, f1 = gmm.gmm_eval(dd, GMM, return_accums=1) l = l + l1.sum() lc = lc + l1.shape[0] n = n + n1; f = f + f1; print '[avg llh=' + repr(l / lc) + ', #frames=' + repr(n_data) + ']' n, f = normalize_stats(n, f, ubm_means, ubm_norm) f = row(f.astype(v.dtype)) n = row(n.astype(v.dtype)) print ' Computing i-vector' w = iv.estimate_i(n, f, v, MVVT).T # write it to the disk print ' Saving ivec to:', ivec_out_file # np.savetxt(ivec_out_file, w.ravel(), newline=' ', fmt='%f') ivio.write_binary_ivector(ivec_out_file, w.ravel(), n_data / 100.0) except NoVadException as e: print e print "Warning: No features generated for segment: " + fn except: raise
fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True) n_data, d_data = fea.shape l = 0 lc = 0 n = np.zeros((numG), dtype=np.float32) f = np.zeros((numG, dimF), dtype=np.float32) print ' Computing stats ...', # Note that we compute the stats in in sub-chunks due to memory optimization # seq_data = split_seq(range(n_data), 1000) for i in range(len(seq_data)): dd = fea[seq_data[i], :] l1, n1, f1 = gmm.gmm_eval(dd, GMM, return_accums=1) l = l + l1.sum() lc = lc + l1.shape[0] n = n + n1 f = f + f1 print '[avg llh=' + repr(l / lc) + ', #frames=' + repr(n_data) + ']' n, f = normalize_stats(n, f, ubm_means, ubm_norm) f = row(f.astype(v.dtype)) n = row(n.astype(v.dtype)) print ' Computing i-vector' w = iv.estimate_i(n, f, v, MVVT).T
def process_wav(self, wav_file, mode="ivector", vad_dir="auto"): if mode not in ["ivector", "statistics", "mfcc"]: return False else: # all constans are initialized in __init__() method # READ WAVE AND COMPUTE IVECTOR sig, rate = librosa.load(wav_file) #print(librosa.get_duration(sig, rate)) # wav conversion sig, rate = self.wav_conversion(sig, rate) #import sounddevice as sd #sd.play(sig, rate) if rate != 8000: raise Exception( 'The input file ' + wav_file + ' is expected to be in 8000 Hz sampling rate, but ' + repr(rate) + ' Hz detected') # info about singnal printed print '[t=' + repr( len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr( len(sig)) + ' samples]' if ADDDITHER > 0.0: print ' Adding dither' sig = features.add_dither(sig, ADDDITHER) print ' Extracting features', fea = features.mfcc_htk(sig, window=WINDOWSIZE / SOURCERATE, noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE, fbank_mx=fbank_mx, _0='first', NUMCEPS=NUMCEPS, RAWENERGY=RAWENERGY, PREEMCOEF=PREEMCOEF, CEPLIFTER=CEPLIFTER, ZMEANSOURCE=ZMEANSOURCE, ENORMALISE=ENORMALISE, ESCALE=0.1, SILFLOOR=50.0, USEHAMMING=True) print '[n=' + repr(len(fea)) + ' frames]' print ' Adding derivatives' # [add_deriv] step fea = features.add_deriv(fea, (deltawindow, accwindow)) print ' Reshaping to SFeaCat convention' # [reshape] step fea = fea.reshape(fea.shape[0], 3, -1).transpose( (0, 2, 1)).reshape(fea.shape[0], -1) # re-order coeffs like SFeaCut if vad_dir == "auto": print ' Computing VAD ' vad, n_regions, n_frames = self.compute_vad( sig, win_length=WINDOWSIZE / SOURCERATE, win_overlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE)[:len(fea)] print ' Applying VAD [#frames=' + repr( n_frames) + ', #regions=' + repr(n_regions) + ']' fea = fea[0:len(vad), ...] fea = fea[vad, ...] if len(fea) < 3: raise NoVadException('Too few frames left: ' + str(len(fea))) print ' Applying floating CMVN' fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True) if mode == "mfcc": return fea n_data, d_data = fea.shape l = 0 lc = 0 n = np.zeros((self.numG), dtype=np.float32) f = np.zeros((self.numG, self.dimF), dtype=np.float32) print ' Computing stats ...', # Note that we compute the stats in in sub-chunks due to memory optimization # seq_data = self.split_seq(range(n_data), 1000) for i in range(len(seq_data)): dd = fea[seq_data[i], :] l1, n1, f1 = gmm.gmm_eval(dd, self.GMM, return_accums=1) l = l + l1.sum() lc = lc + l1.shape[0] n = n + n1 f = f + f1 print '[avg llh=' + repr( l / lc) + ', #frames=' + repr(n_data) + ']' n, f = self.normalize_stats(n, f, self.ubm_means, self.ubm_norm) f = self.row(f.astype(self.v.dtype)) n = self.row(n.astype(self.v.dtype)) if mode == "statistics": return f, n print ' Computing i-vector' w = iv.estimate_i(n, f, self.v, self.MVVT).T print "IVECTOR", w if mode == "ivector": return w