def main(argv): fbank_mx = features.mel_fbank_mx(winlen_nfft=WINDOWSIZE / SOURCERATE, fs=fs, NUMCHANS=NUMCHANS, LOFREQ=LOFREQ, HIFREQ=HIFREQ) scp_list = sys.argv[1] vad_dir = sys.argv[2] wav_dir = sys.argv[3] ubm_file = sys.argv[4] v_file = sys.argv[5] out_dir = sys.argv[6] print 'Loading UBM from', ubm_file ubm_weights, ubm_means, ubm_covs = load_ubm(ubm_file) GMM = gmm.gmm_eval_prep(ubm_weights, ubm_means, ubm_covs) numG = ubm_means.shape[0] dimF = ubm_means.shape[1] # normalization of statistics - precomputing matrices if ubm_covs.shape[1] == dimF: ubm_norm = 1 / np.sqrt(ubm_covs); print 'Loading T matrix from ', v_file, '...' v = np.loadtxt(v_file, dtype=np.float32) print 'Computing MVVT ...' MVVT = iv.compute_VtV(v, numG) print 'Loading list of files to process from ' + scp_list seg_list = np.atleast_1d(np.loadtxt(scp_list, dtype=object)) # extract all sub-dir names for dir in set(map(os.path.dirname, seg_list)): mkdir_p(out_dir + '/' + dir) # go over the scp and process the audio files for ii, fn in enumerate(seg_list, 1): try: print 'Processing ', ii, '/', len(seg_list), fn np.random.seed(777) wav_file = wav_dir + '/' + fn + '.wav' raw_file = wav_dir + '/' + fn + '.raw' lab_file = vad_dir + '/' + fn + '.lab.gz' ivec_out_file = out_dir + '/' + fn + '.ivec' if os.path.isfile(wav_file): print ' Reading wave file from ' + wav_file, rate, sig = spiowav.read(wav_file) if rate != 8000: raise Exception( 'The input file ' + wav_file + ' is expected to be in 8000 Hz sampling rate, but ' + repr( rate) + ' Hz detected') else: print ' Reading raw 8000Hz, 16bit-s, 1c, file from ' + raw_file, sig = np.fromfile(raw_file, dtype='int16') print '[t=' + repr(len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr(len(sig)) + ' samples]' if ADDDITHER > 0.0: print ' Adding dither' sig = features.add_dither(sig, ADDDITHER) print ' Extracting features', fea = features.mfcc_htk(sig, window=WINDOWSIZE / SOURCERATE, noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE, fbank_mx=fbank_mx, _0='first', NUMCEPS=NUMCEPS, RAWENERGY=RAWENERGY, PREEMCOEF=PREEMCOEF, CEPLIFTER=CEPLIFTER, ZMEANSOURCE=ZMEANSOURCE, ENORMALISE=ENORMALISE, ESCALE=0.1, SILFLOOR=50.0, USEHAMMING=True) print '[n=' + repr(len(fea)) + ' frames]' print ' Adding derivatives' # [add_deriv] step fea = features.add_deriv(fea, (deltawindow, accwindow)) print ' Reshaping to SFeaCat convention' # [reshape] step fea = fea.reshape(fea.shape[0], 3, -1).transpose((0, 2, 1)).reshape(fea.shape[0], -1) # re-order coeffs like SFeaCut if vad_dir == "auto": print ' Computing VAD ' vad, n_regions, n_frames = compute_vad(sig, win_length=WINDOWSIZE / SOURCERATE, win_overlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE)[:len(fea)] else: print ' Loading VAD definition from ' + lab_file vad, n_regions, n_frames = load_vad_lab_as_bool_vec(lab_file)[:len(fea)] print ' Applying VAD [#frames=' + repr(n_frames) + ', #regions=' + repr(n_regions) + ']' fea = fea[vad, ...] if len(fea) < 3: raise NoVadException('Too few frames left: ' + str(len(fea))) print ' Applying floating CMVN' fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True) n_data, d_data = fea.shape l = 0; lc = 0 n = np.zeros((numG), dtype=np.float32) f = np.zeros((numG, dimF), dtype=np.float32) print ' Computing stats ...', # Note that we compute the stats in in sub-chunks due to memory optimization # seq_data = split_seq(range(n_data), 1000) for i in range(len(seq_data)): dd = fea[seq_data[i], :] l1, n1, f1 = gmm.gmm_eval(dd, GMM, return_accums=1) l = l + l1.sum() lc = lc + l1.shape[0] n = n + n1; f = f + f1; print '[avg llh=' + repr(l / lc) + ', #frames=' + repr(n_data) + ']' n, f = normalize_stats(n, f, ubm_means, ubm_norm) f = row(f.astype(v.dtype)) n = row(n.astype(v.dtype)) print ' Computing i-vector' w = iv.estimate_i(n, f, v, MVVT).T # write it to the disk print ' Saving ivec to:', ivec_out_file # np.savetxt(ivec_out_file, w.ravel(), newline=' ', fmt='%f') ivio.write_binary_ivector(ivec_out_file, w.ravel(), n_data / 100.0) except NoVadException as e: print e print "Warning: No features generated for segment: " + fn except: raise
raise Exception( 'The input file ' + wav_file + ' is expected to be in 8000 Hz sampling rate, but ' + repr(rate) + ' Hz detected') else: print ' Reading raw 8000Hz, 16bit-s, 1c, file from ' + raw_file, sig = np.fromfile(raw_file, dtype='int16') print '[t=' + repr( len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr( len(sig)) + ' samples]' if ADDDITHER > 0.0: print ' Adding dither' sig = features.add_dither(sig, ADDDITHER) print ' Extracting features', fea = features.mfcc_htk(sig, window=WINDOWSIZE / SOURCERATE, noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE, fbank_mx=fbank_mx, _0='first', NUMCEPS=NUMCEPS, RAWENERGY=RAWENERGY, PREEMCOEF=PREEMCOEF, CEPLIFTER=CEPLIFTER, ZMEANSOURCE=ZMEANSOURCE, ENORMALISE=ENORMALISE, ESCALE=0.1,
fbank_mx = features.mel_fbank_mx(winlen, samplerate, NUMCHANS=64, LOFREQ=20.0, HIFREQ=7600, htk_bug=False) else: raise ValueError( f'Only 8kHz and 16kHz are supported. Got {samplerate} instead.' ) LC = 150 RC = 149 np.random.seed(3) # for reproducibility signal = features.add_dither( (signal * 2**15).astype(int)) for segnum in range(len(labs)): seg = signal[labs[segnum, 0]:labs[segnum, 1]] if seg.shape[ 0] > 0.01 * samplerate: # process segment only if longer than 0.01s # Mirror noverlap//2 initial and final samples seg = np.r_[seg[noverlap // 2 - 1::-1], seg, seg[-1:-winlen // 2 - 1:-1]] fea = features.fbank_htk(seg, window, noverlap, fbank_mx, USEPOWER=True, ZMEANSOURCE=True) fea = features.cmvn_floating_kaldi(
def process_wav(self, wav_file, mode="ivector", vad_dir="auto"): if mode not in ["ivector", "statistics", "mfcc"]: return False else: # all constans are initialized in __init__() method # READ WAVE AND COMPUTE IVECTOR sig, rate = librosa.load(wav_file) #print(librosa.get_duration(sig, rate)) # wav conversion sig, rate = self.wav_conversion(sig, rate) #import sounddevice as sd #sd.play(sig, rate) if rate != 8000: raise Exception( 'The input file ' + wav_file + ' is expected to be in 8000 Hz sampling rate, but ' + repr(rate) + ' Hz detected') # info about singnal printed print '[t=' + repr( len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr( len(sig)) + ' samples]' if ADDDITHER > 0.0: print ' Adding dither' sig = features.add_dither(sig, ADDDITHER) print ' Extracting features', fea = features.mfcc_htk(sig, window=WINDOWSIZE / SOURCERATE, noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE, fbank_mx=fbank_mx, _0='first', NUMCEPS=NUMCEPS, RAWENERGY=RAWENERGY, PREEMCOEF=PREEMCOEF, CEPLIFTER=CEPLIFTER, ZMEANSOURCE=ZMEANSOURCE, ENORMALISE=ENORMALISE, ESCALE=0.1, SILFLOOR=50.0, USEHAMMING=True) print '[n=' + repr(len(fea)) + ' frames]' print ' Adding derivatives' # [add_deriv] step fea = features.add_deriv(fea, (deltawindow, accwindow)) print ' Reshaping to SFeaCat convention' # [reshape] step fea = fea.reshape(fea.shape[0], 3, -1).transpose( (0, 2, 1)).reshape(fea.shape[0], -1) # re-order coeffs like SFeaCut if vad_dir == "auto": print ' Computing VAD ' vad, n_regions, n_frames = self.compute_vad( sig, win_length=WINDOWSIZE / SOURCERATE, win_overlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE)[:len(fea)] print ' Applying VAD [#frames=' + repr( n_frames) + ', #regions=' + repr(n_regions) + ']' fea = fea[0:len(vad), ...] fea = fea[vad, ...] if len(fea) < 3: raise NoVadException('Too few frames left: ' + str(len(fea))) print ' Applying floating CMVN' fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True) if mode == "mfcc": return fea n_data, d_data = fea.shape l = 0 lc = 0 n = np.zeros((self.numG), dtype=np.float32) f = np.zeros((self.numG, self.dimF), dtype=np.float32) print ' Computing stats ...', # Note that we compute the stats in in sub-chunks due to memory optimization # seq_data = self.split_seq(range(n_data), 1000) for i in range(len(seq_data)): dd = fea[seq_data[i], :] l1, n1, f1 = gmm.gmm_eval(dd, self.GMM, return_accums=1) l = l + l1.sum() lc = lc + l1.shape[0] n = n + n1 f = f + f1 print '[avg llh=' + repr( l / lc) + ', #frames=' + repr(n_data) + ']' n, f = self.normalize_stats(n, f, self.ubm_means, self.ubm_norm) f = self.row(f.astype(self.v.dtype)) n = self.row(n.astype(self.v.dtype)) if mode == "statistics": return f, n print ' Computing i-vector' w = iv.estimate_i(n, f, self.v, self.MVVT).T print "IVECTOR", w if mode == "ivector": return w
fs, NUMCHANS=40, LOFREQ=20.0, HIFREQ=7600, htk_bug=False) LC = 150 RC = 149 with open(out_seg_fn, "w") as seg_file: with open(out_ark_fn, "wb") as ark_file: for fn in file_names: labs = ( np.loadtxt(in_lab_dir + "/" + fn + ".lab", usecols=(0, 1)) * 16000).astype(int) signal, samplerate = sf.read(in_flac_dir + "/" + fn + ".flac") signal = features.add_dither( (signal * 2**(samplerate / 1000 - 1)).astype(int)) for segnum in range(len(labs)): seg = signal[labs[segnum, 0]:labs[segnum, 1]] seg = np.r_[ seg[noverlap // 2 - 1::-1], seg, seg[-1:-winlen // 2 - 1:-1]] # Mirror noverlap//2 initial and final samples fea = features.fbank_htk(seg, window, noverlap, fbank_mx, USEPOWER=True, ZMEANSOURCE=True) fea = features.cmvn_floating_kaldi(fea, LC, RC,