def split_dict_and_save_ark(in_dict,split_num,data_folder,name,kaldi_cmd): all_keys=list(in_dict.keys()) dict_size=int(np.ceil(len(all_keys)/split_num)) split_keys=list(chunks(all_keys,dict_size)) if not exists(join(data_folder,name)): os.makedirs(join(data_folder,name)) for i in range(0,split_num): ark_file_name=join(data_folder,name,'normalized.'+str(i+1)) dict2Ark(sub_dict(in_dict,split_keys[i]),ark_file_name,kaldi_cmd)
for utt_id, mat in kaldi_io.read_mat_ark(cmd) } all_feats.append(feat_dict) post_dict = {} # I assume 3 streams from here for utt_id in all_feats[0]: batch_x1 = all_feats[0][utt_id][None, :, :] batch_x2 = all_feats[1][utt_id][None, :, :] batch_x3 = all_feats[2][utt_id][None, :, :] batch_l = Variable(torch.IntTensor([batch_x1.shape[1]])) out = model([batch_x1, batch_x2, batch_x3], batch_l) if config.prior: post_dict[utt_id] = lsm( out[0, :, :]).data.numpy() - config.prior_weight * prior else: if config.add_softmax: post_dict[utt_id] = sm(out[0, :, :]).data.numpy() else: post_dict[utt_id] = out[0, :, :].data.numpy() return post_dict if __name__ == '__main__': config = get_args() post_dict = get_output(config) dict2Ark(post_dict, os.path.abspath(config.save_file), kaldi_cmd='copy-feats')
def getFeats(args, srate=16000, window=np.hanning): wavs = args.scp scp_type = args.scp_type outfile = args.outfile add_reverb = args.add_reverb coeff_0 = args.coeff_0 coeff_n = args.coeff_n order = args.order fduration = args.fduration frate = args.frate nfilters = args.nfilters kaldi_cmd = args.kaldi_cmd # Set up mel-filterbank fbank_type = args.fbank_type.strip().split(',') if args.complex_modulation: dur = int(fduration * srate) else: dur = int(2 * fduration * srate) if fbank_type[0] == "mel": if len(fbank_type) < 2: raise ValueError('Mel filter bank not configured properly....') fbank = createFbank(nfilters, dur, srate, warp_fact=float(fbank_type[1])) elif fbank_type[0] == "cochlear": if len(fbank_type) < 6: raise ValueError( 'Cochlear filter bank not configured properly....') if int(fbank_type[3]) == 1: print( '%s: Alpha is fixed and will not change as a function of the center frequency...' % sys.argv[0]) fbank = createFbankCochlear(nfilters, dur, srate, om_w=float(fbank_type[1]), alp=float(fbank_type[2]), fixed=int(fbank_type[3]), bet=float(fbank_type[4]), warp_fact=float(fbank_type[5])) else: raise ValueError( 'Invalid type of filter bank, use mel or cochlear with proper configuration' ) coeff_num = coeff_n - coeff_0 + 1 if args.keep_even: temp = np.arange(0, coeff_num) if coeff_0 % 2 == 0: # It starts from odd coefficients feat_len = temp[1::2].shape[0] else: feat_len = temp[0::2].shape[0] elif args.complex_modulation: if args.absolute_value: feat_len = coeff_num else: feat_len = 2 * coeff_num else: feat_len = coeff_num if args.compensate_noise: if args.complex_modulation: fmax = coeff_num / (fduration) faxis = np.linspace(0, fmax, coeff_n) else: fmax = coeff_num / (2 * fduration) faxis = np.linspace(0, fmax, coeff_n) if args.no_window: print('%s: Using square windows' % sys.argv[0]) window = sq_wind if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if scp_type == 'wav': if inwav[-1] == '|': try: proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest = False except Exception: skip_rest = True else: try: sr, signal = read(inwav) skip_rest = False except Exception: skip_rest = True assert sr == srate, 'Input file has different sampling rate.' elif scp_type == 'segment': try: cmd = 'wav-copy ' + inwav + ' - ' proc = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest = False except Exception: skip_rest = True else: raise ValueError( 'Invalid type of scp type, it should be either wav or segment' ) if not skip_rest: # I want to work with numbers from 0 to 1 so.... # signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) if args.complex_modulation: cos_trans = freqAnalysis.ifft(time_frames) cos_trans = cos_trans[:, :int(fduration * srate / 2)] else: cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) feats = np.zeros((frame_num, nfilters * feat_len)) print('%s: Computing Features for file: %s, also %d' % (sys.argv[0], uttid, time_frames.shape[0])) sys.stdout.flush() for i in range(frame_num): each_feat = np.zeros([nfilters, feat_len]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] if args.complex_modulation: xlpc, gg = computeLpcFast( band_dct, order, keepreal=False) # Compute LPC coefficients mod_spec = computeModSpecFromLpc(gg, xlpc, coeff_n) if args.compensate_noise: mod_spec = mod_spec * faxis if args.absolute_value: temp2 = np.abs(mod_spec[coeff_0 - 1:coeff_n]) else: temp2 = np.append( np.real(mod_spec[coeff_0 - 1:coeff_n]), np.imag(mod_spec[coeff_0 - 1:coeff_n])) else: xlpc, gg = computeLpcFast(band_dct, order) mod_spec = np.real( computeModSpecFromLpc(gg, xlpc, coeff_n)) if args.compensate_noise: mod_spec = mod_spec * faxis if args.absolute_value: temp2 = np.abs(mod_spec[coeff_0 - 1:coeff_n]) else: temp2 = mod_spec[coeff_0 - 1:coeff_n] if args.keep_even: if coeff_0 % 2 == 0: each_feat[j, :] = temp2[1::2] else: each_feat[j, :] = temp2[0::2] else: each_feat[j, :] = temp2 each_feat = np.reshape(each_feat, (1, nfilters * feat_len)) feats[i, :] = each_feat all_feats[uttid] = feats dict2Ark(all_feats, outfile, kaldi_cmd)
help='Set LPC gain to 1 (True)') parser.add_argument('--kaldi_cmd', help='Kaldi command to use to get ark files') args = parser.parse_args() start_time = time.time() print('%s: Computing MFCC features' % sys.argv[0]) sys.stdout.flush() all_mfcc = get_mfcc(args) print('%s: Computing Modulation Spectral features' % sys.argv[0]) sys.stdout.flush() all_modspec = get_modspec(args) print('%s: Combining Modulation Spectral features' % sys.argv[0]) sys.stdout.flush() all_feats = {} for uttid in list(all_mfcc.keys()): mfcc = all_mfcc[uttid] modspec = all_modspec[uttid] all_feats[uttid] = np.concatenate((modspec, mfcc), axis=1) dict2Ark(all_feats, args.outfile, args.kaldi_cmd) time_note = 'Execution Time: {t:.3f} seconds'.format(t=time.time() - start_time) print(time_note) sys.stdout.flush()
def getFeats(args, srate=16000, window=np.hanning): wavs=args.scp segment=args.segment outfile=args.outfile add_reverb=args.add_reverb set_unity_gain=args.set_unity_gain nmodulations=args.nmodulations order=args.order fduration=args.fduration frate=args.frate nfilters=args.nfilters kaldi_cmd=args.kaldi_cmd fbank = createFbank(nfilters, int(2*fduration*srate), srate) if add_reverb: if add_reverb=='small_room': sr_r, rir=read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir=rir[:,1] rir=rir/np.power(2,15) elif add_reverb=='large_room': sr_r, rir=read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir=rir[:,1] rir=rir/np.power(2,15) elif add_reverb=='clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') wav_in_buffer='' # Wav file that is currently in RAM # Load Location and Ids of all wav files wav_ids=[]; wav_locs=[] with open(wavs, 'r') as fid: for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) wav_ids.append(uttid) wav_locs.append(inwav) # Compute features for all the segments with open(segment, 'r') as fid_s: all_feats={} for line_s in fid_s: token_s = line_s.strip().split() seg_id=token_s[0]; wav_id=token_s[1] # Load wav file it is already not in RAM unload if wav_in_buffer!=wav_id: wav_in_buffer=wav_id inwav=wav_locs[wav_ids.index(wav_id)] if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal_big = read(io.BytesIO(proc.stdout)) else: sr, signal_big = read(inwav) assert sr == srate, 'Input file has different sampling rate.' t_beg=int(float(token_s[2])*sr); t_end=int(float(token_s[3])*sr) signal=signal_big[t_beg:t_end] signal=signal/np.power(2,15) if add_reverb: if not add_reverb=='clean': signal=addReverb(signal,rir) time_frames = np.array([frame for frame in getFrames(signal, srate, frate, fduration, window)]) cos_trans=freqAnalysis.dct(time_frames)/np.sqrt(2*int(srate * fduration)) [frame_num, ndct]=np.shape(cos_trans) feats=np.zeros((frame_num,nfilters*nmodulations)) print('%s: Computing Features for file: %s and segment: %s' % (sys.argv[0],wav_id,seg_id)) sys.stdout.flush() for i in range(frame_num): each_feat=np.zeros([nfilters,nmodulations]) for j in range(nfilters): filt=fbank[j,0:-1] band_dct=filt*cos_trans[i,:] xlpc, gg=computeLpcFast(band_dct,order) # Compute LPC coefficients if set_unity_gain: gg=1 mod_spec=computeModSpecFromLpc(gg,xlpc,nmodulations) each_feat[j,:]=mod_spec each_feat=np.reshape(each_feat,(1,nfilters*nmodulations)) feats[i,:]=each_feat all_feats[seg_id]=feats dict2Ark(all_feats,outfile,kaldi_cmd)
def getFeats(args, srate=16000, window=np.hamming): wavs = args.scp scp_type = args.scp_type outfile = args.outfile coeff_num = args.coeff_num coeff_range = args.coeff_range order = args.order fduration = args.fduration frate = args.frate nfilters = args.nfilters kaldi_cmd = args.kaldi_cmd add_noise = args.add_noise add_reverb = args.add_reverb if args.lifter_config: fid = open(args.lifter_config, 'r') lifter_config = fid.readline().strip().split(',') lifter_config = np.asarray([float(x) for x in lifter_config]) # Set up mel-filterbank fbank_type = args.fbank_type.strip().split(',') if fbank_type[0] == "mel": if len(fbank_type) < 2: raise ValueError('Mel filter bank not configured properly....') fbank = createFbank(nfilters, int(2 * fduration * srate), srate, warp_fact=float(fbank_type[1])) elif fbank_type[0] == "cochlear": if len(fbank_type) < 6: raise ValueError('Cochlear filter bank not configured properly....') if int(fbank_type[3]) == 1: print('%s: Alpha is fixed and will not change as a function of the center frequency...' % sys.argv[0]) fbank = createFbankCochlear(nfilters, int(2 * fduration * srate), srate, om_w=float(fbank_type[1]), alp=float(fbank_type[2]), fixed=int(fbank_type[3]), bet=float(fbank_type[4]), warp_fact=float(fbank_type[5])) else: raise ValueError('Invalid type of filter bank, use mel or cochlear with proper configuration') # Ignore odd modulations if args.odd_mod_zero: print('%s: Ignoring odd modulations... ' % sys.argv[0]) if add_noise: if add_noise == "clean" or add_noise == "diff": print('%s: No noise added!' % sys.argv[0]) else: noise_info = add_noise.strip().split(',') noise = load_noise(noise_info[0]) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'medium_room': sr_r, rir = read('./RIR/RIR_MediumRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') # Set up mask coeff_range = coeff_range.split(',') lowpass = int(coeff_range[0]) highpass = int(coeff_range[1]) mask = [] for i in range(coeff_num): if i >= lowpass and i <= highpass: mask.append(1) else: mask.append(0) mask = np.asarray(mask) args.overlap_fraction = 1 - args.overlap_fraction # Setup modulation weights args.gamma_weight = args.gamma_weight.strip().split(',') if not args.gamma_weight[0] == "None": print('%s: Adding gamma filter on modulation frequencies...' % sys.argv[0]) x = np.linspace(0, order - 1, order) scale = float(args.gamma_weight[0]) shape = float(args.gamma_weight[1]) pk_required = float(args.gamma_weight[2]) res = 2 * fduration pk_required = pk_required * res pk = (shape - 1) * scale loc = -pk + pk_required mod_wts = stats.gamma.pdf(x, a=shape, loc=loc, scale=scale) * 3 * scale with open(wavs, 'r') as fid: all_feats = {} if args.write_utt2num_frames: all_lens = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if scp_type == 'wav': if inwav[-1] == '|': try: proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest=False except Exception: skip_rest=True else: try: sr, signal = read(inwav) skip_rest = False except Exception: skip_rest = True assert sr == srate, 'Input file has different sampling rate.' elif scp_type == 'segment': try: cmd = 'wav-copy ' + inwav + ' - ' proc = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest = False except Exception: skip_rest = True else: raise ValueError('Invalid type of scp type, it should be either wav or segment') # I want to work with numbers from 0 to 1 so.... # signal = signal / np.power(2, 15) if not skip_rest: if add_noise: if not add_noise == "clean": if add_noise == "diff": a = [1, 2, 3, 2, 0, -2, -5, -2, 0, 2, 3, 2, 1] signal = convolve(signal, a, mode='same') else: signal = add_noise_to_wav(signal, noise, float(noise_info[1])) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) tframes = signal.shape[0] # Number of samples in the signal lfr = 1 / (args.overlap_fraction * fduration) time_frames = np.array([frame for frame in getFrames(signal, srate, lfr, fduration, window)]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) feats = np.zeros((nfilters, int(np.ceil(tframes * frate / srate)))) ptr = int(0) print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() for i in range(0, frame_num): for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast(band_dct, order) # Compute LPC coefficients ms = computeModSpecFromLpc(gg, xlpc, coeff_num) ms = ms * mask if args.lifter_config: ms = ms * lifter_config if not args.gamma_weight[0] == "None": ms = ms * mod_wts if args.odd_mod_zero: ms[1::2] = 0 ms = fft(ms, 2 * int(fduration * frate)) ms = np.abs(np.exp(ms)) kk = int(np.round(fduration * frate)) kkb2 = int(np.round(fduration * frate / 2)) ms = ms[0:kk] * np.hanning(kk) / window(kk) if i == 0: if feats.shape[1] < kkb2: feats[j, :] += ms[kkb2:kkb2 + feats.shape[1]] else: feats[j, ptr:ptr + kkb2] += ms[kkb2:] elif i == frame_num - 1 or i == frame_num - 2: if ms.shape[0] >= feats.shape[1] - ptr: feats[j, ptr:] += ms[:feats.shape[1] - ptr] else: feats[j, ptr:ptr + kk] += ms else: feats[j, ptr:ptr + kk] += ms kk = int(np.round(fduration * frate * args.overlap_fraction)) kkb2 = int(np.round(fduration * frate / 2)) if i == 0: ptr = int(ptr + kk - kkb2) else: ptr = int(ptr + kk + randrange(2)) all_feats[uttid] = np.log(np.clip(feats.T, a_max=None, a_min=0.00000000000001)) if args.write_utt2num_frames: all_lens[uttid] = feats.shape[1] dict2Ark(all_feats, outfile, kaldi_cmd) if args.write_utt2num_frames: with open(outfile + '.len', 'w+') as file: for key, lens in all_lens.items(): p = "{:s} {:d}".format(key, lens) file.write(p) file.write("\n")
def getFeats(args, srate=16000, window=np.hanning): wavs = args.scp outfile = args.outfile add_reverb = args.add_reverb set_unity_gain = args.set_unity_gain nmodulations = args.nmodulations order = args.order fduration = args.fduration frate = args.frate nfilters = args.nfilters kaldi_cmd = args.kaldi_cmd fbank = createFbank(nfilters, int(2 * fduration * srate), srate) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' # I want to work with numbers from 0 to 1 so.... signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) if set_unity_gain: feats = np.zeros((frame_num, nfilters * (nmodulations - 1))) else: feats = np.zeros((frame_num, nfilters * nmodulations)) print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() for i in range(frame_num): if set_unity_gain: each_feat = np.zeros([nfilters, nmodulations - 1]) else: each_feat = np.zeros([nfilters, nmodulations]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast( band_dct, order) # Compute LPC coefficients if set_unity_gain: gg = 1 mod_spec = computeModSpecFromLpc(gg, xlpc, nmodulations) if set_unity_gain: mod_spec = mod_spec[1:] each_feat[j, :] = mod_spec if set_unity_gain: each_feat = np.reshape(each_feat, (1, nfilters * (nmodulations - 1))) else: each_feat = np.reshape(each_feat, (1, nfilters * nmodulations)) feats[i, :] = each_feat all_feats[uttid] = feats dict2Ark(all_feats, outfile, kaldi_cmd)