def context_gen(i): # get feature filename filename = f[i][len(feature_addr):-4] print(filename) # read using htk: nframes X channels data_read = htk.open(feature_addr + filename + '.htk') x = data_read.getall() # apply cmvn varnorm = 1 # get mean across time along each channel mu = np.mean(x, 0) mu = mu.reshape(1, mu.shape[0]) # get standard deviation across time along each channel eps = np.spacing(np.float32(1.0)) if (varnorm == 1): stddev = np.std(x, 0) stddev = stddev.reshape(1, stddev.shape[0]) else: stddev = 1 y = (x - mu) / (stddev + eps ) # uses broadcasting for element-wise division # store feature writer = htk.open(store_addr + filename + '.htk', mode='w', veclen=y.shape[1]) writer.writeall(y) writer.close()
def Data_Getter(trainfile,valfile): print('Getting and Prepping Data') train=htk.open(trainfile) train_data=train.getall() np.random.shuffle(train_data) print('train data loaded') print(train_data.shape) val=htk.open(valfile) val_data=val.getall() np.random.shuffle(val_data) print('val data loaded') print(val_data.shape) Y_train=train_data[:,-1] X_train=train_data[:,:-1] del train_data time.sleep(5) Y_train=Y_train.reshape(Y_train.shape[0],1) Y_train=Y_train.astype(np.int8) Y_train=np_utils.to_categorical(Y_train,3) Y_val=val_data[:,-1] X_val=val_data[:,:-1] del val_data time.sleep(5) Y_val=Y_val.reshape(Y_val.shape[0],1) Y_val=Y_val.astype(np.int8) Y_val=np_utils.to_categorical(Y_val,3) print 'Shapes of train and val data' print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape) return (X_train,X_val,Y_train,Y_val)
def data_creator(num, addr, file_reader, filename): corrupt_files = 0 noscdlab = 0 scdlab = 0 matrix = np.empty((0, num)) changedir() writer = htk.open( filename + '.htk', mode='w', veclen=num ) #num is the final feature vector size to be written(including the label. Ensure that by looking at the botttom entry) for i in range(len(file_reader)): print "Starting with file: ", i data_read = htk.open(addr + file_reader[i] + '.htk') #opening the Gamma-Label HTK file print file_reader[i] # kurt_matrix=sio.loadmat(kurt_addr+file_reader[i]+'.mat')['kurt'] #opening the kurtosis matrix for a file # sfm_matrix=sio.loadmat(sfm_addr+file_reader[i]+'.mat')['sfm'] #opening the sfm_matrix file # labels_this_file=sio.loadmat(label_addr+file_reader[i]+'.mat')['labels'] ### Kurtosis and sfm are row vectors, that is (1,Number of frames) ### GAMMATONE -- LABEL <--- Structure of the final matrix try: read_data = data_read.getall() id1 = (1, 2)[(file_reader[i][0] == 'M') == True] temp_index = file_reader[i].index("-") id2 = (1, 2)[(file_reader[i][temp_index + 1] == 'M') == True] print "ID1: ", id1, " ID2: ", id2 gender_label = return_vec(read_data[:, -1], id1, id2) read_data = np.hstack((read_data, gender_label)) # print "Raw shape: ",read_data.shape read_data = filter_data( read_data ) #We lose the structure of the file because of shuffling # print "Filtered data shape: ",read_data.shape scdlab += len(np.where(read_data[:, -1] == 1)[0]) noscdlab += read_data.shape[0] - len( np.where(read_data[:, -1] == 1)[0]) #id1 and id2 are integers essentially. if male then 2, if female than 1 # kurt_vector=np.transpose(kurt_matrix) # sfm_vector=np.transpose(sfm_matrix) # label_vector=np.transpose(labels_this_file) # final_vector=np.hstack((read_data,kurt_vector,sfm_vector,label_vector)) final_vector = read_data # matrix=np.vstack((matrix,final_vector)) del read_data except: print "In the corrupt file section" corrupt_files += 1 continue # ind=ind+read_data.shape[0] #HTK supports concatenation, so we don't have to deal with numpy matrix again and again writer.writeall(final_vector) print('corrput_files', corrupt_files) f = open(save_extra, 'w') write_string = str(scdlab) + "," + str(noscdlab) + ", Corrupt: " + str( corrupt_files) f.write(write_string) f.close()
def data_creator(num,addr,file_reader,filename): corrupt_files=0 noscdlab=0 scdlab=0 changedir() writer=htk.open(filename+'.htk',mode='w',veclen=num) #num is the final feature vector size to be written(including the label. Ensure that by looking at the botttom entry) # for i in range(1): for i in range(len(file_reader)): print "Starting with file: ",i data_read=htk.open(addr+file_reader[i]+'.htk') #opening the Gamma-Label HTK file pitch_read=htk.open(paddr+file_reader[i]+'.htk') #opening the pitch variance file # kurt_matrix=sio.loadmat(kurt_addr+file_reader[i]+'.mat')['kurt'] #opening the kurtosis matrix for a file # sfm_matrix=sio.loadmat(sfm_addr+file_reader[i]+'.mat')['sfm'] #opening the sfm_matrix file # labels_this_file=sio.loadmat(label_addr+file_reader[i]+'.mat')['labels'] ### Kurtosis and sfm are row vectors, that is (1,Number of frames) ### GAMMATONE -- LABEL --GenderLabel <--- Structure of the final matrix try: read_data=data_read.getall() read_pitch=pitch_read.getall() variance_vector=read_pitch[:,-2] #Gettin the variance vector variance_vector=variance_vector[0:read_data.shape[0]] # print "Variance obtained" # print "read_data shape: ",read_data.shape # print "variance shape: ",variance_vector.shape read_data=np.insert(read_data,-1,variance_vector,axis=1) print "Variance inserted" id1=(1,2)[(file_reader[i][0]=='M')==True] temp_index=file_reader[i].index("-") id2=(1,2)[(file_reader[i][temp_index+1]=='M')==True] read_data[:,-1]=read_data[:,-1]-1 gender_label=return_vec(read_data[:,-1],id1,id2) read_data=np.hstack((read_data,gender_label)) read_data=filter_data(read_data) scdlab+=len(np.where(read_data[:,-2]==1)[0]) noscdlab+=len(np.where(read_data[:,-2]==0)[0]) #id1 and id2 are integers essentially. if male then 1, if female than 0 # kurt_vector=np.transpose(kurt_matrix) # sfm_vector=np.transpose(sfm_matrix) # label_vector=np.transpose(labels_this_file) # final_vector=np.hstack((read_data,kurt_vector,sfm_vector,label_vector)) final_vector=read_data # matrix=np.vstack((matrix,final_vector)) del read_data except: corrupt_files+=1 print "In the corrupt file section",corrupt_files continue # ind=ind+read_data.shape[0] #HTK supports concatenation, so we don't have to deal with numpy matrix again and again writer.writeall(final_vector) print('Corrupt_files',corrupt_files) f=open(save_extra,'w') write_string=str(scdlab)+","+str(noscdlab)+", Corrupt: "+str(corrupt_files) f.write(write_string) f.close()
def Data_Getter(filename): gamma=htk.open('/home/siddharthm/scd/context/600/gamma/train/'+filename+'.htk') #Getting gamma context feats pitch=htk.open('/home/siddharthm/scd/context/600/pitch/train/'+filename+'.htk') #Getting pitch context feats temp_gamma=gamma.getall() temp_pitch=pitch.getall() only_pitch=temp_pitch[:,0] #Extracting only the pitch value x_val=temp_gamma[:,:-1] #Only gammatone values, here 64*61 y_val=temp_gamma[:,-1] #These are the real labels, that is from the ground truth y_val=y_val.reshape(y_val.shape[0],1) y_val=y_val.astype(np.int8) print(x_val.shape,y_val.shape) return (x_val,only_pitch,y_val)
def loadData(inputData): featsReader = htk.open(inputData) trainData = featsReader.getall() yTrain = trainData[:, -1] xTrain = np.delete(trainData, -1, 1) del trainData return (xTrain, yTrain)
def generate_file_statistics(filename, mixture_number): """ Load GMM at current iteration/mixture and calculate the statistics for filename. At the end, store the statistics in the 'stats' directory. """ # Here we use a sci-kit format gmm, so to be able to use its methods model_file_name = 'models/gmm' + mixture_number gmm = pickle.load(open(model_file_name, 'rb')) # Load features: features = htkmfc.open(filename) data = features.getall() # Calculate Prob(X/gmm) prob_data_given_model = gmm.scikitGmm.predict_proba(data) # Calculate 0th, 1st, and 2nd order statistics zeroth_order_stats = np.sum(prob_data_given_model, axis=0) first_order_stats = np.dot(data.T, prob_data_given_model) second_order_stats = np.dot(np.power(data.T, 2), prob_data_given_model) print zeroth_order_stats.shape, first_order_stats.shape, second_order_stats.shape file_stats = [zeroth_order_stats, first_order_stats, second_order_stats] # Store statistics: basename = filename.split('/')[-1] output_name = '/erasable/nxs113020/stats/' + basename + '.stats' with open(output_name, 'wb') as pickle_file: pickle.dump(file_stats, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
def generate_file_statistics(filename, mixture_number): """ Load GMM at current iteration/mixture and calculate the statistics for filename. At the end, store the statistics in the 'stats' directory. """ # Here we use a sci-kit format gmm, so to be able to use its methods model_file_name = 'models/gmm'+mixture_number gmm = pickle.load(open( model_file_name, 'rb' )) # Load features: features = htkmfc.open(filename) data = features.getall() # Calculate Prob(X/gmm) prob_data_given_model = gmm.scikitGmm.predict_proba(data) # Calculate 0th, 1st, and 2nd order statistics zeroth_order_stats = np.sum(prob_data_given_model,axis = 0) first_order_stats = np.dot(data.T,prob_data_given_model) second_order_stats = np.dot(np.power(data.T,2),prob_data_given_model) print zeroth_order_stats.shape, first_order_stats.shape, second_order_stats.shape file_stats = [zeroth_order_stats, first_order_stats, second_order_stats] # Store statistics: basename = filename.split('/')[-1] output_name = '/erasable/nxs113020/stats/'+basename+'.stats' with open(output_name, 'wb') as pickle_file: pickle.dump(file_stats, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
def loadMFCCs(self, URI_recording_noExt, extractedPitchList, sectionLink): ''' for now lead extracted with HTK, read in matlab and seriqlized to txt file ''' URI_recording = URI_recording_noExt + '.wav' URIRecordingChunkResynthesized = sectionLink.URIRecordingChunk + '.wav' logging.info("working on sectionLink: {}".format( URIRecordingChunkResynthesized)) # resynthesize audio chunk: if ParametersAlgo.POLYPHONIC: if not os.path.isfile(URIRecordingChunkResynthesized ): # only if resynth file does not exist logging.info( "doing harmonic models and resynthesis for segment: {} ..." .format(URIRecordingChunkResynthesized)) if extractedPitchList == None: extractedPitchList = extractPredominantPitch( URI_recording_noExt, 2048, 128, jointAnalysis=True, ) hfreq, hmag, hphase, fs, hopSizeMelodia, inputAudioFromTsToTs = extractHarmSpec( URI_recording, extractedPitchList, sectionLink.beginTs, sectionLink.endTs, ParametersAlgo.THRESHOLD_PEAKS) resynthesize(hfreq, hmag, hphase, fs, hopSizeMelodia, URIRecordingChunkResynthesized) else: sampleRate = 44100 loader = essentia.standard.MonoLoader(filename=URI_recording, sampleRate=sampleRate) audio = loader() audioChunk = audio[sectionLink.beginTs * sampleRate:sectionLink.endTs * sampleRate] monoWriter = essentia.standard.MonoWriter( filename=URIRecordingChunkResynthesized) monoWriter(audioChunk) # call htk to extract features URImfcFile = self._extractMFCCs(URIRecordingChunkResynthesized) # read features form binary htk file logging.debug("reading MFCCs from {} ...".format(URImfcFile)) HTKFeat_reader = htkmfc.open(URImfcFile, 'rb') mfccsFeatrues = HTKFeat_reader.getall() if ParametersAlgo.FOR_MAKAM and ParametersAlgo.OBS_MODEL == 'GMM': # makam mdoels are trained with 25-dim features (no energy, no deltadeltas ) mfccs_no_energy = mfccsFeatrues[:, 0:12] mfccDeltas = mfccsFeatrues[:, 13:26] mfccsFeatrues = np.hstack((mfccs_no_energy, mfccDeltas)) return mfccsFeatrues
def data_creator(num, addr, file_reader, filename): corrupt_files = 0 ind = 0 writer = htk.open(filename + '.htk', mode='w', veclen=num) for i in range(int(len(file_reader))): print(i) data_read = htk.open(addr + file_reader[i] + '.htk') try: read_data = data_read.getall() except: corrupt_files += 1 continue ind = ind + read_data.shape[0] print(read_data.shape) writer.writeall(read_data) print('corrput_files', corrupt_files)
def load_data_val(valfile): a=htk.open(valfile) data=a.getall() print "Done loading the validation data: ",data.shape data=filter_data(data) x_val=data[:,:-1] Y_val=data[:,-1] Y_val=np.reshape(Y_val,(Y_val.shape[0],1)) y_val=np_utils.to_categorical(Y_val,2) del data return x_val,y_val
def load_data_test(testfile): a=htk.open(testfile) data=a.getall() print "Done loading the testing data: ",data.shape data=filter_data(data) x_test=data[:,:-1] Y_test=data[:,-1] print np.where(Y_test==2) # Y_test=np.reshape(Y_test,(Y_test.shape[0],1)) # y_test=np_utils.to_categorical(Y_test,2) del data return x_test,Y_test
def __call__(self, line): cline = clean(line) if VERBOSE: print(cline) likelihoods = self.comp_likelihoods(htkmfc.open(cline).getall()) s = '"' + cline[:-3] + 'rec"\n' + \ string_mlf(self.map_states_to_phones, viterbi(likelihoods, self.transitions, self.map_states_to_phones, using_bigram=self.using_bigram)[0], phones_only=True) + '.\n' return s
def load_data_test(testfile): a = htk.open(testfile) data = a.getall() print "Done loading the testing data: ", data.shape x_test = cnn_reshaper(data[:, :-2]) Y_test = data[:, -2] print np.where(Y_test == 2) # Y_test=np.reshape(Y_test,(Y_test.shape[0],1)) # y_test=np_utils.to_categorical(Y_test,2) gender_labels = data[:, -1] del data return x_test, Y_test, gender_labels
def __call__(self, line): cline = clean(line) if VERBOSE: print cline likelihoods = self.comp_likelihoods(htkmfc.open(cline).getall()) s = '"' + cline[:-3] + 'rec"\n' + \ string_mlf(self.map_states_to_phones, viterbi(likelihoods, self.transitions, self.map_states_to_phones, using_bigram=self.using_bigram)[0], phones_only=True) + '.\n' return s
def loadData(inputData): featsReader = htk.open(inputData) trainData = featsReader.getall() np.random.shuffle(trainData) yUtt = trainData[:, -1] trainData = np.delete(trainData, -1, 1) ySpkTrain = trainData[:, -1] trainData = np.delete(trainData, -1, 1) yKwTrain = trainData[:, -1] xTrain = np.delete(trainData, -1, 1) del trainData return (xTrain, ySpkTrain.astype(int), yKwTrain.astype(int) ,yUtt.astype(int))
def data_getter(testfile): print 'getting and prepping data' val = htk.open(testfile) val_data = val.getall() Y_test = val_data[:, -1] X_test = val_data[:, :-1] del val_data time.sleep(5) Y_test = Y_test.reshape(Y_test.shape[0], 1) Y_test = Y_test.astype(np.int8) return X_test, Y_test
def extract_from_mlf(mlf): x = np.ndarray((0, N_MFCC_COEFFS + N_EMA_COEFFS), dtype='float32') y = [] with open(mlf) as f: tmp_len_x = 0 # verify sizes for line in f: line = line.rstrip('\n') if len(line) < 1: continue if line[0] == '"': if tmp_len_x != 0: print( "the file above this one was mismatching x and y lengths", line) t = htkmfc.open(line.strip('"')[:-3] + 'mfc') # .lab -> .mfc mfc_file = t.getall() with open(line.strip('"')[:-4] + '_ema.npy') as ema_f: # .lab -> _ema.npy ema_file = np.load(ema_f)[:, 2:] x_file = np.concatenate(from_mfcc_ema_to_mfcc_arti_tuple( mfc_file, ema_file), axis=1) x = np.append(x, x_file, axis=0) tmp_len_x = mfc_file.shape[0] elif line[0].isdigit(): start, end, state = line.split()[:3] start = (int(start) + 1) / (MFCC_TIMESTEP * 10000) # htk end = (int(end) + 1) / (MFCC_TIMESTEP * 10000) # htk for i in range(start, end): tmp_len_x -= 1 y.append(state) assert (len(y) == x.shape[0]) rootname = mlf[:-4] np.save(rootname + '_xdata.npy', x) yy = np.array(y) np.save(rootname + '_ylabels.npy', yy) print("length x:", len(x), " length y:", len(y)) print("shape x:", x.shape, "shape yy:", yy.shape) if TEST: tx = np.load(rootname + '_xdata.npy') ty = np.load(rootname + '_ylabels.npy') if np.all(tx == x) and np.all(ty == yy): print("SUCCESS: serialized and current in-memory arrays are equal") sys.exit(0) else: print("ERROR: serialized and current in-memory arrays differ!") sys.exit(-1)
def load_features(self, file_list): """Read text file containing list of file-names and load feature files as numpy arrays.""" fin = open(file_list, 'r') for i in fin: filename = i.strip() features = htkmfc.open(filename) data = features.getall() if (self.training_data == None): self.training_data = data else: self.training_data = np.vstack((self.training_data, data)) self.number_of_frames, self.feature_dimension = self.training_data.shape fin.close()
def load_data_val(valfile): a = htk.open(valfile) data = a.getall() print "Done loading the validation data: ", data.shape data = filter_data_val(data) x_val = data[:, :-2] Y_val = data[:, -2] # print np.where(Y_val==1) Y_val = np.reshape(Y_val, (Y_val.shape[0], 1)) y_val = np_utils.to_categorical(Y_val, 2) # print np.where(y_val[:,1]==1) gender_val = data[:, -1] del data return x_val, y_val, gender_val
def load_features(self,file_list): """Read text file containing list of file-names and load feature files as numpy arrays.""" fin = open(file_list,'r') for i in fin: filename = i.strip() features = htkmfc.open(filename) data = features.getall() if (self.training_data == None): self.training_data = data else: self.training_data = np.vstack((self.training_data,data)) self.number_of_frames, self.feature_dimension = self.training_data.shape fin.close()
def GetHTKfea( wav_fd, fe_fd, n_delete ): print('Entre a getHTKfea') print(wav_fd) names = [ na for na in os.listdir(wav_fd) if na.endswith('.16kHz.fb40') ] print(names) extlen=len('16kHz.fb40')+1 names = sorted(names) for na in names: print na path = wav_fd + '/' + na print path mfc_reader=htkmfc.open(path,mode='rb') X=mfc_reader.getall() X = X[:, n_delete:]
def load_data_train(trainfile): print "Getting the training data" a=htk.open(trainfile) train_data=a.getall() print "Done with Loading the training data: ",train_data.shape data=filter_data(train_data) x_train=data[:,:-1] #Set to different column based on different model Y_train=data[:,-1] print Y_train.shape print np.where(Y_train==2) Y_train=Y_train.reshape(Y_train.shape[0],1) Y_train=Y_train.astype(np.int8) y_train=np_utils.to_categorical(Y_train,2) del data return x_train,y_train
def get_frames(bname, warpfreq): """Return concatenated vad frames from bname """ key = (bname, warpfreq) if key not in _cache: r = [] for start, end in intervals[bname]: mfcfile = path.join(english_vtln_dir, 'warp_freq_{:.2f}'.format(warpfreq), bname + '.mfc') mfc = htkmfc.open(mfcfile).getall() start_fr = start * FRATE end_fr = end * FRATE r.append(mfc[start_fr:end_fr]) _cache[key] = np.vstack(r) return _cache[key]
def load_data_val(valfile,scaler): a=htk.open(valfile) data=a.getall() print "Done loading the validation data: ",data.shape data=filter_data_val(data) x_val=data[:,:-2] # x_val=scaler.transform(x_val) Y_val=data[:,-2] # print np.where(Y_val==1) Y_val=np.reshape(Y_val,(Y_val.shape[0],1)) y_val=np_utils.to_categorical(Y_val,2) # print np.where(y_val[:,1]==1) gender_val=data[:,-1] del data #x_val has the pitch variances and also the gammatone values return x_val,y_val,gender_val
def extract_from_mlf(mlf): x = np.ndarray((0, N_MFCC_COEFFS + N_EMA_COEFFS), dtype='float32') y = [] with open(mlf) as f: tmp_len_x = 0 # verify sizes for line in f: line = line.rstrip('\n') if len(line) < 1: continue if line[0] == '"': if tmp_len_x != 0: print "the file above this one was mismatching x and y lengths", line t = htkmfc.open(line.strip('"')[:-3] + 'mfc') # .lab -> .mfc mfc_file = t.getall() with open(line.strip('"')[:-4] + '_ema.npy') as ema_f: # .lab -> _ema.npy ema_file = np.load(ema_f)[:,2:] x_file = np.concatenate(from_mfcc_ema_to_mfcc_arti_tuple( mfc_file, ema_file), axis=1) x = np.append(x, x_file, axis=0) tmp_len_x = mfc_file.shape[0] elif line[0].isdigit(): start, end, state = line.split()[:3] start = (int(start)+1)/(MFCC_TIMESTEP * 10000) # htk end = (int(end)+1)/(MFCC_TIMESTEP * 10000) # htk for i in xrange(start, end): tmp_len_x -= 1 y.append(state) assert(len(y) == x.shape[0]) rootname = mlf[:-4] np.save(rootname + '_xdata.npy', x) yy = np.array(y) np.save(rootname + '_ylabels.npy', yy) print "length x:", len(x), " length y:", len(y) print "shape x:", x.shape, "shape yy:", yy.shape if TEST: tx = np.load(rootname + '_xdata.npy') ty = np.load(rootname + '_ylabels.npy') if np.all(tx==x) and np.all(ty==yy): print "SUCCESS: serialized and current in-memory arrays are equal" sys.exit(0) else: print "ERROR: serialized and current in-memory arrays differ!" sys.exit(-1)
def GetHTKfea(wav_fd, fe_fd, n_delete): names = [na for na in os.listdir(wav_fd) if na.endswith('.16kHz.fb40')] extlen = len('16kHz.fb40') + 1 names = sorted(names) for na in names: print na path = wav_fd + '/' + na print path mfc_reader = htkmfc.open(path, mode='rb') X = mfc_reader.getall() X = X[:, n_delete:] print X.shape # (1291,40) out_path = fe_fd + '/' + na[0:-extlen] + '.f' #### change na[0:-4] cPickle.dump(X, open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
def get_frames(bname, warpfreq): """Return concatenated vad frames from bname """ key = (bname, warpfreq) if key not in _cache: r = [] for start, end in intervals[bname]: mfcfile = path.join( english_vtln_dir, 'warp_freq_{:.2f}'.format(warpfreq), bname + '.mfc') mfc = htkmfc.open(mfcfile).getall() start_fr = start * FRATE end_fr = end * FRATE r.append(mfc[start_fr:end_fr]) _cache[key] = np.vstack(r) return _cache[key]
def load_data_train(trainfile): print "Getting the training data" a = htk.open(trainfile) train_data = a.getall() print "Done with Loading the training data: ", train_data.shape data = filter_data_train(train_data) # x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model x_train = data[:, :-2] #Set to different column based on different model Y_train = data[:, -2] print Y_train.shape # print np.where(Y_train==2) Y_train = Y_train.reshape(Y_train.shape[0], 1) y_train = np_utils.to_categorical(Y_train, 2) print y_train[0:5, :] gender_train = data[:, -1] del data return x_train, y_train, gender_train
def concat_all(folder): l = [] for d, ds, fs in os.walk(folder): for fname in fs: if fname[-4:] != '.mfc': continue fullfname = d + '/' + fname print fullfname t = htkmfc.open(fullfname) l.append(t.getall()) stats = np.concatenate(l) mean = np.mean(stats, 0) stddev = np.std(stats, 0) for i, e in enumerate(l): l[i] = padding(NFRAMES_DBN, (e - mean) / stddev) a = np.concatenate(l) np.save(folder + '/' + 'x_all_mfcc.npy', a)
def extract_MFCC(audio_URI, output_URI): fe = FeatureExtractor( '/usr/local/bin/HCopy', None) ## TODO: replace htk-mfcc extraction with essentia # call htk to extract features URImfcFile = fe._extractMFCCs(audio_URI) # read features form binary htk file logging.debug("reading MFCCs from {} ...".format(URImfcFile)) HTKFeat_reader = htkmfc.open(URImfcFile, 'rb') mfccsFeatrues = HTKFeat_reader.getall() labels = numpy.zeros(len(mfccsFeatrues), dtype='float32') with open(output_URI, 'w') as f: pickle.dump((mfccsFeatrues, labels), f) return output_URI
def load_data_train(trainfile): print "Getting the training data" a=htk.open(trainfile) train_data=a.getall() print "Done with Loading the training data: ",train_data.shape data=filter_data_train(train_data) # x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model x_train=data[:,:-2] #Set to different column based on different model scaler=StandardScaler().fit(x_train) # x_train=scaler.transform(x_train) Y_train=data[:,-2] print Y_train.shape # print np.where(Y_train==2) Y_train=Y_train.reshape(Y_train.shape[0],1) y_train=np_utils.to_categorical(Y_train,2) print y_train[0:5,:] gender_train=data[:,-1] del data #x_train has complete data, that is gammatone and also the pitch variance values. return x_train,y_train,gender_train,scaler
def normalize(folder): corpus = {} full = np.ndarray((0, 39)) for d, ds, fs in os.walk(folder): for fname in fs: if fname[-11:] != '.mfc_unnorm': continue fullfname = d + '/' + fname t = htkmfc.open(fullfname) corpus[fullfname[:-11] + '_mfc.npy'] = copy.deepcopy(t.getall()) full = np.append(full, t.getall(), axis=0) mean = np.mean(full) stddev = sss.tstd(full) if stddev == 0: print >> sys.stderr, "*** null stddev, no *.mfc_unnorm file ??? ***" sys.exit(-1) for key, val in corpus.iteritems(): corpus[key] = (val - mean) / stddev # verification: ### full = np.ndarray((0,39)) ### for key,val in corpus.iteritems(): ### full = np.append(full, val, axis=0) ### print "verification of 0-mean 1-stddev" ### print "mean (big numeric errors, beware)" ### print np.mean(full) ### print "stddev" ### print sss.tvar(full) # /verification for key, val in corpus.iteritems(): print "Dealt with:", key np.save(key, val)
def normalize(folder): corpus = {} full = np.ndarray((0,39)) for d, ds, fs in os.walk(folder): for fname in fs: if fname[-11:] != '.mfc_unnorm': continue fullfname = d + '/'+fname t = htkmfc.open(fullfname) corpus[fullfname[:-11]+'_mfc.npy'] = copy.deepcopy(t.getall()) full = np.append(full, t.getall(), axis=0) mean = np.mean(full) stddev = sss.tstd(full) if stddev == 0: print >> sys.stderr, "*** null stddev, no *.mfc_unnorm file ??? ***" sys.exit(-1) for key,val in corpus.iteritems(): corpus[key] = (val - mean) / stddev # verification: ### full = np.ndarray((0,39)) ### for key,val in corpus.iteritems(): ### full = np.append(full, val, axis=0) ### print "verification of 0-mean 1-stddev" ### print "mean (big numeric errors, beware)" ### print np.mean(full) ### print "stddev" ### print sss.tvar(full) # /verification for key,val in corpus.iteritems(): print "Dealt with:", key np.save(key, val)
def process(ofname, iscpfname, ihmmfname, ilmfname=None, iwdnetfname=None, unibifname=None, idbnfname=None, idbndictstuple=None): with open(ihmmfname) as ihmmf: n_states, transitions, gmms = parse_hmm(ihmmf) gmms_ = precompute_det_inv(gmms) map_states_to_phones = phones_mapping(gmms) likelihoods_computer = functools.partial(compute_likelihoods, gmms_) gmm_likelihoods_computer = functools.partial(compute_likelihoods, gmms_) #TODO REMOVE dbn = None dbn_to_int_to_state_tuple = None if idbnfname != None: with open(idbnfname) as idbnf: dbn = cPickle.load(idbnf) with open(idbndictstuple) as idbndtf: dbn_to_int_to_state_tuple = cPickle.load(idbndtf) dbn_phones_to_states = dbn_to_int_to_state_tuple[0] likelihoods_computer = functools.partial(compute_likelihoods_dbn, dbn) # like that = for GRBM first layer (normalize=True, unit=False) # TODO correct the normalize/unit to work on full test dataset if iwdnetfname != None: with open(iwdnetfname) as iwdnf: transitions = parse_wdnet(transitions, iwdnf) # parse wordnet elif ilmfname != None: with open(ilmfname) as ilmf: if MATRIX_BIGRAM: transitions = parse_lm_matrix(transitions, ilmf) # parse bigram LM in matrix format in ilmf else: transitions = parse_lm(transitions, ilmf) # parse bigram LM in ARPA-MIT in ilmf elif unibifname != None: # our own unigram and bigram counts, # c.f. src/produce_LM.py with open(unibifname) as ubf: transitions = initialize_transitions(transitions, ubf, unigrams_only=UNIGRAMS_ONLY) else: # uniform transitions between phones transitions = initialize_transitions(transitions) transitions = penalty_scale(transitions, insertion_penalty=INSERTION_PENALTY, scale_factor=SCALE_FACTOR) dummy = np.ndarray((2,2)) # to force only 1 compile of Viterbi's C viterbi(dummy, [None, dummy], {}) # also for this compile's debug purposes if dbn != None: input_n_frames_mfcc = dbn.rbm_layers[0].n_visible / 39 # TODO generalize print "this is a DBN with", input_n_frames_mfcc, "MFCC frames" input_n_frames_arti = dbn.rbm_layers[1].n_visible / 59 # 60 # TODO generalize print "this is a DBN with", input_n_frames_arti, "articulatory frames" input_file_name = 'tmp_input_mocha.npy' map_input_file_name = 'tmp_map_file_to_start_end_mocha.pickle' try: # TODO remove? print "loading concat MFCC from pickled file" with open(input_file_name) as concat: all_input = np.load(concat) with open(map_input_file_name) as map_input: map_file_to_start_end = cPickle.load(map_input) except: print "concatenating MFCC and articulatory files" # TODO parallelize + use np.concatenate all_input = np.ndarray((0, dbn.rbm_layers[0].n_visible + dbn.rbm_layers[1].n_visible), dtype='float32') map_file_to_start_end = {} with open(iscpfname) as iscpf: for line in iscpf: cline = clean(line) start = all_input.shape[0] # get the 1 framed signals x_mfcc = htkmfc.open(cline).getall() with open(cline[:-4] + '_ema.npy') as ema: x_arti = np.load(ema)[:, 2:] # compute deltas and deltas deltas for articulatory features _, x_arti = from_mfcc_ema_to_mfcc_arti_tuple(x_mfcc, x_arti) # add the adjacent frames if input_n_frames_mfcc > 1: x_mfcc = padding(input_n_frames_mfcc, x_mfcc) if input_n_frames_arti > 1: x_arti = padding(input_n_frames_arti, x_arti) # do feature transformations if any # TODO with mocha_timit_params.json params # concatenate x_mfcc_arti = np.concatenate((x_mfcc, x_arti), axis=1) all_input = np.append(all_input, x_mfcc_arti, axis=0) map_file_to_start_end[cline] = (start, all_input.shape[0]) with open(input_file_name, 'w') as concat: np.save(concat, all_input) with open(map_input_file_name, 'w') as map_input: cPickle.dump(map_file_to_start_end, map_input) else: # GMM all_mfcc = np.ndarray((0, 39), dtype='float32') # TODO generalize print "computing likelihoods" if dbn != None: # TODO clean tmp_likelihoods = likelihoods_computer(all_input) #mean_dbns = np.mean(tmp_likelihoods, 0) #tmp_likelihoods *= (mean_gmms / mean_dbns) print tmp_likelihoods print tmp_likelihoods.shape columns_remapping = [dbn_phones_to_states[map_states_to_phones[i]] for i in xrange(tmp_likelihoods.shape[1])] print columns_remapping likelihoods = (tmp_likelihoods[:, columns_remapping], map_file_to_start_end) print likelihoods[0] print likelihoods[0].shape else: likelihoods = (likelihoods_computer(all_mfcc), map_file_to_start_end) print "computing viterbi paths" list_mlf_string = [] with open(iscpfname) as iscpf: il = InnerLoop(likelihoods, map_states_to_phones, transitions, using_bigram=(ilmfname != None or iwdnetfname != None or unibifname != None)) p = Pool(cpu_count()) list_mlf_string = p.map(il, iscpf) with open(ofname, 'w') as of: of.write('#!MLF!#\n') for line in list_mlf_string: of.write(line)
def extract_from_mlf(mlf, do_gammatones): x = np.ndarray((0, N_MFCC_COEFFS), dtype='float32') x_fbank = np.ndarray((0, N_FILTERBANK_COEFFS), dtype='float32') x_gamma = np.ndarray((0, N_GAMMATONES*3), dtype='float32') y = [] y_spkr = [] with open(mlf) as f: tmp_len_x = 0 # verify sizes len_x = 0 end = 0 speaker_label = '' for line in f: line = line.rstrip('\n') if len(line) < 1: continue if line[0] == '"': assert tmp_len_x == 0, "the file above this one %s was mismatching x (%d frames) and y (%d frames) lengths by %d" % (line, len_x, end, tmp_len_x) speaker_label = line.split('/')[-2] # load HTK's MFCC t = htkmfc.open(line.strip('"')[:-3] + 'mfc') # .lab -> .mfc x = np.append(x, t.getall(), axis=0) len_x = t.getall().shape[0] tmp_len_x = len_x if TALKBOX_FBANKS: # do our own filterbanks TODO fr, snd = wavfile.read(line.strip('"')[:-3] + 'wav') # .lab -> .wav assert fr == SAMPLING_RATE, "SAMPLING_RATE is not what is found in the wav file" _, fbank, _ = tbmfcc(snd, nwin=HAMMING_SIZE/1000.*SAMPLING_RATE, nfft=2048, fs=SAMPLING_RATE, nceps=13) x_fbank = np.append(x_fbank, fbank, axis=0) assert t.getall().shape[0] == fbank.shape[0], "MFCC and filterbank not of the same length (not on the same sampling rate)" else: fbank = None with open(line.strip('"')[:-4] + '_fbanks.npy') as fbanksf: fbank = np.load(fbanksf) if fbank != None: # it seems filterbanks obtained with spectral are a little longer at the end if DEBUG: print "cutting the last", fbank.shape[0] - t.getall().shape[0], "frames from the filterbank" fbank = fbank[:t.getall().shape[0]] x_fbank = np.append(x_fbank, fbank, axis=0) assert t.getall().shape[0] == fbank.shape[0], "MFCC and filterbank not of the same length (not on the same sampling rate)" if do_gammatones: # load the wav sound (with Brian) sound = loadsound(line.strip('"')[:-3] + 'wav') # .lab -> .wav # compute the gammatones (see Brian's doc) bw = 10**(0.037+0.785*log10(center_frequencies)) gammatone = ApproximateGammatone(sound, center_frequencies, bw, order=3) g = gammatone.process() # subsample the gammatones at the same rate than the MFCC's # (just for practicality so that they are aligned...) n_samples = g.shape[0]*1./(t.getall().shape[0] + 1) # TODO check "+1" ### # do the harmonic mean (nth root of the product of the terms) ### g_sub = subsample_apply_f(g, n_samples, lambda z: np.power(np.prod(z), 1./n_samples)) g_sub = subsample_apply_f(g, n_samples, lambda z: np.sqrt(np.sum(np.square(z)))) # compute the delta and delta of the subsampled gammatones gamma_speed_accel = compute_speed_and_accel(g_sub) # append tmp = gamma_speed_accel[:t.getall().shape[0]] # TODO check if tmp.shape[0] != t.getall().shape[0]: # TODO remove print line print tmp.shape print t.getall().shape print n_samples print g.shape print "exiting because of the mismatch" sys.exit(-1) x_gamma = np.append(x_gamma, tmp, axis=0) elif line[0].isdigit(): start, end, state = line.split()[:3] start = (int(start)+9999)/(MFCC_TIMESTEP * 10000) # htk end = (int(end)+9999)/(MFCC_TIMESTEP * 10000) # htk for i in xrange(start, end): tmp_len_x -= 1 y.append(state) y_spkr.append(speaker_label) assert(len(y) == x.shape[0]) assert(len(y_spkr) == x.shape[0]) rootname = mlf[:-4] np.save(rootname + '_xdata.npy', x) np.save(rootname + '_xfbank.npy', x_fbank) if do_gammatones: np.save(rootname + '_xgamma.npy', x_gamma) yy = np.array(y) yy_spkr = np.array(y_spkr) np.save(rootname + '_ylabels.npy', yy) np.save(rootname + '_yspeakers.npy', yy_spkr) print "length x:", len(x), "length y:", len(y), "length y_spkr:", len(y_spkr) print "shape x:", x.shape, "shape yy:", yy.shape, "shape yy_spkr:", yy_spkr.shape if TEST: tx = np.load(rootname + '_xdata.npy') tx_fbank = np.load(rootname + '_xfbank.npy') if do_gammatones: tx_gamma = np.load(rootname + '_xgamma.npy') ty = np.load(rootname + '_ylabels.npy') ty_spkr = np.load(rootname + '_yspeakers.npy') if np.all(tx==x) and np.all(ty==yy) and np.all(ty_spkr==yy_spkr): assert_allclose(tx_fbank, x_fbank, err_msg="x_fbank and its serialized version are not allclose") if do_gammatones: assert_allclose(tx_gamma, x_gamma, err_msg="x_gamma and its serialized version are not allclose") print "SUCCESS: serialized and current in-memory arrays are equal" sys.exit(0) else: print "ERROR: serialized and current X (MFCC) or Y in-memory arrays differ!" print "x (MFCC):", np.all(tx==x) print "y (labels):", np.all(ty==yy) print "y (speakers):", np.all(ty_spkr==yy_spkr) sys.exit(-1)
all_mfcc = np.ndarray((0, dbn.rbm_layers[0].n_visible), dtype='float32') map_file_to_start_end = {} mfcc_file_name = 'tmp_allen_mfcc_' + str(int(input_n_frames)) + '.npy' map_mfcc_file_name = 'tmp_allen_map_file_to_start_end_' + str( int(input_n_frames)) + '.pickle' try: print("loading concat MFCC from pickled file") with open(mfcc_file_name) as concat_mfcc: all_mfcc = np.load(concat_mfcc) with open(map_mfcc_file_name) as map_mfcc: map_file_to_start_end = pickle.load(map_mfcc) except: for ind, mfcc_file in enumerate(list_of_mfcc_files): start = all_mfcc.shape[0] x = htkmfc.open(mfcc_file).getall() if input_n_frames > 1: x = padding(input_n_frames, x) all_mfcc = np.append(all_mfcc, x, axis=0) map_file_to_start_end[mfcc_file] = (start, all_mfcc.shape[0]) print("did", mfcc_file, "ind", ind) with open(mfcc_file_name, 'w') as concat_mfcc: np.save(concat_mfcc, all_mfcc) with open(map_mfcc_file_name, 'w') as map_mfcc: pickle.dump(map_file_to_start_end, map_mfcc) tmp_likelihoods = likelihoods_computer(all_mfcc) depth_1_likelihoods = depth_1_computer(all_mfcc) depth_2_likelihoods = depth_2_computer(all_mfcc) #depth_3_likelihoods = depth_1_computer(all_mfcc) TODO print(map_states_to_phones)
def load_htkfile_full(input_file): feat_reader = htk.open(input_file) #extracting features from the htk files feat1 = feat_reader.getall() feat = np.reshape(feat1, (1, -1, 60)) return feat
print "this is a DBN with", input_n_frames, "frames on the input layer" print "concatenating MFCC files" all_mfcc = np.ndarray((0, dbn.rbm_layers[0].n_visible), dtype='float32') map_file_to_start_end = {} mfcc_file_name = 'tmp_allen_mfcc_' + str(int(input_n_frames)) + '.npy' map_mfcc_file_name = 'tmp_allen_map_file_to_start_end_' + str(int(input_n_frames)) + '.pickle' try: print "loading concat MFCC from pickled file" with open(mfcc_file_name) as concat_mfcc: all_mfcc = np.load(concat_mfcc) with open(map_mfcc_file_name) as map_mfcc: map_file_to_start_end = cPickle.load(map_mfcc) except: for ind, mfcc_file in enumerate(list_of_mfcc_files): start = all_mfcc.shape[0] x = htkmfc.open(mfcc_file).getall() if input_n_frames > 1: x = padding(input_n_frames, x) all_mfcc = np.append(all_mfcc, x, axis=0) map_file_to_start_end[mfcc_file] = (start, all_mfcc.shape[0]) print "did", mfcc_file, "ind", ind with open(mfcc_file_name, 'w') as concat_mfcc: np.save(concat_mfcc, all_mfcc) with open(map_mfcc_file_name, 'w') as map_mfcc: cPickle.dump(map_file_to_start_end, map_mfcc) tmp_likelihoods = likelihoods_computer(all_mfcc) columns_remapping = [dbn_phones_to_states[map_states_to_phones[i]] for i in xrange(tmp_likelihoods.shape[1])] likelihoods = (tmp_likelihoods[:, columns_remapping], map_file_to_start_end) else:
def process(ofname, iscpfname, ihmmfname, ilmfname=None, iwdnetfname=None, unibifname=None, idbnfname=None, idbndictstuple=None): with open(ihmmfname) as ihmmf: n_states, transitions, gmms = parse_hmm(ihmmf) gmms_ = precompute_det_inv(gmms) map_states_to_phones = phones_mapping(gmms) likelihoods_computer = functools.partial(compute_likelihoods, gmms_) gmm_likelihoods_computer = functools.partial(compute_likelihoods, gmms_) #TODO REMOVE dbn = None dbn_to_int_to_state_tuple = None if idbnfname != None: with open(idbnfname) as idbnf: dbn = cPickle.load(idbnf) with open(idbndictstuple) as idbndtf: dbn_to_int_to_state_tuple = cPickle.load(idbndtf) dbn_phones_to_states = dbn_to_int_to_state_tuple[0] likelihoods_computer = functools.partial(compute_likelihoods_dbn, dbn) # like that = for GRBM first layer (normalize=True, unit=False) # TODO correct the normalize/unit to work on full test dataset if iwdnetfname != None: with open(iwdnetfname) as iwdnf: transitions = parse_wdnet(transitions, iwdnf) # parse wordnet elif ilmfname != None: with open(ilmfname) as ilmf: if MATRIX_BIGRAM: transitions = parse_lm_matrix(transitions, ilmf) # parse bigram LM in matrix format in ilmf else: transitions = parse_lm(transitions, ilmf) # parse bigram LM in ARPA-MIT in ilmf elif unibifname != None: # our own unigram and bigram counts, # c.f. src/produce_LM.py with open(unibifname) as ubf: transitions = initialize_transitions(transitions, ubf, unigrams_only=UNIGRAMS_ONLY) else: # uniform transitions between phones transitions = initialize_transitions(transitions) transitions = penalty_scale(transitions, insertion_penalty=INSERTION_PENALTY, scale_factor=SCALE_FACTOR) dummy = np.ndarray((2,2)) # to force only 1 compile of Viterbi's C viterbi(dummy, [None, dummy], {}) # also for this compile's debug purposes if dbn != None: input_n_frames = dbn.rbm_layers[0].n_visible / 39 # TODO generalize print "this is a DBN with", input_n_frames, "frames on the input layer" mfcc_file_name = 'tmp_mfcc_' + str(int(input_n_frames)) + '.npy' map_mfcc_file_name = 'tmp_map_file_to_start_end_' + str(int(input_n_frames)) + '.pickle' try: # TODO remove? print "loading concat MFCC from pickled file", mfcc_file_name with open(mfcc_file_name) as concat_mfcc: all_mfcc = np.load(concat_mfcc) with open(map_mfcc_file_name) as map_mfcc: map_file_to_start_end = cPickle.load(map_mfcc) except: print "concatenating MFCC files" # TODO parallelize + use np.concatenate all_mfcc = np.ndarray((0, dbn.rbm_layers[0].n_visible), dtype='float32') map_file_to_start_end = {} with open(iscpfname) as iscpf: for line in iscpf: cline = clean(line) start = all_mfcc.shape[0] x = htkmfc.open(cline).getall() if input_n_frames > 1: x = padding(input_n_frames, x) print all_mfcc.shape print x.shape all_mfcc = np.append(all_mfcc, x, axis=0) map_file_to_start_end[cline] = (start, all_mfcc.shape[0]) with open(mfcc_file_name, 'w') as concat_mfcc: np.save(concat_mfcc, all_mfcc) with open(map_mfcc_file_name, 'w') as map_mfcc: cPickle.dump(map_file_to_start_end, map_mfcc) else: # GMM all_mfcc = np.ndarray((0, 39), dtype='float32') # TODO generalize print "computing likelihoods" if dbn != None: # TODO clean # TODO REMOVE #gmm_likelihoods = gmm_likelihoods_computer(all_mfcc[:, xrange(195,234)]) #mean_gmms = np.mean(gmm_likelihoods, 0) #print gmm_likelihoods #print gmm_likelihoods.shape tmp_likelihoods = likelihoods_computer(all_mfcc) #mean_dbns = np.mean(tmp_likelihoods, 0) #tmp_likelihoods *= (mean_gmms / mean_dbns) if VERBOSE: print tmp_likelihoods print tmp_likelihoods.shape print map_states_to_phones print dbn_phones_to_states assert set(map_states_to_phones.values()) == set(dbn_phones_to_states.keys()), "Phones differ between the HMM and the DBN" columns_remapping = [dbn_phones_to_states[map_states_to_phones[i]] for i in xrange(tmp_likelihoods.shape[1])] if VERBOSE: print columns_remapping likelihoods = (tmp_likelihoods[:, columns_remapping], map_file_to_start_end) #if VERBOSE: #print map_file_to_start_end #print len(map_file_to_start_end) #print likelihoods[0] #print likelihoods[0].shape else: likelihoods = (likelihoods_computer(all_mfcc), map_file_to_start_end) print "computing viterbi paths" list_mlf_string = [] with open(iscpfname) as iscpf: il = InnerLoop(likelihoods, map_states_to_phones, transitions, using_bigram=(ilmfname != None or iwdnetfname != None or unibifname != None)) #p = Pool(1) p = Pool(cpu_count()) list_mlf_string = p.map(il, iscpf) with open(ofname, 'w') as of: of.write('#!MLF!#\n') for line in list_mlf_string: of.write(line)
speakers = sorted(set([bname2speaker(bname) for bname in bnames])) bnames_per_speaker = { speaker: [bname for bname in bnames if bname2speaker(bname) == speaker] for speaker in speakers } if __name__ == '__main__': ideal_warpfreq_file = 'ideal_warpfreq_5.txt' ideal_warps = pd.read_csv(ideal_warpfreq_file) # outrawdir = path.join(datadir, 'raw') outwarpeddir = path.join(english_vtln_dir, 'vtln2') try: os.makedirs(outwarpeddir) except OSError: pass for ix, (_, filename, warpfreq) in ideal_warps.iterrows(): # print '{} ({}/{})'.format(speaker, ix+1, len(ideal_warps)) # for filename in bnames_per_speaker[speaker]: print '{} ({}/{})'.format(filename, ix+1, len(ideal_warps)) infile_warp = path.join( english_vtln_dir, 'warp_freq_{:.2f}'.format(warpfreq), filename+'.mfc' ) mfc_warp = htkmfc.open(infile_warp).getall() np.save(path.join(outwarpeddir, filename+'.npy'), mfc_warp)