def merge_label(self, binary_label_file_list, new_feat_file_list, out_feat_file_list): """ merging additional label for each utterance. """ utt_number = len(new_feat_file_list) if utt_number != len(binary_label_file_list): print( "the number of new feature input files and label files should be the same!\n" ) sys.exit(1) io_funcs = BinaryIOCollection() for i in range(utt_number): lab_file_name = binary_label_file_list[i] new_feat_file_name = new_feat_file_list[i] out_feat_file_name = out_feat_file_list[i] lab_features, lab_frame_number = io_funcs.load_binary_file_frame( lab_file_name, self.lab_dim) # shape of new feature shoule be (1, dim) new_features = io_funcs.load_binary_file(new_feat_file_name, self.feat_dim) # expand shape of new feature to (T, dim) new_features = numpy.tile(new_features, (lab_frame_number, 1)) merged_features = numpy.zeros( (lab_frame_number, self.lab_dim + self.feat_dim)) merged_features[0:lab_frame_number, 0:self.lab_dim] = lab_features merged_features[0:lab_frame_number, self.lab_dim:self.lab_dim + self.feat_dim] = new_features[0:lab_frame_number, ] io_funcs.array_to_binary_file(merged_features, out_feat_file_name)
def compute_global_variance(self, file_list, feat_dim, save_dir): logger = logging.getLogger("compute gv") logger.info('computed global variance of length %d') all_std_vector = numpy.zeros((len(file_list), feat_dim)) filenum = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, feat_dim) std_vector = numpy.var(features, axis=0) all_std_vector[filenum, :] = std_vector filenum = filenum + 1 #compute mean and std for all_std_vectors print all_std_vector.shape global_mean = numpy.mean(all_std_vector, axis=0) global_var = numpy.var(all_std_vector, axis=0) gv_mean_name = os.path.join(save_dir, 'gv_mean') fid = open(gv_mean_name, 'wb') global_mean.tofile(fid) fid.close() gv_var_name = os.path.join(save_dir, 'gv_var') fid = open(gv_var_name, 'wb') global_var.tofile(fid) fid.close() print global_mean print global_var
def extract_durational_features(self, dur_file_name=None, dur_data=None): if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) ph_count = len(dur_data) total_num_of_frames = int(sum(dur_data)) duration_feature_array = numpy.zeros( (total_num_of_frames, self.frame_feature_size)) frame_index = 0 for i in range(ph_count): frame_number = int(dur_data[i]) if self.subphone_feats == "coarse_coding": cc_feat_matrix = self.extract_coarse_coding_features_relative( frame_number) for j in range(frame_number): duration_feature_array[frame_index, 0] = cc_feat_matrix[j, 0] duration_feature_array[frame_index, 1] = cc_feat_matrix[j, 1] duration_feature_array[frame_index, 2] = cc_feat_matrix[j, 2] duration_feature_array[frame_index, 3] = float(frame_number) frame_index += 1 return duration_feature_array
def compute_mean(self, file_list): logger = logging.getLogger("acoustic_norm") mean_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_vector += numpy.reshape(numpy.sum(features, axis=0), (1, self.feature_dimension)) all_frame_number += current_frame_number mean_vector /= float(all_frame_number) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed mean vector of length %d :' % mean_vector.shape[1]) logger.info(' mean: %s' % mean_vector) # restore the print options # numpy.set_printoptions(po) return mean_vector
def find_min_max_values(self, in_file_list, start_index, end_index): local_feature_dimension = end_index - start_index file_number = len(in_file_list) min_value_matrix = numpy.zeros((file_number, local_feature_dimension)) max_value_matrix = numpy.zeros((file_number, local_feature_dimension)) io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) temp_min = numpy.amin(features[:, start_index:end_index], axis=0) temp_max = numpy.amax(features[:, start_index:end_index], axis=0) min_value_matrix[i, ] = temp_min max_value_matrix[i, ] = temp_max self.min_vector = numpy.amin(min_value_matrix, axis=0) self.max_vector = numpy.amax(max_value_matrix, axis=0) self.min_vector = numpy.reshape(self.min_vector, (1, local_feature_dimension)) self.max_vector = numpy.reshape(self.max_vector, (1, local_feature_dimension)) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('found min/max values of length %d:' % local_feature_dimension) self.logger.info(' min: %s' % self.min_vector) self.logger.info(' max: %s' % self.max_vector)
def find_min_max_values(self, in_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) min_value_matrix = numpy.zeros((file_number, self.feature_dimension)) max_value_matrix = numpy.zeros((file_number, self.feature_dimension)) io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) temp_min = numpy.amin(features, axis=0) temp_max = numpy.amax(features, axis=0) min_value_matrix[i, ] = temp_min max_value_matrix[i, ] = temp_max self.min_vector = numpy.amin(min_value_matrix, axis=0) self.max_vector = numpy.amax(max_value_matrix, axis=0) self.min_vector = numpy.reshape(self.min_vector, (1, self.feature_dimension)) self.max_vector = numpy.reshape(self.max_vector, (1, self.feature_dimension)) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('across %d files found min/max values of length %d:' % (file_number, self.feature_dimension)) logger.info(' min: %s' % self.min_vector) logger.info(' max: %s' % self.max_vector)
def compute_std(self, file_list, mean_vector): logger = logging.getLogger("acoustic_norm") std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape(numpy.sum((features - mean_matrix) ** 2, axis=0), (1, self.feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector ** 0.5 # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed std vector of length %d' % std_vector.shape[1] ) logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) return std_vector
def find_min_max_values(self, in_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) min_value_matrix = numpy.zeros((file_number, self.feature_dimension)) max_value_matrix = numpy.zeros((file_number, self.feature_dimension)) io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) temp_min = numpy.amin(features, axis = 0) temp_max = numpy.amax(features, axis = 0) min_value_matrix[i, ] = temp_min; max_value_matrix[i, ] = temp_max; self.min_vector = numpy.amin(min_value_matrix, axis = 0) self.max_vector = numpy.amax(max_value_matrix, axis = 0) self.min_vector = numpy.reshape(self.min_vector, (1, self.feature_dimension)) self.max_vector = numpy.reshape(self.max_vector, (1, self.feature_dimension)) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('across %d files found min/max values of length %d:' % (file_number,self.feature_dimension) ) logger.info(' min: %s' % self.min_vector) logger.info(' max: %s' % self.max_vector)
def denormalise_data(self, in_file_list, out_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) logger.info('MinMaxNormalisation.denormalise_data for %d files' % file_number) # print self.max_vector, self.min_vector fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) ) fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size // self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = fea_diff_matrix / numpy.tile(target_max_min_diff, (frame_number, 1)) norm_features = diff_norm_matrix * (features - target_min_matrix) + fea_min_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def compute_std(self, file_list, mean_vector): logger = logging.getLogger("acoustic_norm") std_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_vector += numpy.reshape( numpy.sum((features - mean_matrix)**2, axis=0), (1, self.feature_dimension)) all_frame_number += current_frame_number std_vector /= float(all_frame_number) std_vector = std_vector**0.5 # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed std vector of length %d' % std_vector.shape[1]) logger.info(' std: %s' % std_vector) # restore the print options # numpy.set_printoptions(po) return std_vector
def normalise_data(self, in_file_list, out_file_list): file_number = len(in_file_list) fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size // self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) target_min_matrix = numpy.tile(self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = numpy.tile(target_max_min_diff, (frame_number, 1)) / fea_diff_matrix norm_features = diff_norm_matrix * (features - fea_min_matrix) + target_min_matrix ## If we are to keep some columns unnormalised, use advanced indexing to ## reinstate original values: m,n = numpy.shape(features) for col in self.exclude_columns: norm_features[list(range(m)),[col]*m] = features[list(range(m)),[col]*m] io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def find_min_max_values(self, in_file_list, start_index, end_index): local_feature_dimension = end_index - start_index file_number = len(in_file_list) min_value_matrix = numpy.zeros((file_number, local_feature_dimension)) max_value_matrix = numpy.zeros((file_number, local_feature_dimension)) io_funcs = BinaryIOCollection() for i in xrange(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) temp_min = numpy.amin(features[:, start_index:end_index], axis = 0) temp_max = numpy.amax(features[:, start_index:end_index], axis = 0) min_value_matrix[i, ] = temp_min; max_value_matrix[i, ] = temp_max; self.min_vector = numpy.amin(min_value_matrix, axis = 0) self.max_vector = numpy.amax(max_value_matrix, axis = 0) self.min_vector = numpy.reshape(self.min_vector, (1, local_feature_dimension)) self.max_vector = numpy.reshape(self.max_vector, (1, local_feature_dimension)) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) self.logger.info('found min/max values of length %d:' % local_feature_dimension) self.logger.info(' min: %s' % self.min_vector) self.logger.info(' max: %s' % self.max_vector)
def extract_durational_features(self, dur_file_name=None, dur_data=None): if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) ph_count = len(dur_data) total_num_of_frames = int(sum(dur_data)) duration_feature_array = numpy.zeros((total_num_of_frames, self.frame_feature_size)) frame_index=0 for i in range(ph_count): frame_number = int(dur_data[i]) if self.subphone_feats == "coarse_coding": cc_feat_matrix = self.extract_coarse_coding_features_relative(frame_number) for j in range(frame_number): duration_feature_array[frame_index, 0] = cc_feat_matrix[j, 0] duration_feature_array[frame_index, 1] = cc_feat_matrix[j, 1] duration_feature_array[frame_index, 2] = cc_feat_matrix[j, 2] duration_feature_array[frame_index, 3] = float(frame_number) frame_index+=1 elif self.subphone_feats == 'full': state_number = 5 # hard coded here phone_duration = sum(dur_data[i, :]) state_duration_base = 0 for state_index in xrange(1, state_number+1): state_index_backward = (state_number - state_index) + 1 frame_number = int(dur_data[i][state_index-1]) for j in xrange(frame_number): duration_feature_array[frame_index, 0] = float(j+1) / float(frame_number) ## fraction through state (forwards) duration_feature_array[frame_index, 1] = float(frame_number - j) / float(frame_number) ## fraction through state (backwards) duration_feature_array[frame_index, 2] = float(frame_number) ## length of state in frames duration_feature_array[frame_index, 3] = float(state_index) ## state index (counting forwards) duration_feature_array[frame_index, 4] = float(state_index_backward) ## state index (counting backwards) duration_feature_array[frame_index, 5] = float(phone_duration) ## length of phone in frames duration_feature_array[frame_index, 6] = float(frame_number) / float(phone_duration) ## fraction of the phone made up by current state duration_feature_array[frame_index, 7] = float(phone_duration - j - state_duration_base) / float(phone_duration) ## fraction through phone (forwards) duration_feature_array[frame_index, 8] = float(state_duration_base + j + 1) / float(phone_duration) ## fraction through phone (backwards) frame_index+=1 state_duration_base += frame_number return duration_feature_array
def extract_durational_features(self, dur_file_name=None, dur_data=None): if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) ph_count = len(dur_data) total_num_of_frames = int(sum(dur_data)) duration_feature_array = numpy.zeros((total_num_of_frames, self.frame_feature_size)) frame_index=0 for i in range(ph_count): frame_number = int(dur_data[i]) if self.subphone_feats == "coarse_coding": cc_feat_matrix = self.extract_coarse_coding_features_relative(frame_number) for j in range(frame_number): duration_feature_array[frame_index, 0] = cc_feat_matrix[j, 0] duration_feature_array[frame_index, 1] = cc_feat_matrix[j, 1] duration_feature_array[frame_index, 2] = cc_feat_matrix[j, 2] duration_feature_array[frame_index, 3] = float(frame_number) frame_index+=1 elif self.subphone_feats == 'full': state_number = 5 # hard coded here phone_duration = sum(dur_data[i, :]) state_duration_base = 0 for state_index in xrange(1, state_number+1): state_index_backward = (state_number - state_index) + 1 frame_number = int(dur_data[i][state_index-1]) for j in xrange(frame_number): duration_feature_array[frame_index, 0] = float(j+1) / float(frame_number) ## fraction through state (forwards) duration_feature_array[frame_index, 1] = float(frame_number - j) / float(frame_number) ## fraction through state (backwards) duration_feature_array[frame_index, 2] = float(frame_number) ## length of state in frames duration_feature_array[frame_index, 3] = float(state_index) ## state index (counting forwards) duration_feature_array[frame_index, 4] = float(state_index_backward) ## state index (counting backwards) duration_feature_array[frame_index, 5] = float(phone_duration) ## length of phone in frames duration_feature_array[frame_index, 6] = float(frame_number) / float(phone_duration) ## fraction of the phone made up by current state duration_feature_array[frame_index, 7] = float(phone_duration - j - state_duration_base) / float(phone_duration) ## fraction through phone (forwards) duration_feature_array[frame_index, 8] = float(state_duration_base + j + 1) / float(phone_duration) ## fraction through phone (backwards) frame_index+=1 state_duration_base += frame_number return duration_feature_array
def load_phone_alignment(self, alignment_file_name, dur_file_name=None): if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) ph_count = 0 base_frame_index = 0 nonsilence_frame_index_list = [] fid = open(alignment_file_name) for line in fid.readlines(): line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) if len(temp_list) == 1: full_label = temp_list[0] else: start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] # to do - support different frame shift - currently hardwired to 5msec # currently under beta testing: supports different frame shift if dur_file_name: frame_number = manual_dur_data[ph_count] ph_count = ph_count + 1 else: frame_number = int((end_time - start_time) / 50000) label_binary_flag = self.check_silence_pattern(full_label) if self.remove_frame_features: if label_binary_flag == 0: for frame_index in range(frame_number): nonsilence_frame_index_list.append(base_frame_index + frame_index) base_frame_index = base_frame_index + frame_number elif self.subphone_feats == 'none': if label_binary_flag == 0: nonsilence_frame_index_list.append(base_frame_index) base_frame_index = base_frame_index + 1 fid.close() return nonsilence_frame_index_list
def normalise_data(self, in_file_list, out_file_list): file_number = len(in_file_list) fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in xrange(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size / self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) fea_max_matrix = numpy.tile(self.max_vector, (frame_number, 1)) for m in xrange(features.shape[0]): for n in xrange(features.shape[1]): if features[m][n] < fea_min_matrix[m][n]: features[m][n] = fea_min_matrix[m][n] elif features[m][n] > fea_max_matrix[m][n]: features[m][n] = fea_max_matrix[m][n] target_min_matrix = numpy.tile( self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = numpy.tile(target_max_min_diff, (frame_number, 1)) / fea_diff_matrix norm_features = diff_norm_matrix * ( features - fea_min_matrix) + target_min_matrix ## If we are to keep some columns unnormalised, use advanced indexing to ## reinstate original values: m, n = numpy.shape(features) for col in self.exclude_columns: norm_features[range(m), [col] * m] = features[range(m), [col] * m] io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def normal_standardization(self, in_file_list, out_file_list): mean_vector = self.compute_mean(in_file_list) std_vector = self.compute_std(in_file_list, mean_vector) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = (features - mean_matrix) / std_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def load_phone_alignment(self, alignment_file_name, dur_file_name=None): if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) ph_count = 0 base_frame_index = 0 nonsilence_frame_index_list = [] fid = open(alignment_file_name) for line in fid.readlines(): line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) if len(temp_list) == 1: full_label = temp_list[0] else: start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] # to do - support different frame shift - currently hardwired to 5msec # currently under beta testing: supports different frame shift if dur_file_name: frame_number = manual_dur_data[ph_count] ph_count = ph_count + 1 else: frame_number = int((end_time - start_time) / 50000) label_binary_flag = self.check_silence_pattern(full_label) if self.remove_frame_features: if label_binary_flag == 0: for frame_index in range(frame_number): nonsilence_frame_index_list.append(base_frame_index + frame_index) base_frame_index = base_frame_index + frame_number elif self.subphone_feats == 'none': if label_binary_flag == 0: nonsilence_frame_index_list.append(base_frame_index) base_frame_index = base_frame_index + 1 fid.close() return nonsilence_frame_index_list
def normal_standardization(self, in_file_list, out_file_list): mean_vector = self.compute_mean(in_file_list) std_vector = self.compute_std(in_file_list, mean_vector) io_funcs = BinaryIOCollection() file_number = len(in_file_list) for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_matrix = numpy.tile(mean_vector, (current_frame_number, 1)) std_matrix = numpy.tile(std_vector, (current_frame_number, 1)) norm_features = old_div((features - mean_matrix), std_matrix) io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def remove_silence(self, in_data_list, in_align_list, out_data_list, dur_file_list=None): file_number = len(in_data_list) align_file_number = len(in_align_list) if file_number != align_file_number: print "The number of input and output files does not equal!\n" sys.exit(1) if file_number != len(out_data_list): print "The number of input and output files does not equal!\n" sys.exit(1) io_funcs = BinaryIOCollection() for i in xrange(file_number): if self.label_type == "phone_align": if dur_file_list: dur_file_name = dur_file_list[i] else: dur_file_name = None nonsilence_indices = self.load_phone_alignment( in_align_list[i], dur_file_name) else: nonsilence_indices = self.load_alignment(in_align_list[i]) ori_cmp_data = io_funcs.load_binary_file(in_data_list[i], self.n_cmp) frame_number = ori_cmp_data.size / self.n_cmp if len(nonsilence_indices) == frame_number: print 'WARNING: no silence found!' # previsouly: continue -- in fact we should keep non-silent data! ## if labels have a few extra frames than audio, this can break the indexing, remove them: nonsilence_indices = [ ix for ix in nonsilence_indices if ix < frame_number ] new_cmp_data = ori_cmp_data[nonsilence_indices, ] io_funcs.array_to_binary_file(new_cmp_data, out_data_list[i])
def denormalise_data(self, in_file_list, out_file_list): logger = logging.getLogger("acoustic_norm") file_number = len(in_file_list) logger.info('MinMaxNormalisation.denormalise_data for %d files' % file_number) # print self.max_vector, self.min_vector fea_max_min_diff = self.max_vector - self.min_vector diff_value = self.target_max_value - self.target_min_value # logger.debug('reshaping fea_max_min_diff from shape %s to (1,%d)' % (fea_max_min_diff.shape, self.feature_dimension) ) fea_max_min_diff = numpy.reshape(fea_max_min_diff, (1, self.feature_dimension)) target_max_min_diff = numpy.zeros((1, self.feature_dimension)) target_max_min_diff.fill(diff_value) target_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 fea_max_min_diff[fea_max_min_diff <= 0.0] = 1.0 io_funcs = BinaryIOCollection() for i in range(file_number): features = io_funcs.load_binary_file(in_file_list[i], self.feature_dimension) frame_number = features.size // self.feature_dimension fea_min_matrix = numpy.tile(self.min_vector, (frame_number, 1)) target_min_matrix = numpy.tile( self.target_min_value, (frame_number, self.feature_dimension)) fea_diff_matrix = numpy.tile(fea_max_min_diff, (frame_number, 1)) diff_norm_matrix = old_div( fea_diff_matrix, numpy.tile(target_max_min_diff, (frame_number, 1))) norm_features = diff_norm_matrix * ( features - target_min_matrix) + fea_min_matrix io_funcs.array_to_binary_file(norm_features, out_file_list[i])
def remove_silence(self, in_data_list, in_align_list, out_data_list, dur_file_list=None): file_number = len(in_data_list) align_file_number = len(in_align_list) if file_number != align_file_number: print "The number of input and output files does not equal!\n" sys.exit(1) if file_number != len(out_data_list): print "The number of input and output files does not equal!\n" sys.exit(1) io_funcs = BinaryIOCollection() for i in xrange(file_number): if self.label_type=="phone_align": if dur_file_list: dur_file_name = dur_file_list[i] else: dur_file_name = None nonsilence_indices = self.load_phone_alignment(in_align_list[i], dur_file_name) else: nonsilence_indices = self.load_alignment(in_align_list[i]) ori_cmp_data = io_funcs.load_binary_file(in_data_list[i], self.n_cmp) frame_number = ori_cmp_data.size/self.n_cmp if len(nonsilence_indices) == frame_number: print 'WARNING: no silence found!' # previsouly: continue -- in fact we should keep non-silent data! ## if labels have a few extra frames than audio, this can break the indexing, remove them: nonsilence_indices = [ix for ix in nonsilence_indices if ix < frame_number] new_cmp_data = ori_cmp_data[nonsilence_indices,] io_funcs.array_to_binary_file(new_cmp_data, out_data_list[i])
def compute_mean(self, file_list): logger = logging.getLogger("acoustic_norm") mean_vector = numpy.zeros((1, self.feature_dimension)) all_frame_number = 0 io_funcs = BinaryIOCollection() for file_name in file_list: features = io_funcs.load_binary_file(file_name, self.feature_dimension) current_frame_number = features.size // self.feature_dimension mean_vector += numpy.reshape(numpy.sum(features, axis=0), (1, self.feature_dimension)) all_frame_number += current_frame_number mean_vector /= float(all_frame_number) # po=numpy.get_printoptions() # numpy.set_printoptions(precision=2, threshold=20, linewidth=1000, edgeitems=4) logger.info('computed mean vector of length %d :' % mean_vector.shape[1] ) logger.info(' mean: %s' % mean_vector) # restore the print options # numpy.set_printoptions(po) return mean_vector
def extract_base_features(self, feat_dir_path, feat_switch, list_of_files, decomposition_unit): ### load Binary module ### io_funcs = BinaryIOCollection() htsclass = readHTSlabelFile() ### read file by file ### for i in range(len(list_of_files)): filename = list_of_files[i] print filename binary_label_dir = feat_dir_path['input_binary'] label_align_dir = feat_dir_path['input_labfile'] txt_dir = feat_dir_path['input_txt'] out_feat_dir = feat_dir_path['output_feat'] in_filename = os.path.join(binary_label_dir, filename + '.lab') in_lab_file = os.path.join(label_align_dir, filename + '.lab') in_txt_file = os.path.join(txt_dir, filename + '.txt') out_filename = os.path.join(out_feat_dir, filename + '.lab') ip1 = open(in_txt_file, 'r') text_Data = ip1.readlines() ip1.close() list_of_words = text_Data[0].split() [phone, ph_arr, mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file) features = io_funcs.load_binary_file(in_filename, 1) file_len = len(phone) op1 = open(out_filename, 'w') count = 0 frame_count = 0 phone_count = 0 wc = 0 seg_count = 0 feat_arr = [] prev_feat_arr = [] syl_identity = self.zeros(300, 1) syl = '' phinsyl = 0 for j in range(len(features)): count = count + 1 if (count == 601): count = 0 feat_arr = [] sil_flag = 0 continue if (count == 59 and int(features[j]) == 1): sil_flag = 1 if (count == 148 and int(features[j]) == 1): sil_flag = 0 if (count <= 348 or (count >= 406 and count <= 421) or count > 592): continue feat_arr.append(int(features[j])) if (count == 592): if np.abs(frame_count - int(ph_arr[1][phone_count] * (10**-4) / 5)) <= 1: ph_identity = features[j - 492:j - 443] ph_identity = np.reshape(ph_identity, len(ph_identity), -1) syl_identity[phinsyl * 50:(phinsyl + 1) * 50 - 1] = ph_identity syl = syl + phone[phone_count] if phone[phone_count] == '#': syl_identity[(phinsyl + 1) * 50 - 1] = 1 phinsyl += 1 phone_count += 1 frame_count += 1 if (len(prev_feat_arr) != 0 and prev_feat_arr == feat_arr): continue else: prev_feat_arr = feat_arr if (syl != '#' and syl != ''): syl_vec = '' new_syl_identity = [ 0.99 if x == 1 else 0.01 for x in syl_identity ] for x in range(len(new_syl_identity)): syl_vec = syl_vec + str( new_syl_identity[x]) + ' ' op1.write(syl_vec + '\n') ### reset syllable information ### phinsyl = 0 syl = '' syl_identity = self.zeros(300, 1) if (sil_flag == 1): continue seg_count += 1 new_arr = [ 0.99 if x == 1 else 0.01 for x in prev_feat_arr ] for item in new_arr: op1.write("%s " % item) ### word ending information ### if (mean_f0_arr[phone_count][5] - mean_f0_arr[phone_count - 1][5] != 0 and phone[phone_count] != 'pau'): wc += 1 word = list_of_words[wc - 1] if word in self.wrd_embeds: word_vec = self.wrd_embeds[word] else: word_vec = self.wrd_embeds['*UNKNOWN*'] if (phone[phone_count] == 'pau'): word_vec = self.wrd_embeds['*UNKNOWN*'] op1.write(word_vec + ' ') continue op1.close()
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \ silence_feature_index, percent_to_keep=0): ''' Function to trim silence from binary label/speech files based on binary labels. in_list: list of binary label/speech files to trim out_list: trimmed files in_dimension: dimension of data to trim label_list: list of binary labels which contain trimming criterion label_dimesion: silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave. ''' assert len(in_list) == len(out_list) == len(label_list) io_funcs = BinaryIOCollection() for (infile, outfile, label_file) in zip(in_list, out_list, label_list): data = io_funcs.load_binary_file(infile, in_dimension) label = io_funcs.load_binary_file(label_file, label_dimension) audio_label_difference = data.shape[0] - label.shape[0] assert math.fabs( audio_label_difference ) < 3, '%s and %s contain different numbers of frames: %s %s' % ( infile, label_file, data.shape[0], label.shape[0]) ## In case they are different, resize -- keep label fixed as we assume this has ## already been processed. (This problem only arose with STRAIGHT features.) if audio_label_difference < 0: ## label is longer -- pad audio to match by repeating last frame: print('audio too short -- pad') padding = numpy.vstack([data[-1, :]] * int(math.fabs(audio_label_difference))) data = numpy.vstack([data, padding]) elif audio_label_difference > 0: ## audio is longer -- cut it print('audio too long -- trim') new_length = label.shape[0] data = data[:new_length, :] # else: -- expected case -- lengths match, so do nothing silence_flag = label[:, silence_feature_index] # print silence_flag if not (numpy.unique(silence_flag) == numpy.array([0, 1])).all(): ## if it's all 0s or 1s, that's ok: assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \ (numpy.unique(silence_flag) == numpy.array([1]).all()), \ 'dimension %s of %s contains values other than 0 and 1' % (silence_feature_index, infile) print('Remove %d%% of frames (%s frames) as silence... ' % (100 * numpy.sum(silence_flag / float(len(silence_flag))), int(numpy.sum(silence_flag)))) non_silence_indices = numpy.nonzero( silence_flag == 0) ## get the indices where silence_flag == 0 is True (i.e. != 0) if percent_to_keep != 0: assert type(percent_to_keep) == int and percent_to_keep > 0 # print silence_flag silence_indices = numpy.nonzero(silence_flag == 1) ## nonzero returns a tuple of arrays, one for each dimension of input array silence_indices = silence_indices[0] every_nth = 100 / percent_to_keep silence_indices_to_keep = silence_indices[:: every_nth] ## every_nth used +as step value in slice ## -1 due to weird error with STRAIGHT features at line 144: ## IndexError: index 445 is out of bounds for axis 0 with size 445 if len(silence_indices_to_keep) == 0: silence_indices_to_keep = numpy.array( [1]) ## avoid errors in case there is no silence print( ' Restore %s%% (every %sth frame: %s frames) of silent frames' % (percent_to_keep, every_nth, len(silence_indices_to_keep))) ## Append to end of utt -- same function used for labels and audio ## means that violation of temporal order doesn't matter -- will be consistent. ## Later, frame shuffling will disperse silent frames evenly across minibatches: non_silence_indices = (numpy.hstack( [non_silence_indices[0], silence_indices_to_keep])) ## ^---- from tuple and back (see nonzero note above) trimmed_data = data[ non_silence_indices, :] ## advanced integer indexing io_funcs.array_to_binary_file(trimmed_data, outfile)
def extract_base_features(self, feat_dir_path, feat_switch, list_of_files, decomposition_unit): ### load Binary module ### io_funcs = BinaryIOCollection() htsclass = readHTSlabelFile() ### read file by file ### for i in range(len(list_of_files)): filename = list_of_files[i] print filename binary_label_dir = feat_dir_path['input_binary'] label_align_dir = feat_dir_path['input_labfile'] txt_dir = feat_dir_path['input_txt'] out_feat_dir = feat_dir_path['output_feat'] in_filename = os.path.join(binary_label_dir, filename + '.lab'); in_lab_file = os.path.join(label_align_dir, filename + '.lab') in_txt_file = os.path.join(txt_dir, filename + '.txt') out_filename = os.path.join(out_feat_dir, filename + '.lab'); ip1 = open(in_txt_file, 'r') text_Data = ip1.readlines() ip1.close() list_of_words = text_Data[0].split() [phone, ph_arr, mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file) features = io_funcs.load_binary_file(in_filename, 1) file_len = len(phone) op1 = open(out_filename, 'w') count = 0; frame_count = 0; phone_count = 0; wc = 0; seg_count = 0; feat_arr = [] prev_feat_arr = [] syl_identity = self.zeros(300,1) syl = '' phinsyl = 0 for j in range(len(features)): count = count + 1 if(count == 601): count = 0; feat_arr = [] sil_flag = 0 continue; if (count == 59 and int(features[j]) == 1): sil_flag = 1 if (count == 148 and int(features[j]) == 1): sil_flag = 0 if(count <= 348 or (count >= 406 and count <= 421) or count > 592): continue; feat_arr.append(int(features[j])) if(count == 592): if np.abs(frame_count - int(ph_arr[1][phone_count] * (10 ** -4) / 5)) <= 1: ph_identity = features[j-492:j-443] ph_identity = np.reshape(ph_identity, len(ph_identity), -1) syl_identity[phinsyl*50:(phinsyl+1)*50-1] = ph_identity syl = syl+phone[phone_count] if phone[phone_count] == '#': syl_identity[(phinsyl+1)*50-1] = 1 phinsyl += 1 phone_count += 1 frame_count += 1 if(len(prev_feat_arr) != 0 and prev_feat_arr == feat_arr): continue; else: prev_feat_arr = feat_arr if(syl!='#' and syl!=''): syl_vec = '' new_syl_identity = [0.99 if x==1 else 0.01 for x in syl_identity] for x in range(len(new_syl_identity)): syl_vec = syl_vec+str(new_syl_identity[x])+' ' op1.write(syl_vec+'\n') ### reset syllable information ### phinsyl = 0; syl='' syl_identity = self.zeros(300, 1) if (sil_flag == 1): continue; seg_count += 1 new_arr = [0.99 if x==1 else 0.01 for x in prev_feat_arr] for item in new_arr: op1.write("%s " % item) ### word ending information ### if(mean_f0_arr[phone_count][5] - mean_f0_arr[phone_count - 1][5] != 0 and phone[phone_count] != 'pau'): wc += 1 word = list_of_words[wc - 1] if word in self.wrd_embeds: word_vec = self.wrd_embeds[word] else: word_vec = self.wrd_embeds['*UNKNOWN*'] if(phone[phone_count] == 'pau'): word_vec = self.wrd_embeds['*UNKNOWN*'] op1.write(word_vec + ' ') continue; op1.close()
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN # AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, # ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF # THIS SOFTWARE. ################################################################################ # quick and dirty utility to print out binary files, for debugging import sys # import numpy from io_funcs.binary_io import BinaryIOCollection if __name__ == '__main__': ## shall we read the logging config file from command line? if len(sys.argv) < 3: print 'usage: python view.py dimension filename(s)' sys.exit(1) dimension = int(sys.argv[1]) fnames = sys.argv[2:] print fnames io_funcs = BinaryIOCollection() for f in fnames: features = io_funcs.load_binary_file(f, dimension) print features.shape # print features
def extract_base_features(self, feat_dir_path, feat_switch, list_of_files, decomposition_unit, unit_dim): ### load Binary module ### io_funcs = BinaryIOCollection() htsclass = readHTSlabelFile() ### read file by file ### for i in range(len(list_of_files)): filename = list_of_files[i] print filename binary_label_dir = feat_dir_path['input_binary'] label_align_dir = feat_dir_path['input_labfile'] txt_dir = feat_dir_path['input_txt'] out_feat_dir = feat_dir_path['output_feat'] in_filename = os.path.join(binary_label_dir, filename + '.lab') in_lab_file = os.path.join(label_align_dir, filename + '.lab') in_txt_file = os.path.join(txt_dir, filename + '.txt') out_filename = os.path.join(out_feat_dir, filename + '.lab') word_embed_list = [] binary_feat_list = [] identity_vec_list = [] dur_feat_list = [] dur_list = [] ### read text file ### if feat_switch['wordEmbed']: ip1 = open(in_txt_file, 'r') text_Data = ip1.readlines() ip1.close() norm_text = self.format_text(text_Data[0].strip()) norm_text = norm_text.replace('OUF', 'O U F') norm_text = norm_text.replace('Mmm', 'M m m') norm_text = norm_text.replace('USA', 'U S A') list_of_words = norm_text.split() ### read label file ### [phone, st_arr, ph_arr, mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file) file_len = len(phone) ### read binary label file ### features = io_funcs.load_binary_file(in_filename, 1) ### take non-silence region ### ph_start = int(ph_arr[0][1] / (np.power(10, 4) * 5)) ph_end = int(ph_arr[1][file_len - 2] / (np.power(10, 4) * 5)) ### extract duration features ### frame_feat_list = features.reshape( len(features) / unit_dim['frame'], unit_dim['frame']) frame_feat_list = frame_feat_list[ph_start:ph_end, :] dur_feat_list = frame_feat_list[:, -9:] ### initialise common variables ### num_of_frames = 0 ### initialise syllable variables ### #frame_indx=0; syl_num_of_frames = 0 wc = 0 phinsyl = 0 syl_identity = self.zeros(300, 1) syl = '' j = 0 while j < file_len: #### ignore silence #### if (phone[j] == '#' or phone[j] == 'pau'): j = j + 1 continue ### extract boundaries of phone ### ph_start = int(ph_arr[0][j] / (np.power(10, 4) * 5)) ph_end = int(ph_arr[1][j] / (np.power(10, 4) * 5)) num_of_frames = sum(st_arr[j][:] / (np.power(10, 4) * 5)) mid_frame = (ph_start + ph_end) / 2 ### syllable ending information ### syl_end = 0 if (mean_f0_arr[j + 1][3] - mean_f0_arr[j][3] != 0): syl_end = 1 ### word ending information ### word_end = 0 if (mean_f0_arr[j + 1][5] - mean_f0_arr[j][5] != 0): word_end = 1 ### syllable duration ### syl_num_of_frames += num_of_frames ### extract binary phone-level features ### st_indx = unit_dim['frame'] * mid_frame mid_frame_feat = features[st_indx:st_indx + 592] mid_frame_feat = np.reshape(mid_frame_feat, len(mid_frame_feat)) ### word embedding features ### if feat_switch['wordEmbed']: ### word embeddings for syllable ### word = list_of_words[wc] if (word_end and phone[j] != 'pau'): wc += 1 if (phone[j] == 'pau'): word_vec = self.wrd_embeds['*UNKNOWN*'] elif word in self.wrd_embeds: word_vec = self.wrd_embeds[word] elif word.lower() in self.wrd_embeds: word_vec = self.wrd_embeds[word.lower()] else: word_vec = self.wrd_embeds['*UNKNOWN*'] ### identity features ### if feat_switch['identity']: ### phone identity features ### ph_identity = mid_frame_feat[99:148] if decomposition_unit == 'syllable': ### syllable identity features st_indx = phinsyl * 50 syl_identity[st_indx:st_indx + 49] = ph_identity syl = syl + phone[j] ### to make nucleus centre ### #if phone[j] in self.vlist: # vow_index = phinsyl ### if silence is allowed ### #if phone[j] == '#': # syl_identity[(phinsyl+1)*50-1] = 1 phinsyl += 1 #### select features depending on decomposition unit ### ### frame-level features ### if (decomposition_unit == 'frame'): ### duration features for phone ### dur_list.append(num_of_frames) ### frame level binary features ### if feat_switch['binary'] and j + 2 == file_len: ### load normalisation statistics ### label_norm_float_file = os.path.join( binary_label_dir, '../label_norm_float_HTS.dat') fid = open(label_norm_float_file, 'r') arr12 = [float(x.strip()) for x in fid.readlines()] fid.close() min_vector = np.array(arr12[0:len(arr12) / 2]) max_vector = np.array(arr12[len(arr12) / 2:len(arr12)]) max_range_vector = max_vector - min_vector max_range_vector[max_range_vector == 0] = 1 ### normalise features ### nrows = len(frame_feat_list) for x in xrange(nrows): norm_frame_feat = ( frame_feat_list[x, :] - min_vector) / max_range_vector * 0.98 + 0.01 norm_frame_vec = ' '.join( map(str, norm_frame_feat[:])) binary_feat_list.append(norm_frame_vec) ### embedding features ### if feat_switch['wordEmbed']: for x in xrange(num_of_frames): word_embed_list.append(word_vec) ### phone-level features ### if (decomposition_unit == 'phone'): ### duration features for phone ### dur_list.append(num_of_frames) ### phone level binary features ### if feat_switch['binary']: #ph_feat = np.concatenate((mid_frame_feat[0:99], mid_frame_feat[348:]), axis=0) norm_ph_feat = [ 0.99 if x == 1 else 0.01 for x in mid_frame_feat ] norm_ph_vec = ' '.join(map(str, norm_ph_feat[:])) binary_feat_list.append(norm_ph_vec) ### embedding features ### if feat_switch['wordEmbed']: word_embed_list.append(word_vec) ### phone-identity features ### if feat_switch['identity']: extra_ph = 1 if phone[j] == 'o~' else 0 ph_identity = np.append(ph_identity, extra_ph) #norm_ph_identity = [0.99 if x==1 else 0.01 for x in ph_identity] norm_ph_identity = [int(x) for x in ph_identity] norm_ph_identity_vec = ' '.join( map(str, norm_ph_identity[:])) identity_vec_list.append(norm_ph_identity_vec) ### syllable level features ### if (decomposition_unit == 'syllable' and syl_end): #print syl ### duration features for syllable ### dur_list.append(syl_num_of_frames) ### syllable and above level binary features ### if feat_switch['binary']: syl_feat = [] for x in range(len(mid_frame_feat)): if (x < 348 or (x >= 405 and x < 421)): continue syl_feat.append(mid_frame_feat[x]) norm_syl_feat = [ 0.99 if x == 1 else 0.01 for x in syl_feat ] norm_syl_vec = ' '.join(map(str, norm_syl_feat[:])) binary_feat_list.append(norm_syl_vec) if feat_switch['wordEmbed']: word_embed_list.append(word_vec) ### syllable-identity features ### if feat_switch['identity']: ### to make nucleus centre ### #if(vow_index<=1): # syl_identity = np.roll(syl_identity, 50*(vow_index+1)) norm_syl_identity = [ 0.99 if x == 1 else 0.01 for x in syl_identity ] norm_syl_identity_vec = ' '.join( map(str, norm_syl_identity[:])) identity_vec_list.append(norm_syl_identity_vec) ### reset syllable information ### phinsyl = 0 syl = '' syl_num_of_frames = 0 syl_identity = self.zeros(300, 1) j += 1 ### default vectors to use ### if feat_switch['identity'] and decomposition_unit == 'syllable': syl_identity = self.zeros(300, 1) norm_syl_identity = [ 0.99 if x == 1 else 0.01 for x in syl_identity ] norm_syl_identity_vec = ' '.join(map(str, norm_syl_identity[:])) if feat_switch['wordEmbed']: word_vec = self.wrd_embeds['*UNKNOWN*'] ### writing features to output file ### op1 = open(out_filename, 'w') num_of_vectors = max(len(binary_feat_list), len(identity_vec_list), len(word_embed_list)) for x in range(num_of_vectors): ### initialise feat vector ### feat_vec = '' ### binary features ### if feat_switch['binary']: feat_vec = feat_vec + binary_feat_list[x] + ' ' ### word embeddings ### if feat_switch['wordEmbed']: if feat_switch['wordEmbed'] >= 3: if (x - 1 < 0): feat_vec = feat_vec + word_vec + ' ' else: feat_vec = feat_vec + word_embed_list[x - 1] + ' ' feat_vec = feat_vec + word_embed_list[x] + ' ' if feat_switch['wordEmbed'] >= 3: if (x + 1 >= len(binary_feat_list)): feat_vec = feat_vec + word_vec + ' ' else: feat_vec = feat_vec + word_embed_list[x + 1] + ' ' ### identity features ### if feat_switch['identity']: if feat_switch['identity'] >= 5: if (x - 2 < 0): feat_vec = feat_vec + norm_syl_identity_vec + ' ' else: feat_vec = feat_vec + identity_vec_list[x - 2] + ' ' if feat_switch['identity'] >= 3: if (x - 1 < 0): feat_vec = feat_vec + norm_syl_identity_vec + ' ' else: feat_vec = feat_vec + identity_vec_list[x - 1] + ' ' feat_vec = feat_vec + identity_vec_list[x] + ' ' if feat_switch['identity'] >= 3: if (x + 1 >= len(binary_feat_list)): feat_vec = feat_vec + norm_syl_identity_vec + ' ' else: feat_vec = feat_vec + identity_vec_list[x + 1] + ' ' if feat_switch['identity'] >= 5: if (x + 2 >= len(binary_feat_list)): feat_vec = feat_vec + norm_syl_identity_vec + ' ' else: feat_vec = feat_vec + identity_vec_list[x + 2] + ' ' op1.write(feat_vec + '\n') #for z in range(dur_list[x]): # op1.write(feat_vec + ' '.join(map(str, dur_feat_list[frame_indx+z,:]))+'\n') #frame_indx+=dur_list[x] op1.close()
def extract_base_features(self, feat_dir_path, feat_switch, list_of_files, decomposition_unit, unit_dim): ### load Binary module ### io_funcs = BinaryIOCollection() htsclass = readHTSlabelFile() ### read file by file ### for i in range(len(list_of_files)): filename = list_of_files[i] print filename binary_label_dir = feat_dir_path['input_binary'] label_align_dir = feat_dir_path['input_labfile'] txt_dir = feat_dir_path['input_txt'] out_feat_dir = feat_dir_path['output_feat'] in_filename = os.path.join(binary_label_dir, filename + '.lab'); in_lab_file = os.path.join(label_align_dir, filename + '.lab') in_txt_file = os.path.join(txt_dir, filename + '.txt') out_filename = os.path.join(out_feat_dir, filename + '.lab'); word_embed_list = [] binary_feat_list = [] identity_vec_list = [] dur_feat_list = [] dur_list = [] ### read text file ### if feat_switch['wordEmbed']: ip1 = open(in_txt_file, 'r') text_Data = ip1.readlines() ip1.close() norm_text = self.format_text(text_Data[0].strip()) norm_text = norm_text.replace('OUF', 'O U F') norm_text = norm_text.replace('Mmm', 'M m m') norm_text = norm_text.replace('USA', 'U S A') list_of_words = norm_text.split() ### read label file ### [phone, st_arr, ph_arr, mean_f0_arr] = htsclass.read_state_align_label_file(in_lab_file) file_len = len(phone) ### read binary label file ### features = io_funcs.load_binary_file(in_filename, 1) ### take non-silence region ### ph_start = int(ph_arr[0][1] / (np.power(10, 4) * 5)); ph_end = int(ph_arr[1][file_len-2] / (np.power(10, 4) * 5)); ### extract duration features ### frame_feat_list = features.reshape(len(features)/unit_dim['frame'], unit_dim['frame']) frame_feat_list = frame_feat_list[ph_start: ph_end, :] dur_feat_list = frame_feat_list[:,-9:] ### initialise common variables ### num_of_frames=0; ### initialise syllable variables ### #frame_indx=0; syl_num_of_frames=0 wc = 0; phinsyl=0; syl_identity = self.zeros(300,1) syl = '' j=0; while j < file_len: #### ignore silence #### if(phone[j] == '#' or phone[j] == 'pau'): j = j + 1 continue; ### extract boundaries of phone ### ph_start = int(ph_arr[0][j] / (np.power(10, 4) * 5)); ph_end = int(ph_arr[1][j] / (np.power(10, 4) * 5)); num_of_frames = sum(st_arr[j][:]/(np.power(10,4)*5)) mid_frame = (ph_start+ph_end)/2 ### syllable ending information ### syl_end = 0 if(mean_f0_arr[j + 1][3] - mean_f0_arr[j][3] != 0): syl_end = 1 ### word ending information ### word_end = 0 if(mean_f0_arr[j + 1][5] - mean_f0_arr[j][5] != 0): word_end = 1 ### syllable duration ### syl_num_of_frames += num_of_frames ### extract binary phone-level features ### st_indx = unit_dim['frame']*mid_frame mid_frame_feat = features[st_indx:st_indx+592] mid_frame_feat = np.reshape(mid_frame_feat, len(mid_frame_feat)) ### word embedding features ### if feat_switch['wordEmbed']: ### word embeddings for syllable ### word = list_of_words[wc] if(word_end and phone[j]!='pau'): wc += 1 if(phone[j] == 'pau'): word_vec = self.wrd_embeds['*UNKNOWN*'] elif word in self.wrd_embeds: word_vec = self.wrd_embeds[word] elif word.lower() in self.wrd_embeds: word_vec = self.wrd_embeds[word.lower()] else: word_vec = self.wrd_embeds['*UNKNOWN*'] ### identity features ### if feat_switch['identity']: ### phone identity features ### ph_identity = mid_frame_feat[99:148] if decomposition_unit == 'syllable': ### syllable identity features st_indx = phinsyl*50 syl_identity[st_indx:st_indx+49] = ph_identity syl = syl + phone[j] ### to make nucleus centre ### #if phone[j] in self.vlist: # vow_index = phinsyl ### if silence is allowed ### #if phone[j] == '#': # syl_identity[(phinsyl+1)*50-1] = 1 phinsyl += 1 #### select features depending on decomposition unit ### ### frame-level features ### if(decomposition_unit=='frame'): ### duration features for phone ### dur_list.append(num_of_frames) ### frame level binary features ### if feat_switch['binary'] and j+2==file_len: ### load normalisation statistics ### label_norm_float_file = os.path.join(binary_label_dir, '../label_norm_float_HTS.dat'); fid = open(label_norm_float_file, 'r') arr12 = [float(x.strip()) for x in fid.readlines()] fid.close() min_vector = np.array(arr12[0:len(arr12)/2]) max_vector = np.array(arr12[len(arr12)/2:len(arr12)]) max_range_vector = max_vector - min_vector max_range_vector[max_range_vector==0] = 1 ### normalise features ### nrows = len(frame_feat_list) for x in xrange(nrows): norm_frame_feat = (frame_feat_list[x,:] - min_vector) / max_range_vector*0.98 + 0.01 norm_frame_vec = ' '.join(map(str, norm_frame_feat[:])) binary_feat_list.append(norm_frame_vec) ### embedding features ### if feat_switch['wordEmbed']: for x in xrange(num_of_frames): word_embed_list.append(word_vec) ### phone-level features ### if(decomposition_unit=='phone'): ### duration features for phone ### dur_list.append(num_of_frames) ### phone level binary features ### if feat_switch['binary']: #ph_feat = np.concatenate((mid_frame_feat[0:99], mid_frame_feat[348:]), axis=0) norm_ph_feat = [0.99 if x==1 else 0.01 for x in mid_frame_feat] norm_ph_vec = ' '.join(map(str, norm_ph_feat[:])) binary_feat_list.append(norm_ph_vec) ### embedding features ### if feat_switch['wordEmbed']: word_embed_list.append(word_vec) ### phone-identity features ### if feat_switch['identity']: extra_ph = 1 if phone[j] == 'o~' else 0 ph_identity = np.append(ph_identity, extra_ph) #norm_ph_identity = [0.99 if x==1 else 0.01 for x in ph_identity] norm_ph_identity = [int(x) for x in ph_identity] norm_ph_identity_vec = ' '.join(map(str, norm_ph_identity[:])) identity_vec_list.append(norm_ph_identity_vec) ### syllable level features ### if(decomposition_unit=='syllable' and syl_end): #print syl ### duration features for syllable ### dur_list.append(syl_num_of_frames) ### syllable and above level binary features ### if feat_switch['binary']: syl_feat = [] for x in range(len(mid_frame_feat)): if(x < 348 or (x >= 405 and x < 421)): continue; syl_feat.append(mid_frame_feat[x]) norm_syl_feat = [0.99 if x==1 else 0.01 for x in syl_feat] norm_syl_vec = ' '.join(map(str, norm_syl_feat[:])) binary_feat_list.append(norm_syl_vec) if feat_switch['wordEmbed']: word_embed_list.append(word_vec) ### syllable-identity features ### if feat_switch['identity']: ### to make nucleus centre ### #if(vow_index<=1): # syl_identity = np.roll(syl_identity, 50*(vow_index+1)) norm_syl_identity = [0.99 if x==1 else 0.01 for x in syl_identity] norm_syl_identity_vec = ' '.join(map(str, norm_syl_identity[:])) identity_vec_list.append(norm_syl_identity_vec) ### reset syllable information ### phinsyl = 0; syl='' syl_num_of_frames = 0 syl_identity = self.zeros(300, 1) j+=1 ### default vectors to use ### if feat_switch['identity'] and decomposition_unit=='syllable': syl_identity = self.zeros(300, 1) norm_syl_identity = [0.99 if x==1 else 0.01 for x in syl_identity] norm_syl_identity_vec = ' '.join(map(str, norm_syl_identity[:])) if feat_switch['wordEmbed']: word_vec = self.wrd_embeds['*UNKNOWN*'] ### writing features to output file ### op1 = open(out_filename, 'w') num_of_vectors = max(len(binary_feat_list), len(identity_vec_list), len(word_embed_list)) for x in range(num_of_vectors): ### initialise feat vector ### feat_vec = '' ### binary features ### if feat_switch['binary']: feat_vec = feat_vec + binary_feat_list[x]+' ' ### word embeddings ### if feat_switch['wordEmbed']: if feat_switch['wordEmbed']>=3: if(x-1<0): feat_vec = feat_vec + word_vec+' ' else: feat_vec = feat_vec + word_embed_list[x-1]+' ' feat_vec = feat_vec + word_embed_list[x]+' ' if feat_switch['wordEmbed']>=3: if(x+1>=len(binary_feat_list)): feat_vec = feat_vec + word_vec+' ' else: feat_vec = feat_vec + word_embed_list[x+1]+' ' ### identity features ### if feat_switch['identity']: if feat_switch['identity']>=5: if(x-2<0): feat_vec = feat_vec + norm_syl_identity_vec+' ' else: feat_vec = feat_vec + identity_vec_list[x-2]+' ' if feat_switch['identity']>=3: if(x-1<0): feat_vec = feat_vec + norm_syl_identity_vec+' ' else: feat_vec = feat_vec + identity_vec_list[x-1]+' ' feat_vec = feat_vec + identity_vec_list[x]+' ' if feat_switch['identity']>=3: if(x+1>=len(binary_feat_list)): feat_vec = feat_vec + norm_syl_identity_vec+' ' else: feat_vec = feat_vec + identity_vec_list[x+1]+' ' if feat_switch['identity']>=5: if(x+2>=len(binary_feat_list)): feat_vec = feat_vec + norm_syl_identity_vec+' ' else: feat_vec = feat_vec + identity_vec_list[x+2]+' ' op1.write(feat_vec+'\n') #for z in range(dur_list[x]): # op1.write(feat_vec + ' '.join(map(str, dur_feat_list[frame_indx+z,:]))+'\n') #frame_indx+=dur_list[x] op1.close()
def load_labels_with_phone_alignment(self, file_name, dur_file_name): # this is not currently used ??? -- it works now :D logger = logging.getLogger("labels") #logger.critical('unused function ???') #raise Exception if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) if self.add_frame_features: assert self.dimension == self.dict_size+self.frame_feature_size elif self.subphone_feats != 'none': assert self.dimension == self.dict_size+self.frame_feature_size else: assert self.dimension == self.dict_size label_feature_matrix = numpy.empty((100000, self.dimension)) ph_count=0 label_feature_index = 0 with open(file_name) as fid: all_data = fid.readlines() for line in all_data: line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) if len(temp_list)==1: frame_number = 0 full_label = temp_list[0] else: start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] # to do - support different frame shift - currently hardwired to 5msec # currently under beta testing: support different frame shift if dur_file_name: frame_number = manual_dur_data[ph_count] else: frame_number = int(end_time/50000) - int(start_time/50000) if self.subphone_feats == "coarse_coding": cc_feat_matrix = self.extract_coarse_coding_features_relative(frame_number) ph_count = ph_count+1 #label_binary_vector = self.pattern_matching(full_label) label_binary_vector = self.pattern_matching_binary(full_label) # if there is no CQS question, the label_continuous_vector will become to empty label_continuous_vector = self.pattern_matching_continous_position(full_label) label_vector = numpy.concatenate([label_binary_vector, label_continuous_vector], axis = 1) if self.add_frame_features: current_block_binary_array = numpy.zeros((frame_number, self.dict_size+self.frame_feature_size)) for i in range(frame_number): current_block_binary_array[i, 0:self.dict_size] = label_vector if self.subphone_feats == 'minimal_phoneme': ## features which distinguish frame position in phoneme current_block_binary_array[i, self.dict_size] = float(i+1)/float(frame_number) # fraction through phone forwards current_block_binary_array[i, self.dict_size+1] = float(frame_number - i)/float(frame_number) # fraction through phone backwards current_block_binary_array[i, self.dict_size+2] = float(frame_number) # phone duration elif self.subphone_feats == 'coarse_coding': ## features which distinguish frame position in phoneme using three continous numerical features current_block_binary_array[i, self.dict_size+0] = cc_feat_matrix[i, 0] current_block_binary_array[i, self.dict_size+1] = cc_feat_matrix[i, 1] current_block_binary_array[i, self.dict_size+2] = cc_feat_matrix[i, 2] current_block_binary_array[i, self.dict_size+3] = float(frame_number) elif self.subphone_feats == 'none': pass else: sys.exit('unknown subphone_feats type') label_feature_matrix[label_feature_index:label_feature_index+frame_number,] = current_block_binary_array label_feature_index = label_feature_index + frame_number elif self.subphone_feats == 'none': current_block_binary_array = label_vector label_feature_matrix[label_feature_index:label_feature_index+1,] = current_block_binary_array label_feature_index = label_feature_index + 1 label_feature_matrix = label_feature_matrix[0:label_feature_index,] logger.info('loaded %s, %3d labels' % (file_name, ph_count) ) logger.debug('made label matrix of %d frames x %d labels' % label_feature_matrix.shape ) return label_feature_matrix
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN # AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, # ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF # THIS SOFTWARE. ################################################################################ # quick and dirty utility to print out binary files, for debugging import sys # import numpy from io_funcs.binary_io import BinaryIOCollection if __name__ == '__main__': ## shall we read the logging config file from command line? if len(sys.argv) < 3: print 'usage: python view.py dimension filename(s)' sys.exit(1) dimension = int(sys.argv[1]) fnames = sys.argv[2:] print fnames io_funcs = BinaryIOCollection() for f in fnames: features = io_funcs.load_binary_file(f, dimension) print features.shape # print features
def load_labels_with_phone_alignment(self, file_name, dur_file_name): # this is not currently used ??? -- it works now :D logger = logging.getLogger("labels") #logger.critical('unused function ???') #raise Exception if dur_file_name: io_funcs = BinaryIOCollection() dur_dim = 1 ## hard coded for now manual_dur_data = io_funcs.load_binary_file(dur_file_name, dur_dim) if self.add_frame_features: assert self.dimension == self.dict_size + self.frame_feature_size elif self.subphone_feats != 'none': assert self.dimension == self.dict_size + self.frame_feature_size else: assert self.dimension == self.dict_size label_feature_matrix = numpy.empty((100000, self.dimension)) ph_count = 0 label_feature_index = 0 fid = open(file_name) for line in fid.readlines(): line = line.strip() if len(line) < 1: continue temp_list = re.split('\s+', line) if len(temp_list) == 1: frame_number = 0 full_label = temp_list[2] else: start_time = int(temp_list[0]) end_time = int(temp_list[1]) full_label = temp_list[2] # to do - support different frame shift - currently hardwired to 5msec # currently under beta testing: support different frame shift if dur_file_name: frame_number = manual_dur_data[ph_count] else: frame_number = int((end_time - start_time) / 50000) if self.subphone_feats == "coarse_coding": cc_feat_matrix = self.extract_coarse_coding_features_relative( frame_number) ph_count = ph_count + 1 #label_binary_vector = self.pattern_matching(full_label) label_binary_vector = self.pattern_matching_binary(full_label) # if there is no CQS question, the label_continuous_vector will become to empty label_continuous_vector = self.pattern_matching_continous_position( full_label) label_vector = numpy.concatenate( [label_binary_vector, label_continuous_vector], axis=1) if self.add_frame_features: current_block_binary_array = numpy.zeros( (frame_number, self.dict_size + self.frame_feature_size)) for i in range(frame_number): current_block_binary_array[i, 0:self.dict_size] = label_vector if self.subphone_feats == 'minimal_phoneme': ## features which distinguish frame position in phoneme current_block_binary_array[ i, self.dict_size] = float(i + 1) / float( frame_number ) # fraction through phone forwards current_block_binary_array[ i, self.dict_size + 1] = float(frame_number - i) / float( frame_number ) # fraction through phone backwards current_block_binary_array[ i, self.dict_size + 2] = float( frame_number) # phone duration elif self.subphone_feats == 'coarse_coding': ## features which distinguish frame position in phoneme using three continous numerical features current_block_binary_array[i, self.dict_size + 0] = cc_feat_matrix[i, 0] current_block_binary_array[i, self.dict_size + 1] = cc_feat_matrix[i, 1] current_block_binary_array[i, self.dict_size + 2] = cc_feat_matrix[i, 2] current_block_binary_array[i, self.dict_size + 3] = float(frame_number) elif self.subphone_feats == 'none': pass else: sys.exit('unknown subphone_feats type') label_feature_matrix[ label_feature_index:label_feature_index + frame_number, ] = current_block_binary_array label_feature_index = label_feature_index + frame_number elif self.subphone_feats == 'none': current_block_binary_array = label_vector label_feature_matrix[label_feature_index:label_feature_index + 1, ] = current_block_binary_array label_feature_index = label_feature_index + 1 fid.close() label_feature_matrix = label_feature_matrix[0:label_feature_index, ] logger.info('loaded %s, %3d labels' % (file_name, ph_count)) logger.debug('made label matrix of %d frames x %d labels' % label_feature_matrix.shape) return label_feature_matrix
def trim_silence(in_list, out_list, in_dimension, label_list, label_dimension, \ silence_feature_index, percent_to_keep=0): ''' Function to trim silence from binary label/speech files based on binary labels. in_list: list of binary label/speech files to trim out_list: trimmed files in_dimension: dimension of data to trim label_list: list of binary labels which contain trimming criterion label_dimesion: silence_feature_index: index of feature in labels which is silence: 1 means silence (trim), 0 means leave. ''' assert len(in_list) == len(out_list) == len(label_list) io_funcs = BinaryIOCollection() for (infile, outfile, label_file) in zip(in_list, out_list, label_list): data = io_funcs.load_binary_file(infile, in_dimension) label = io_funcs.load_binary_file(label_file, label_dimension) audio_label_difference = data.shape[0] - label.shape[0] assert math.fabs(audio_label_difference) < 3, '%s and %s contain different numbers of frames: %s %s' % ( infile, label_file, data.shape[0], label.shape[0]) ## In case they are different, resize -- keep label fixed as we assume this has ## already been processed. (This problem only arose with STRAIGHT features.) if audio_label_difference < 0: ## label is longer -- pad audio to match by repeating last frame: print('audio too short -- pad') padding = numpy.vstack([data[-1, :]] * int(math.fabs(audio_label_difference))) data = numpy.vstack([data, padding]) elif audio_label_difference > 0: ## audio is longer -- cut it print('audio too long -- trim') new_length = label.shape[0] data = data[:new_length, :] # else: -- expected case -- lengths match, so do nothing silence_flag = label[:, silence_feature_index] # print silence_flag if not (numpy.unique(silence_flag) == numpy.array([0, 1])).all(): ## if it's all 0s or 1s, that's ok: assert (numpy.unique(silence_flag) == numpy.array([0]).all()) or \ (numpy.unique(silence_flag) == numpy.array([1]).all()), \ 'dimension %s of %s contains values other than 0 and 1' % (silence_feature_index, infile) print('Remove %d%% of frames (%s frames) as silence... ' % ( 100 * numpy.sum(silence_flag / float(len(silence_flag))), int(numpy.sum(silence_flag)))) non_silence_indices = numpy.nonzero( silence_flag == 0) ## get the indices where silence_flag == 0 is True (i.e. != 0) if percent_to_keep != 0: assert type(percent_to_keep) == int and percent_to_keep > 0 # print silence_flag silence_indices = numpy.nonzero(silence_flag == 1) ## nonzero returns a tuple of arrays, one for each dimension of input array silence_indices = silence_indices[0] every_nth = 100 / percent_to_keep silence_indices_to_keep = silence_indices[::every_nth] ## every_nth used +as step value in slice ## -1 due to weird error with STRAIGHT features at line 144: ## IndexError: index 445 is out of bounds for axis 0 with size 445 if len(silence_indices_to_keep) == 0: silence_indices_to_keep = numpy.array([1]) ## avoid errors in case there is no silence print(' Restore %s%% (every %sth frame: %s frames) of silent frames' % ( percent_to_keep, every_nth, len(silence_indices_to_keep))) ## Append to end of utt -- same function used for labels and audio ## means that violation of temporal order doesn't matter -- will be consistent. ## Later, frame shuffling will disperse silent frames evenly across minibatches: non_silence_indices = (numpy.hstack([non_silence_indices[0], silence_indices_to_keep])) ## ^---- from tuple and back (see nonzero note above) trimmed_data = data[non_silence_indices, :] ## advanced integer indexing io_funcs.array_to_binary_file(trimmed_data, outfile)
class provider(object): def __init__(self, list_path, dim_lab, dim_cmp, root_lab, root_cmp, train_num, batch_size, mode, index): f_list = open(list_path, 'r') lines_list = f_list.readlines() f_list.close() self.dim_lab = dim_lab self.dim_cmp = dim_cmp self.list_labels = [ root_lab + item.split()[0] + '.lab' for item in lines_list ] self.list_cmp = [ root_cmp + item.split()[0] + '.cmp' for item in lines_list ] for i in range(0, len(self.list_labels)): assert self.list_labels[i].split('.')[0].split( '/')[-1] == self.list_cmp[i].split('.')[0].split('/')[-1] self.list_index = 0 self.end_reading = False self.io_tool = BinaryIOCollection() self.batch_size = batch_size if mode == 'train': self.list_labels_using = self.list_labels[:train_num] self.list_cmp_using = self.list_cmp[:train_num] self.len_list = len(self.list_labels_using) if mode == 'valid': self.list_labels_using = self.list_labels[train_num:] self.list_cmp_using = self.list_cmp[train_num:] self.len_list = len(self.list_labels_using) if index == 1: self.index_array = np.asarray([1, 0, 0]) self.index_array = np.tile(self.index_array.reshape(1, -1), (self.batch_size, 1)).astype(np.float32) elif index == 2: self.index_array = np.asarray([0, 1, 0]) self.index_array = np.tile(self.index_array.reshape(1, -1), (self.batch_size, 1)).astype(np.float32) elif index == 3: self.index_array = np.asarray([0, 0, 1]) self.index_array = np.tile(self.index_array.reshape(1, -1), (self.batch_size, 1)).astype(np.float32) else: raise Exception('out of index') def reset(self): self.list_index = 0 self.end_reading = False c = list(zip(self.list_labels_using, self.list_cmp_using)) random.shuffle(c) self.list_labels_using, self.list_cmp_using = zip(*c) self.list_labels_using = list(self.list_labels_using) self.list_cmp_using = list(self.list_cmp_using) def load_one_batch(self): list_input = [] list_target = [] for i in range(0, self.batch_size): #print self.list_labels_using[self.list_index] labs = self.io_tool.load_binary_file( self.list_labels_using[self.list_index], self.dim_lab) cmps = self.io_tool.load_binary_file( self.list_cmp_using[self.list_index], self.dim_cmp) assert labs.shape[0] == cmps.shape[0] list_input.append(labs.astype(np.float32)) list_target.append(cmps.astype(np.float32)) self.list_index += 1 if self.list_index + self.batch_size - 1 >= self.len_list: self.end_reading = True return list_input, list_target, self.index_array