def process(lab_dir, id_list, out_dir, state_level): """Processes label files in id_list, saves the phone identities (as a string) to text files. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. """ file_ids = utils.get_file_ids(id_list=id_list) utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) for file_id in file_ids: # Label processing. lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) phones = label.phones n_phones = len(label.phones) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))
def save_file(self, data, base_name, data_dir): r"""Saves data as a text file. Parameters ---------- data : int or float or bool or `np.ndarray`, shape (seq_len, feat_dim) Data loaded from the file specified. base_name : str The name (without extensions) of the file to be loaded. data_dir : str The directory containing all feature types for this dataset. """ file_path = self.file_path(base_name, data_dir) file_io.save_txt(data, file_path)
def cluster(embeddings, n_clusters, names=None, out_dir=None): """Processes wav files in id_list, saves the log-F0 and MVN parameters to files. Args: embeddings_dir (str): Directory containing the embedding files. n_clusters (int): Number of clusters for k-means. names (str): List of file basenames to process. out_dir (str): Directory to save the output to. """ if out_dir is not None: if names is None: raise ValueError( 'If `out_dir` is given, then `names` of individual sentences must also be given' ) centres_path = os.path.join(out_dir, 'k_means', 'clusters') make_dirs(centres_path, names) assignments_path = os.path.join(out_dir, 'k_means', 'cluster_assignments') make_dirs(assignments_path, names) # Cluster with k-means. kmeans = KMeans(n_clusters=n_clusters).fit(embeddings) cluster_centres = kmeans.cluster_centers_ cluster_assignments = kmeans.labels_ # Save the cluster assignments and clusters to files. if out_dir is not None: cluster_names = [f'cluster_{i}' for i in range(n_clusters)] file_io.save_dir(file_io.save_bin, centres_path, cluster_centres, cluster_names, feat_ext='npy') file_io.save_dir(file_io.save_txt, assignments_path, cluster_assignments, names, feat_ext='txt') counts = np.array([(i, cluster_assignments.reshape(-1).tolist().count(i)) for i in range(n_clusters)]) file_io.save_txt(counts, f'{assignments_path}_counts.txt') return cluster_centres, cluster_assignments
def analysis_for_train_epoch(self, out_dir, **kwargs): pred_dir = os.path.join(out_dir, 'feats') os.makedirs(pred_dir, exist_ok=True) # Get pseudo inputs and calculate prior using the encoder. prior_mean, prior_log_variance = self.encoder_layer( self.pseudo_inputs, seq_len=self.pseudo_inputs_seq_lens) prior_mean = prior_mean.cpu().detach().numpy() prior_log_variance = prior_log_variance.cpu().detach().numpy() file_io.save_dir(file_io.save_bin, path=os.path.join(pred_dir, 'prior'), data=prior_mean, file_ids=self.pseudo_input_names) embeddings = self.metrics.metrics['embeddings'].result().detach().cpu( ).numpy() names = self.metrics.metrics['name'].result() # Names and classes are at a sentence level, change these to segment level for use in the scatter plot. n_segments = self.metrics.metrics['n_segments'].result().detach().cpu( ).numpy().squeeze(1) segment_names = [ f'{names[i]}_{j}' for i, n_segment in enumerate(n_segments) for j in range(n_segment) ] segment_mean_F0 = self.metrics.metrics['segment_mean_F0'].result( ).detach().cpu().numpy().squeeze(1) title = out_dir.split('experiments/')[-1] for proj in ['PCA', 'tSNE']: viz.scatter_plot(embeddings, segment_names, prior_mean, self.pseudo_input_names, gradients=segment_mean_F0, gradient_title='Mean phrase F0 (Hz)', projection=proj, title=title, out_file=os.path.join( out_dir, f'scatter_{proj}_mean_F0.pdf')) def get_class_assignments(z, mean, log_variance): densities = np.sum(-0.5 * (log_variance + (z - mean)**2 / np.exp(log_variance)), axis=-1) return np.argmax(densities, axis=-1) posterior_classes = get_class_assignments( embeddings[:, None, :], prior_mean[None, :, :], prior_log_variance[None, :, :]) file_io.save_dir(file_io.save_txt, os.path.join(pred_dir, 'classes'), posterior_classes, segment_names, feat_ext='txt') counts = np.array([(i, posterior_classes.reshape(-1).tolist().count(i)) for i in range(self.n_components)]) file_io.save_txt(counts, os.path.join(pred_dir, 'class_counts.txt'))
def process(lab_dir, wav_dir, id_list, out_dir, state_level, question_file, upsample, subphone_feat_type, trim_silences, calculate_normalisation, normalisation_of_deltas): """Processes wav files in id_list, saves the log-F0 and MVN parameters to files. Args: lab_dir (str): Directory containing the label files. wav_dir (str): Directory containing the wav files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. question_file (str): Question set to be loaded. Can be one of the four provided question sets; questions-unilex_dnn_600.hed questions-unilex_phones_69.hed questions-radio_dnn_416.hed questions-radio_phones_48.hed questions-mandarin.hed questions-japanese.hed upsample (bool): Whether to upsample phone-level numerical labels to frame-level. subphone_feat_type (str): Subphone features to be extracted from the durations. trim_silences (bool): Whether to trim start and end silences from all features. calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0. normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features. """ file_ids = utils.get_file_ids(id_list=id_list) question_set = lab_to_feat.QuestionSet(question_file) subphone_feature_set = lab_to_feat.SubphoneFeatureSet(subphone_feat_type) utils.make_dirs(os.path.join(out_dir, 'lab'), file_ids) utils.make_dirs(os.path.join(out_dir, 'counters'), file_ids) utils.make_dirs(os.path.join(out_dir, 'dur'), file_ids) utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_frames'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids) utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids) utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids) utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids) for file_id in tqdm(file_ids): # Label processing. lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) numerical_labels = label.extract_numerical_labels( question_set, upsample_to_frame_level=upsample) counter_features = label.extract_counter_features(subphone_feature_set) durations = label.phone_durations.reshape((-1, 1)) phones = label.phones n_frames = np.sum(durations).item() n_phones = len(label.phones) # Acoustic processing. wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate) lf0 = np.log(f0) # Match the number of frames between label forced-alignment and vocoder analysis. # Often the durations from forced alignment are a few frames longer than the vocoder features. diff = n_frames - f0.shape[0] if diff > n_phones: raise ValueError( f'Number of label frames and vocoder frames is too different for {file_id}\n' f'\tlabel frames {n_frames}\n' f'\tvocoder frames {f0.shape[0]}\n' f'\tnumber of phones {n_phones}') # Remove excess durations if there is a shape mismatch. if diff > 0: # Remove 1 frame from each phone's duration starting at the end of the sequence. durations[-diff:] -= 1 n_frames = f0.shape[0] print( f'Cropped {diff} frames from durations for utterance {file_id}' ) assert n_frames == np.sum(durations).item() trim_frame_slice = slice(0, n_frames) if trim_silences: start_phone_idx, end_phone_idx = 0, n_phones start_frame_idx, end_frame_idx = 0, n_frames if phones[0] in ['sil', '#']: start_phone_idx += 1 start_frame_idx += durations[0] if phones[-1] in ['sil', '#']: end_phone_idx -= 1 end_frame_idx -= durations[-1] trim_phone_slice = slice(int(start_phone_idx), int(end_phone_idx)) trim_frame_slice = slice(int(start_frame_idx), int(end_frame_idx)) numerical_labels = numerical_labels[ trim_frame_slice if upsample else trim_phone_slice] durations = durations[trim_phone_slice] phones = phones[trim_phone_slice] n_frames = trim_frame_slice.stop - trim_frame_slice.start n_phones = trim_phone_slice.stop - trim_phone_slice.start counter_features = counter_features[trim_frame_slice] lf0 = lf0[trim_frame_slice] vuv = vuv[trim_frame_slice] mcep = mcep[trim_frame_slice] bap = bap[trim_frame_slice] file_io.save_bin(numerical_labels.astype(np.float32), os.path.join(out_dir, 'lab', f'{file_id}.npy')) file_io.save_bin(counter_features.astype(np.float32), os.path.join(out_dir, 'counters', f'{file_id}.npy')) file_io.save_txt(durations, os.path.join(out_dir, 'dur', f'{file_id}.txt')) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_frames, os.path.join(out_dir, 'n_frames', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt')) file_io.save_bin(lf0.astype(np.float32), os.path.join(out_dir, 'lf0', f'{file_id}.npy')) file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy')) file_io.save_bin(mcep.astype(np.float32), os.path.join(out_dir, 'mcep', f'{file_id}.npy')) file_io.save_bin(bap.astype(np.float32), os.path.join(out_dir, 'bap', f'{file_id}.npy')) if calculate_normalisation: process_minmax(out_dir, 'lab', id_list, out_dir=out_dir) process_minmax(out_dir, 'counters', id_list, out_dir=out_dir) process_mvn(out_dir, 'dur', is_npy=False, id_list=id_list, deltas=False, out_dir=out_dir) process_mvn(out_dir, 'lf0', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir) process_mvn(out_dir, 'mcep', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir) process_mvn(out_dir, 'bap', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
def process(lab_dir, id_list, out_dir, state_level, question_file, upsample, subphone_feat_type, calculate_normalisation): """Processes label files in id_list, saves the numerical labels and durations to file. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. question_file (str): Question set to be loaded. Can be one of the four provided question sets; questions-unilex_dnn_600.hed questions-unilex_phones_69.hed questions-radio_dnn_416.hed questions-radio_phones_48.hed questions-mandarin.hed questions-japanese.hed upsample (bool): Whether to upsample phone-level numerical labels to frame-level. subphone_feat_type (str): Subphone features to be extracted from the durations. calculate_normalisation (bool): Calculate mean-variance and min-max normalisation for duration and labels. """ file_ids = get_file_ids(id_list=id_list) question_set = QuestionSet(question_file) subphone_feature_set = SubphoneFeatureSet(subphone_feat_type) make_dirs(os.path.join(out_dir, 'lab'), file_ids) make_dirs(os.path.join(out_dir, 'counters'), file_ids) make_dirs(os.path.join(out_dir, 'dur'), file_ids) make_dirs(os.path.join(out_dir, 'phones'), file_ids) make_dirs(os.path.join(out_dir, 'n_frames'), file_ids) make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) for file_id in file_ids: lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = Label(lab_path, state_level) numerical_labels = label.extract_numerical_labels( question_set, upsample_to_frame_level=upsample) counter_features = label.extract_counter_features(subphone_feature_set) durations = label.phone_durations.reshape((-1, 1)) phones = label.phones n_frames = np.sum(durations).item() n_phones = len(label.phones) file_io.save_bin(numerical_labels, os.path.join(out_dir, 'lab', f'{file_id}.npy')) file_io.save_bin(counter_features, os.path.join(out_dir, 'counters', f'{file_id}.npy')) file_io.save_txt(durations, os.path.join(out_dir, 'dur', f'{file_id}.dur')) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_frames, os.path.join(out_dir, 'n_frames', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt')) if calculate_normalisation: process_minmax(out_dir, 'lab', id_list) process_minmax(out_dir, 'counters', id_list) process_mvn(out_dir, 'dur', is_npy=False, id_list=id_list, deltas=False)
def process(lab_dir, id_list, out_dir, state_level, lab_dir_with_pos, wav_dir): """Processes label files in id_list, saves the phone identities (as a string) to text files. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. """ file_ids = utils.get_file_ids(id_list=id_list) utils.make_dirs(os.path.join(out_dir, 'segment_n_phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'segment_n_frames'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_segments'), file_ids) for file_id in file_ids: lab_path_with_pos = os.path.join(lab_dir_with_pos, f'{file_id}.lab') label_with_pos = file_io.load_lines(lab_path_with_pos) word_start_idxs, _ = get_word_idxs( label_with_pos, word_idx_sep=(r'@', r'\+'), phrase_idx_sep=(r'@', r'=')) pos_tags = get_pos_tags(label_with_pos, word_start_idxs) lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) durations = label.phone_durations n_frames = np.sum(durations).item() n_phones = len(label.phones) word_start_idxs, word_end_idxs = get_word_idxs( label.labels, word_idx_sep=(r':', r'\+'), phrase_idx_sep=(r':', r'=')) try: segment_start_idxs, segment_end_idxs = segment_words(word_start_idxs, word_end_idxs, pos_tags) except (ValueError, IndexError) as e: print(f'{e}\n{file_id}') else: wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, _, _, _ = world_with_reaper_f0.analysis(wav, sample_rate) # Match the number of frames between label forced-alignment and vocoder analysis. # Often the durations from forced alignment are a few frames longer than the vocoder features. diff = n_frames - f0.shape[0] if diff > n_phones: raise ValueError(f'Number of label frames and vocoder frames is too different for {file_id}\n' f'\tlabel frames {n_frames}\n' f'\tvocoder frames {f0.shape[0]}\n' f'\tnumber of phones {n_phones}') # Remove excess durations if there is a shape mismatch. if diff > 0: # Remove 1 frame from each phone's duration starting at the end of the sequence. durations[-diff:] -= 1 n_frames = f0.shape[0] print(f'Cropped {diff} frames from durations for utterance {file_id}') assert n_frames == np.sum(durations).item() segment_phone_lens = [] segment_frame_lens = [] for segment_start_idx, segment_end_idx in zip(segment_start_idxs, segment_end_idxs): segment_phone_lens.append(segment_end_idx - segment_start_idx) segment_frame_lens.append(sum(durations[segment_start_idx:segment_end_idx])) file_io.save_txt(segment_phone_lens, os.path.join(out_dir, 'segment_n_phones', f'{file_id}.txt')) file_io.save_txt(segment_frame_lens, os.path.join(out_dir, 'segment_n_frames', f'{file_id}.txt')) file_io.save_txt(len(segment_phone_lens), os.path.join(out_dir, 'n_segments', f'{file_id}.txt'))