def process(wav_dir, id_list, out_dir, calculate_normalisation, normalisation_of_deltas): """Processes wav files in id_list, saves the log-F0 and MVN parameters to files. Args: wav_dir (str): Directory containing the wav files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0. normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features. """ file_ids = get_file_ids(wav_dir, id_list) make_dirs(os.path.join(out_dir, 'lf0'), file_ids) make_dirs(os.path.join(out_dir, 'vuv'), file_ids) for file_id in file_ids: wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, vuv = analysis(wav, sample_rate) lf0 = np.log(f0) file_io.save_bin(lf0, os.path.join(out_dir, 'lf0', f'{file_id}.npy')) file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy')) if calculate_normalisation: process_mvn(out_dir, 'lf0', id_list=id_list, deltas=normalisation_of_deltas)
def process_dir(festival_dir, txt_dir, id_list, out_dir, custom_voice=None): """Create Utterance structures for all sentences in `id_list` and save them to `out_dir`. Args: festival_dir (str): Directory containing festival installation. txt_dir (str): Directory containing text transcriptions. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. """ file_ids = utils.get_file_ids(id_list=id_list) sentences = [] # For all file_ids load the sentence and add a command to create and save the Utterance structure. for file_id in sorted(file_ids): sentence = file_io.load_lines(os.path.join(txt_dir, f'{file_id}.txt'))[0] sentence = sentence.replace('"', '\\"') sentences.append(sentence) # If the file_ids are paths (e.g. for multi-speaker data), make sure the directory structure is already in place. utils.make_dirs(os.path.join(out_dir, 'utts'), file_ids) # Create and save the Utterance structures. create_utterances(festival_dir, file_ids, sentences, out_dir, custom_voice=custom_voice)
def process(data_dir, feat_name, id_list=None, is_npy=True, out_dir=None): """Calculates the min-max normalisation statistics from a directory of features. Args: data_dir (str): Root directory containing folders of features. feat_name (str): Name of the feature to be normalised. id_list (str): List of file names to process. is_npy (bool): If True uses `file_io.load_bin`, otherwise uses `file_io.load_txt` to load each file. ext (str): File extension of the saved features. out_dir (str): Location to save the normalisation parameters to. """ feat_dir = os.path.join(data_dir, feat_name) file_ids = utils.get_file_ids(id_list=id_list) if is_npy: feature_list = file_io.load_dir(file_io.load_bin, feat_dir, file_ids, feat_ext='npy') else: feature_list = file_io.load_dir(file_io.load_txt, feat_dir, file_ids, feat_ext='txt') minmax_params = calculate_minmax_parameters(feature_list) if out_dir is not None: minmax_file_path = os.path.join(out_dir, f'{feat_name}_minmax.json') file_io.save_json(minmax_params, minmax_file_path)
def process(lab_dir, id_list, out_dir, state_level): """Processes label files in id_list, saves the phone identities (as a string) to text files. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. """ file_ids = utils.get_file_ids(id_list=id_list) utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) for file_id in file_ids: # Label processing. lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) phones = label.phones n_phones = len(label.phones) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))
def process(festival_dir, utt_dir, id_list, out_dir, feature_level='Segment', extra_feats_scm='extra_feats.scm', label_feats='label.feats', label_full_awk='label-full.awk', label_mono_awk='label-mono.awk', custom_voice=None): """Create flat HTS-style full-context labels. Args: festival_dir (str): Directory containing festival installation. utt_dir (str): Directory containing Utterance structures. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. extra_feats_scm (str): . label_feats (str): . label_full_awk (str): . label_mono_awk (str): . """ file_ids = utils.get_file_ids(id_list=id_list) dumpfeats_exe = os.path.join(festival_dir, 'examples', 'dumpfeats') label_dump_dir = os.path.join(out_dir, 'label_phone_align', 'dump') label_full_dir = os.path.join(out_dir, 'label_phone_align', 'full') label_mono_dir = os.path.join(out_dir, 'label_phone_align', 'mono') label_no_align_dir = os.path.join(out_dir, 'label_no_align') mono_no_align_dir = os.path.join(out_dir, 'mono_no_align') # Create the flattened features and format them according to `label_full_awk` and `label_mono_awk`. utts_to_dumps(dumpfeats_exe, utt_dir, file_ids, label_dump_dir, feature_level, extra_feats_scm, label_feats, custom_voice) dumps_to_labs(label_dump_dir, file_ids, label_full_dir, label_full_awk) dumps_to_labs(label_dump_dir, file_ids, label_mono_dir, label_mono_awk) # Clean up the full-context label features: replace initial pauses with 'sil' and remove timestamps. sanitise_labs(label_full_dir, file_ids, label_no_align_dir, include_times=False, state_level=False) sanitise_labs(label_mono_dir, file_ids, mono_no_align_dir, include_times=False, state_level=False, is_mono=True)
def __init__(self, htk_dir, lab_dir, wav_dir, id_list, out_dir): self.HCompV = os.path.join(htk_dir, 'bin', 'HCompV') self.HCopy = os.path.join(htk_dir, 'bin', 'HCopy') self.HERest = os.path.join(htk_dir, 'bin', 'HERest') self.HHEd = os.path.join(htk_dir, 'bin', 'HHEd') self.HVite = os.path.join(htk_dir, 'bin', 'HVite') self.wav_dir = wav_dir self.lab_dir = lab_dir self.file_ids = get_file_ids(id_list=id_list) self.file_ids = self.check_file_ids(self.file_ids) print('---preparing environment') # Directories # ----------- self.cfg_dir = os.path.join(out_dir, 'config') self.model_dir = os.path.join(out_dir, 'model') self.cur_dir = os.path.join(self.model_dir, 'hmm0') self.mfc_dir = os.path.join(out_dir, 'mfc') self.mono_lab_dir = os.path.join(out_dir, 'mono_no_align') os.makedirs(self.cfg_dir, exist_ok=True) os.makedirs(self.cur_dir, exist_ok=True) os.makedirs(self.mfc_dir, exist_ok=True) os.makedirs(self.mono_lab_dir, exist_ok=True) # Paths # ----- self.phonemes = os.path.join(out_dir, 'mono_phone.list') self.phoneme_map = os.path.join(out_dir, 'phoneme_map.dict') self.align_mlf = os.path.join(out_dir, 'mono_align.mlf') # HMMs self.proto = os.path.join(self.cfg_dir, 'proto') # SCP files self.copy_scp = os.path.join(self.cfg_dir, 'copy.scp') self.train_scp = os.path.join(self.cfg_dir, 'train.scp') self.phoneme_mlf = os.path.join(self.cfg_dir, 'mono_phone.mlf') # CFG self.cfg = os.path.join(self.cfg_dir, 'cfg')
def process(embeddings_dir, n_clusters, id_list, out_dir): """Processes wav files in id_list, saves the log-F0 and MVN parameters to files. Args: embeddings_dir (str): Directory containing the embedding files. n_clusters (int): Number of clusters for k-means. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. """ file_ids = get_file_ids(id_list=id_list) # Load the embeddings. embeddings = file_io.load_dir(file_io.load_bin, embeddings_dir, file_ids, feat_ext='npy') embeddings = np.array(list(embeddings)) cluster(embeddings, n_clusters, names=file_ids, out_dir=out_dir)
def process(data_dir, feat_name, id_list=None, is_npy=True, deltas=False, out_dir=None): """Calculates the mean-variance normalisation statistics from a directory of features. Args: data_dir (str): Root directory containing folders of features. feat_name (str): Name of the feature to be normalised. id_list (str): List of file names to process. deltas (bool): Also calculate the MVN parameters for the delta and delta-delta features. is_npy (bool): If True uses `file_io.load_bin`, otherwise uses `file_io.load_txt` to load each file. out_dir (str): Location to save the normalisation parameters to. """ feat_dir = os.path.join(data_dir, feat_name) file_ids = utils.get_file_ids(id_list=id_list) if is_npy: feature_list = file_io.load_dir(file_io.load_bin, feat_dir, file_ids, feat_ext='npy') else: feature_list = file_io.load_dir(file_io.load_txt, feat_dir, file_ids, feat_ext='txt') mvn_params, delta_mvn_params = calculate_mvn_parameters( feature_list, deltas) # Possibly save the parameters to json files. if out_dir is not None: mvn_file_path = os.path.join(out_dir, f'{feat_name}_mvn.json') file_io.save_json(mvn_params, mvn_file_path) if deltas: delta_mvn_file_path = os.path.join(out_dir, f'{feat_name}_deltas_mvn.json') file_io.save_json(delta_mvn_params, delta_mvn_file_path)
def load_params(self, data_dir, data_root='.', device='cpu'): r"""Loads the parameters for all speakers from file and stacks them in NumPy arrays and PyTorch tensors. Parameters ---------- data_dir : str Directory containing all data for this dataset split. data_root : str Directory root for this dataset. device : str or torch.device Name of the device to place the parameters on. """ if self.speaker_ids is None: self.speaker_ids = get_file_ids( id_list=os.path.join(data_root, self.speaker_id_list)) for speaker_id in self.speaker_ids: params_file = os.path.join( data_root, data_dir, self.file_pattern.format(name=self.name, speaker_id=speaker_id)) self.params[speaker_id] = self._from_json(params_file) self.params_torch[speaker_id] = self._to_torch( self.params[speaker_id], device=device) if self.use_deltas: delta_params_file = os.path.join( data_root, data_dir, self.file_pattern.format(speaker_id=speaker_id, name=self.name + '_deltas')) self.delta_params[speaker_id] = self._from_json( delta_params_file) self.delta_params_torch[speaker_id] = self._to_torch( self.delta_params[speaker_id], device=device)
def process(lab_dir, wav_dir, id_list, out_dir, state_level, question_file, upsample, subphone_feat_type, trim_silences, calculate_normalisation, normalisation_of_deltas): """Processes wav files in id_list, saves the log-F0 and MVN parameters to files. Args: lab_dir (str): Directory containing the label files. wav_dir (str): Directory containing the wav files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. question_file (str): Question set to be loaded. Can be one of the four provided question sets; questions-unilex_dnn_600.hed questions-unilex_phones_69.hed questions-radio_dnn_416.hed questions-radio_phones_48.hed questions-mandarin.hed questions-japanese.hed upsample (bool): Whether to upsample phone-level numerical labels to frame-level. subphone_feat_type (str): Subphone features to be extracted from the durations. trim_silences (bool): Whether to trim start and end silences from all features. calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0. normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features. """ file_ids = utils.get_file_ids(id_list=id_list) question_set = lab_to_feat.QuestionSet(question_file) subphone_feature_set = lab_to_feat.SubphoneFeatureSet(subphone_feat_type) utils.make_dirs(os.path.join(out_dir, 'lab'), file_ids) utils.make_dirs(os.path.join(out_dir, 'counters'), file_ids) utils.make_dirs(os.path.join(out_dir, 'dur'), file_ids) utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_frames'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids) utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids) utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids) utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids) for file_id in tqdm(file_ids): # Label processing. lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) numerical_labels = label.extract_numerical_labels( question_set, upsample_to_frame_level=upsample) counter_features = label.extract_counter_features(subphone_feature_set) durations = label.phone_durations.reshape((-1, 1)) phones = label.phones n_frames = np.sum(durations).item() n_phones = len(label.phones) # Acoustic processing. wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate) lf0 = np.log(f0) # Match the number of frames between label forced-alignment and vocoder analysis. # Often the durations from forced alignment are a few frames longer than the vocoder features. diff = n_frames - f0.shape[0] if diff > n_phones: raise ValueError( f'Number of label frames and vocoder frames is too different for {file_id}\n' f'\tlabel frames {n_frames}\n' f'\tvocoder frames {f0.shape[0]}\n' f'\tnumber of phones {n_phones}') # Remove excess durations if there is a shape mismatch. if diff > 0: # Remove 1 frame from each phone's duration starting at the end of the sequence. durations[-diff:] -= 1 n_frames = f0.shape[0] print( f'Cropped {diff} frames from durations for utterance {file_id}' ) assert n_frames == np.sum(durations).item() trim_frame_slice = slice(0, n_frames) if trim_silences: start_phone_idx, end_phone_idx = 0, n_phones start_frame_idx, end_frame_idx = 0, n_frames if phones[0] in ['sil', '#']: start_phone_idx += 1 start_frame_idx += durations[0] if phones[-1] in ['sil', '#']: end_phone_idx -= 1 end_frame_idx -= durations[-1] trim_phone_slice = slice(int(start_phone_idx), int(end_phone_idx)) trim_frame_slice = slice(int(start_frame_idx), int(end_frame_idx)) numerical_labels = numerical_labels[ trim_frame_slice if upsample else trim_phone_slice] durations = durations[trim_phone_slice] phones = phones[trim_phone_slice] n_frames = trim_frame_slice.stop - trim_frame_slice.start n_phones = trim_phone_slice.stop - trim_phone_slice.start counter_features = counter_features[trim_frame_slice] lf0 = lf0[trim_frame_slice] vuv = vuv[trim_frame_slice] mcep = mcep[trim_frame_slice] bap = bap[trim_frame_slice] file_io.save_bin(numerical_labels.astype(np.float32), os.path.join(out_dir, 'lab', f'{file_id}.npy')) file_io.save_bin(counter_features.astype(np.float32), os.path.join(out_dir, 'counters', f'{file_id}.npy')) file_io.save_txt(durations, os.path.join(out_dir, 'dur', f'{file_id}.txt')) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_frames, os.path.join(out_dir, 'n_frames', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt')) file_io.save_bin(lf0.astype(np.float32), os.path.join(out_dir, 'lf0', f'{file_id}.npy')) file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy')) file_io.save_bin(mcep.astype(np.float32), os.path.join(out_dir, 'mcep', f'{file_id}.npy')) file_io.save_bin(bap.astype(np.float32), os.path.join(out_dir, 'bap', f'{file_id}.npy')) if calculate_normalisation: process_minmax(out_dir, 'lab', id_list, out_dir=out_dir) process_minmax(out_dir, 'counters', id_list, out_dir=out_dir) process_mvn(out_dir, 'dur', is_npy=False, id_list=id_list, deltas=False, out_dir=out_dir) process_mvn(out_dir, 'lf0', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir) process_mvn(out_dir, 'mcep', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir) process_mvn(out_dir, 'bap', id_list=id_list, deltas=normalisation_of_deltas, out_dir=out_dir)
def process(lab_dir, id_list, out_dir, state_level, question_file, upsample, subphone_feat_type, calculate_normalisation): """Processes label files in id_list, saves the numerical labels and durations to file. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. question_file (str): Question set to be loaded. Can be one of the four provided question sets; questions-unilex_dnn_600.hed questions-unilex_phones_69.hed questions-radio_dnn_416.hed questions-radio_phones_48.hed questions-mandarin.hed questions-japanese.hed upsample (bool): Whether to upsample phone-level numerical labels to frame-level. subphone_feat_type (str): Subphone features to be extracted from the durations. calculate_normalisation (bool): Calculate mean-variance and min-max normalisation for duration and labels. """ file_ids = get_file_ids(id_list=id_list) question_set = QuestionSet(question_file) subphone_feature_set = SubphoneFeatureSet(subphone_feat_type) make_dirs(os.path.join(out_dir, 'lab'), file_ids) make_dirs(os.path.join(out_dir, 'counters'), file_ids) make_dirs(os.path.join(out_dir, 'dur'), file_ids) make_dirs(os.path.join(out_dir, 'phones'), file_ids) make_dirs(os.path.join(out_dir, 'n_frames'), file_ids) make_dirs(os.path.join(out_dir, 'n_phones'), file_ids) for file_id in file_ids: lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = Label(lab_path, state_level) numerical_labels = label.extract_numerical_labels( question_set, upsample_to_frame_level=upsample) counter_features = label.extract_counter_features(subphone_feature_set) durations = label.phone_durations.reshape((-1, 1)) phones = label.phones n_frames = np.sum(durations).item() n_phones = len(label.phones) file_io.save_bin(numerical_labels, os.path.join(out_dir, 'lab', f'{file_id}.npy')) file_io.save_bin(counter_features, os.path.join(out_dir, 'counters', f'{file_id}.npy')) file_io.save_txt(durations, os.path.join(out_dir, 'dur', f'{file_id}.dur')) file_io.save_lines(phones, os.path.join(out_dir, 'phones', f'{file_id}.txt')) file_io.save_txt(n_frames, os.path.join(out_dir, 'n_frames', f'{file_id}.txt')) file_io.save_txt(n_phones, os.path.join(out_dir, 'n_phones', f'{file_id}.txt')) if calculate_normalisation: process_minmax(out_dir, 'lab', id_list) process_minmax(out_dir, 'counters', id_list) process_mvn(out_dir, 'dur', is_npy=False, id_list=id_list, deltas=False)
def process(lab_dir, id_list, out_dir, state_level, lab_dir_with_pos, wav_dir): """Processes label files in id_list, saves the phone identities (as a string) to text files. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. """ file_ids = utils.get_file_ids(id_list=id_list) utils.make_dirs(os.path.join(out_dir, 'segment_n_phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'segment_n_frames'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_segments'), file_ids) for file_id in file_ids: lab_path_with_pos = os.path.join(lab_dir_with_pos, f'{file_id}.lab') label_with_pos = file_io.load_lines(lab_path_with_pos) word_start_idxs, _ = get_word_idxs( label_with_pos, word_idx_sep=(r'@', r'\+'), phrase_idx_sep=(r'@', r'=')) pos_tags = get_pos_tags(label_with_pos, word_start_idxs) lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) durations = label.phone_durations n_frames = np.sum(durations).item() n_phones = len(label.phones) word_start_idxs, word_end_idxs = get_word_idxs( label.labels, word_idx_sep=(r':', r'\+'), phrase_idx_sep=(r':', r'=')) try: segment_start_idxs, segment_end_idxs = segment_words(word_start_idxs, word_end_idxs, pos_tags) except (ValueError, IndexError) as e: print(f'{e}\n{file_id}') else: wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, _, _, _ = world_with_reaper_f0.analysis(wav, sample_rate) # Match the number of frames between label forced-alignment and vocoder analysis. # Often the durations from forced alignment are a few frames longer than the vocoder features. diff = n_frames - f0.shape[0] if diff > n_phones: raise ValueError(f'Number of label frames and vocoder frames is too different for {file_id}\n' f'\tlabel frames {n_frames}\n' f'\tvocoder frames {f0.shape[0]}\n' f'\tnumber of phones {n_phones}') # Remove excess durations if there is a shape mismatch. if diff > 0: # Remove 1 frame from each phone's duration starting at the end of the sequence. durations[-diff:] -= 1 n_frames = f0.shape[0] print(f'Cropped {diff} frames from durations for utterance {file_id}') assert n_frames == np.sum(durations).item() segment_phone_lens = [] segment_frame_lens = [] for segment_start_idx, segment_end_idx in zip(segment_start_idxs, segment_end_idxs): segment_phone_lens.append(segment_end_idx - segment_start_idx) segment_frame_lens.append(sum(durations[segment_start_idx:segment_end_idx])) file_io.save_txt(segment_phone_lens, os.path.join(out_dir, 'segment_n_phones', f'{file_id}.txt')) file_io.save_txt(segment_frame_lens, os.path.join(out_dir, 'segment_n_frames', f'{file_id}.txt')) file_io.save_txt(len(segment_phone_lens), os.path.join(out_dir, 'n_segments', f'{file_id}.txt'))