def build_phoneme_dicts(text_lang_pairs): """Create dictionaries (possibly more languages) of words (from a list of texts) with IPA equivalents.""" dictionaries = {} Logger.progress(0 / len(text_lang_pairs), prefix='Building phoneme dictionary:') for i, (t, l) in enumerate(text_lang_pairs): if not (l in dictionaries): dictionaries[l] = {} clear_words = remove_punctuation(t).split() for w in clear_words: if w in dictionaries[l]: continue dictionaries[l][w] = _phonemize(w, l)[:-1] Logger.progress((i + 1) / len(text_lang_pairs), prefix='Building phoneme dictionary:') return dictionaries
def create_meta_file(dataset_name, dataset_root_dir, output_metafile_name, audio_sample_rate, num_fft_freqs, spectrograms=True, phonemes=True): """Create the meta-file and spectrograms (mel and linear, optionally) or phonemized utterances (optionally). Format details: Every line of the metadata file contains info about one dataset item. The line has following format 'id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text' And the following must hold 'audio_file_path' can be empty if loading just spectrograms 'text' should be carefully normalized and should contain interpunction 'phonemized_text' can be empty if loading just raw text Arguments: dataset_name (string): Name of the dataset, loaders.py should contain a function for loading with a corresponding name. dataset_root_dir (string): Root directory from which is the dataset build and to which are spectrograms and the meta-file saved.. output_metafile_name (string): Name of the output meta-file. audio_sample_rate (int): Sample rate of audios, used if spectrograms is set True. num_fft_freqs (int): Number of frequency bands used during spectrogram computation, used if spectrograms is set True. Keyword arguments: spectrograms (boolean, default True): If true, spetrograms (both mel and linear) are computed and saved. phonemes (boolean, default True): If true, phonemized variants of utterances are computed and saved. """ # save current sample rate and fft freqs hyperparameters, as we may process dataset with different sample rate if spectrograms: old_sample_rate = hp.sample_rate hp.sample_rate = audio_sample_rate old_fft_freqs = hp.num_fft hp.num_fft = num_fft_freqs # load metafiles, an item is a list like: [text, audiopath, speaker_id, language_code] items = loaders.get_loader_by_name(dataset_name)(dataset_root_dir) # build dictionaries for translation to IPA from source languages, see utils.text for details if phonemes: text_lang_pairs = [(i[0], hp.languages[0] if i[3] == "" else i[3]) for i in items] phoneme_dicts = text.build_phoneme_dicts(text_lang_pairs) # prepare directories which will store spectrograms if spectrograms: spectrogram_dirs = [os.path.join(dataset_root_dir, 'spectrograms'), os.path.join(dataset_root_dir, 'linear_spectrograms')] for x in spectrogram_dirs: if not os.path.exists(x): os.makedirs(x) # iterate through items and build the meta-file metafile_path = os.path.join(dataset_root_dir, output_metafile_name) with open(metafile_path, 'w', encoding='utf-8') as f: Logger.progress(0, prefix='Building metafile:') for i in range(len(items)): raw_text, audio_path, speaker, language = items[i] if language == "": language = hp.languages[0] phonemized_text = text.to_phoneme(raw_text, False, language, phoneme_dicts[language]) if phonemes else "" spectrogram_paths = "|" if spectrograms: spec_name = f'{str(i).zfill(6)}.npy' audio_data = audio.load(os.path.join(dataset_root_dir, audio_path)) np.save(os.path.join(spectrogram_dirs[0], spec_name), audio.spectrogram(audio_data, True)) np.save(os.path.join(spectrogram_dirs[1], spec_name), audio.spectrogram(audio_data, False)) spectrogram_paths = os.path.join('spectrograms', spec_name) + '|' + os.path.join('linear_spectrograms', spec_name) print(f'{str(i).zfill(6)}|{speaker}|{language}|{audio_path}|{spectrogram_paths}|{raw_text}|{phonemized_text}', file=f) Logger.progress((i + 1) / len(items), prefix='Building metafile:') # restore the original sample rate and fft freq values if spectrograms: hp.sample_rate = old_sample_rate hp.num_fft = old_fft_freqs