def load_spectrogram(self, audio_path, spectrogram_path, normalize, is_mel): """Load a mel or linear spectrogram from file or compute from scratch if needed. Arguments: audio_path (string): Path to the audio from which will (possibly) be the spectrogram computed. spectrogram_path (string): Path to the spectrogram file which will be loaded (possibly). normalize (boolean): If True, the spectrogram is normalized (per channel, extract mean and divide by std). is_mel (boolean): If True, the mel spectrogram is loaded or computed, otherwise returns a linear spectrogram. """ # load or compute spectrogram if hp.cache_spectrograms: full_spec_path = os.path.join(self.root_dir, spectrogram_path) spectrogram = np.load(full_spec_path) else: full_audio_path = os.path.join(self.root_dir, audio_path) audio_data = audio.load(full_audio_path) spectrogram = audio.spectrogram(audio_data, is_mel) # check spectrogram dimensions expected_dimension = hp.num_mels if is_mel else hp.num_fft // 2 + 1 assert np.shape(spectrogram)[0] == expected_dimension, ( f'Spectrogram dimensions mismatch: given {np.shape(spectrogram)[0]}, expected {expected_dimension}') # normalize if desired if normalize: spectrogram = audio.normalize_spectrogram(spectrogram, is_mel) return spectrogram
metadata = [] for d, fs in files_to_solve: with open(os.path.join(d, fs), 'r', encoding='utf-8') as f: metadata.append((d, fs, [line.rstrip().split('|') for line in f])) print("metadata is:::", metadata) print(f'Please wait, this may take a very long time.') for d, fs, m in metadata: print(f'Creating spectrograms for: {fs}') with open(os.path.join(d, fs), 'w', encoding='utf-8') as f: for i in m: idx, s, l, a, _, _, raw_text, ph = i spec_name = idx + '.npy' audio_path = os.path.join(d, a) audio_data = audio.load(audio_path) mel_path = os.path.join(spectrogram_dirs[0], spec_name) lin_path = os.path.join(spectrogram_dirs[1], spec_name) #mel_path = os.path.join(d, mel_path_partial) if not os.path.exists(mel_path): np.save(mel_path, audio.spectrogram(audio_data, True)) #lin_path = os.path.join(d, lin_path_partial) if not os.path.exists(lin_path): np.save(lin_path, audio.spectrogram(audio_data, False)) print( f'{idx}|{s}|{l}|{a}|{mel_path}|{lin_path}|{raw_text}|{ph}', file=f)
def create_meta_file(dataset_name, dataset_root_dir, output_metafile_name, audio_sample_rate, num_fft_freqs, spectrograms=True, phonemes=True): """Create the meta-file and spectrograms (mel and linear, optionally) or phonemized utterances (optionally). Format details: Every line of the metadata file contains info about one dataset item. The line has following format 'id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text' And the following must hold 'audio_file_path' can be empty if loading just spectrograms 'text' should be carefully normalized and should contain interpunction 'phonemized_text' can be empty if loading just raw text Arguments: dataset_name (string): Name of the dataset, loaders.py should contain a function for loading with a corresponding name. dataset_root_dir (string): Root directory from which is the dataset build and to which are spectrograms and the meta-file saved.. output_metafile_name (string): Name of the output meta-file. audio_sample_rate (int): Sample rate of audios, used if spectrograms is set True. num_fft_freqs (int): Number of frequency bands used during spectrogram computation, used if spectrograms is set True. Keyword arguments: spectrograms (boolean, default True): If true, spetrograms (both mel and linear) are computed and saved. phonemes (boolean, default True): If true, phonemized variants of utterances are computed and saved. """ # save current sample rate and fft freqs hyperparameters, as we may process dataset with different sample rate if spectrograms: old_sample_rate = hp.sample_rate hp.sample_rate = audio_sample_rate old_fft_freqs = hp.num_fft hp.num_fft = num_fft_freqs # load metafiles, an item is a list like: [text, audiopath, speaker_id, language_code] items = loaders.get_loader_by_name(dataset_name)(dataset_root_dir) # build dictionaries for translation to IPA from source languages, see utils.text for details if phonemes: text_lang_pairs = [(i[0], hp.languages[0] if i[3] == "" else i[3]) for i in items] phoneme_dicts = text.build_phoneme_dicts(text_lang_pairs) # prepare directories which will store spectrograms if spectrograms: spectrogram_dirs = [os.path.join(dataset_root_dir, 'spectrograms'), os.path.join(dataset_root_dir, 'linear_spectrograms')] for x in spectrogram_dirs: if not os.path.exists(x): os.makedirs(x) # iterate through items and build the meta-file metafile_path = os.path.join(dataset_root_dir, output_metafile_name) with open(metafile_path, 'w', encoding='utf-8') as f: Logger.progress(0, prefix='Building metafile:') for i in range(len(items)): raw_text, audio_path, speaker, language = items[i] if language == "": language = hp.languages[0] phonemized_text = text.to_phoneme(raw_text, False, language, phoneme_dicts[language]) if phonemes else "" spectrogram_paths = "|" if spectrograms: spec_name = f'{str(i).zfill(6)}.npy' audio_data = audio.load(os.path.join(dataset_root_dir, audio_path)) np.save(os.path.join(spectrogram_dirs[0], spec_name), audio.spectrogram(audio_data, True)) np.save(os.path.join(spectrogram_dirs[1], spec_name), audio.spectrogram(audio_data, False)) spectrogram_paths = os.path.join('spectrograms', spec_name) + '|' + os.path.join('linear_spectrograms', spec_name) print(f'{str(i).zfill(6)}|{speaker}|{language}|{audio_path}|{spectrogram_paths}|{raw_text}|{phonemized_text}', file=f) Logger.progress((i + 1) / len(items), prefix='Building metafile:') # restore the original sample rate and fft freq values if spectrograms: hp.sample_rate = old_sample_rate hp.num_fft = old_fft_freqs