def load_spectrogram(self, audio_path, spectrogram_path, normalize, is_mel):
        """Load a mel or linear spectrogram from file or compute from scratch if needed.
        
        Arguments:
            audio_path (string): Path to the audio from which will (possibly) be the spectrogram computed.
            spectrogram_path (string): Path to the spectrogram file which will be loaded (possibly).
            normalize (boolean): If True, the spectrogram is normalized (per channel, extract mean and divide by std).
            is_mel (boolean): If True, the mel spectrogram is loaded or computed, otherwise returns a linear spectrogram.
        """

        # load or compute spectrogram
        if hp.cache_spectrograms:
            full_spec_path = os.path.join(self.root_dir, spectrogram_path)
            spectrogram = np.load(full_spec_path)
        else:
            full_audio_path = os.path.join(self.root_dir, audio_path)
            audio_data = audio.load(full_audio_path)
            spectrogram = audio.spectrogram(audio_data, is_mel)

        # check spectrogram dimensions
        expected_dimension = hp.num_mels if is_mel else hp.num_fft // 2 + 1
        assert np.shape(spectrogram)[0] == expected_dimension, (
                f'Spectrogram dimensions mismatch: given {np.shape(spectrogram)[0]}, expected {expected_dimension}')
        
        # normalize if desired
        if normalize:
            spectrogram = audio.normalize_spectrogram(spectrogram, is_mel)

        return spectrogram
Example #2
0
    metadata = []
    for d, fs in files_to_solve:
        with open(os.path.join(d, fs), 'r', encoding='utf-8') as f:
            metadata.append((d, fs, [line.rstrip().split('|') for line in f]))
    print("metadata is:::", metadata)
    print(f'Please wait, this may take a very long time.')
    for d, fs, m in metadata:
        print(f'Creating spectrograms for: {fs}')

        with open(os.path.join(d, fs), 'w', encoding='utf-8') as f:
            for i in m:
                idx, s, l, a, _, _, raw_text, ph = i
                spec_name = idx + '.npy'
                audio_path = os.path.join(d, a)
                audio_data = audio.load(audio_path)

                mel_path = os.path.join(spectrogram_dirs[0], spec_name)
                lin_path = os.path.join(spectrogram_dirs[1], spec_name)

                #mel_path = os.path.join(d, mel_path_partial)
                if not os.path.exists(mel_path):
                    np.save(mel_path, audio.spectrogram(audio_data, True))

                #lin_path = os.path.join(d, lin_path_partial)
                if not os.path.exists(lin_path):
                    np.save(lin_path, audio.spectrogram(audio_data, False))

                print(
                    f'{idx}|{s}|{l}|{a}|{mel_path}|{lin_path}|{raw_text}|{ph}',
                    file=f)
    def create_meta_file(dataset_name, dataset_root_dir, output_metafile_name, audio_sample_rate, num_fft_freqs, spectrograms=True, phonemes=True):
        """Create the meta-file and spectrograms (mel and linear, optionally) or phonemized utterances (optionally).
        
        Format details:
            Every line of the metadata file contains info about one dataset item.
            The line has following format 
                'id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text'
            And the following must hold
                'audio_file_path' can be empty if loading just spectrograms
                'text' should be carefully normalized and should contain interpunction
                'phonemized_text' can be empty if loading just raw text  
        
        Arguments:
            dataset_name (string): Name of the dataset, loaders.py should contain a function for loading with a corresponding name.
            dataset_root_dir (string): Root directory from which is the dataset build and to which are spectrograms and the meta-file saved..
            output_metafile_name (string): Name of the output meta-file.
            audio_sample_rate (int): Sample rate of audios, used if spectrograms is set True.
            num_fft_freqs (int): Number of frequency bands used during spectrogram computation, used if spectrograms is set True.
        Keyword arguments:
            spectrograms (boolean, default True): If true, spetrograms (both mel and linear) are computed and saved.
            phonemes (boolean, default True): If true, phonemized variants of utterances are computed and saved.
        """

        # save current sample rate and fft freqs hyperparameters, as we may process dataset with different sample rate
        if spectrograms:
            old_sample_rate = hp.sample_rate
            hp.sample_rate = audio_sample_rate
            old_fft_freqs = hp.num_fft
            hp.num_fft = num_fft_freqs

        # load metafiles, an item is a list like: [text, audiopath, speaker_id, language_code]
        items = loaders.get_loader_by_name(dataset_name)(dataset_root_dir)

        # build dictionaries for translation to IPA from source languages, see utils.text for details
        if phonemes:
            text_lang_pairs = [(i[0], hp.languages[0] if i[3] == "" else i[3]) for i in items]
            phoneme_dicts = text.build_phoneme_dicts(text_lang_pairs)

        # prepare directories which will store spectrograms
        if spectrograms:
            spectrogram_dirs = [os.path.join(dataset_root_dir, 'spectrograms'), 
                                os.path.join(dataset_root_dir, 'linear_spectrograms')]
            for x in spectrogram_dirs:
                if not os.path.exists(x): os.makedirs(x)

        # iterate through items and build the meta-file
        metafile_path = os.path.join(dataset_root_dir, output_metafile_name)
        with open(metafile_path, 'w', encoding='utf-8') as f:
            Logger.progress(0, prefix='Building metafile:')
            for i in range(len(items)):
                raw_text, audio_path, speaker, language = items[i]   
                if language == "": language = hp.languages[0]
                phonemized_text = text.to_phoneme(raw_text, False, language, phoneme_dicts[language]) if phonemes else ""     
                spectrogram_paths = "|"
                if spectrograms:
                    spec_name = f'{str(i).zfill(6)}.npy'                 
                    audio_data = audio.load(os.path.join(dataset_root_dir, audio_path))
                    np.save(os.path.join(spectrogram_dirs[0], spec_name), audio.spectrogram(audio_data, True))
                    np.save(os.path.join(spectrogram_dirs[1], spec_name), audio.spectrogram(audio_data, False))
                    spectrogram_paths = os.path.join('spectrograms', spec_name) + '|' + os.path.join('linear_spectrograms', spec_name)
                print(f'{str(i).zfill(6)}|{speaker}|{language}|{audio_path}|{spectrogram_paths}|{raw_text}|{phonemized_text}', file=f)
                Logger.progress((i + 1) / len(items), prefix='Building metafile:')
        
        # restore the original sample rate and fft freq values
        if spectrograms:
            hp.sample_rate = old_sample_rate
            hp.num_fft = old_fft_freqs