Python spectrogramの例、utils.audio.spectrogram Pythonの例

コード例 #1

0

ファイルを表示

def _process_utterance(out_dir, index, wav_path, text):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- out-dir: the directory to write the spectograms into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file

	Returns:
		- A tuple: (mel_filename, n_frames, text)
	"""

    # Load the audio as numpy array
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav to calculate n_frames
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrogram to disk
    mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (mel_filename, n_frames, text)

コード例 #2

0

ファイルを表示

ファイル: dataset.py プロジェクト: cnlinxi/Multilingual_Text_to_Speech

    def load_spectrogram(self, audio_path, spectrogram_path, normalize, is_mel):
        """Load a mel or linear spectrogram from file or compute from scratch if needed.
        
        Arguments:
            audio_path (string): Path to the audio from which will (possibly) be the spectrogram computed.
            spectrogram_path (string): Path to the spectrogram file which will be loaded (possibly).
            normalize (boolean): If True, the spectrogram is normalized (per channel, extract mean and divide by std).
            is_mel (boolean): If True, the mel spectrogram is loaded or computed, otherwise returns a linear spectrogram.
        """

        # load or compute spectrogram
        if hp.cache_spectrograms:
            full_spec_path = os.path.join(self.root_dir, spectrogram_path)
            spectrogram = np.load(full_spec_path)
        else:
            full_audio_path = os.path.join(self.root_dir, audio_path)
            audio_data = audio.load(full_audio_path)
            spectrogram = audio.spectrogram(audio_data, is_mel)

        # check spectrogram dimensions
        expected_dimension = hp.num_mels if is_mel else hp.num_fft // 2 + 1
        assert np.shape(spectrogram)[0] == expected_dimension, (
                f'Spectrogram dimensions mismatch: given {np.shape(spectrogram)[0]}, expected {expected_dimension}')
        
        # normalize if desired
        if normalize:
            spectrogram = audio.normalize_spectrogram(spectrogram, is_mel)

        return spectrogram

コード例 #3

0

ファイルを表示

ファイル: ljspeech.py プロジェクト: xcmyz/VAE-Tacotron

def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    # print(len(spectrogram))
    # print(len(spectrogram[0]))
    # print(type(spectrogram))
    # print(np.shape(spectrogram))
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    # print(np.shape(mel_spectrogram))
    # print()

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)

コード例 #4

0

ファイルを表示

def _process_utterance(out_dir, index, wav_path, text):

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'meta_spec_%05d.npy' % index
    mel_filename = 'meta_mel_%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)

コード例 #5

0

ファイルを表示

    metadata = []
    for d, fs in files_to_solve:
        with open(os.path.join(d, fs), 'r', encoding='utf-8') as f:
            metadata.append((d, fs, [line.rstrip().split('|') for line in f]))
    print("metadata is:::", metadata)
    print(f'Please wait, this may take a very long time.')
    for d, fs, m in metadata:
        print(f'Creating spectrograms for: {fs}')

        with open(os.path.join(d, fs), 'w', encoding='utf-8') as f:
            for i in m:
                idx, s, l, a, _, _, raw_text, ph = i
                spec_name = idx + '.npy'
                audio_path = os.path.join(d, a)
                audio_data = audio.load(audio_path)

                mel_path = os.path.join(spectrogram_dirs[0], spec_name)
                lin_path = os.path.join(spectrogram_dirs[1], spec_name)

                #mel_path = os.path.join(d, mel_path_partial)
                if not os.path.exists(mel_path):
                    np.save(mel_path, audio.spectrogram(audio_data, True))

                #lin_path = os.path.join(d, lin_path_partial)
                if not os.path.exists(lin_path):
                    np.save(lin_path, audio.spectrogram(audio_data, False))

                print(
                    f'{idx}|{s}|{l}|{a}|{mel_path}|{lin_path}|{raw_text}|{ph}',
                    file=f)

コード例 #6

0

ファイルを表示

ファイル: dataset.py プロジェクト: cnlinxi/Multilingual_Text_to_Speech

    def create_meta_file(dataset_name, dataset_root_dir, output_metafile_name, audio_sample_rate, num_fft_freqs, spectrograms=True, phonemes=True):
        """Create the meta-file and spectrograms (mel and linear, optionally) or phonemized utterances (optionally).
        
        Format details:
            Every line of the metadata file contains info about one dataset item.
            The line has following format 
                'id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text'
            And the following must hold
                'audio_file_path' can be empty if loading just spectrograms
                'text' should be carefully normalized and should contain interpunction
                'phonemized_text' can be empty if loading just raw text  
        
        Arguments:
            dataset_name (string): Name of the dataset, loaders.py should contain a function for loading with a corresponding name.
            dataset_root_dir (string): Root directory from which is the dataset build and to which are spectrograms and the meta-file saved..
            output_metafile_name (string): Name of the output meta-file.
            audio_sample_rate (int): Sample rate of audios, used if spectrograms is set True.
            num_fft_freqs (int): Number of frequency bands used during spectrogram computation, used if spectrograms is set True.
        Keyword arguments:
            spectrograms (boolean, default True): If true, spetrograms (both mel and linear) are computed and saved.
            phonemes (boolean, default True): If true, phonemized variants of utterances are computed and saved.
        """

        # save current sample rate and fft freqs hyperparameters, as we may process dataset with different sample rate
        if spectrograms:
            old_sample_rate = hp.sample_rate
            hp.sample_rate = audio_sample_rate
            old_fft_freqs = hp.num_fft
            hp.num_fft = num_fft_freqs

        # load metafiles, an item is a list like: [text, audiopath, speaker_id, language_code]
        items = loaders.get_loader_by_name(dataset_name)(dataset_root_dir)

        # build dictionaries for translation to IPA from source languages, see utils.text for details
        if phonemes:
            text_lang_pairs = [(i[0], hp.languages[0] if i[3] == "" else i[3]) for i in items]
            phoneme_dicts = text.build_phoneme_dicts(text_lang_pairs)

        # prepare directories which will store spectrograms
        if spectrograms:
            spectrogram_dirs = [os.path.join(dataset_root_dir, 'spectrograms'), 
                                os.path.join(dataset_root_dir, 'linear_spectrograms')]
            for x in spectrogram_dirs:
                if not os.path.exists(x): os.makedirs(x)

        # iterate through items and build the meta-file
        metafile_path = os.path.join(dataset_root_dir, output_metafile_name)
        with open(metafile_path, 'w', encoding='utf-8') as f:
            Logger.progress(0, prefix='Building metafile:')
            for i in range(len(items)):
                raw_text, audio_path, speaker, language = items[i]   
                if language == "": language = hp.languages[0]
                phonemized_text = text.to_phoneme(raw_text, False, language, phoneme_dicts[language]) if phonemes else ""     
                spectrogram_paths = "|"
                if spectrograms:
                    spec_name = f'{str(i).zfill(6)}.npy'                 
                    audio_data = audio.load(os.path.join(dataset_root_dir, audio_path))
                    np.save(os.path.join(spectrogram_dirs[0], spec_name), audio.spectrogram(audio_data, True))
                    np.save(os.path.join(spectrogram_dirs[1], spec_name), audio.spectrogram(audio_data, False))
                    spectrogram_paths = os.path.join('spectrograms', spec_name) + '|' + os.path.join('linear_spectrograms', spec_name)
                print(f'{str(i).zfill(6)}|{speaker}|{language}|{audio_path}|{spectrogram_paths}|{raw_text}|{phonemized_text}', file=f)
                Logger.progress((i + 1) / len(items), prefix='Building metafile:')
        
        # restore the original sample rate and fft freq values
        if spectrograms:
            hp.sample_rate = old_sample_rate
            hp.num_fft = old_fft_freqs

コード例 #7

0

ファイルを表示

f = open(tdd_file, encoding='utf-8')
ctr = 0
for line in f:
 if len(line) > 2:
    ctr += 1
    line = line.split('\n')[0]

    fname = line.split()[0]
    phones = ' '.join(k for k in line.split()[1:])

    if generate_feats_flag:
       wav_fname = wav_dir + '/' + fname + '.wav'
       wav = audio.load_wav(wav_fname)
       max_samples = _max_out_length * 5 / 1000 * 16000
       spectrogram = audio.spectrogram(wav).astype(np.float32)
       n_frames = spectrogram.shape[1]
       mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
       lspec_fname = lspec_dir + '/' + fname + '_lspec.npy'
       mspec_fname = mspec_dir + '/' + fname + '_mspec.npy'
       np.save(lspec_fname, spectrogram.T, allow_pickle=False)
       np.save(mspec_fname, mel_spectrogram.T, allow_pickle=False)

       g = open(data_file, 'a')
       g.write(lspec_fname + '|' + mspec_fname + '|' + str(n_frames) + '| ' + phones  + '\n')
       g.close()

       g = open(feats_dir + '/' + fname + '.feats', 'w')
       for phone in phones.split():
          g.write(phone + '\n')
       g.close()

コード例 #8

0

ファイルを表示

ファイル: test.py プロジェクト: Zain-Jiang/speech-signal-processing_study1

from griffin_lim import inv_spectrogram, tf
import os

if __name__ == '__main__':
    data_foler = "data"
    wavs = [
        os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler)
        if file.endswith(".wav")
    ]
    outputs_py = [file + ".py.gen.wav" for file in wavs]
    outputs_tf = [file + ".tf.gen.wav" for file in wavs]
    wavs = [
        audio.load_wav(wav_path + ".wav", hparams.sample_rate)
        for wav_path in wavs
    ]
    spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs]
    print("Linear spectrograms dim: ")
    print(spectrogram[0].shape)
    # --------------------------------- librosa Version ---------------------------------
    # convert back
    gens = [audio.inv_spectrogram(s) for s in spectrogram]

    for gen, output in zip(gens, outputs_py):
        audio.save_wav(gen, output)

    # --------------------------------- TensorFlow Version ---------------------------------

    samples = [inv_spectrogram(spec) for spec in spectrogram]

    with tf.Session() as sess:
        samples = [sess.run(sample) for sample in samples]

コード例 #9

0

ファイルを表示

ファイル: prepare_spectrograms.py プロジェクト: CherokeeLanguage/Cherokee-TTS

def main():
    argv0: str = sys.argv[0]
    if argv0:
        workdir: str = os.path.dirname(argv0)
        if workdir:
            os.chdir(workdir)
    os.chdir("data")

    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", type=str, default="1a",  #
                        help="Params dataset for Training Data.")

    args = parser.parse_args()
    Params.load(f"../params/{args.dataset}.json")
    audio.hp = Params
    hop_frames: int = audio.ms_to_frames(audio.hp.stft_shift_ms)
    win_frames: int = audio.ms_to_frames(audio.hp.stft_window_ms)
    print(f"mel parameters: hop = {hop_frames:,}, win = {win_frames:,}")
    dataset_path: str = os.path.join("datasets", args.dataset)

    # as this code *alters* the train and val files, always regenerate them first!
    _: List[str] = ["python", os.path.join(dataset_path, "create_training_files.py")]
    subprocess.run(_, check=True, bufsize=0)

    files_to_solve = [(dataset_path, "train.txt"), (dataset_path, "val.txt"), ]

    mel_path: str = os.path.join(dataset_path, 'mel_spectrograms')
    os.makedirs(mel_path, exist_ok=True)

    mp3_path: str = os.path.join(dataset_path, "reference-audio")
    shutil.rmtree(mp3_path, ignore_errors=True)
    os.mkdir(mp3_path)

    mp3_bad_path: str = os.path.join(dataset_path, "reference-audio-bad")
    shutil.rmtree(mp3_bad_path, ignore_errors=True)
    os.mkdir(mp3_bad_path)

    mp3_fixed_path: str = os.path.join(dataset_path, "reference-audio-fixed")
    shutil.rmtree(mp3_fixed_path, ignore_errors=True)
    os.mkdir(mp3_fixed_path)

    metadata = []
    for d, fs in files_to_solve:
        with open(os.path.join(d, fs), 'r', encoding='utf-8') as f:
            metadata.append((d, fs, [line.rstrip().split('|') for line in f]))

    bad_silence_count: int = 0
    file_bad_entries: str = os.path.join(dataset_path, "entries-bad.txt")
    with open(file_bad_entries, "w"):
        pass

    fix_silence_count: int = 0
    file_fixed_entries: str = os.path.join(dataset_path, "entries-fixed.txt")
    with open(file_fixed_entries, "w"):
        pass

    skipped_too_short: List[str] = list()
    skipped_too_long: List[str] = list()
    spec_id: int = 0
    print(f'Please wait, this may take a very long time.')
    for d, fs, m in metadata:
        print(f'Creating spectrograms for: {fs}')
        bar: progressbar.ProgressBar = progressbar.ProgressBar(maxval=len(m))
        bar.start()
        with open(os.path.join(d, fs + "-tmp"), 'w', encoding='utf-8') as f:
            for i in m:
                idx, speaker, lang, wav, _, _, raw_text, phonemes = i

                if lang not in Params.languages:
                    continue

                raw_text = ud.normalize("NFC", raw_text)
                phonemes = ud.normalize("NFC", phonemes)

                spec_id += 1
                spec_name = f"{lang}_{speaker}-{spec_id:06d}.npy"

                mel_path_partial = os.path.join("mel_spectrograms", spec_name)
                mel_path = os.path.join(dataset_path, mel_path_partial)

                entry: str = f'{idx}|{speaker}|{lang}|{wav}|{mel_path_partial}||{raw_text}|{phonemes}'

                audio_path = os.path.join(d, wav)

                py_audio: AudioSegment = AudioSegment.from_file(audio_path)
                py_audio = py_audio.set_channels(1).set_frame_rate(Params.sample_rate)
                py_audio = effects.normalize(py_audio)
                py_audio = trim_silence(py_audio)

                # Output altered audio (compressed) for manual review
                mp3_name = f"{lang}_{speaker}-{spec_id:06d}.mp3"
                ref_audio_mp3: str = os.path.join(mp3_path, mp3_name)

                if Params.fix_silence:
                    fix_silence: int = Params.fix_silence_len
                    segments = silence.split_on_silence(py_audio,  #
                                                        min_silence_len=fix_silence,  #
                                                        silence_thresh=-50,  #
                                                        keep_silence=fix_silence / 2)
                    if len(segments) > 1:
                        new_py_audio = AudioSegment.empty()
                        for segment in segments:
                            new_py_audio = new_py_audio.append(segment, crossfade=0)
                        assert len(new_py_audio), "Empty fixed audio after recombining?"

                        py_audio = new_py_audio.set_channels(1).set_frame_rate(py_audio.frame_rate)
                        with open(file_fixed_entries, "a") as w:
                            print(entry, file=w)
                        fix_audio_mp3: str = os.path.join(mp3_fixed_path, f"fix-{mp3_name}")
                        py_audio.export(fix_audio_mp3, format="mp3", parameters=["-qscale:a", "3"])
                        fix_silence_count += 1

                if Params.skip_silence:
                    max_silence: int = Params.max_silence_len
                    if silence.detect_silence(py_audio,  #
                                              min_silence_len=max_silence,  #
                                              silence_thresh=-50):
                        with open(file_bad_entries, "a") as w:
                            print(entry, file=w)
                        bad_audio_mp3: str = os.path.join(mp3_bad_path, f"bad-{mp3_name}")
                        py_audio.export(bad_audio_mp3, format="mp3", parameters=["-qscale:a", "3"])
                        bad_silence_count += 1
                        continue

                if len(py_audio) < Params.audio_min_length:
                    skipped_too_short.append(entry)
                    bad_audio_mp3: str = os.path.join(mp3_bad_path, f"too-short-{mp3_name}")
                    py_audio.export(bad_audio_mp3, format="mp3", parameters=["-qscale:a", "3"])
                    continue

                if len(py_audio) > Params.audio_max_length:
                    skipped_too_long.append(entry)
                    bad_audio_mp3: str = os.path.join(mp3_bad_path, f"too-long-{mp3_name}")
                    py_audio.export(bad_audio_mp3, format="mp3", parameters=["-qscale:a", "3"])
                    continue

                if Params.lead_in_silence > 0:
                    # Add lead_in_silence ms of silence at the beginning
                    py_audio = AudioSegment.silent(Params.lead_in_silence) + py_audio

                if Params.lead_out_silence > 0:
                    # Add lead_out_silence ms of silence at the end
                    py_audio = py_audio + AudioSegment.silent(Params.lead_out_silence)

                if not os.path.exists(ref_audio_mp3):
                    py_audio.export(ref_audio_mp3, format="mp3", parameters=["-qscale:a", "3"])

                py_audio_samples: array = np.array(py_audio.get_array_of_samples()).astype(np.float32)
                py_audio_samples = py_audio_samples / (1 << 8 * 2 - 1)
                if not os.path.exists(mel_path):
                    np.save(mel_path, audio.spectrogram(py_audio_samples, True))

                print(entry, file=f)
                bar.update(bar.currval + 1)

        print(f"Records skipped (>{Params.audio_max_length / 1000:.02f}): {len(skipped_too_long):,}")
        with open(os.path.join(d, "too-long-" + fs), "w") as w:
            for entry in skipped_too_long:
                print(entry, file=w)

        print(f"Records skipped (<{Params.audio_min_length / 1000:.02f}): {len(skipped_too_short):,}")
        with open(os.path.join(d, "too-short-" + fs), "w") as w:
            for entry in skipped_too_short:
                print(entry, file=w)

        bar.finish()

    if bad_silence_count:
        print(f"Records skipped because of excessive silence: {bad_silence_count:,}")
    if fix_silence_count:
        print(f"Records altered because of excessive silence: {fix_silence_count:,}")

    for d, fs in files_to_solve:
        tmp = os.path.join(d, fs + "-tmp")
        dst = os.path.join(d, fs)
        bkup = os.path.join(d, fs + "-bkup")

        if os.path.exists(bkup):
            os.remove(bkup)

        os.rename(dst, bkup)
        os.rename(tmp, dst)

    sys.exit()