Exemple #1
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       speaker_num, lan_num, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Get spectrogram from wav
    ret = audio.wav2spectrograms(wav, hparams)
    if ret is None:
        return None
    out, mel_spectrogram, linear_spectrogram, time_steps, mel_frames = ret

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(np.float32),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text, speaker_num, lan_num)
Exemple #2
0
def audio_process_utterance(mel_dir, linear_dir, wav_dir, duration_dir,
                            score_dir, index, wav, durations, scores, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
    	- mel_dir: the directory to write the mel spectograms into
    	- linear_dir: the directory to write the linear spectrograms into
    	- wav_dir: the directory to write the preprocessed wav into
    	- index: the numeric index to use in the spectogram filename
    	- wav_path: path to the audio file containing the speech input
    	- hparams: hyper parameters

    Returns:
    	- A tuple: (audio_filename, mel_filename, linear_filename, score_filename, duration_filename, time_steps, mel_frames)
    """
    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #Get spectrogram from wav
    ret = audio.wav2spectrograms(wav, hparams)
    if ret is None:
        return None
    out = ret[0]
    mel_spectrogram = ret[1]
    linear_spectrogram = ret[2]
    time_steps = ret[3]
    mel_frames = ret[4]

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    duration_filename = 'duration-{}.npy'.format(index)
    score_filename = 'score-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(np.float32),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(duration_dir, duration_filename),
            durations,
            allow_pickle=False)
    np.save(os.path.join(score_dir, score_filename),
            scores,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, duration_filename,
            score_filename, time_steps, mel_frames)