def _prepare_targets(self, targets, maxlen):
     #[batch_size, time_steps]
     if is_mulaw_quantize(self._hparams.input_type):
         y_batch = np.stack([_pad_targets(x, maxlen)
                             for x in targets]).astype(np.int32)
     else:
         y_batch = np.stack([_pad_targets(x, maxlen)
                             for x in targets]).astype(np.float32)
     assert len(y_batch.shape) == 2
     #Add extra axis (make 3 dimension)
     y_batch = np.expand_dims(y_batch, axis=-1)
     return y_batch
 def _prepare_inputs(self, inputs, maxlen):
     if is_mulaw_quantize(self._hparams.input_type):
         #[batch_size, time_steps, quantize_channels]
         x_batch = np.stack([
             _pad_inputs(
                 np_utils.to_categorical(
                     x, num_classes=self._hparams.quantize_channels),
                 maxlen) for x in inputs
         ]).astype(np.float32)
     else:
         #[batch_size, time_steps, 1]
         x_batch = np.stack([
             _pad_inputs(x.reshape(-1, 1), maxlen) for x in inputs
         ]).astype(np.float32)
     assert len(x_batch.shape) == 3
     #Convert to channels first [batch_size, quantize_channels (or 1), time_steps]
     x_batch = np.transpose(x_batch, (0, 2, 1))
     return x_batch
Exemple #3
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    """
	Preprocesses a single utterance wav/text pair
    convert audio data to:
	        - audio time serie data form (numpy array)
	        - mel + linear spectrogram matrix (numpy matrix)
	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # check for M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = trim_silence(wav, hparams)

    # check for Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)
        # Trim silences
        start, end = start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    #check for mu_larw
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio_series_to_mel(hparams, wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    print('Debug: audiopreprocessing.py 214 mel_frames={}'.format(mel_frames))
    print('Debug: audiopreprocessing.py 214 mel_spectrogram.shape={}'.format(
        mel_spectrogram.shape))
    print('Debug: audiopreprocessing.py 214 max_mel_frames={}'.format(
        hparams.max_mel_frames))

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio_series_to_linear(hparams,
                                                wav).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]
    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = pad_lr(wav, fft_size, get_hop_size(hparams))
    # Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * get_hop_size(hparams)
    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * get_hop_size(hparams)]
    assert len(out) % get_hop_size(hparams) == 0
    time_steps = len(out)
    # Write the spectrogram and audio to disk
    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectrogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        preem_wav = preem_wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.audio_series_to_mel(hparams,
                                                preem_wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                            audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index))
    mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index))
    np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)

    #global condition features
    if hparams.gin_channels > 0:
        raise RuntimeError(
            'When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training'
        )
        speaker_id = '<no_g>'  #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable)
    else:
        speaker_id = '<no_g>'

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, mel_filename, speaker_id, time_steps,
            mel_frames)