Beispiel #1
0
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    sr = hparams.sample_rate

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Trim silence from hts labels if available
    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
    if exists(lab_path):
        labels = hts.load(lab_path)
        assert "sil" in labels[0][-1]
        assert "sil" in labels[-1][-1]
        b = int(labels[0][1] * 1e-7 * sr)
        e = int(labels[-1][0] * 1e-7 * sr)
        wav = wav[b:e]
    else:
        wav, _ = librosa.effects.trim(wav, top_db=30)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution
    # between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'jsut-audio-%05d.npy' % index
    mel_filename = 'jsut-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
Beispiel #2
0
def _process_utterance(out_dir, index, audio_filepath, text):
    # Load the audio to a numpy array:
    wav_whole = audio.load_wav(audio_filepath)

    if hparams.rescaling:
        wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max

    # This is a librivox source, so the audio files are going to be v. long
    # compared to a typical 'utterance' : So split the wav into chunks

    tup_results = []

    n_samples = int(8.0 * hparams.sample_rate)  # All 8 second utterances
    n_chunks = wav_whole.shape[0] // n_samples

    for chunk_idx in range(n_chunks):
        chunk_start, chunk_end = chunk_idx * \
            n_samples, (chunk_idx + 1) * n_samples
        if chunk_idx == n_chunks - 1:
            # This is the last chunk - allow it
            # to extend to the end of the file
            chunk_end = None
        wav = wav_whole[chunk_start:chunk_end]

        # Mu-law quantize
        if is_mulaw_quantize(hparams.input_type):
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels)

            # Trim silences
            start, end = audio.start_and_end_indices(out,
                                                     hparams.silence_threshold)
            wav = wav[start:end]
            out = out[start:end]
            constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
            out_dtype = np.int16
        elif is_mulaw(hparams.input_type):
            # [-1, 1]
            out = P.mulaw(wav, hparams.quantize_channels)
            constant_values = P.mulaw(0.0, hparams.quantize_channels)
            out_dtype = np.float32
        else:
            # [-1, 1]
            out = wav
            constant_values = 0.0
            out_dtype = np.float32

        # Compute a mel-scale spectrogram from the trimmed wav:
        # (N, D)
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
        # lws pads zeros internally before performing stft
        # this is needed to adjust time resolution
        # between audio and mel-spectrogram
        l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

        # zero pad for quantized signal
        out = np.pad(out, (l, r),
                     mode="constant",
                     constant_values=constant_values)
        N = mel_spectrogram.shape[0]
        assert len(out) >= N * audio.get_hop_size()

        # time resolution adjustment
        # ensure length of raw audio is multiple of hop_size so that we can use
        # transposed convolution to upsample
        out = out[:N * audio.get_hop_size()]
        assert len(out) % audio.get_hop_size() == 0

        timesteps = len(out)

        # Write the spectrograms to disk:
        audio_filename = 'librivox-audio-%04d-%05d.npy' % (
            index,
            chunk_idx,
        )
        mel_filename = 'librivox-mel-%04d-%05d.npy' % (
            index,
            chunk_idx,
        )
        text_idx = '%s - %05d' % (
            text,
            chunk_idx,
        )
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype),
                allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.astype(np.float32),
                allow_pickle=False)

        # Add results tuple describing this training example:
        tup_results.append((audio_filename, mel_filename, timesteps, text_idx))

    # Return all the audio results tuples (unpack in caller)
    return tup_results
Beispiel #3
0
def _process_utterance(out_dir, index, wav_path, text, sample_rate, fft_size,
                       hop_size, n_mels, redis_connection):
    # Load the audio to a numpy array:
    wav = load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    # this really gets called if input_type in hparams
    # is changed from raw to mulaw
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    # mel_spectrogram =
    # audio.melspectrogram(wav, 22050, 1024, 40).astype(np.float32).T
    # change this line to adjust hyperparams
    mel_spectrogram = melspectrogram(wav, sample_rate, fft_size, hop_size,
                                     n_mels).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution
    # between audio and mel-spectrogram
    l, r = lws_pad_lr(wav, fft_size, hop_size)

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    # assert len(out) >= N * audio.get_hop_size()
    assert len(out) >= N * hop_size

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    # out = out[:N * audio.get_hop_size()]
    # assert len(out) % audio.get_hop_size() == 0
    out = out[:N * hop_size]
    assert len(out) % hop_size == 0

    timesteps = len(out)
    # compute example reconstruction
    # change this line to adjust hparams
    # signal = audio.inv_mel_spectrogram(mel_spectrogram,
    # sample_rate, fft_size, n_mels)

    # mel_spectrogram = mel_spectrogram.T

    # Write the spectrograms to disk:
    audio_filename = 'ljspeech-audio-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    # recon_audio_filename = 'ljspeech-audio-%05d.wav' % index
    data = out.tobytes()
    target = np.asarray(text).tobytes()
    redis_connection.set(index, data + target)
    # np.save(os.path.join(out_dir, audio_filename),
    # out.astype(out_dtype), allow_pickle=False)
    # np.save(os.path.join(out_dir, mel_filename),
    # mel_spectrogram.astype(np.float32), allow_pickle=False)
    # audio.save_wav(signal, os.path.join(out_dir, recon_audio_filename))

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)