Exemple #1
0
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'ljspeech-audio-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    # np.save(os.path.join(out_dir, audio_filename),
    #         out.astype(out_dtype), allow_pickle=False)
    # np.save(os.path.join(out_dir, mel_filename),
    #         mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
Exemple #2
0
def adjust_wav_lws(labels,
                   wav_length,
                   fft_size=1024,
                   hop_size=256,
                   sample_rate=22050):
    l, r = audio.lws_pad_lr(wav_length, fft_size, hop_size)
    pad_l = int(l / sample_rate * 10000000)
    labels.start_times = [x + pad_l for x in labels.start_times]
    labels.end_times = [x + pad_l for x in labels.end_times]
    labels.start_times[0] = 0
    labels.end_times[-1] = int((wav_length + l + r) / sample_rate * 10000000)
def _extract_mel(wav_path):
    # Load the audio to a numpy array. Resampled if needed.
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjast time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjastment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0
    assert len(out) // N == audio.get_hop_size()

    timesteps = len(out)

    return out, mel_spectrogram, timesteps, out_dtype
Exemple #4
0
def _process_utterance(out_dir, index, wav_path, text, silence_threshold,
                       fft_size):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Mu-law quantize
    quantized = P.mulaw_quantize(wav)

    # Trim silences
    start, end = audio.start_and_end_indices(quantized, silence_threshold)
    quantized = quantized[start:end]
    wav = wav[start:end]

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjast time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    quantized = np.pad(quantized, (l, r),
                       mode="constant",
                       constant_values=P.mulaw_quantize(0))
    N = mel_spectrogram.shape[0]
    assert len(quantized) >= N * audio.get_hop_size()

    # time resolution adjastment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    quantized = quantized[:N * audio.get_hop_size()]
    assert len(quantized) % audio.get_hop_size() == 0

    timesteps = len(quantized)

    wav_id = wav_path.split('/')[-1].split('.')[0]
    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    np.save(os.path.join(out_dir, audio_filename),
            quantized.astype(np.int16),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
def wavenet_data():
    out = P.mulaw_quantize(wav, hparams.quantize_channels)
    out8 = P.mulaw_quantize(wav, 256)
    # WAVENENT TRANFSORMATIONS
    # Mu-law quantize

    # Trim silences
    start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
    wav = wav[start:end]
    out = out[start:end]
    constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
    out_dtype = np.int16

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    import matplotlib.pyplot as plt

    plt.subplot(3, 1, 1)
    specshow(mel_spectrogram.T, sr=20000, hop_length=hparams.hop_size)
    plt.subplot(3, 1, 2)
    plt.plot(out)
    plt.xlim(0, len(out))
    plt.subplot(3, 1, 3)
    plt.plot(wav)
    plt.xlim(0, len(wav))
    plt.show()

    out /= out.max()
Exemple #6
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'cmu_arctic-audio-%05d.npy' % index
    mel_filename = 'cmu_arctic-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text, speaker_id)
Exemple #7
0
def _process_utterance(out_dir, index, audio_filepath, text):
    # Load the audio to a numpy array:
    wav_whole = audio.load_wav(audio_filepath)

    if hparams.rescaling:
        wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max

    # This is a librivox source, so the audio files are going to be v. long
    # compared to a typical 'utterance' : So split the wav into chunks

    tup_results = []

    n_samples = int(8.0 * hparams.sample_rate)  # All 8 second utterances
    n_chunks = wav_whole.shape[0] // n_samples

    for chunk_idx in range(n_chunks):
        chunk_start, chunk_end = chunk_idx * \
            n_samples, (chunk_idx + 1) * n_samples
        if chunk_idx == n_chunks - 1:
            # This is the last chunk - allow it
            # to extend to the end of the file
            chunk_end = None
        wav = wav_whole[chunk_start:chunk_end]

        # Mu-law quantize
        if is_mulaw_quantize(hparams.input_type):
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels)

            # Trim silences
            start, end = audio.start_and_end_indices(out,
                                                     hparams.silence_threshold)
            wav = wav[start:end]
            out = out[start:end]
            constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
            out_dtype = np.int16
        elif is_mulaw(hparams.input_type):
            # [-1, 1]
            out = P.mulaw(wav, hparams.quantize_channels)
            constant_values = P.mulaw(0.0, hparams.quantize_channels)
            out_dtype = np.float32
        else:
            # [-1, 1]
            out = wav
            constant_values = 0.0
            out_dtype = np.float32

        # Compute a mel-scale spectrogram from the trimmed wav:
        # (N, D)
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
        # lws pads zeros internally before performing stft
        # this is needed to adjust time resolution
        # between audio and mel-spectrogram
        l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

        # zero pad for quantized signal
        out = np.pad(out, (l, r),
                     mode="constant",
                     constant_values=constant_values)
        N = mel_spectrogram.shape[0]
        assert len(out) >= N * audio.get_hop_size()

        # time resolution adjustment
        # ensure length of raw audio is multiple of hop_size so that we can use
        # transposed convolution to upsample
        out = out[:N * audio.get_hop_size()]
        assert len(out) % audio.get_hop_size() == 0

        timesteps = len(out)

        # Write the spectrograms to disk:
        audio_filename = 'librivox-audio-%04d-%05d.npy' % (
            index,
            chunk_idx,
        )
        mel_filename = 'librivox-mel-%04d-%05d.npy' % (
            index,
            chunk_idx,
        )
        text_idx = '%s - %05d' % (
            text,
            chunk_idx,
        )
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype),
                allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.astype(np.float32),
                allow_pickle=False)

        # Add results tuple describing this training example:
        tup_results.append((audio_filename, mel_filename, timesteps, text_idx))

    # Return all the audio results tuples (unpack in caller)
    return tup_results
Exemple #8
0
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a np array:
    wav = audio.load_wav(wav_path)

    fs = hparams.sample_rate

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    out = wav
    constant_values = 0.0
    out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    p_vt = 5
    p_gl = 5
    d = 0.99
    hpfilt = 2
    preflt = p_vt + 1

    # High-pass filter speech in order to remove possible low frequency
    # fluctuations (Linear-phase FIR, Fc = 70 Hz)
    Fstop = 40  # Stopband Frequency
    Fpass = 70  # Passband Frequency
    Nfir = np.round(300 / 16000 * fs)  # FIR numerator order

    if (Nfir % 2 == 0):
        Nfir = Nfir + 1
    '''
    it is very very expensive to calculate the firls filter! However, as 
    long as the fs does not change, the firls filter does not change.
    Therefore, the computed filter is returned and can be passed to this
    function later on to avoid the calculated of the (same) filter.
    '''
    B = dsp.firls(Nfir, [0, Fstop / (fs / 2), Fpass / (fs / 2), fs / 2],
                  [0, 0, 1, 1], [1, 1])
    '''
    % Estimate the combined effect of the glottal flow and the lip radiation
    % (Hg1) and cancel it out through inverse filtering. Note that before
    % filtering, a mean-normalized pre-frame ramp is appended in order to
    % diminish ripple in the beginning of the frame. The ramp is removed after
    % filtering.
    '''
    le = np.int(len(out) / hparams.hop_size)
    glot = np.zeros([le, 254])
    vtfilter = np.zeros([le, 5])

    for j in range(le):
        w = out[hparams.hop_size * (j):hparams.hop_size * (j + 1)]

        #print(wav[(hparams.hop_size)*(i):(hparams.hop_size)*(i+1)])
        for i in range(hpfilt):
            w = dsp.lfilter(
                B, 1, np.concatenate((w, np.zeros(np.int(len(B) / 2) - 1))))
            w = w[np.int(len(B) / 2):]

        win = np.hanning(len(w))
        a = np.array([1])
        b = np.array([1, -d])
        signal = np.concatenate((np.linspace(-w[0], w[0], preflt), w))

        windowed = w * win
        t = dsp.correlate(windowed, windowed)
        l = len(win)
        try:
            Hg1 = lpcsolve(t[l:l + 1], t[l + 1:l + 2])
        except np.linalg.linalg.LinAlgError:
            Hg1 = np.zeros(1)

        y = dsp.lfilter(Hg1, 1, signal)
        y = y[preflt:]
        '''
        % Estimate the effect of the vocal tract (Hvt1) and cancel it out through
        % inverse filtering. The effect of the lip radiation is canceled through
        % intergration. Signal g1 is the first estimate of the glottal flow.
        '''

        windowedy = y * win
        r = dsp.correlate(windowedy, windowedy)
        try:
            Hvt1 = lpcsolve(r[l:l + p_vt], r[l + 1:l + p_vt + 1])
        except np.linalg.linalg.LinAlgError:
            Hvt1 = np.zeros(p_vt)
        g1 = dsp.lfilter(Hvt1, 1, signal)
        g1 = dsp.lfilter(a, b, g1)
        g1 = g1[preflt:]
        '''
        % Re-estimate the effect of the glottal flow (Hg2). Cancel the contribution
        % of the glottis and the lip radiation through inverse filtering and
        % integration, respectively.
        '''
        windowedg1 = w * g1
        u = dsp.correlate(windowedg1, windowedg1)
        try:
            Hg2 = lpcsolve(u[l:l + p_gl], u[l + 1:l + p_gl + 1])
        except np.linalg.linalg.LinAlgError:
            Hg2 = np.zeros(p_gl)
        y = dsp.lfilter(Hg2, 1, signal)
        y = dsp.lfilter(a, b, y)
        y = y[preflt:]
        '''
        % Estimate the model for the vocal tract (Hvt2) and cancel it out through
        % inverse filtering. The final estimate of the glottal flow is obtained
        % through canceling the effect of the lip radiation.
        '''
        windowedynew = y * win
        t = dsp.correlate(windowedynew, windowedynew)
        try:
            Hvt2 = lpcsolve(t[l:l + p_vt], t[l + 1:l + p_vt + 1])
        except np.linalg.linalg.LinAlgError:
            Hvt2 = np.zeros(p_vt)
        dg = dsp.lfilter(Hvt2, 1, signal)
        g = dsp.lfilter(a, b, dg)
        g = g[preflt:]
        dg = dg[preflt:]

        # Set vocal tract model to 'a' and glottal source spectral model to 'ag'
        a = Hvt2
        ag = Hg2

        #print(j)
        glot[j - 1] = g.T
        vtfilter[j - 1] = a.T

    glot = np.reshape(glot, [254 * le])
    #print(glot)

    # Write the spectrograms to disk:
    audio_filename = 'ljspeech-audio-%05d.npy' % index
    glot_filename = 'ljspeech-glot-%05d.npy' % index
    vt_filename = 'ljspeech-vt-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, glot_filename),
            glot.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, vt_filename),
            vtfilter.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, glot_filename, vt_filename,
            timesteps, text)
Exemple #9
0
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams):
    # modified version of LJSpeech _process_utterance
    audio.set_hparams(hparams)
    
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    sr = hparams.sample_rate
    # Added from the multispeaker version
    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0]+'.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        wav = clean_by_phoneme(labels, wav, sr)
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)
    # End added from the multispeaker version
    
    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    
    if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length:
        return None
    if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length:
        return None
    
        # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
    
    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()
    
    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0
    
    timesteps = len(out)
    
    # Write the spectrograms to disk: 
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]
    out_filename = 'audio-{}.npy'.format(wav_name)
    mel_filename = 'mel-{}.npy'.format(wav_name)
    np.save(os.path.join(out_dir, out_filename), out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (out_filename, mel_filename, timesteps, text)
Exemple #10
0
def _process_utterance(wav_path, out_dir):
    fname = wav_path.split(os.sep)[-1].split(".")[0]
    audio_filename = '{}_resolved.npy'.format(fname)
    mel_filename = '{}_mel.npy'.format(fname)
    apth = os.path.join(out_dir, audio_filename)
    mpth = os.path.join(out_dir, mel_filename)
    if os.path.exists(apth) and os.path.exists(mpth):
        print("File {} already processed".format(wav_path))
        return

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    np.save(apth,
            out.astype(out_dtype), allow_pickle=False)
    np.save(mpth,
            mel_spectrogram.astype(np.float32), allow_pickle=False)
Exemple #11
0
def _process_utterance(
    out_dir,
    index,
    speaker_id,
    wav_path,
    text,
    silence_threshold,
    fft_size,
):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    # Mu-law quantize
    quantized = P.mulaw_quantize(wav)

    # Trim silences
    start, end = audio.start_and_end_indices(quantized, silence_threshold)
    quantized = quantized[start:end]
    wav = wav[start:end]

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjast time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    quantized = np.pad(quantized, (l, r),
                       mode="constant",
                       constant_values=P.mulaw_quantize(0))
    N = mel_spectrogram.shape[0]
    assert len(quantized) >= N * audio.get_hop_size()

    # time resolution adjastment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    quantized = quantized[:N * audio.get_hop_size()]
    assert len(quantized) % audio.get_hop_size() == 0

    timesteps = len(quantized)

    wav_id = wav_path.split('/')[-1].split('.')[0]
    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    np.save(os.path.join(out_dir, audio_filename),
            quantized.astype(np.int16),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_utterance(out_dir, index, wav_path, text, sample_rate, fft_size,
                       hop_size, n_mels, redis_connection):
    # Load the audio to a numpy array:
    wav = load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    # this really gets called if input_type in hparams
    # is changed from raw to mulaw
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    # mel_spectrogram =
    # audio.melspectrogram(wav, 22050, 1024, 40).astype(np.float32).T
    # change this line to adjust hyperparams
    mel_spectrogram = melspectrogram(wav, sample_rate, fft_size, hop_size,
                                     n_mels).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution
    # between audio and mel-spectrogram
    l, r = lws_pad_lr(wav, fft_size, hop_size)

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    # assert len(out) >= N * audio.get_hop_size()
    assert len(out) >= N * hop_size

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    # out = out[:N * audio.get_hop_size()]
    # assert len(out) % audio.get_hop_size() == 0
    out = out[:N * hop_size]
    assert len(out) % hop_size == 0

    timesteps = len(out)
    # compute example reconstruction
    # change this line to adjust hparams
    # signal = audio.inv_mel_spectrogram(mel_spectrogram,
    # sample_rate, fft_size, n_mels)

    # mel_spectrogram = mel_spectrogram.T

    # Write the spectrograms to disk:
    audio_filename = 'ljspeech-audio-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    # recon_audio_filename = 'ljspeech-audio-%05d.wav' % index
    data = out.tobytes()
    target = np.asarray(text).tobytes()
    redis_connection.set(index, data + target)
    # np.save(os.path.join(out_dir, audio_filename),
    # out.astype(out_dtype), allow_pickle=False)
    # np.save(os.path.join(out_dir, mel_filename),
    # mel_spectrogram.astype(np.float32), allow_pickle=False)
    # audio.save_wav(signal, os.path.join(out_dir, recon_audio_filename))

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)