Exemple #1
0
def _process_utterance(out_dir, index, wav_path, text, silence_threshold,
                       fft_size):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.input_type != "raw":
        # Mu-law quantize
        out = P.mulaw_quantize(wav)

        # Trim silences
        start, end = audio.start_and_end_indices(out, silence_threshold)
        out = out[start:end]
        wav = wav[start:end]
        constant_value = P.mulaw_quantize(0, 256)
        out_dtype = np.int16
    else:
        out = wav
        constant_value = 0.
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_value)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    wav_id = os.path.basename(wav_path).split('.')[
        0]  # wav_id = wav_path.split('/')[-1].split('.')[0]

    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return audio_filename, mel_filename, timesteps, text
def _process_utterance(
    out_dir,
    index,
    speaker_id,
    wav_path,
    text,
    silence_threshold,
    fft_size,
):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    # Mu-law quantize
    quantized = P.mulaw_quantize(wav)

    # Trim silences
    start, end = audio.start_and_end_indices(quantized, silence_threshold)
    quantized = quantized[start:end]
    wav = wav[start:end]

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjast time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    quantized = np.pad(quantized, (l, r),
                       mode="constant",
                       constant_values=P.mulaw_quantize(0))
    N = mel_spectrogram.shape[0]
    assert len(quantized) >= N * audio.get_hop_size()

    # time resolution adjastment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    quantized = quantized[:N * audio.get_hop_size()]
    assert len(quantized) % audio.get_hop_size() == 0

    timesteps = len(quantized)

    wav_id = wav_path.split('/')[-1].split('.')[0]
    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    np.save(os.path.join(out_dir, audio_filename),
            quantized.astype(np.int16),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text, speaker_id)
Exemple #3
0
def assert_ready_for_upsampling(x, c):
    assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size()
Exemple #4
0
    def thread_main(self, sess):
        stop = False
        while not stop:
            iterator = load_npy_data(self.metadata_filename, self.npy_dataroot,
                                     self.speaker_id)
            for wav, local_condition, global_condition in iterator:
                if self.coord.should_stop():
                    stop = True
                    break

                # force to align the audio and local_condition
                # if audio.shape[0] > local_condition.shape[0]:
                #     audio = audio[:local_condition.shape[0], :]
                # else:
                #     local_condition = local_condition[:audio.shape[0], :]

                # audio = np.pad(audio, [[self.receptive_field, 0], [0, 0]], mode='constant')
                # local_condition = np.pad(local_condition, [[self.receptive_field, 0], [0, 0]], mode='constant')
                # if self.sample_size:
                #     while len(audio) > self.receptive_field:
                #         audio_piece = audio[:(self.receptive_field + self.sample_size), :]
                #         audio = audio[self.sample_size:, :]
                #
                #         local_condition_piece = local_condition[:(self.receptive_field + self.sample_size), :]
                #         local_condition = local_condition[self.sample_size:, :]
                #
                #         if self.gc_enable:
                #             sess.run(self.enqueue, feed_dict=
                #             dict(zip(self._placeholders, (audio_piece, local_condition_piece, global_condition))))
                #         else:
                #             sess.run(self.enqueue, feed_dict=
                #             dict(zip(self._placeholders, (audio_piece, local_condition_piece))))
                # else:
                #     if self.gc_enable:
                #         sess.run(self.enqueue, feed_dict=dict(zip(
                #             self._placeholders, (audio, local_condition, global_condition))))
                #     else:
                #         sess.run(self.enqueue, feed_dict=dict(zip(self._placeholders, (audio, local_condition))))

                if hparams.upsample_conditional_features:
                    wav = wav.reshape(-1, 1)
                    assert_ready_for_upsampling(wav, local_condition)
                    if self.sample_size is not None:
                        sample_size = ensure_divisible(self.sample_size,
                                                       audio.get_hop_size(),
                                                       True)
                        if wav.shape[0] > sample_size:
                            max_frames = sample_size // audio.get_hop_size()
                            s = np.random.randint(
                                0,
                                len(local_condition) - max_frames)
                            ts = s * audio.get_hop_size()
                            wav = wav[ts:ts +
                                      audio.get_hop_size() * max_frames, :]
                            local_condition = local_condition[s:s +
                                                              max_frames, :]
                            if self.gc_enable:
                                sess.run(self.enqueue,
                                         feed_dict=dict(
                                             zip(self._placeholders,
                                                 (wav, local_condition,
                                                  global_condition))))
                            else:
                                sess.run(self.enqueue,
                                         feed_dict=dict(
                                             zip(self._placeholders,
                                                 (wav, local_condition))))
                else:
                    wav, local_condition = audio.adjust_time_resolution(
                        wav, local_condition)
                    wav = wav.reshape(-1, 1)
                    if self.sample_size is not None:
                        while wav.shape[0] > self.sample_size:
                            wav_piece = wav[:(self.receptive_field +
                                              self.sample_size), :]
                            local_condition_piece = local_condition[:(
                                self.receptive_field + self.sample_size), :]
                            wav = wav[:self.sample_size, :]
                            local_condition = local_condition[:self.
                                                              sample_size, :]
                            assert len(wav_piece) == len(local_condition_piece)

                            if self.gc_enable:
                                sess.run(
                                    self.enqueue,
                                    feed_dict=dict(
                                        zip(self._placeholders,
                                            (wav_piece, local_condition_piece,
                                             global_condition))))
                            else:
                                sess.run(self.enqueue,
                                         feed_dict=dict(
                                             zip(self._placeholders,
                                                 (wav_piece,
                                                  local_condition_piece))))