Exemple #1
0
    def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
        '''Adjust time resolution between audio and local condition
		'''
        if local_condition:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                self._assert_ready_for_upsample(x, c)
                if max_time_steps is not None:
                    max_steps = _ensure_divisible(
                        max_time_steps, audio.get_hop_size(self._hparams),
                        True)
                    if len(x) > max_time_steps:
                        max_time_frames = max_steps // audio.get_hop_size(
                            self._hparams)
                        start = np.random.randint(0, len(c) - max_time_frames)
                        time_start = start * audio.get_hop_size(self._hparams)
                        x = x[time_start:time_start + max_time_frames *
                              audio.get_hop_size(self._hparams)]
                        c = c[start:start + max_time_frames, :]
                        self._assert_ready_for_upsample(x, c)

                new_batch.append((x, c, g, l))
            return new_batch

        else:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                x = audio.trim_silence(x, hparams)
                if max_time_steps is not None and len(x) > max_time_steps:
                    start = np.random.randint(0, len(c) - max_time_steps)
                    x = x[start:start + max_time_steps]
                new_batch.append((x, c, g, l))
            return new_batch
Exemple #2
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # text to pinyin
    text = text.replace("#1", "").replace("#2",
                                          "").replace("#3",
                                                      "").replace("#4", "")
    pinyin = " ".join(get_pinyin(text))

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    wav = wav / np.max(np.abs(wav)) * 0.9  # norm

    # denoise
    if hparams.mmse_denoise_by_bothEndOfAudio and len(
            wav) > hparams.sample_rate * (hparams.length_as_noise * 2 + 0.1):
        noise_wav = np.concatenate([
            wav[:int(hparams.sample_rate * hparams.length_as_noise)],
            wav[-int(hparams.sample_rate * hparams.length_as_noise):]
        ])
        profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # trim silence
    wav = audio.trim_silence(
        wav, hparams.trim_top_db)  # top_db=30 for aishell, 60 for BZNSYP
    # audio.save_wav(wav, wav_path.replace(".wav", "_trimed.wav"))

    # convert wav to 16bit int
    wav *= 32768
    wav = wav.astype(np.int16)

    # extract LPC feature
    extractor = lpcnet.FeatureExtractor()
    feat = extractor.compute_feature(wav)
    n_frames = feat.shape[0]

    # write the lpc feature to disk
    feature_filename = 'biaobei-lpc-feat-%05d.npy' % index
    np.save(os.path.join(out_dir, feature_filename), feat, allow_pickle=False)

    # Return a tuple describing this training example:
    return (feature_filename, n_frames, pinyin)
Exemple #3
0
def _process_utterance(out_dir, index, wav_path, pinyin):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # rescale wav for unified measure for all clips
    wav = wav / np.abs(wav).max() * 0.999

    # trim silence
    wav = audio.trim_silence(wav)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    if n_frames > hp.max_frame_num:
        return None

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'thchs30-spec-%05d.npy' % index
    mel_filename = 'thchs30-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, pinyin)
Exemple #4
0
def _process_utterance(out_dir, name, wav_path, text, hparams):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path, hparams)

    # trim silences here
    wav = audio.trim_silence(wav, hparams)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav, hparams).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'mailabs-spec-{}.npy'.format(name)
    mel_filename = 'mailabs-mel-{}.npy'.format(name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemple #5
0
def _process_utterance(out_dir, index, wav_path, pinyin):
    # 读取语音
    wav = audio.load_wav(wav_path)
    wav = wav / np.abs(wav).max() * 0.999

    # 消除静音
    wav = audio.trim_silence(wav)

    # 得到语音的线性频谱和梅尔频谱
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # 保存两种频谱
    spectrogram_filename = 'biaobei-spec-%05d.npy' % index
    mel_filename = 'biaobei-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    return (spectrogram_filename, mel_filename, n_frames, pinyin)
Exemple #6
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    # M-AILABS extra silence specific
    wav = audio.trim_silence(wav)

    #Pre-emphasize
    wav = audio.preemphasis(wav)

    #rescale wav
    #wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #Assert all audio is in [-1, 1]
    if (wav > 1.).any() or (wav < -1.).any():
        #raise RuntimeError('wav has invalid value: {}'.format(wav))
        print('file {} has invalid value. skipping!'.format(wav_path))
        return None

    #[-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_frame_num:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.spectrogram(wav).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    #Ensure time resolution adjustement between audio and mel-spectrogram

    l_pad, r_pad, hop_size = audio.librosa_pad_lr(wav)

    #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad),
                 mode='constant',
                 constant_values=constant_values)

    assert len(out) >= mel_frames * hop_size

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * hop_size]
    assert len(out) % hop_size == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Exemple #7
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text):
    """
	Preprocesses a single utterance wav/text pair
	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file
	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters
	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav)

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    #Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size())

    #Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * audio.get_hop_size()

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)