Beispiel #1
0
def _process_utterance(out_dir, in_dir, label, speaker_name, hparams):
    wav_paths = glob.glob(os.path.join(in_dir, "*.wav"))
    if not wav_paths:
        return None

    num_samples = len(wav_paths)
    npz_dir = os.path.join(out_dir, speaker_name)
    os.makedirs(npz_dir, exist_ok=True)

    for idx, wav_path in enumerate(wav_paths):
        wav_name, ext = os.path.splitext(os.path.basename(wav_path))
        if ext == ".wav":
            wav, sr = librosa.load(wav_path, sr=hparams.sample_rate)

            # rescale wav
            if hparams.rescaling:  # hparams.rescale = True
                wav = wav / np.abs(wav).max() * hparams.rescaling_max

            # M-AILABS extra silence specific
            if hparams.trim_silence:  # hparams.trim_silence = True
                wav = trim_silence(wav, hparams)  # Trim leading and trailing silence

            mel = melspectrogram(wav, hparams)
            seq_len = wav.shape[0]
            frame_len = mel.shape[1]

            file_name = wav_name
            np.savez(os.path.join(out_dir, file_name), mel=mel.T, speaker=label, seq_len=seq_len, frame_len=frame_len)


    return num_samples
Beispiel #2
0
    def say(self, txt, trim_silence=True, dyn_range_compress=True):

        time_start = time()

        logging.debug(u'%fs synthesizing %s' % (time() - time_start, txt))

        input_data = np.zeros((1, self.hp['max_inp_len']), dtype='int32')
        input_lengths = np.zeros((1, ), dtype='int32')

        logging.debug('input_data.shape=%s, input_lengths.shape=%s' %
                      (input_data.shape, input_lengths.shape))

        self._encode_input(txt, 0, input_data, input_lengths)

        logging.debug('input_data=%s input_lengths=%s' %
                      (input_data[0], input_lengths[0]))

        if self.write_debug_files:
            np.save('say_x', input_data[0])
            logging.debug('say_x.npy written.')
            np.save('say_xl', input_lengths[0])
            logging.debug('say_xl.npy written.')

        logging.debug(u'%fs self.session.run...' % (time() - time_start))
        spectrograms = self.sess.run(fetches=self.linear_outputs,
                                     feed_dict={
                                         self.inputs: input_data,
                                         self.input_lengths: input_lengths,
                                     })
        spectrogram = spectrograms[0]

        logging.debug('spectrogram.shape=%s' % repr(spectrogram.shape))

        if self.write_debug_files:
            np.save('say_spectrogram', spectrogram)
            logging.debug('say_spectrogram.npy written.')

        # np.set_printoptions(threshold=np.inf)

        logging.debug(u'%fs audio.inv_spectrogram...' % (time() - time_start))
        wav = audio.inv_spectrogram(spectrogram.T, self.hp, use_fgla=True)

        if dyn_range_compress:
            logging.debug(u'%fs dynamic range compression...' %
                          (time() - time_start))
            wav = audio.dyn_range_compress(wav, self.hp)

        if trim_silence:
            logging.debug(u'%fs trim silence...' % (time() - time_start))
            wav = audio.trim_silence(wav, self.hp)

        logging.debug(u'%fs wav.' % (time() - time_start))
        return wav
Beispiel #3
0
def _process_utterance(out_dir, in_dir, label, speaker_name, hparams):
    wav_paths = glob.glob(os.path.join(in_dir, "*.wav"))
    if not wav_paths:
        return None

    total_utter_num = len(wav_paths)
    train_utter_num = (total_utter_num // 10) * 9
    print("[%s] train : %d, test : %d" %
          (speaker_name, train_utter_num, total_utter_num - train_utter_num))

    num_samples = len(wav_paths)
    npz_dir = os.path.join(out_dir, speaker_name)
    os.makedirs(npz_dir, exist_ok=True)

    # Train & Test path 설정
    train_path = os.path.join(npz_dir, "train")
    test_path = os.path.join(npz_dir, "test")
    os.makedirs(train_path, exist_ok=True)
    os.makedirs(test_path, exist_ok=True)

    for idx, wav_path in enumerate(wav_paths):
        wav_name, ext = os.path.splitext(os.path.basename(wav_path))
        if ext == ".wav":
            wav, sr = librosa.load(wav_path, sr=hparams.sample_rate)

            # rescale wav
            if hparams.rescaling:  # hparams.rescale = True
                wav = wav / np.abs(wav).max() * hparams.rescaling_max

            # M-AILABS extra silence specific
            if hparams.trim_silence:  # hparams.trim_silence = True
                wav = trim_silence(
                    wav, hparams)  # Trim leading and trailing silence

            mel = melspectrogram(wav, hparams)
            seq_len = wav.shape[0]
            frame_len = mel.shape[1]

            # data output dir
            if idx < train_utter_num:
                data_out_dir = train_path
            else:
                data_out_dir = test_path
            file_name = wav_name
            np.savez(os.path.join(data_out_dir, file_name),
                     mel=mel.T,
                     speaker=label,
                     seq_len=seq_len,
                     frame_len=frame_len)

    return num_samples
Beispiel #4
0
def _process_utterance(out_dir, index, wav_path, pinyin):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

    Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # rescale wav for unified measure for all clips
    wav = wav / np.abs(wav).max() * 0.999

    # trim silence
    wav = audio.trim_silence(wav)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    if n_frames > hp.max_frame_num:
        return None

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'biaobei-spec-%05d.npy' % index
    mel_filename = 'biaobei-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, pinyin)
def _process_utterance(mel_dir,
                       linear_dir,
                       wav_dir,
                       index,
                       wav_path,
                       text,
                       hparams,
                       step_factor=1):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate * step_factor)
        if step_factor > 1: wav = wav[::step_factor]
        audio_time = len(wav) / hparams.sample_rate
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

#Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

#Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

#Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        preem_wav = preem_wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

# Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

#Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                            audio.get_hop_size(hparams),
                                            hparams.wavenet_pad_sides)

        #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (wav_path, audio_filename, mel_filename, linear_filename,
            time_steps, mel_frames, audio_time, text, len(text))