Esempio n. 1
0
    def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
        '''Adjust time resolution between audio and local condition
		'''
        if local_condition:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                self._assert_ready_for_upsample(x, c)
                if max_time_steps is not None:
                    max_steps = _ensure_divisible(
                        max_time_steps, audio.get_hop_size(self._hparams),
                        True)
                    if len(x) > max_time_steps:
                        max_time_frames = max_steps // audio.get_hop_size(
                            self._hparams)
                        start = np.random.randint(0, len(c) - max_time_frames)
                        time_start = start * audio.get_hop_size(self._hparams)
                        x = x[time_start:time_start + max_time_frames *
                              audio.get_hop_size(self._hparams)]
                        c = c[start:start + max_time_frames, :]
                        self._assert_ready_for_upsample(x, c)

                new_batch.append((x, c, g, l))
            return new_batch
        else:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                x = audio.trim_silence(x, hparams)
                if max_time_steps is not None and len(x) > max_time_steps:
                    start = np.random.randint(0, len(c) - max_time_steps)
                    x = x[start:start + max_time_steps]
                new_batch.append((x, c, g, l))
            return new_batch
Esempio n. 2
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text):

    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path)
    except:
        print('file {} present in csv not in folder'.format(wav_path))
        return None

    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.trim_silence:
        wav = audio.trim_silence(wav)

    out = mulaw_quantize(wav, hparams.quantize_channels)

    start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
    wav = wav[start:end]
    out = out[start:end]

    constant_values = mulaw_quantize(0, hparams.quantize_channels)
    out_dtype = np.int16

    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    assert linear_frames == mel_frames

    l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    time_steps = len(out)
    assert time_steps >= mel_frames * audio.get_hop_size()

    out = out[:mel_frames * audio.get_hop_size()]
    assert time_steps % audio.get_hop_size() == 0

    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Esempio n. 3
0
	def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir):
		hparams = self._hparams
		local_cond, global_cond = self._check_conditions()

		#Get True length of audio to be synthesized: audio_len = mel_len * hop_size
		audio_lengths = [len(x) * get_hop_size(self._hparams) for x in mel_spectrograms]

		#Prepare local condition batch
		maxlen = max([len(x) for x in mel_spectrograms])
		#[-max, max] or [0,max]
		T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value)

		if self._hparams.clip_for_wavenet:
			mel_spectrograms = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms]

		c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms]).astype(np.float32)

		if self._hparams.normalize_for_wavenet:
			#rerange to [0, 1]
			c_batch = np.interp(c_batch, T2_output_range, (0, 1))

		g = None if speaker_ids is None else np.asarray(speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
		feed_dict = {}

		if local_cond:
			feed_dict[self.local_conditions] = c_batch
		else:
			feed_dict[self.synthesis_length] = 100

		if global_cond:
			feed_dict[self.global_conditions] = g

		#Generate wavs and clip extra padding to select Real speech parts
		generated_wavs = self.session.run(self.model.y_hat, feed_dict=feed_dict)
		generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)]

		audio_filenames = []
		for i, generated_wav in enumerate(generated_wavs):
			#Save wav to disk
			audio_filename = os.path.join(out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
			save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate)
			audio_filenames.append(audio_filename)

			#Save waveplot to disk
			if log_dir is not None:
				plot_filename = os.path.join(log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
				util.waveplot(plot_filename, generated_wav, None, hparams)

		return audio_filenames
Esempio n. 4
0
    def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
        '''Adjust time resolution between audio and local condition
		'''
        if local_condition:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                if len(x) < len(c) * audio.get_hop_size(self._hparams):
                    pad_length = audio.get_hop_size(
                        self._hparams) * len(c) - len(x)
                    if pad_length % 2 == 0:
                        x = np.pad(x, (pad_length // 2, pad_length // 2),
                                   mode='constant',
                                   constant_values=_pad)
                    else:
                        x = np.pad(x, (pad_length // 2, (pad_length + 1) // 2),
                                   mode='constant',
                                   constant_values=_pad)
                else:
                    c = self._pad_specs(
                        c,
                        len(x) // audio.get_hop_size(self._hparams))
                self._assert_ready_for_upsample(x, c)
                if max_time_steps is not None:
                    max_steps = _ensure_divisible(
                        max_time_steps, audio.get_hop_size(self._hparams),
                        True)
                    if len(x) > max_time_steps:
                        max_time_frames = max_steps // audio.get_hop_size(
                            self._hparams)
                        start = np.random.randint(0, len(c) - max_time_frames)
                        time_start = start * audio.get_hop_size(self._hparams)
                        x = x[time_start:time_start + max_time_frames *
                              audio.get_hop_size(self._hparams)]
                        c = c[start:start + max_time_frames, :]
                        self._assert_ready_for_upsample(x, c)

                new_batch.append((x, c, g, l))
            return new_batch
        else:
            new_batch = []
            for b in batch:
                x, c, g, l = b
                x = audio.trim(x)
                if max_time_steps is not None and len(x) > max_time_steps:
                    start = np.random.randint(0, len(c) - max_time_steps)
                    x = x[start:start + max_time_steps]
                new_batch.append((x, c, g, l))
            return new_batch
Esempio n. 5
0
def _process_utterance(wav_dir, mel_dir, index, wav_path, text, hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #[-1, 1]
    out = encode_mu_law(wav, mu=512)

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames or len(
            text) > hparams.max_text_length:
        return None

    #Zero pad for quantized signal
    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    r = mel_frames * audio.get_hop_size(hparams) - len(wav)
    out = np.pad(out, (0, r), mode='constant', constant_values=0.)
    assert len(out) == mel_frames * audio.get_hop_size(hparams)
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    filename = '{}.npy'.format(index)
    np.save(os.path.join(wav_dir, filename),
            out.astype(np.int16),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (filename, time_steps, mel_frames, text)
Esempio n. 6
0
def re_save_all(wav_path, audio_filename, mel_filename, linear_filename):

    try:
        # Load the audio as numpy array
        aud = audio.load_audio(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None
    #Trim lead/trail silences
    if hparams.trim_silence:
        aud = audio.trim_silence(aud, hparams)

    #Pre-emphasize
    preem_aud = audio.preemphasis(aud, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale audio
    if hparams.rescale:
        aud = aud / np.abs(aud).max() * hparams.rescaling_max
        preem_aud = preem_aud / np.abs(preem_aud).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (aud > 1.).any() or (aud < -1.).any():
            raise RuntimeError('audio has invalid value: {}'.format(wav_path))
        if (preem_aud > 1.).any() or (preem_aud < -1.).any():
            raise RuntimeError('audio has invalid value: {}'.format(wav_path))

    #[-1, 1]
    out = aud
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the audio
    mel_spectrogram = audio.melspectrogram(preem_aud,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    #Compute the linear scale spectrogram from the audui
    linear_spectrogram = audio.linearspectrogram(preem_aud,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    #Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(aud, hparams.n_fft,
                                        audio.get_hop_size(hparams),
                                        hparams.wavenet_pad_sides)

    #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad),
                 mode='constant',
                 constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0

    # Write the spectrogram and audio to disk
    np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
    np.save(linear_filename, linear_spectrogram.T, allow_pickle=False)
Esempio n. 7
0
def _process_utterance(out_dir, index, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - out_dir: the directory to write the msgpack into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    # Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        # Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    # [-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(wav, audio.get_hop_size(hparams), hparams.pad_sides)

    # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)
    npz_filename = '{}.npz'.format(index)
    r = hparams.outputs_per_step
    if hparams.symmetric_mels:
        _pad_value = -hparams.max_abs_value
    else:
        _pad_value = 0.
    # +2r for head and tail silence
    mel_spec = np.pad(mel_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value)
    linear_spec = np.pad(linear_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value)
    target_length = len(linear_spec)
    target_frames = (target_length // r + 1) * r
    num_pad = target_frames - target_length
    if num_pad != 0:
        linear_spec = np.pad(linear_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value)
        mel_spec = np.pad(mel_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value)
    stop_token = np.concatenate(
        [np.zeros(target_frames - 1, dtype=np.float32), np.ones(1, dtype=np.float32)],
        axis=0)
    data = {
        'mel': mel_spec,
        'linear': linear_spec,
        'audio': out.astype(out_dtype),
        'input_data': np.asarray(text_to_sequence(text)),
        'time_steps': time_steps,
        'mel_frames': target_frames,
        'text': text,
        'stop_token': stop_token,
    }
    dumps_msgpack(data, os.path.join(out_dir, npz_filename))
    # Return a tuple describing this training example
    return npz_filename, time_steps, mel_frames, text
def _process_utterance(audio_dir, label_dir, index, wav_path, text_path, args):
    """
    Preprocesses a single utterance wav/text_jamo pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text_jamo: text_jamo spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text_jamo)
    """
    try:
        # Load the audio as numpy array
        # wav = audio.load_wav(wav_path, sr=args.sample_rate)
        with open(wav_path, 'rb') as pcmfile:
            buf = pcmfile.read()
            wav = np.frombuffer(buf, dtype='int16')
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
        return None

    # rescale wav
    if args.rescale:
        wav = wav / np.abs(wav).max() * args.rescaling_max

    # M-AILABS extra silence specific
    if args.trim_silence:
        wav = audio.trim_silence(wav, args)

    # [-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, args).astype(out_dtype)
    mel_frames = mel_spectrogram.shape[1]

    # Ensure time resolution adjustement between audio and mel-spectrogram
    pad = audio.librosa_pad_lr(wav, args.n_fft, audio.get_hop_size(args))

    # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (0, pad), mode='reflect')
    assert len(out) >= mel_frames * audio.get_hop_size(args)

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(args)]
    assert len(out) % audio.get_hop_size(args) == 0
    time_steps = len(out)

    # text_jamo sequence
    with open(text_path, 'r', encoding='CP949') as f:
        line = f.readline()

    # ETRI transcription rule
    line = sentence_filter(line).upper()
    label_sequence = normalize(line)
    print(label_sequence)

    # Write the spectrogram and audio to disk
    mel_filename = 'mel-{}.npy'.format(index)
    label_filename = 'label-{}.txt'.format(index)
    np.save(os.path.join(audio_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    with open(os.path.join(label_dir, label_filename), 'w', encoding='utf-8') as f_out:
        f_out.write(label_sequence)

    # Return a tuple describing this training example
    return (wav_path, text_path, mel_filename, label_filename, time_steps, mel_frames)
Esempio n. 9
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    #Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

    #Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Esempio n. 10
0
def _process_utterance(mel_dir, linear_dir, wav_dir, spkid, uttid, wav_path,
                       text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        preem_wav = preem_wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                            audio.get_hop_size(hparams),
                                            hparams.wavenet_pad_sides)

        #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    sub_wav_dir = os.path.join(wav_dir, spkid)
    sub_mel_dir = os.path.join(mel_dir, spkid)
    sub_linear_dir = os.path.join(linear_dir, spkid)

    os.makedirs(sub_wav_dir, exist_ok=True)
    os.makedirs(sub_mel_dir, exist_ok=True)
    os.makedirs(sub_linear_dir, exist_ok=True)

    audio_filename = 'audio-{}.npy'.format(uttid)
    mel_filename = 'mel-{}.npy'.format(uttid)
    linear_filename = 'linear-{}.npy'.format(uttid)
    np.save(os.path.join(sub_wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(sub_mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(sub_linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (spkid, audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Esempio n. 11
0
        return [None, eliminated]

    if hparams.predict_linear:
        # Compute the linear scale spectrogram from the wav
        linear_spectrogram = audio.linearspectrogram(
            wav, hparams).astype(np.float32)
        linear_frames = linear_spectrogram.shape[1]

        # sanity check
        assert linear_frames == mel_frames
>>>>>>> f33090dba9ba4bc52db8367abdc48841d13c48f8

    if hparams.use_lws:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        # Zero pad audio signal
<<<<<<< HEAD
        out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    else:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
=======
        out = np.pad(out, (l, r), mode='constant',
                     constant_values=constant_values)
    else:
        # Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(
            wav, hparams.n_fft, audio.get_hop_size(hparams))
>>>>>>> f33090dba9ba4bc52db8367abdc48841d13c48f8
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, ppgs,
                       lf0_path, speaker, refer, hparams):
    """
	Preprocesses a single utterance wav/ppgs pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- ppgs: ppgs spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, refer_name ,time_steps, mel_frames, linear_frames, ppgs,speaker,lf0)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)
    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, refer, time_steps,
            mel_frames, ppgs, speaker, lf0_path)
Esempio n. 13
0
    def initialize(self,
                   y,
                   c,
                   g,
                   input_lengths,
                   x=None,
                   synthesis_length=None):
        '''Initialize wavenet graph for train, eval and test cases.
		'''
        hparams = self._hparams
        self.is_training = x is not None
        self.is_evaluating = not self.is_training and y is not None
        #Set all convolutions to corresponding mode
        self.set_mode(self.is_training)

        # split_device = '/cpu:0' if self._hparams.wavenet_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0'
        # with tf.device(split_device):
        # 	hp = self._hparams
        # 	lout_int = [tf.int32] * hp.wavenet_num_gpus
        # 	lout_float = [tf.float32] * hp.wavenet_num_gpus
        #
        # 	tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if input_lengths is not None else [input_lengths] * hp.wavenet_num_gpus
        #
        # 	tower_y = tf.split(y, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if y is not None else [y] * hp.wavenet_num_gpus
        # 	tower_x = tf.split(x, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if x is not None else [x] * hp.wavenet_num_gpus
        # 	tower_c = tf.split(c, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if self.local_conditioning_enabled() else [None] * hp.wavenet_num_gpus
        # 	tower_g = tf.split(g, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if self.global_conditioning_enabled() else [None] * hp.wavenet_num_gpus
        # 	tower_test_inputs = tf.split(test_inputs, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if test_inputs is not None else [test_inputs] * hp.wavenet_num_gpus
        #
        # self.tower_y_hat_q = []
        # self.tower_y_hat_train = []
        # self.tower_y = []
        # self.tower_input_lengths = []
        # self.tower_means = []
        # self.tower_log_scales = []
        # self.tower_y_hat_log = []
        # self.tower_y_log = []
        # self.tower_c = []
        # self.tower_y_eval = []
        # self.tower_eval_length = []
        # self.tower_y_hat = []
        # self.tower_y_target = []
        # self.tower_eval_c = []
        # self.tower_mask = []
        # self.tower_upsampled_local_features = []
        # self.tower_eval_upsampled_local_features = []
        # self.tower_synth_upsampled_local_features = []
        #
        log('Initializing Wavenet model.  Dimensions (? = dynamic shape): ')
        log('  Train mode:                {}'.format(self.is_training))
        log('  Eval mode:                 {}'.format(self.is_evaluating))
        log('  Synthesis mode:            {}'.format(not (
            self.is_training or self.is_evaluating)))

        #1. Declare GPU devices
        #gpus = ['/gpu:{}'.format(i) for i in range(hp.wavenet_num_gpus)]
        #for i in range(hp.wavenet_num_gpus):
        #with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device='/cpu:0', worker_device=gpus[i])):
        with tf.variable_scope('inference') as scope:
            #log('  device:                    {}'.format(i))
            #Training
            if self.is_training:
                batch_size = tf.shape(x)[0]
                #[batch_size, time_length, 1]
                self.mask = self.get_mask(
                    input_lengths,
                    maxlen=tf.shape(x)[-1])  #To be used in loss computation
                #[batch_size, channels, time_length]
                y_hat = self.step(
                    x, c, g, softmax=False
                )  #softmax is automatically computed inside softmax_cross_entropy if needed

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length, channels]
                    self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

                self.y_hat = y_hat
                self.y = y
                self.input_lengths = input_lengths

                #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL)
                if self._hparams.out_channels == 2:
                    self.means = self.y_hat[:, 0, :]
                    self.log_scales = y_hat[:, 1, :]
                else:
                    self.means = None

                    #Graph extension for log saving
                    #[batch_size, time_length]
                shape_control = (batch_size, tf.shape(x)[-1], 1)
                with tf.control_dependencies(
                    [tf.assert_equal(tf.shape(y), shape_control)]):
                    y_log = tf.squeeze(y, [-1])
                    if is_mulaw_quantize(hparams.input_type):
                        self.y = y_log

                y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
                                    lambda: tf.squeeze(y_hat, [-1]),
                                    lambda: y_hat)
                y_hat_log = tf.reshape(y_hat_log,
                                       [batch_size, hparams.out_channels, -1])

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length]
                    y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1)

                    y_hat_log = util.inv_mulaw_quantize(
                        y_hat_log, hparams.quantize_channels)
                    y_log = util.inv_mulaw_quantize(y_log,
                                                    hparams.quantize_channels)

                else:
                    #[batch_size, time_length]
                    if hparams.out_channels == 2:
                        y_hat_log = sample_from_gaussian(
                            y_hat_log,
                            log_scale_min_gauss=hparams.log_scale_min_gauss)
                    else:
                        y_hat_log = sample_from_discretized_mix_logistic(
                            y_hat_log, log_scale_min=hparams.log_scale_min)

                    if is_mulaw(hparams.input_type):
                        y_hat_log = util.inv_mulaw(y_hat_log,
                                                   hparams.quantize_channels)
                        y_log = util.inv_mulaw(y_log,
                                               hparams.quantize_channels)

                self.y_hat_log = y_hat_log
                self.y_log = y_log
                # self.tower_c.append(tower_c[i])
                # self.tower_upsampled_local_features.append(self.upsampled_local_features)

                log('  inputs:                    {}'.format(x.shape))
                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_log.shape))
                log('  outputs:                   {}'.format(y_hat_log.shape))

                #evaluating
            elif self.is_evaluating:
                #[time_length, ]
                idx = 0
                length = input_lengths[idx]
                y_target = tf.reshape(y[idx], [-1])[:length]
                #test_inputs = tf.reshape(y_target, [1, -1, 1]) if not hparams.wavenet_natural_eval else None

                if c is not None:
                    c = tf.expand_dims(c[idx, :, :length], axis=0)
                    with tf.control_dependencies(
                        [tf.assert_equal(tf.rank(c), 3)]):
                        c = tf.identity(c, name='eval_assert_c_rank_op')

                if g is not None:
                    g = tf.expand_dims(g[idx], axis=0)

                batch_size = tf.shape(c)[0]

                #Start silence frame
                if is_mulaw_quantize(hparams.input_type):
                    initial_value = mulaw_quantize(0,
                                                   hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    initial_value = mulaw(0.0, hparams.quantize_channels)
                else:
                    initial_value = 0.0

                    #[channels, ]
                if is_mulaw_quantize(hparams.input_type):
                    initial_input = tf.one_hot(indices=initial_value,
                                               depth=hparams.quantize_channels,
                                               dtype=tf.float32)
                    initial_input = tf.tile(
                        tf.reshape(initial_input,
                                   [1, 1, hparams.quantize_channels]),
                        [batch_size, 1, 1])
                else:
                    initial_input = tf.ones([batch_size, 1, 1],
                                            tf.float32) * initial_value

                    #Fast eval
                y_hat = self.incremental(
                    initial_input,
                    c=c,
                    g=g,
                    time_length=length,
                    softmax=False,
                    quantize=True,
                    log_scale_min=hparams.log_scale_min,
                    log_scale_min_gauss=hparams.log_scale_min_gauss)

                #Save targets and length for eval loss computation
                if is_mulaw_quantize(hparams.input_type):
                    self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
                else:
                    self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
                self.eval_length = length

                if is_mulaw_quantize(hparams.input_type):
                    y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1])
                    y_hat = inv_mulaw_quantize(y_hat,
                                               hparams.quantize_channels)
                    y_target = inv_mulaw_quantize(y_target,
                                                  hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    y_hat = inv_mulaw(tf.reshape(y_hat, [-1]),
                                      hparams.quantize_channels)
                    y_target = inv_mulaw(y_target, hparams.quantize_channels)
                else:
                    y_hat = tf.reshape(y_hat, [-1])

                self.y_hat = y_hat
                self.y_target = y_target
                # self.tower_eval_c.append(tower_c[i][idx])
                # self.tower_eval_upsampled_local_features.append(self.upsampled_local_features[idx])

                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_target.shape))
                log('  outputs:                   {}'.format(y_hat.shape))

                #synthesizing
            else:
                batch_size = tf.shape(c)[0]
                if c is None:
                    assert synthesis_length is not None
                else:
                    #[batch_size, local_condition_time, local_condition_dimension(num_mels)]
                    message = (
                        'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'
                        .format(hparams.cin_channels, c.shape))
                with tf.control_dependencies(
                    [tf.assert_equal(tf.rank(c), 3, message=message)]):
                    c = tf.identity(c, name='synthesis_assert_c_rank_op')

                Tc = tf.shape(c)[1]
                upsample_factor = audio.get_hop_size(self._hparams)

                #Overwrite length with respect to local condition features
                synthesis_length = Tc * upsample_factor

                #[batch_size, local_condition_dimension, local_condition_time]
                #time_length will be corrected using the upsample network
                c = tf.transpose(c, [0, 2, 1])

            if g is not None:
                assert g.shape == (batch_size, 1)

                #Start silence frame
            if is_mulaw_quantize(hparams.input_type):
                initial_value = mulaw_quantize(0, hparams.quantize_channels)
            elif is_mulaw(hparams.input_type):
                initial_value = mulaw(0.0, hparams.quantize_channels)
            else:
                initial_value = 0.0

            if is_mulaw_quantize(hparams.input_type):
                assert initial_value >= 0 and initial_value < hparams.quantize_channels
                initial_input = tf.one_hot(indices=initial_value,
                                           depth=hparams.quantize_channels,
                                           dtype=tf.float32)
                initial_input = tf.tile(
                    tf.reshape(initial_input,
                               [1, 1, hparams.quantize_channels]),
                    [batch_size, 1, 1])
            else:
                initial_input = tf.ones([batch_size, 1, 1],
                                        tf.float32) * initial_value

            y_hat = self.incremental(
                initial_input,
                c=c,
                g=g,
                time_length=synthesis_length,
                softmax=False,
                quantize=True,
                log_scale_min=hparams.log_scale_min,
                log_scale_min_gauss=hparams.log_scale_min_gauss)

            if is_mulaw_quantize(hparams.input_type):
                y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [batch_size, -1])
                y_hat = util.inv_mulaw_quantize(y_hat,
                                                hparams.quantize_channels)
            elif is_mulaw(hparams.input_type):
                y_hat = util.inv_mulaw(tf.reshape(y_hat, [batch_size, -1]),
                                       hparams.quantize_channels)
            else:
                y_hat = tf.reshape(y_hat, [batch_size, -1])

            self.y_hat = y_hat
            #self.tower_synth_upsampled_local_features.append(self.upsampled_local_features)

            if self.local_conditioning_enabled():
                log('  local_condition:           {}'.format(c.shape))
            if self.has_speaker_embedding():
                log('  global_condition:          {}'.format(g.shape))
            log('  outputs:                   {}'.format(y_hat.shape))

        self.variables = tf.trainable_variables()
        log('  Receptive Field:           ({} samples / {:.1f} ms)'.format(
            self.receptive_field,
            self.receptive_field / hparams.sample_rate * 1000.))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  WaveNet Parameters:        {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.variables]) / 1000000))

        self.ema = tf.train.ExponentialMovingAverage(
            decay=hparams.wavenet_ema_decay)
Esempio n. 14
0
def _process_utterance(mel_dir, index, wav_path, start, end, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - start, end: start, end points of speech
        - hparams: hyper parameters

    Returns:
        - A tuple: (wav_path, mel_filename, time_steps, mel_frames, start, end)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
        return None

    start += 1 * hparams.sample_rate
    end += 1 * hparams.sample_rate

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #[-1, 1]
    out = wav
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype)
    mel_frames = mel_spectrogram.shape[1]

    # Ensure time resolution adjustement between audio and mel-spectrogram
    pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))

    # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (0, pad), mode='reflect')
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    start = round(start/int(time_steps / mel_frames))
    end = round(end/int(time_steps / mel_frames))

    # Write the spectrogram and audio to disk
    mel_filename = 'mel-{}.npy'.format(index)
    np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example
    return (wav_path, mel_filename, time_steps, mel_frames, start, end)
Esempio n. 15
0
def build_from_path_ispl(hparams, input_dirs, mel_dir, label_dir, tqdm=lambda x: x):
    """
    Preprocesses the speech dataset from a gven input path to given output directories

    Args:
        - hparams: hyper parameters
        - input_dirs: input directory that contains the files to prerocess
        - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
        - label_dir: the directory to write the label into
        - tqdm: Optional, provides a nice progress bar

    Returns:
        - A list of tuple describing the train examples. this should be written to train.txt
    """

    # We use ProcessPoolExecutor to parallelize across processes, this is just for
    # optimization purposes and it can be omited
    futures = []
    index = 1
    for input_dir in input_dirs:
        files = find_files(os.path.join(input_dir))
        for wav_path in files:
            file_name = wav_path.split("\\")[-1]
            if int(file_name.split('.')[0]) <= 10:
                label_path = wav_path.split("\\")[0] + '/label.txt'
                with open(label_path, encoding='utf-8') as f:
                    lines = f.readlines()
                for line in lines:
                    if file_name in line:
                        labels = line.replace('[', '').replace(']', '').split(':')[1].replace(',\n', '').split(',')
                        start = []
                        end = []
                        for idx in range(0, len(labels), 2):
                            start.append(int(labels[idx]))
                            end.append(int(labels[idx+1]))

            try:
                # Load the audio as numpy array
                wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
            except FileNotFoundError:  # catch missing wav exception
                print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path))
                return None

            # rescale wav
            if hparams.rescale:
                wav = wav / np.abs(wav).max() * hparams.rescaling_max

            # M-AILABS extra silence specific
            if hparams.trim_silence:
                wav = audio.trim_silence(wav, hparams)

            # [-1, 1]
            out = wav
            out_dtype = np.float32

            if int(file_name.split('.')[0]) <= 10:
                label = np.zeros_like(out)
                for idx in range(len(start)):
                    start[idx] = int(start[idx] / 1000 * hparams.sample_rate)
                    end[idx] = int(end[idx] / 1000 * hparams.sample_rate)
                    label[start[idx]:end[idx]] = 1.
            else:
                label = wav_path.split('.')[0] + '.label'
                with open(label, encoding='utf-8') as f:
                    lines = f.readlines()
                label = np.asarray([int(line.strip('\n')) for line in lines])

            # Compute the mel scale spectrogram from the wav
            mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype)
            mel_spectrogram = mel_spectrogram[:, -len(label):]
            mel_frames = mel_spectrogram.shape[1]

            # Ensure time resolution adjustement between audio and mel-spectrogram
            pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))

            if int(file_name.split('.')[0]) <= 10:
                # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
                out = np.pad(out, (0, pad), mode='reflect')
                label = np.pad(label, (0, pad), mode='reflect')
                assert len(out) >= mel_frames * audio.get_hop_size(hparams)

                # time resolution adjustement
                # ensure length of raw audio is multiple of hop size so that we can use
                # transposed convolution to upsample
                out = out[:mel_frames * audio.get_hop_size(hparams)]
                label = label[:mel_frames * audio.get_hop_size(hparams)]
                assert len(out) % audio.get_hop_size(hparams) == 0
                label = label[::audio.get_hop_size(hparams)]

                time_steps = len(out)
            else:
                time_steps = len(out)

            # Write the spectrogram and audio to disk
            mel_filename = 'mel-{}.npy'.format(index)
            label_filename = 'label-{}.npy'.format(index)
            np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
            np.save(os.path.join(label_dir, label_filename), label, allow_pickle=False)
            futures.append((wav_path, mel_filename, time_steps, mel_frames, label_filename))
            index += 1

    return [future for future in tqdm(futures)]
Esempio n. 16
0
def _process_utterance(out_dir, index, wav_path, pinyin, hparams):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    mel_dir = out_dir + "/mels"
    linear_dir = out_dir + "/linear"
    wav_dir = out_dir + "/audio"

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    print("debug wav_path:", wav_path)
    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the wav:
    #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        print("debug --- drop wav_path:", wav_path, "mel_frames:", mel_frames)
        return None

    # Compute the linear-scale spectrogram from the wav:
    #spectrogram = audio.spectrogram(wav).astype(np.float32)
    #n_frames = spectrogram.shape[1]
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrograms to disk:
    #spectrogram_filename = 'thchs30-spec-%05d.npy' % index
    #mel_filename = 'thchs30-mel-%05d.npy' % index
    #np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    #np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    print("debug save wav file:", os.path.join(wav_dir, audio_filename))
    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, pinyin)
Esempio n. 17
0
    def __getitem__(self, indexs):
        if self.data is None:
            self.data = np.load(self.hparams.ds_name + '.npz',
                                allow_pickle=True)

        ret = []
        if not isinstance(indexs, list):
            _indexs = [indexs]
        else:
            _indexs = indexs
        for index in _indexs:
            data = self.data[self.audio_and_text_keys[index]].item()
            text = np.array(data['text'], np.int)

            if self.hparams.multispeaker:
                spk_id = data.get('spk_id', 0)
            else:
                spk_id = 0

            if self.hparams.add_sil == 2:
                text = text.reshape(-1,
                                    3 if not self.hparams.use_pinyin else 2)
                text = np.concatenate(
                    [text, 2 * np.ones([text.shape[0], 1], np.int)],
                    -1)  # [L, 4]
                text = text.reshape(-1)  # [L + L//3]
            elif self.hparams.add_sil == 3:
                text = np.stack([text, 128 + text, 256 + text], -1)  # [L, 3]
                text = text.reshape(
                    -1, 9 if not self.hparams.use_pinyin else 2)  # [L/3, 9]
                text = np.concatenate(
                    [text, 2 * np.ones([text.shape[0], 1], np.int)],
                    -1)  # [L/3, 10]
                text = text.reshape(-1)  # [10L/3]

            text = torch.from_numpy(text)
            mel = torch.from_numpy(np.array(data['mels']).reshape(-1, 80).T)
            if self.hparams.use_linear or self.hparams.linear_directly:
                linear = torch.from_numpy(
                    np.array(data['linear']).reshape(-1,
                                                     self.hparams.num_freq).T)
            else:
                linear = None

            if self.hparams.speech and self.type != 'val':
                mel = mel[:, :1550]
                text = text[:350]
                if linear:
                    linear = linear[:1550]

            pitch = None
            if self.hparams.use_pitch:
                pitch_key = 'pitches' if not self.hparams.use_smooth_pitch else 'smooth_pitches'
                pitch = torch.from_numpy(np.array(data[pitch_key], np.int))

            utt_ids = torch.from_numpy(np.array(data['utt_id'], np.int))
            if self.hparams.prefix_len > 0:
                text_len = int(self.hparams.prefix_len * text.shape[0] /
                               mel.shape[1])
                text = text[:text_len]
                mel = mel[:, :self.hparams.prefix_len]
                pitch = pitch[:self.hparams.prefix_len]

            attn = None
            if self.hparams.use_ali or self.hparams.use_ali_mask:
                attn = np.zeros((mel.shape[1], text.shape[0]))
                if self.hparams.use_phoneme_align:
                    mel_splits = [
                        int(x * self.hparams.audio_sample_rate /
                            self.hparams.hop_size) for x in data['splits']
                    ]
                    last = 0
                    for t_idx, s in enumerate(mel_splits):
                        attn[last:s, t_idx] = 1
                else:
                    splits_begin = np.clip(np.array(data['splits'], np.int), 0,
                                           mel.shape[1] - 1)
                    splits_end = np.clip(np.array(data['splits_end'], np.int),
                                         0, mel.shape[1] - 1)
                    splits_begin = [0] + list(splits_begin)
                    splits_end = [0] + list(splits_end)
                    if not self.hparams.use_ali_mask2:  # TODO: PINYIN?
                        if self.hparams.use_pinyin:
                            for i in range(text.shape[0] // 3):
                                splits_begin_step = (splits_begin[i + 1] -
                                                     splits_begin[i] - 3) / 2
                                if self.hparams.attn_step_clip10:
                                    splits_begin_step = np.clip(
                                        splits_begin_step, 0, 10)
                                attn[int(splits_begin[i]
                                         ):int(splits_begin[i] +
                                               splits_begin_step), i * 3] += 1
                                attn[int(splits_begin[i] + splits_begin_step
                                         ):int(splits_begin[i] +
                                               splits_begin_step * 2),
                                     i * 3 + 1] += 1
                                attn[int(splits_begin[i + 1]) -
                                     3:int(splits_begin[i + 1]),
                                     i * 3 + 2] += 1
                        else:
                            if self.hparams.add_sil == 0:
                                for i in range(text.shape[0] // 3):
                                    splits_begin_step = (splits_begin[i + 1] -
                                                         splits_begin[i]) / 3
                                    splits_end_step = (splits_end[i + 1] -
                                                       splits_end[i]) / 3
                                    if self.hparams.attn_step_clip10:
                                        splits_begin_step = np.clip(
                                            splits_begin_step, 0, 10)
                                        splits_end_step = np.clip(
                                            splits_end_step, 0, 10)
                                    attn[int(splits_begin[i]
                                             ):int(splits_begin[i] +
                                                   splits_begin_step),
                                         i * 3] += 0.5

                                    attn[int(splits_begin[i] +
                                             splits_begin_step
                                             ):int(splits_begin[i] +
                                                   splits_begin_step * 2),
                                         i * 3 + 1] += 0.5

                                    attn[int(splits_begin[i] +
                                             splits_begin_step *
                                             2):int(splits_begin[i + 1]),
                                         i * 3 + 2] += 0.5

                                    attn[int(splits_end[i]
                                             ):int(splits_end[i] +
                                                   splits_end_step),
                                         i * 3] += 0.5

                                    attn[int(splits_end[i] + splits_end_step
                                             ):int(splits_end[i] +
                                                   splits_end_step * 2),
                                         i * 3 + 1] += 0.5

                                    attn[int(splits_end[i] + splits_end_step *
                                             2):int(splits_end[i + 1]),
                                         i * 3 + 2] += 0.5
                            elif self.hparams.add_sil == 2:
                                for i in range(text.shape[0] // 4):
                                    splits_begin_step = (splits_begin[i + 1] -
                                                         splits_begin[i] -
                                                         3) / 3
                                    splits_end_step = (splits_end[i + 1] -
                                                       splits_end[i] - 3) / 3
                                    if self.hparams.attn_step_clip10:
                                        splits_begin_step = np.clip(
                                            splits_begin_step, 0, 10)
                                        splits_end_step = np.clip(
                                            splits_end_step, 0, 10)

                                    attn[int(splits_begin[i]
                                             ):int(splits_begin[i] +
                                                   splits_begin_step),
                                         i * 4] += 0.5
                                    attn[int(splits_begin[i] +
                                             splits_begin_step
                                             ):int(splits_begin[i] +
                                                   splits_begin_step * 2),
                                         i * 4 + 1] += 0.5
                                    attn[int(splits_begin[i] +
                                             splits_begin_step *
                                             2):int(splits_begin[i + 1]) - 3,
                                         i * 4 + 2] += 0.5
                                    attn[int(splits_begin[i + 1]) -
                                         3:int(splits_begin[i + 1]),
                                         i * 4 + 3] += 0.5

                                    attn[int(splits_end[i]
                                             ):int(splits_end[i] +
                                                   splits_end_step),
                                         i * 4] += 0.5
                                    attn[int(splits_end[i] + splits_end_step
                                             ):int(splits_end[i] +
                                                   splits_end_step * 2),
                                         i * 4 + 1] += 0.5
                                    attn[int(splits_end[i] + splits_end_step *
                                             2):int(splits_end[i + 1]) - 3,
                                         i * 4 + 2] += 0.5
                                    attn[int(splits_end[i + 1]) -
                                         3:int(splits_end[i + 1]),
                                         i * 4 + 3] += 0.5
                    else:
                        for i in range(text.shape[0] // 3):
                            attn[int(splits_begin[i]):int(splits_begin[i + 1]),
                                 i * 3:(i + 1) * 3] = 1
                            attn[int(splits_end[i]):int(splits_end[i + 1]),
                                 i * 3:(i + 1) * 3] = 1
                attn = torch.from_numpy(attn)

            if self.hparams.use_wavenet:
                wav = torch.from_numpy(np.array(data['raw_wav']))
                max_time_steps = self.hparams.wavenet_max_time
                if wav.shape[0] > max_time_steps:
                    max_time_frames = max_time_steps // audio.get_hop_size(
                        self.hparams)
                    start_cond_idx = torch.randint(
                        mel.shape[1] - max_time_frames, [])
                else:
                    start_cond_idx = 0
            else:
                wav = None
                start_cond_idx = None
            if self.hparams.linear_directly:
                mel = linear
            ret.append([
                text, mel, pitch, utt_ids, attn, linear, spk_id, wav,
                start_cond_idx
            ])
        if not isinstance(indexs, list):
            return ret[0]
        else:
            return ret
Esempio n. 18
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    wav = _trim_wav(audio.load_wav(wav_path, sr=hparams.sample_rate))
    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)

    name = os.path.splitext(os.path.basename(wav_path))[0]
    speaker_id = _speaker_re.match(name).group(1)

    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

    # Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, speaker_id, text)
Esempio n. 19
0
def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams, speaker_id):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectrogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index))
    mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index))
    np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)

    #global condition features
    if hparams.gin_channels > 0:
        # raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training')
        speaker_id = speaker_id  #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable)
    else:
        speaker_id = speaker_id

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, '_', speaker_id, time_steps,
            mel_frames)
Esempio n. 20
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path, sr=hparams.sample_rate)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        # [-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

    # Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'speech-audio-{:05d}.npy'.format(index)
    mel_filename = 'speech-mel-{:05d}.npy'.format(index)
    linear_filename = 'speech-linear-{:05d}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
Esempio n. 21
0
    def initialize(self,
                   y,
                   c,
                   g,
                   input_lengths,
                   x=None,
                   synthesis_length=None):
        '''Initialize wavenet graph for train, eval and test cases.
		'''
        hparams = self._hparams
        self.is_training = x is not None
        self.is_evaluating = not self.is_training and y is not None
        #Set all convolutions to corresponding mode
        self.set_mode(self.is_training)

        log('Initializing Wavenet model.  Dimensions (? = dynamic shape): ')
        log('  Train mode:                {}'.format(self.is_training))
        log('  Eval mode:                 {}'.format(self.is_evaluating))
        log('  Synthesis mode:            {}'.format(not (
            self.is_training or self.is_evaluating)))
        with tf.variable_scope('inference') as scope:
            #Training
            if self.is_training:
                batch_size = tf.shape(x)[0]
                #[batch_size, time_length, 1]
                self.mask = self.get_mask(
                    input_lengths,
                    maxlen=tf.shape(x)[-1])  #To be used in loss computation
                #[batch_size, channels, time_length]
                y_hat = self.step(
                    x, c, g, softmax=False
                )  #softmax is automatically computed inside softmax_cross_entropy if needed

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length, channels]
                    self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

                self.y_hat = y_hat
                self.y = y
                self.input_lengths = input_lengths

                #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL)
                if self._hparams.out_channels == 2:
                    self.means = self.y_hat[:, 0, :]
                    self.log_scales = self.y_hat[:, 1, :]
                else:
                    self.means = None

                #Graph extension for log saving
                #[batch_size, time_length]
                shape_control = (batch_size, tf.shape(x)[-1], 1)
                with tf.control_dependencies(
                    [tf.assert_equal(tf.shape(y), shape_control)]):
                    y_log = tf.squeeze(y, [-1])
                    if is_mulaw_quantize(hparams.input_type):
                        self.y = y_log

                y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
                                    lambda: tf.squeeze(y_hat, [-1]),
                                    lambda: y_hat)
                y_hat_log = tf.reshape(y_hat_log,
                                       [batch_size, hparams.out_channels, -1])

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length]
                    y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1)

                    y_hat_log = util.inv_mulaw_quantize(
                        y_hat_log, hparams.quantize_channels)
                    y_log = util.inv_mulaw_quantize(y_log,
                                                    hparams.quantize_channels)

                else:
                    #[batch_size, time_length]
                    if hparams.out_channels == 2:
                        y_hat_log = sample_from_gaussian(
                            y_hat_log,
                            log_scale_min_gauss=hparams.log_scale_min_gauss)
                    else:
                        y_hat_log = sample_from_discretized_mix_logistic(
                            y_hat_log, log_scale_min=hparams.log_scale_min)

                    if is_mulaw(hparams.input_type):
                        y_hat_log = util.inv_mulaw(y_hat_log,
                                                   hparams.quantize_channels)
                        y_log = util.inv_mulaw(y_log,
                                               hparams.quantize_channels)

                self.y_hat_log = y_hat_log
                self.y_log = y_log

                log('  inputs:                    {}'.format(x.shape))
                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_log.shape))
                log('  outputs:                   {}'.format(y_hat_log.shape))

            #evaluating
            elif self.is_evaluating:
                #[time_length, ]
                idx = 0
                length = input_lengths[idx]
                y_target = tf.reshape(y[idx], [-1])[:length]

                if c is not None:
                    c = tf.expand_dims(c[idx, :, :length], axis=0)
                    with tf.control_dependencies(
                        [tf.assert_equal(tf.rank(c), 3)]):
                        c = tf.identity(c, name='eval_assert_c_rank_op')
                if g is not None:
                    g = tf.expand_dims(g[idx], axis=0)

                batch_size = tf.shape(c)[0]

                #Start silence frame
                if is_mulaw_quantize(hparams.input_type):
                    initial_value = mulaw_quantize(0,
                                                   hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    initial_value = mulaw(0.0, hparams.quantize_channels)
                else:
                    initial_value = 0.0

                #[channels, ]
                if is_mulaw_quantize(hparams.input_type):
                    initial_input = tf.one_hot(indices=initial_value,
                                               depth=hparams.quantize_channels,
                                               dtype=tf.float32)
                    initial_input = tf.tile(
                        tf.reshape(initial_input,
                                   [1, 1, hparams.quantize_channels]),
                        [batch_size, 1, 1])
                else:
                    initial_input = tf.ones([batch_size, 1, 1],
                                            tf.float32) * initial_value

                #Fast eval
                y_hat = self.incremental(initial_input,
                                         c=c,
                                         g=g,
                                         time_length=length,
                                         softmax=False,
                                         quantize=True,
                                         log_scale_min=hparams.log_scale_min)

                #Save targets and length for eval loss computation
                if is_mulaw_quantize(hparams.input_type):
                    self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
                else:
                    self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
                self.eval_length = length

                if is_mulaw_quantize(hparams.input_type):
                    y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1])
                    y_hat = inv_mulaw_quantize(y_hat,
                                               hparams.quantize_channels)
                    y_target = inv_mulaw_quantize(y_target,
                                                  hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    y_hat = inv_mulaw(tf.reshape(y_hat, [-1]),
                                      hparams.quantize_channels)
                    y_target = inv_mulaw(y_target, hparams.quantize_channels)
                else:
                    y_hat = tf.reshape(y_hat, [-1])

                self.y_hat = y_hat
                self.y_target = y_target

                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_target.shape))
                log('  outputs:                   {}'.format(y_hat.shape))

            #synthesizing
            else:
                batch_size = tf.shape(c)[0]
                if c is None:
                    assert synthesis_length is not None
                else:
                    #[batch_size, local_condition_time, local_condition_dimension(num_mels)]
                    message = (
                        'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'
                        .format(hparams.cin_channels, c.shape))
                    with tf.control_dependencies(
                        [tf.assert_equal(tf.rank(c), 3, message=message)]):
                        c = tf.identity(c, name='synthesis_assert_c_rank_op')

                    Tc = tf.shape(c)[1]
                    upsample_factor = audio.get_hop_size(self._hparams)

                    #Overwrite length with respect to local condition features
                    synthesis_length = Tc * upsample_factor

                    #[batch_size, local_condition_dimension, local_condition_time]
                    #time_length will be corrected using the upsample network
                    c = tf.transpose(c, [0, 2, 1])

                #Start silence frame
                if is_mulaw_quantize(hparams.input_type):
                    initial_value = mulaw_quantize(0,
                                                   hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    initial_value = mulaw(0.0, hparams.quantize_channels)
                else:
                    initial_value = 0.0

                if is_mulaw_quantize(hparams.input_type):
                    assert initial_value >= 0 and initial_value < hparams.quantize_channels
                    initial_input = tf.one_hot(indices=initial_value,
                                               depth=hparams.quantize_channels,
                                               dtype=tf.float32)
                    initial_input = tf.tile(
                        tf.reshape(initial_input,
                                   [1, 1, hparams.quantize_channels]),
                        [batch_size, 1, 1])
                else:
                    initial_input = tf.ones([batch_size, 1, 1],
                                            tf.float32) * initial_value

                y_hat = self.incremental(initial_input,
                                         c=c,
                                         g=g,
                                         time_length=synthesis_length,
                                         softmax=False,
                                         quantize=True,
                                         log_scale_min=hparams.log_scale_min)

                if is_mulaw_quantize(hparams.input_type):
                    y_hat = tf.reshape(tf.argmax(y_hat, axis=1),
                                       [batch_size, -1])
                    self.out_node = y_hat
                    y_hat = util.inv_mulaw_quantize(y_hat,
                                                    hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    y_hat = util.inv_mulaw(tf.reshape(y_hat, [batch_size, -1]),
                                           hparams.quantize_channels)
                else:
                    y_hat = tf.reshape(y_hat, [batch_size, -1])

                self.y_hat = y_hat

                if self.local_conditioning_enabled():
                    log('  local_condition:            {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:           {}'.format(g.shape))
                log('  outputs:                    {}'.format(y_hat.shape))

        self.variables = tf.trainable_variables()
        self.ema = tf.train.ExponentialMovingAverage(
            decay=hparams.wavenet_ema_decay)
Esempio n. 22
0
	def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None):
		'''Initialize wavenet graph for train, eval and test cases.
		'''
		hparams = self._hparams
		self.is_training = x is not None
		self.is_evaluating = not self.is_training and y is not None
		#Set all convolutions to corresponding mode
		self.set_mode(self.is_training)

		log('Initializing Wavenet model.  Dimensions (? = dynamic shape): ')
		log('  Train mode:                {}'.format(self.is_training))
		log('  Eval mode:                 {}'.format(self.is_evaluating))
		log('  Synthesis mode:            {}'.format(not (self.is_training or self.is_evaluating)))
		with tf.variable_scope('inference') as scope:
			#Training
			if self.is_training:
				batch_size = tf.shape(x)[0]
				#[batch_size, time_length, 1]
				self.mask = self.get_mask(input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation
				#[batch_size, channels, time_length]
				y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed

				if is_mulaw_quantize(hparams.input_type):
					#[batch_size, time_length, channels]
					self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

				self.y_hat = y_hat
				self.y = y
				self.input_lengths = input_lengths

				#Graph extension for log saving
				#[batch_size, time_length]
				shape_control = (batch_size, tf.shape(x)[-1], 1)
				with tf.control_dependencies([tf.assert_equal(tf.shape(y), shape_control)]):
					y_log = tf.squeeze(y, [-1])
					if is_mulaw_quantize(hparams.input_type):
						self.y = y_log

				y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
					lambda: tf.squeeze(y_hat, [-1]),
					lambda: y_hat)
				y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1])

				if is_mulaw_quantize(hparams.input_type):
					#[batch_size, time_length]
					y_hat_log = tf.reduce_max(tf.nn.softmax(y_hat_log, axis=1), 1)

					y_hat_log = util.inv_mulaw_quantize(y_hat_log, hparams.quantize_channels)
					y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels)

				else:
					#[batch_size, time_length]
					y_hat_log = sample_from_discretized_mix_logistic(
						y_hat_log, log_scale_min=hparams.log_scale_min)

					if is_mulaw(hparams.input_type):
						y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels)
						y_log = util.inv_mulaw(y_log, hparams.quantize_channels)

				self.y_hat_log = y_hat_log
				self.y_log = y_log
				
				log('  inputs:                    {}'.format(x.shape))
				if self.local_conditioning_enabled():
					log('  local_condition:           {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:          {}'.format(g.shape))
				log('  targets:                   {}'.format(y_log.shape))
				log('  outputs:                   {}'.format(y_hat_log.shape))


			#evaluating
			elif self.is_evaluating: 
				#[time_length, ]
				idx = 0
				length = input_lengths[idx]
				y_target = tf.reshape(y[idx], [-1])[:length]

				if c is not None:
					c = tf.expand_dims(c[idx, :, :length], axis=0)
					with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3)]):
						c = tf.identity(c, name='eval_assert_c_rank_op')
				if g is not None:
					g = g[idx]

				#Start silence frame
				if is_mulaw_quantize(hparams.input_type):
					initial_value = mulaw_quantize(0, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					initial_value = mulaw(0.0, hparams.quantize_channels)
				else:
					initial_value = 0.0

				#[channels, ]
				if is_mulaw_quantize(hparams.input_type):
					initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32)
					initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels])
				else:
					initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value

				#Fast eval
				y_hat = self.incremental(initial_input, c=c, g=g, time_length=length,
					softmax=True, quantize=True, log_scale_min=hparams.log_scale_min)

				#Save targets and length for eval loss computation
				if is_mulaw_quantize(hparams.input_type):
					self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
				else:
					self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
				self.eval_length = length

				if is_mulaw_quantize(hparams.input_type):
					y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1])
					y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels)
					y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels)
					y_target = inv_mulaw(y_target, hparams.quantize_channels)
				else:
					y_hat = tf.reshape(y_hat, [-1])

				self.y_hat = y_hat
				self.y_target = y_target

				if self.local_conditioning_enabled():
					log('  local_condition:           {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:          {}'.format(g.shape))
				log('  targets:                   {}'.format(y_target.shape))
				log('  outputs:                   {}'.format(y_hat.shape))

			#synthesizing
			else:
				if c is None:
					assert synthesis_length is not None
				else:
					#[batch_size, local_condition_time, local_condition_dimension(num_mels)]
					message = ('Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'.format(
							hparams.cin_channels, c.shape))
					with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3, message=message)]):
						c = tf.identity(c, name='synthesis_assert_c_rank_op')

					Tc = tf.shape(c)[1]
					upsample_factor = audio.get_hop_size(self._hparams)

					#Overwrite length with respect to local condition features
					synthesis_length = Tc * upsample_factor

					#[batch_size, local_condition_dimension, local_condition_time]
					#time_length will be corrected using the upsample network
					c = tf.transpose(c, [0, 2, 1])

				#Start silence frame
				if is_mulaw_quantize(hparams.input_type):
					initial_value = mulaw_quantize(0, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					initial_value = mulaw(0.0, hparams.quantize_channels)
				else:
					initial_value = 0.0

				if is_mulaw_quantize(hparams.input_type):
					assert initial_value >= 0 and initial_value < hparams.quantize_channels
					initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32)
					initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels])
				else:
					initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value

				y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length,
					softmax=True, quantize=True, log_scale_min=hparams.log_scale_min)

				if is_mulaw_quantize(hparams.input_type):
					y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1])
					y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					y_hat = util.inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels)
				else:
					y_hat = tf.reshape(y_hat, [-1])

				self.y_hat = y_hat

				if self.local_conditioning_enabled():
					log('  local_condition:            {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:           {}'.format(g.shape))
				log('  outputs:                    {}'.format(y_hat.shape))

		self.variables = tf.trainable_variables()
		self.ema = tf.train.ExponentialMovingAverage(decay=hparams.wavenet_ema_decay)
Esempio n. 23
0
    def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir,
                   log_dir):
        hparams = self._hparams
        local_cond, global_cond = self._check_conditions()

        #Switch mels in case of debug
        # if self.synth_debug:
        # 	assert len(hparams.wavenet_debug_mels) == len(hparams.wavenet_debug_wavs)
        # 	mel_spectrograms = [np.load(mel_file) for mel_file in hparams.wavenet_debug_mels]

        #Get True length of audio to be synthesized: audio_len = mel_len * hop_size
        audio_lengths = [
            len(x) * get_hop_size(self._hparams) for x in mel_spectrograms
        ]

        #Prepare local condition batch
        maxlen = max([len(x) for x in mel_spectrograms])
        #[-max, max] or [0,max]
        T2_output_range = (
            -self._hparams.max_abs_value,
            self._hparams.max_abs_value) if self._hparams.symmetric_mels else (
                0, self._hparams.max_abs_value)

        if self._hparams.clip_for_wavenet:
            mel_spectrograms = [
                np.clip(x, T2_output_range[0], T2_output_range[1])
                for x in mel_spectrograms
            ]

        c_batch = np.stack([
            _pad_inputs(x, maxlen, _pad=T2_output_range[0])
            for x in mel_spectrograms
        ]).astype(np.float32)

        if self._hparams.normalize_for_wavenet:
            #rerange to [0, 1]
            c_batch = np.interp(c_batch, T2_output_range, (0, 1))

        g = None if speaker_ids is None else np.asarray(
            speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
        feed_dict = {}

        if local_cond:
            feed_dict[self.local_conditions] = c_batch
        else:
            feed_dict[self.synthesis_length] = 100

        if global_cond:
            feed_dict[self.global_conditions] = g

        # if self.synth_debug:
        # 	debug_wavs = hparams.wavenet_debug_wavs
        # 	assert len(debug_wavs) % hparams.wavenet_num_gpus == 0
        # 	test_wavs = [np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs]
        #
        # 	#pad wavs to same length
        # 	max_test_len = max([len(x) for x in test_wavs])
        # 	test_wavs = np.stack([_pad_inputs(x, max_test_len) for x in test_wavs]).astype(np.float32)
        #
        # 	assert len(test_wavs) == len(debug_wavs)
        # 	feed_dict[self.targets] = test_wavs.reshape(len(test_wavs), max_test_len, 1)
        # 	feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]])

        #Generate wavs and clip extra padding to select Real speech parts
        #generated_wavs, upsampled_features = self.session.run([self.model.tower_y_hat, self.model.tower_synth_upsampled_local_features], feed_dict=feed_dict)

        #Linearize outputs (n_gpus -> 1D)
        #generated_wavs = [wav for gpu_wavs in generated_wavs for wav in gpu_wavs]
        #upsampled_features = [feat for gpu_feats in upsampled_features for feat in gpu_feats]

        #generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)]
        #upsampled_features = [upsampled_feature[:, :length] for upsampled_feature, length in zip(upsampled_features, audio_lengths)]

        generated_wavs = self.session.run(self.model.y_hat,
                                          feed_dict=feed_dict)
        generated_wavs = [
            generated_wav[:length]
            for generated_wav, length in zip(generated_wavs, audio_lengths)
        ]

        audio_filenames = []
        for i, generated_wav in enumerate(generated_wavs):
            #Save wav to disk
            audio_filename = os.path.join(
                out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
            save_wavenet_wav(generated_wav,
                             audio_filename,
                             sr=hparams.sample_rate)
            audio_filenames.append(audio_filename)

            #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
            #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
            # generated_mel = melspectrogram(generated_wav, hparams).T
            # util.plot_spectrogram(generated_mel, os.path.join(log_dir, 'wavenet-mel-spectrogram-{}.png'.format(basenames[i])),
            # 	title='Local Condition vs Reconstructed Audio Mel-Spectrogram analysis', target_spectrogram=input_mel)
            # #Save upsampled features to visualize checkerboard artifacts.
            # util.plot_spectrogram(upsampled_feature.T, os.path.join(log_dir, 'wavenet-upsampled_features-{}.png'.format(basenames[i])),
            # 	title='Upmsampled Local Condition features', auto_aspect=True)

            #Save waveplot to disk
            if log_dir is not None:
                plot_filename = os.path.join(
                    log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
                util.waveplot(plot_filename,
                              generated_wav,
                              None,
                              hparams,
                              title='WaveNet generated Waveform.')

        return audio_filenames
Esempio n. 24
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text):
	"""
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""

	try:
		# Load the audio as numpy array
		wav = audio.load_wav(wav_path)
	except :
		print('file {} present in csv not in folder'.format(
			wav_path))
		return None

	if hparams.rescale:
		wav = wav / np.abs(wav).max() * hparams.rescaling_max

	#M-AILABS extra silence specific
	if hparams.trim_silence:
		wav = audio.trim_silence(wav)

	#[0, quantize_channels)
	out = mulaw_quantize(wav, hparams.quantize_channels)

	#Trim silences
	start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
	wav = wav[start: end]
	out = out[start: end]

	constant_values = mulaw_quantize(0, hparams.quantize_channels)
	out_dtype = np.int16

	# Compute the mel scale spectrogram from the wav
	mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
	mel_frames = mel_spectrogram.shape[1]

	#Compute the linear scale spectrogram from the wav
	linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32)
	linear_frames = linear_spectrogram.shape[1] 

	#sanity check
	assert linear_frames == mel_frames

	#Ensure time resolution adjustement between audio and mel-spectrogram
	l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size())

	#Zero pad for quantized signal
	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
	time_steps = len(out)
	assert time_steps >= mel_frames * audio.get_hop_size()

	#time resolution adjustement
	#ensure length of raw audio is multiple of hop size so that we can use
	#transposed convolution to upsample
	out = out[:mel_frames * audio.get_hop_size()]
	assert time_steps % audio.get_hop_size() == 0

	# Write the spectrogram and audio to disk
	audio_filename = 'speech-audio-{:05d}.npy'.format(index)
	mel_filename = 'speech-mel-{:05d}.npy'.format(index)
	linear_filename = 'speech-linear-{:05d}.npy'.format(index)
	np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
	np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
	np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

	# Return a tuple describing this training example
	return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
Esempio n. 25
0
 def _assert_ready_for_upsample(self, x, c):
     assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(
         self._hparams)
Esempio n. 26
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams):
	"""
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
	try:
		# Load the audio as numpy array
		wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
	except FileNotFoundError: #catch missing wav exception
		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
			wav_path))
		return None

	#rescale wav
	if hparams.rescale:
		wav = wav / np.abs(wav).max() * hparams.rescaling_max

	#M-AILABS extra silence specific
	if hparams.trim_silence:
		wav = audio.trim_silence(wav, hparams)

	#Mu-law quantize
	if is_mulaw_quantize(hparams.input_type):
		#[0, quantize_channels)
		out = mulaw_quantize(wav, hparams.quantize_channels)

		#Trim silences
		start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
		wav = wav[start: end]
		out = out[start: end]

		constant_values = mulaw_quantize(0, hparams.quantize_channels)
		out_dtype = np.int16

	elif is_mulaw(hparams.input_type):
		#[-1, 1]
		out = mulaw(wav, hparams.quantize_channels)
		constant_values = mulaw(0., hparams.quantize_channels)
		out_dtype = np.float32
	
	else:
		#[-1, 1]
		out = wav
		constant_values = 0.
		out_dtype = np.float32

	# Compute the mel scale spectrogram from the wav
	mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
	mel_frames = mel_spectrogram.shape[1]

	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
		return None

	#Compute the linear scale spectrogram from the wav
	linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
	linear_frames = linear_spectrogram.shape[1] 

	#sanity check
	assert linear_frames == mel_frames

	#Ensure time resolution adjustement between audio and mel-spectrogram
	fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
	l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

	#Zero pad for quantized signal
	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
	assert len(out) >= mel_frames * audio.get_hop_size(hparams)

	#time resolution adjustement
	#ensure length of raw audio is multiple of hop size so that we can use
	#transposed convolution to upsample
	out = out[:mel_frames * audio.get_hop_size(hparams)]
	assert len(out) % audio.get_hop_size(hparams) == 0
	time_steps = len(out)

	# Write the spectrogram and audio to disk
	audio_filename = 'speech-audio-{:05d}.npy'.format(index)
	mel_filename = 'speech-mel-{:05d}.npy'.format(index)
	linear_filename = 'speech-linear-{:05d}.npy'.format(index)
	np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
	np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
	np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

	# Return a tuple describing this training example
	return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)