コード例 #1
0
ファイル: preprocessor.py プロジェクト: duvtedudug/Tacotron-2
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams):
	"""
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
	try:
		# Load the audio as numpy array
		wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
	except FileNotFoundError: #catch missing wav exception
		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
			wav_path))
		return None

	#rescale wav
	if hparams.rescale:
		wav = wav / np.abs(wav).max() * hparams.rescaling_max

	#M-AILABS extra silence specific
	if hparams.trim_silence:
		wav = audio.trim_silence(wav, hparams)

	#Mu-law quantize
	if is_mulaw_quantize(hparams.input_type):
		#[0, quantize_channels)
		out = mulaw_quantize(wav, hparams.quantize_channels)

		#Trim silences
		start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
		wav = wav[start: end]
		out = out[start: end]

		constant_values = mulaw_quantize(0, hparams.quantize_channels)
		out_dtype = np.int16

	elif is_mulaw(hparams.input_type):
		#[-1, 1]
		out = mulaw(wav, hparams.quantize_channels)
		constant_values = mulaw(0., hparams.quantize_channels)
		out_dtype = np.float32
	
	else:
		#[-1, 1]
		out = wav
		constant_values = 0.
		out_dtype = np.float32

	# Compute the mel scale spectrogram from the wav
	mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
	mel_frames = mel_spectrogram.shape[1]

	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
		return None

	#Compute the linear scale spectrogram from the wav
	linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
	linear_frames = linear_spectrogram.shape[1] 

	#sanity check
	assert linear_frames == mel_frames

	#Ensure time resolution adjustement between audio and mel-spectrogram
	fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
	l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

	#Zero pad for quantized signal
	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
	assert len(out) >= mel_frames * audio.get_hop_size(hparams)

	#time resolution adjustement
	#ensure length of raw audio is multiple of hop size so that we can use
	#transposed convolution to upsample
	out = out[:mel_frames * audio.get_hop_size(hparams)]
	assert len(out) % audio.get_hop_size(hparams) == 0
	time_steps = len(out)

	# Write the spectrogram and audio to disk
	audio_filename = 'speech-audio-{:05d}.npy'.format(index)
	mel_filename = 'speech-mel-{:05d}.npy'.format(index)
	linear_filename = 'speech-linear-{:05d}.npy'.format(index)
	np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
	np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
	np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

	# Return a tuple describing this training example
	return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
コード例 #2
0
ファイル: preprocessor.py プロジェクト: yqlihust/Tacotron-3
def _process_utterance(out_dir, index, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - out_dir: the directory to write the msgpack into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    # Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        # Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    # [-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(wav, audio.get_hop_size(hparams), hparams.pad_sides)

    # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)
    npz_filename = '{}.npz'.format(index)
    r = hparams.outputs_per_step
    if hparams.symmetric_mels:
        _pad_value = -hparams.max_abs_value
    else:
        _pad_value = 0.
    # +2r for head and tail silence
    mel_spec = np.pad(mel_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value)
    linear_spec = np.pad(linear_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value)
    target_length = len(linear_spec)
    target_frames = (target_length // r + 1) * r
    num_pad = target_frames - target_length
    if num_pad != 0:
        linear_spec = np.pad(linear_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value)
        mel_spec = np.pad(mel_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value)
    stop_token = np.concatenate(
        [np.zeros(target_frames - 1, dtype=np.float32), np.ones(1, dtype=np.float32)],
        axis=0)
    data = {
        'mel': mel_spec,
        'linear': linear_spec,
        'audio': out.astype(out_dtype),
        'input_data': np.asarray(text_to_sequence(text)),
        'time_steps': time_steps,
        'mel_frames': target_frames,
        'text': text,
        'stop_token': stop_token,
    }
    dumps_msgpack(data, os.path.join(out_dir, npz_filename))
    # Return a tuple describing this training example
    return npz_filename, time_steps, mel_frames, text
def run_eval(args, checkpoint_path, output_dir, hparams, ppgs, speakers, Lf0s):
    eval_dir = os.path.join(output_dir, 'eval')
    log_dir = os.path.join(output_dir, 'logs-eval')

    if args.model == 'Tacotron-2':
        assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)

    #Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams, reference_mels=args.reference_audio)

    if args.reference_audio is not None:
        print('reference_audio:', args.reference_audio)
        ref_wav = load_wav(args.reference_audio.strip(), hparams.sample_rate)
        reference_mel = melspectrogram(ref_wav, hparams).astype(np.float32).T
    else:
        if hparams.use_style_encoder == True:
            print("*******************************")
            print(
                "TODO: add style weights when there is no reference audio. Now we use random weights, "
                + "which may generate unintelligible audio sometimes.")
            print("*******************************")
        else:
            #raise ValueError("You must set the reference audio if you don't want to use GSTs.")
            print("233")

    #Set inputs batch wise
    ppgs = [
        ppgs[i:i + hparams.tacotron_synthesis_batch_size]
        for i in range(0, len(ppgs), hparams.tacotron_synthesis_batch_size)
    ]
    Lf0s = [
        Lf0s[i:i + hparams.tacotron_synthesis_batch_size]
        for i in range(0, len(Lf0s), hparams.tacotron_synthesis_batch_size)
    ]
    if args.reference_audio is not None:
        reference_mels = [reference_mel] * len(ppgs)

    log('Starting Synthesis')
    with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:

        for i, texts in enumerate(tqdm(ppgs)):
            start = time.time()
            basenames = [
                'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))
            ]
            if args.reference_audio is not None:
                mel_filenames = synth.synthesize(texts, [speakers[i]],
                                                 basenames, eval_dir, log_dir,
                                                 None, [reference_mels[i]],
                                                 Lf0s[i])
            else:
                mel_filenames = synth.synthesize(texts, [speakers[i]],
                                                 basenames, eval_dir, log_dir,
                                                 None, None, Lf0s[i])

            for elems in zip(texts, mel_filenames, [speakers[i]]):
                file.write('|'.join([str(x) for x in elems]) + '\n')
    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
コード例 #4
0
ファイル: preprocessor.py プロジェクト: Jim-Song/Tacotron-2
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams, speaker_id):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    #Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    #Ensure time resolution adjustement between audio and mel-spectrogram
    fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
    l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

    #Zero pad for quantized signal
    out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text, speaker_id)
コード例 #5
0
ファイル: train.py プロジェクト: templeblock/AIvoices
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name):
    log('\nSaving intermediate states at step {}'.format(global_step))
    idx = 0
    y_hat, y, loss, length, input_mel, upsampled_features = sess.run([
        model.tower_y_hat_log[0][idx], model.tower_y_log[0][idx], model.loss,
        model.tower_input_lengths[0][idx], model.tower_c[0][idx],
        model.tower_upsampled_local_features[0][idx]
    ])

    #mask by length
    y_hat[length:] = 0
    y[length:] = 0

    #Make audio and plot paths
    pred_wav_path = os.path.join(wav_dir,
                                 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir,
                                   'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir,
                             'step-{}-waveplot.png'.format(global_step))
    mel_path = os.path.join(
        plot_dir,
        'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
    upsampled_path = os.path.join(
        plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

    #Save figure
    util.waveplot(plot_path,
                  y_hat,
                  y,
                  hparams,
                  title='{}, {}, step={}, loss={:.5f}'.format(
                      model_name, time_string(), global_step, loss))

    #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
    #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
    T2_output_range = (-hparams.max_abs_value,
                       hparams.max_abs_value) if hparams.symmetric_mels else (
                           0, hparams.max_abs_value)
    generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
    util.plot_spectrogram(
        generated_mel,
        mel_path,
        title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'
        .format(global_step, loss),
        target_spectrogram=input_mel.T)
    util.plot_spectrogram(
        upsampled_features.T,
        upsampled_path,
        title='Upsampled Local Condition features, step={}, loss={:.5f}'.
        format(global_step, loss),
        auto_aspect=True)

    #Save audio
    save_wavenet_wav(y_hat,
                     pred_wav_path,
                     sr=hparams.sample_rate,
                     inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
    save_wavenet_wav(y,
                     target_wav_path,
                     sr=hparams.sample_rate,
                     inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
コード例 #6
0
ファイル: train.py プロジェクト: templeblock/AIvoices
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer,
              hparams, model_name):
    '''Evaluate model during training.
	Supposes that model variables are averaged.
	'''
    start_time = time.time()
    y_hat, y_target, loss, input_mel, upsampled_features = sess.run([
        model.tower_y_hat[0], model.tower_y_target[0], model.eval_loss,
        model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0]
    ])
    duration = time.time() - start_time
    log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'
        .format(len(y_target), duration,
                len(y_target) / duration))

    #Make audio and plot paths
    pred_wav_path = os.path.join(wav_dir,
                                 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir,
                                   'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir,
                             'step-{}-waveplot.png'.format(global_step))
    mel_path = os.path.join(
        plot_dir,
        'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
    upsampled_path = os.path.join(
        plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

    #Save figure
    util.waveplot(plot_path,
                  y_hat,
                  y_target,
                  model._hparams,
                  title='{}, {}, step={}, loss={:.5f}'.format(
                      model_name, time_string(), global_step, loss))
    log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))

    #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
    #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
    T2_output_range = (-hparams.max_abs_value,
                       hparams.max_abs_value) if hparams.symmetric_mels else (
                           0, hparams.max_abs_value)
    generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
    util.plot_spectrogram(
        generated_mel,
        mel_path,
        title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'
        .format(global_step, loss),
        target_spectrogram=input_mel.T)
    util.plot_spectrogram(
        upsampled_features.T,
        upsampled_path,
        title='Upsampled Local Condition features, step={}, loss={:.5f}'.
        format(global_step, loss),
        auto_aspect=True)

    #Save Audio
    save_wavenet_wav(y_hat,
                     pred_wav_path,
                     sr=hparams.sample_rate,
                     inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
    save_wavenet_wav(y_target,
                     target_wav_path,
                     sr=hparams.sample_rate,
                     inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)

    #Write eval summary to tensorboard
    log('Writing eval summary!')
    add_test_stats(summary_writer, global_step, loss, hparams=hparams)