def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(out_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - out_dir: the directory to write the msgpack into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None # Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max # Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, audio.get_hop_size(hparams), hparams.pad_sides) # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) npz_filename = '{}.npz'.format(index) r = hparams.outputs_per_step if hparams.symmetric_mels: _pad_value = -hparams.max_abs_value else: _pad_value = 0. # +2r for head and tail silence mel_spec = np.pad(mel_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value) linear_spec = np.pad(linear_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value) target_length = len(linear_spec) target_frames = (target_length // r + 1) * r num_pad = target_frames - target_length if num_pad != 0: linear_spec = np.pad(linear_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) mel_spec = np.pad(mel_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) stop_token = np.concatenate( [np.zeros(target_frames - 1, dtype=np.float32), np.ones(1, dtype=np.float32)], axis=0) data = { 'mel': mel_spec, 'linear': linear_spec, 'audio': out.astype(out_dtype), 'input_data': np.asarray(text_to_sequence(text)), 'time_steps': time_steps, 'mel_frames': target_frames, 'text': text, 'stop_token': stop_token, } dumps_msgpack(data, os.path.join(out_dir, npz_filename)) # Return a tuple describing this training example return npz_filename, time_steps, mel_frames, text
def run_eval(args, checkpoint_path, output_dir, hparams, ppgs, speakers, Lf0s): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, reference_mels=args.reference_audio) if args.reference_audio is not None: print('reference_audio:', args.reference_audio) ref_wav = load_wav(args.reference_audio.strip(), hparams.sample_rate) reference_mel = melspectrogram(ref_wav, hparams).astype(np.float32).T else: if hparams.use_style_encoder == True: print("*******************************") print( "TODO: add style weights when there is no reference audio. Now we use random weights, " + "which may generate unintelligible audio sometimes.") print("*******************************") else: #raise ValueError("You must set the reference audio if you don't want to use GSTs.") print("233") #Set inputs batch wise ppgs = [ ppgs[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(ppgs), hparams.tacotron_synthesis_batch_size) ] Lf0s = [ Lf0s[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(Lf0s), hparams.tacotron_synthesis_batch_size) ] if args.reference_audio is not None: reference_mels = [reference_mel] * len(ppgs) log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(ppgs)): start = time.time() basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts)) ] if args.reference_audio is not None: mel_filenames = synth.synthesize(texts, [speakers[i]], basenames, eval_dir, log_dir, None, [reference_mels[i]], Lf0s[i]) else: mel_filenames = synth.synthesize(texts, [speakers[i]], basenames, eval_dir, log_dir, None, None, Lf0s[i]) for elems in zip(texts, mel_filenames, [speakers[i]]): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams, speaker_id): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text, speaker_id)
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name): log('\nSaving intermediate states at step {}'.format(global_step)) idx = 0 y_hat, y, loss, length, input_mel, upsampled_features = sess.run([ model.tower_y_hat_log[0][idx], model.tower_y_log[0][idx], model.loss, model.tower_input_lengths[0][idx], model.tower_c[0][idx], model.tower_upsampled_local_features[0][idx] ]) #mask by length y_hat[length:] = 0 y[length:] = 0 #Make audio and plot paths pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) mel_path = os.path.join( plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step)) upsampled_path = os.path.join( plot_dir, 'step-{}-upsampled-features.png'.format(global_step)) #Save figure util.waveplot(plot_path, y_hat, y, hparams, title='{}, {}, step={}, loss={:.5f}'.format( model_name, time_string(), global_step, loss)) #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range) util.plot_spectrogram( generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}' .format(global_step, loss), target_spectrogram=input_mel.T) util.plot_spectrogram( upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'. format(global_step, loss), auto_aspect=True) #Save audio save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name): '''Evaluate model during training. Supposes that model variables are averaged. ''' start_time = time.time() y_hat, y_target, loss, input_mel, upsampled_features = sess.run([ model.tower_y_hat[0], model.tower_y_target[0], model.eval_loss, model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0] ]) duration = time.time() - start_time log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)' .format(len(y_target), duration, len(y_target) / duration)) #Make audio and plot paths pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) mel_path = os.path.join( plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step)) upsampled_path = os.path.join( plot_dir, 'step-{}-upsampled-features.png'.format(global_step)) #Save figure util.waveplot(plot_path, y_hat, y_target, model._hparams, title='{}, {}, step={}, loss={:.5f}'.format( model_name, time_string(), global_step, loss)) log('Eval loss for global step {}: {:.3f}'.format(global_step, loss)) #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range) util.plot_spectrogram( generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}' .format(global_step, loss), target_spectrogram=input_mel.T) util.plot_spectrogram( upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'. format(global_step, loss), auto_aspect=True) #Save Audio save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) #Write eval summary to tensorboard log('Writing eval summary!') add_test_stats(summary_writer, global_step, loss, hparams=hparams)