def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step, plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams): # Save some results for evaluation attention_path = str(plot_dir.joinpath("attention_step_{}_sample_{}".format(step, sample_num))) save_attention(attention, attention_path) # save predicted mel spectrogram to disk (debug) mel_output_fpath = mel_output_dir.joinpath("mel-prediction-step-{}_sample_{}.npy".format(step, sample_num)) np.save(str(mel_output_fpath), mel_prediction, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) # wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) # wav_fpath = wav_dir.joinpath("step-{}-wave-from-mel_sample_{}.wav".format(step, sample_num)) # audio.save_wav(wav, str(wav_fpath), sr=hparams.sample_rate) os.makedirs('log_wavs', exist_ok=True) _wav_pre = mel2wav(mel_prediction, wav_name_path=os.path.join('log_wavs', str(step) + '_pre.wav')) _wav_target = mel2wav(target_spectrogram, wav_name_path=os.path.join('log_wavs', str(step) + '_target.wav')) # save real and predicted mel-spectrogram plot to disk (control purposes) spec_fpath = plot_dir.joinpath("step-{}-mel-spectrogram_sample_{}.png".format(step, sample_num)) title_str = "{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), step, loss) plot_spectrogram(mel_prediction, str(spec_fpath), title=title_str, target_spectrogram=target_spectrogram, max_len=target_spectrogram.size // hparams.num_mels) print("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
def train(log_dir, args, hparams): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id save_dir = os.path.join(log_dir, "taco_pretrained") plot_dir = os.path.join(log_dir, "plots") wav_dir = os.path.join(log_dir, "wavs") mel_dir = os.path.join(log_dir, "mel-spectrograms") eval_dir = os.path.join(log_dir, "eval-dir") eval_plot_dir = os.path.join(eval_dir, "plots") eval_wav_dir = os.path.join(eval_dir, "wavs") tensorboard_dir = os.path.join(log_dir, "tacotron_events") meta_folder = os.path.join(log_dir, "metas") os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt") metadat_fpath = os.path.join(args.synthesizer_root, "train.txt") log("Checkpoint path: {}".format(checkpoint_fpath)) log("Loading training data from: {}".format(metadat_fpath)) log("Using model: Tacotron") log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope("datafeeder") as scope: feeder = Feeder(coord, metadat_fpath, args.module_name, hparams) # Set up model: global_step = tf.Variable(0, name="global_step", trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) # Embeddings metadata char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv") if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, "w", encoding="utf-8") as f: for symbol in symbols: if symbol == " ": symbol = "\\s" # For visual purposes, swap space with \s f.write("{}\n".format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, "..") # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log("Tacotron training set to a maximum of {} steps".format( args.tacotron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if checkpoint_state and checkpoint_state.model_checkpoint_path: log("Loading checkpoint {}".format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log("No model to load at {}".format(save_dir), slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) except tf.errors.OutOfRangeError as e: log("Cannot restore checkpoint: {}".format(e), slack=True) else: log("Starting new training!", slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) # initializing feeder feeder.start_threads(sess) # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format( step, time_window.average, loss, loss_window.average) log(message, end="\r", slack=(step % args.checkpoint_interval == 0)) print(message) if loss > 100 or np.isnan(loss): log("Loss exploded to {:.5f} at step {}".format( loss, step)) raise Exception("Loss exploded") if step % args.summary_interval == 0: log("\nWriting summary at step {}".format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: # Run eval and save eval stats log("\nRunning evaluation at step {}".format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \ mel_t, t_len, align, lin_p, lin_t = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], eval_model.tower_linear_targets[0][0], ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, "step-{}-eval-wave-from-linear.wav".format( step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len,\ align = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) try: eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) log("Saving eval log to {}..".format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, "step-{}-eval-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) plot.plot_alignment( align, os.path.join( eval_plot_dir, "step-{}-eval-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram( mel_p, os.path.join( eval_plot_dir, "step-{" "}-eval-mel-spectrogram.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) except: eval_loss = -1 before_loss = -1 after_loss = -1 stop_token_loss = -1 if hparams.predict_linear: plot.plot_spectrogram( lin_p, os.path.join( eval_plot_dir, "step-{}-eval-linear-spectrogram.png".format( step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log("Eval loss for global step {}: {:.3f}".format( step, eval_loss)) log("Writing eval summary!") add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ step == 300: # Save model and current global step saver.save(sess, checkpoint_fpath, global_step=global_step) log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.." ) input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) # save predicted mel spectrogram to disk (debug) mel_filename = "mel-prediction-step-{}.npy".format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, "step-{}-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, "step-{}-mel-spectrogram.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), target_spectrogram=target, max_len=target_length) log("Input at step {}: {}".format( step, sequence_to_text(input_seq))) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: # Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) # Update Projector log("\nSaving Model Character Embeddings visualization..") add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) log("Tacotron Character embeddings have been updated on tensorboard!" ) log("Tacotron training complete after {} global steps!".format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log("Exiting due to exception: {}".format(e), slack=True) traceback.print_exc() coord.request_stop(e)