def run_synthesis(args, checkpoint_path, output_dir, sentences): metadata_filename = os.path.join(args.input_dir, 'train.txt') print(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, gta=args.GTA) wav = load_wav(args.reference_audio) reference_mel = melspectrogram(wav).transpose() with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) print('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) if args.GTA==True: synth_dir = os.path.join(output_dir, 'gta') else: synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) os.makedirs(os.path.join(synth_dir, 'wavs/'), exist_ok=True) print('starting synthesis') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: #for i, meta in enumerate(tqdm(metadata)): #text = meta[5] for i, text in enumerate(tqdm(sentences)): mel_output_filename = synth.synthesize(text=text, index=i+1, out_dir=synth_dir, log_dir=None, mel_filename=None, reference_mel=reference_mel) mels = np.load(mel_output_filename) wav = audio.inv_mel_spectrogram(mels.T) audio.save_wav(wav, os.path.join(synth_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(i+1))) with open(os.path.join(synth_dir, 'wavs/speech-wav-{:05d}.txt'.format(i+1)), 'w') as tf: tf.write(text) if hparams.predict_linear: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linear.T) audio.save_wav(wav, os.path.join(synth_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(i+1))) #file.write('{}|{}|{}|{}\n'.format(text, mel_filename, mel_output_filename, wav_filename)) print('synthesized mel spectrograms at {}'.format(synth_dir))
def synthesize(self, texts): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] input_lengths = [len(seq) for seq in seqs] seqs = self._prepare_inputs(seqs) feed_dict = { self.model.inputs: seqs, self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32), } # linears, alignments= self.session.run([self.linear_outputs, self.alignments], feed_dict=feed_dict) linears, stop_token= self.session.run([self.linear_outputs, self.stop_token_prediction], feed_dict=feed_dict) wav = audio.inv_linear_spectrogram(linears[0].T, hparams) return wav
def train(log_dir, args): save_dir = os.path.join(log_dir, 'pretrained/') checkpoint_path = os.path.join(save_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: step_count = 0 try: #simple text file to keep count of global step with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file: step_count = int(file.read()) except: print( 'no step_counter file found, assuming there is no saved checkpoint' ) global_step = tf.Variable(step_count, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.model, hparams) if hparams.predict_linear: model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, feeder.linear_targets) else: model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) #Book keeping step = 0 save_step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initiating feeder feeder.start_in_session(sess) #Training loop while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step: {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: with open(os.path.join(log_dir, 'step_counter.txt'), 'w') as file: file.write(str(step)) log('Saving checkpoint to: {}-{}'.format( checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) save_step = step log('Saving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target = sess.run( [ model.inputs[0], model.mel_outputs[0], model.linear_outputs[0], model.alignments[0], model.mel_targets[0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format( step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T) audio.save_wav( wav, os.path.join( wav_dir, 'step-{}-waveform-linear.wav'.format(step))) else: input_seq, mel_prediction, alignment, target = sess.run( [ model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}-waveform-mel.wav'.format(step))) #save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss)) #save real mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( target, os.path.join( plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, Real'.format( args.model, time_string(), step, loss)) #save predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format( args.model, time_string(), step, loss)) log('Input at step {}: {}'.format( step, sequence_to_text(input_seq))) except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) # with open("split_train.txt", "w") as file: # for line in feeder._train_meta: # for k in range(len(line)-1): # file.write(line[k]+"|") # file.write(line[-1]+"\n") # with open("split_validation.txt", "w") as file: # for line in feeder._test_meta: # for k in range(len(line)-1): # file.write(line[k]+"|") # file.write(line[-1]+"\n") # print("Feeder init done !") # assert False # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) # TODO Visualize embeddings # Embeddings inputs metadata char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv') if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, 'w', encoding='utf-8') as f: for symbol in symbols: if symbol == ' ': symbol = '\\s' # For visual purposes, swap space with \s f.write('{}\n'.format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, '..') # # Embeddings speaker metadata # speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv') # if not os.path.isfile(speaker_embedding_meta): # with open(speaker_embedding_meta, 'w', encoding='utf-8') as f: # f.write("Filename\tSpeaker\n") # for description in feeder._metadata: # f.write('{}\t{}\n'.format(description[1], description[-1])) # speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..') # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) # initializing feeder feeder.start_threads(sess) # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: # Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None speaker_losses = [] speaker_loss = None eval_run = [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ] if hparams.predict_linear: eval_run.append(eval_model.tower_linear_loss[0]) eval_run.append(eval_model.tower_linear_outputs[0][0]) eval_run.append(eval_model.tower_linear_targets[0][0]) if hparams.tacotron_multi_speaker: eval_run.append(eval_model.tower_speaker_loss[0]) for i in tqdm(range(feeder.test_steps)): blob = sess.run(eval_run) eloss = blob[0] before_loss = blob[1] after_loss = blob[2] stop_token_loss = blob[3] mel_p = blob[4] mel_t = blob[5] t_len = blob[6] align = blob[7] if hparams.predict_linear: linear_loss = blob[8] lin_p = blob[9] lin_t = blob[10] if hparams.tacotron_multi_speaker: speaker_p = blob[11] eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) if hparams.predict_linear: linear_losses.append(linear_loss) if hparams.tacotron_multi_speaker: speaker_losses.append(speaker_p) if hparams.predict_linear: linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format( step)), sr=hparams.sample_rate) if hparams.tacotron_multi_speaker: speaker_loss = sum(speaker_losses) / len( speaker_losses) # if hparams.predict_linear: # for i in tqdm(range(feeder.test_steps)): # eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run( # [ # eval_model.tower_loss[0], eval_model.tower_before_loss[0], # eval_model.tower_after_loss[0], # eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], # eval_model.tower_mel_outputs[0][0], # eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], # eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], # eval_model.tower_linear_targets[0][0], # ]) # eval_losses.append(eloss) # before_losses.append(before_loss) # after_losses.append(after_loss) # stop_token_losses.append(stop_token_loss) # linear_losses.append(linear_loss) # # print("len(eval_loss) : {}".format(len(eval_loss))) # # print("len(before_losses) : {}".format(len(before_losses))) # # print("len(after_losses) : {}".format(len(after_losses))) # # print("len(stop_token_losses) : {}".format(len(stop_token_losses))) # # print("len(linear_losses) : {}".format(len(linear_losses))) # # print("division par : {}, dans hparams.predict_linear".format(len(linear_losses))) # linear_loss = sum(linear_losses) / len(linear_losses) # # wav = audio.inv_linear_spectrogram(lin_p.T, hparams) # audio.save_wav(wav, # os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), # sr=hparams.sample_rate) # # else: # for i in tqdm(range(feeder.test_steps)): # eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run([ # eval_model.tower_loss[0], eval_model.tower_before_loss[0], # eval_model.tower_after_loss[0], # eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], # eval_model.tower_mel_targets[0][0], # eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] # ]) # eval_losses.append(eloss) # before_losses.append(before_loss) # after_losses.append(after_loss) # stop_token_losses.append(stop_token_loss) # print("len(eval_loss) : {}".format(len(eval_loss))) # print("len(before_losses) : {}".format(len(before_losses))) # print("len(after_losses) : {}".format(len(after_losses))) # print("len(stop_token_losses) : {}".format(len(stop_token_losses))) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) log('Saving eval log to {}..'.format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment( align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram( mel_p, os.path.join( eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) if hparams.predict_linear: plot.plot_spectrogram( lin_p, os.path.join( eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format( step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format( step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss, speaker_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: # Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_linear_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], model.tower_linear_targets[0][0], ]) # save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format( step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram( linear_prediction.T, hparams) audio.save_wav( wav, os.path.join( wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) # Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join( plot_dir, 'step-{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) else: input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) # save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) # TODO Find a way to revert encoded IPA to original IPA or original text # log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: #Get current checkpoint_backup state # checkpoint_state = tf.train.get_checkpoint_state(save_dir) checkpoint_state = tf.train.get_checkpoint_state(save_dir) # TODO Visualize embeddings #Update Projector log('\nSaving Model Character Embeddings visualization..') # add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) # add_embedding_stats(summary_writer, [model.embedding_speaker.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) log('Tacotron Character embeddings have been updated on tensorboard!' ) log('Tacotron training complete after {} global steps!'.format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, mel_reference_filenames=None): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.tacotron_synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) if mel_reference_filenames is not None: mel_reference_filenames.append(mel_reference_filenames[-1]) assert 0 == len(texts) % self._hparams.tacotron_num_gpus if not self._hparams.tacotron_phoneme_transcription: seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] else: seqs = [ np.asarray(ipa_to_articulatory_sequence(text), dtype=np.int32) for text in texts ] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus #Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if mel_reference_filenames is not None: np_references = [ np.load(mel_reference_filename) for mel_reference_filename in mel_reference_filenames ] # pad references according to each GPU max length reference_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_reference = np_references[size_per_device * i:size_per_device * (i + 1)] device_reference, max_reference_len = self._prepare_targets( device_reference, self._hparams.outputs_per_step) reference_seqs = np.concatenate( (reference_seqs, device_reference), axis=1) if reference_seqs is not None else device_reference # split_infos[i][ # 1] = max_target_len # Not really used but setting it in case for future development maybe? feed_dict[self.mel_references] = reference_seqs # assert len(np_targets) == len(texts) if self.gta: np_targets = [ np.load(mel_filename) for mel_filename in mel_filenames ] target_lengths = [len(np_target) for np_target in np_targets] #pad targets according to each GPU max length target_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_target = np_targets[size_per_device * i:size_per_device * (i + 1)] device_target, max_target_len = self._prepare_targets( device_target, self._hparams.outputs_per_step) target_seqs = np.concatenate( (target_seqs, device_target), axis=1) if target_seqs is not None else device_target split_infos[i][ 1] = max_target_len #Not really used but setting it in case for future development maybe? feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [ self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] if not self.gta: #Natural batch synthesis #Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) linears = [ linear for gpu_linear in linears for linear in gpu_linear ] mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #Natural batch synthesis #Get Mel/Linear lengths for the entire batch from stop_tokens predictions # target_lengths = self._get_output_lengths(stop_tokens) target_lengths = [9999] #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] linears = [ linear[:target_length, :] for linear, target_length in zip(linears, target_lengths) ] assert len(mels) == len(linears) == len(texts) if basenames is None: #Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return saved_mels_paths = [] speaker_ids = [] for i, mel in enumerate(mels): #Get speaker id for global conditioning (only used with GTA generally) if hparams.gin_channels > 0: raise RuntimeError( 'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.' ) speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable) speaker_ids.append( speaker_id ) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker) else: speaker_id = '<no_g>' speaker_ids.append(speaker_id) # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignments[i], os.path.join( log_dir, 'plots/alignment-{}.png'.format( basenames[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) #save mel spectrogram plot plot.plot_spectrogram( mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-linear.wav'.format( basenames[i])), sr=hparams.sample_rate) #save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join( log_dir, 'plots/linear-{}.png'.format( basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths, speaker_ids
def synthesize(self, text, index, out_dir, log_dir, mel_filename): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape( 1, -1, 80) if self.gta or not hparams.predict_linear: mels, alignment = self.session.run( [self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape( -1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out if index is None: #Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return #Get speaker id for global conditioning (only used with GTA generally) if hparams.gin_channels > 0: raise RuntimeError( 'Please set the speaker_id rule in line 89 of tacotron/synthesizer.py to allow for global condition usage later.' ) speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "index" variable) else: speaker_id = '<no_g>' # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(index)) np.save(mel_filename, mels, allow_pickle=False) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(index)), sr=hparams.sample_rate) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linear.T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-linear.wav'.format(index)), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignment, os.path.join( log_dir, 'plots/alignment-{}.png'.format(index)), info='{}'.format(text), split_title=True) #save mel spectrogram plot plot.plot_spectrogram(mels, os.path.join( log_dir, 'plots/mel-{}.png'.format(index)), info='{}'.format(text), split_title=True) return mel_filename, speaker_id
def synthesize(self, text, index, out_dir, log_dir, mel_filename): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80) if self.gta or not hparams.predict_linear: mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out if index is None: #Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data=f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) np.save(mel_filename, mels, allow_pickle=False) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linear.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) #save mel spectrogram plot plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) return mel_filename
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] #[-max, max] or [0,max] T2_output_range = ( -hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.tacotron_synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) assert 0 == len(texts) % self._hparams.tacotron_num_gpus seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus #Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [ np.load(mel_filename) for mel_filename in mel_filenames ] target_lengths = [len(np_target) for np_target in np_targets] #pad targets according to each GPU max length target_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_target = np_targets[size_per_device * i:size_per_device * (i + 1)] device_target, max_target_len = self._prepare_targets( device_target, self._hparams.outputs_per_step) target_seqs = np.concatenate( (target_seqs, device_target), axis=1) if target_seqs is not None else device_target split_infos[i][ 1] = max_target_len #Not really used but setting it in case for future development maybe? feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [ self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (n_gpus -> 1D) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] if not self.gta: #Natural batch synthesis #Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) linears = [ linear for gpu_linear in linears for linear in gpu_linear ] mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #Natural batch synthesis #Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] linears = [ linear[:target_length, :] for linear, target_length in zip(linears, target_lengths) ] linears = np.clip(linears, T2_output_range[0], T2_output_range[1]) assert len(mels) == len(linears) == len(texts) mels = np.clip(mels, T2_output_range[0], T2_output_range[1]) if basenames is None: #Generate wav and read it if hparams.GL_on_GPU: wav = self.session.run( self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mels[0]}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way if platform.system() == 'Linux': #Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': #windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!' ) return saved_mels_paths = [] speaker_ids = [] for i, mel in enumerate(mels): #Get speaker id for global conditioning (only used with GTA generally) if hparams.gin_channels > 0: raise RuntimeError( 'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.' ) speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable) speaker_ids.append( speaker_id ) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker) else: speaker_id = '<no_g>' speaker_ids.append(speaker_id) # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: #save wav (mel -> wav) if hparams.GL_on_GPU: wav = self.session.run( self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mel}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignments[i], os.path.join( log_dir, 'plots/alignment-{}.png'.format( basenames[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) #save mel spectrogram plot plot.plot_spectrogram( mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) if hparams.predict_linear: #save wav (linear -> wav) if hparams.GL_on_GPU: wav = self.session.run( self.GLGPU_lin_outputs, feed_dict={self.GLGPU_lin_inputs: linears[i]}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_linear_spectrogram( linears[i].T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-linear.wav'.format( basenames[i])), sr=hparams.sample_rate) #save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join( log_dir, 'plots/linear-{}.png'.format( basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths, speaker_ids
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained/') checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') os.makedirs(eval_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run( [eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], eval_model.targets_lengths[0], eval_model.alignments[0], eval_model.linear_outputs[0]]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run( [eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], eval_model.targets_lengths[0], eval_model.alignments[0]]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) log('Saving eval log to {}..'.format(eval_dir)) #Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eloss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eloss), target_spectrogram=mel_t, max_len=t_len) log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([ model.inputs[0], model.mel_outputs[0], model.linear_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format(step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) else: input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) #save alignment plot to disk (control purposes) plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps)) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e)) traceback.print_exc() coord.request_stop(e)
def synthesize(self, text, index, out_dir, log_dir, mel_filename, reference_mel): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape( 1, -1, 80) feed_dict[self.model.reference_mel] = np.load( mel_filename).reshape(1, -1, 80) elif hparams.use_vae: reference_mel = [np.asarray(reference_mel, dtype=np.float32)] feed_dict[self.model.reference_mel] = reference_mel if self.gta or not hparams.predict_linear: mels, alignment = self.session.run( [self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape( -1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) np.save(mel_filename, mels, allow_pickle=False) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mels.T) audio.save_wav( wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index))) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linear.T) audio.save_wav( wav, os.path.join( log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index))) #save alignments plot.plot_alignment( alignment, os.path.join( log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) #save mel spectrogram plot plot.plot_spectrogram( mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) return mel_filename
def synthesize(self, texts, speakers, basenames, out_dir, log_dir, mel_filenames, reference_mels, Lf0s): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.tacotron_synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) speakers.append(speakers[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) if reference_mels is not None: reference_mels.append(reference_mels[-1]) assert 0 == len(texts) % self._hparams.tacotron_num_gpus seqs = texts #[np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus #Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), self.speakers: np.asarray(speakers, dtype=np.int32) } if self.gta: np_targets = [ np.load(mel_filename) for mel_filename in mel_filenames ] target_lengths = [len(np_target) for np_target in np_targets] #pad targets according to each GPU max length target_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_target = np_targets[size_per_device * i:size_per_device * (i + 1)] device_target, max_target_len = self._prepare_targets( device_target, self._hparams.outputs_per_step) target_seqs = np.concatenate( (target_seqs, device_target), axis=1) if target_seqs is not None else device_target split_infos[i][ 1] = max_target_len #Not really used but setting it in case for future development maybe? feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) if reference_mels is not None: np_refs = [ np.asarray(reference_mel) for reference_mel in reference_mels ] reference_lengths = [len(np_ref) for np_ref in np_refs] ref_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_ref = np_refs[size_per_device * i:size_per_device * (i + 1)] device_ref, max_ref_len = self._prepare_targets( device_ref, self._hparams.outputs_per_step) ref_seqs = np.concatenate( (ref_seqs, device_ref), axis=1) if ref_seqs is not None else device_ref split_infos[i][-1] = max_ref_len feed_dict[self.reference_mels] = ref_seqs feed_dict[self.reference_lengths] = reference_lengths assert len(np_refs) == len(texts) ##2020.7.24 加入lf0 if Lf0s is not None: np_Lf0s = [np.asarray(Lf0) for Lf0 in Lf0s] Lf0_lengths = [len(np_Lf0) for np_Lf0 in np_Lf0s] Lf0_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_Lf0 = np_Lf0s[size_per_device * i:size_per_device * (i + 1)] device_Lf0, max_Lf0_len = self._prepare_F0_inputs( device_Lf0, max_seq_len) #保证不要因为分帧问题导致不一样的长度 #device_Lf0, max_Lf0_len = self._prepare_targets(device_Lf0, self._hparams.outputs_per_step) Lf0_seqs = np.concatenate( (Lf0_seqs, device_Lf0), axis=1) if Lf0_seqs is not None else device_Lf0 split_infos[i][-1] = max_Lf0_len feed_dict[self.Lf0s] = Lf0_seqs assert len(np_Lf0s) == len(texts) if Lf0_seqs.shape[-1] != 2: print(2333) feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [ self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #if not self.gta: # Natural batch synthesis # #Get Mel lengths for the entire batch from stop_tokens predictions # #target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding target_lengths = [9999] mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) linears = [ linear for gpu_linear in linears for linear in gpu_linear ] mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #Natural batch synthesis #Get Mel/Linear lengths for the entire batch from stop_tokens predictions # target_lengths = self._get_output_lengths(stop_tokens) target_lengths = [9999] #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] linears = [ linear[:target_length, :] for linear, target_length in zip(linears, target_lengths) ] assert len(mels) == len(linears) == len(texts) if basenames is None: #Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return saved_mels_paths = [] for i, mel in enumerate(mels): # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) #save alignments plot.plot_alignment( alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format( basenames[i])), title='speaker_id = {:d}'.format(speakers[i]), split_title=True, max_len=target_lengths[i]) #save mel spectrogram plot plot.plot_spectrogram( mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='speaker_id = {:d}'.format(speakers[i]), split_title=True) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-linear.wav'.format( basenames[i])), sr=hparams.sample_rate) #save linear spectrogram plot plot.plot_spectrogram( linears[i], os.path.join( log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='speaker_id = {:d}'.format(speakers[i]), split_title=True, auto_aspect=True) return saved_mels_paths
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, basenames_refs=None, mel_ref_filenames_emt=None, mel_ref_filenames_spk=None, emb_only=False, emt_labels_synth=None, spk_labels_synth=None): hparams = self._hparams # [-max, max] or [0,max] T2_output_range = ( -hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) basenames, basenames_refs, input_seqs, input_lengths, split_infos, mel_ref_seqs_emt, mel_ref_seqs_spk,\ emt_labels_synth, spk_labels_synth = filenames_to_inputs(hparams, texts, basenames, mel_filenames, basenames_refs, mel_ref_filenames_emt, mel_ref_filenames_spk, emt_labels_synth, spk_labels_synth) feed_dict = { self.inputs: input_seqs, self.input_lengths: input_lengths, self.mel_refs_emt: mel_ref_seqs_emt, self.mel_refs_spk: mel_ref_seqs_spk, self.spk_labels: spk_labels_synth, self.emt_labels: emt_labels_synth, self.split_infos: split_infos } # if self.gta: # np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] # target_lengths = [len(np_target) for np_target in np_targets] # # #pad targets according to each GPU max length # target_seqs = None # for i in range(self._hparams.tacotron_num_gpus): # device_target = np_targets[size_per_device*i: size_per_device*(i+1)] # device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step, target_pad=self._target_pad) # target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target # split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe? # # feed_dict[self.targets] = target_seqs # assert len(np_targets) == len(texts) # feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) if emb_only: if self.args.emt_attn: return (self.session.run([ self.model.tower_refnet_out_emt[0], self.model.tower_refnet_out_spk[0], self.model.tower_refnet_outputs_mel_out_emt[0], self.model.tower_refnet_outputs_mel_out_spk[0], self.model.tower_context_emt[0] ], feed_dict=feed_dict)) else: return (self.session.run([ self.model.tower_refnet_out_emt[0], self.model.tower_refnet_out_spk[0], self.model.tower_refnet_outputs_mel_out_emt[0], self.model.tower_refnet_outputs_mel_out_spk[0], tf.constant(1.) ], feed_dict=feed_dict)) if self.gta or not hparams.predict_linear: if self.args.attn == 'style_tokens': mels, alignments, stop_tokens = self.session.run( [ self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) else: mels, alignments, stop_tokens, refnet_emt,\ ref_emt, alignments_emt = self.session.run([self.mel_outputs,self.alignments,self.stop_token_prediction, self.model.tower_refnet_out_emt[0],self.model.tower_ref_mel_emt[0], self.model.tower_alignments_emt],#self.model.tower_context_emt[0],#self.model.tower_refnet_out_spk[0]], feed_dict=feed_dict) # import pandas as pd # df_cont = pd.DataFrame(cont[0]) # df_cont.to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\test\cont.csv') # pd.DataFrame(refnet_spk).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\test\r_spk.csv') # raise # print(refnet_emt[:,0:5]) # print(refnet_spk[:,0:5]) # for i,(m1,m2,m3) in enumerate(zip(mels[0],ref_emt,ref_spk)): # np.save('../eval/mels_save/{}_mel.npy'.format(i),m1) # np.save('../eval/mels_save/{}_ref_emt.npy'.format(i), m2) # np.save('../eval/mels_save/{}_ref_spk.npy'.format(i), m3) # time.sleep(.5) # raise #Linearize outputs (n_gpus -> 1D) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] if self.args.emt_attn and not (self.args.attn == 'style_tokens'): alignments_emt = [ align_emt for gpu_aligns_emt in alignments_emt for align_emt in gpu_aligns_emt ] if not self.gta: #Natural batch synthesis #Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) linears = [ linear for gpu_linear in linears for linear in gpu_linear ] mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #Natural batch synthesis #Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] linears = [ linear[:target_length, :] for linear, target_length in zip(linears, target_lengths) ] linears = np.clip(linears, T2_output_range[0], T2_output_range[1]) assert len(mels) == len(linears) == len(texts) mels = [ np.clip(m, T2_output_range[0], T2_output_range[1]) for m in mels ] if basenames is None: #Generate wav and read it if hparams.GL_on_GPU: wav = self.session.run( self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mels[0]}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way if platform.system() == 'Linux': #Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': #windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!' ) return saved_mels_paths = [] speaker_ids = [] for i, mel in enumerate(mels): #Get speaker id for global conditioning (only used with GTA generally) if hparams.gin_channels > 0: raise RuntimeError( 'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.' ) speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable) speaker_ids.append( speaker_id ) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker) else: speaker_id = '<no_g>' speaker_ids.append(speaker_id) if log_dir is not None: os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'mels'), exist_ok=True) mel_filename = os.path.join( out_dir, 'mels', 'mel-{}_{}.npy'.format(basenames[i], basenames_refs[i])) np.save(mel_filename, mel, allow_pickle=False) #save wav (mel -> wav) if hparams.GL_on_GPU: wav = self.session.run( self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mel}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram(mel.T, hparams) #add silence to make ending of file more noticeable wav = np.append( np.append(np.zeros(int(.5 * hparams.sample_rate)), wav), np.zeros(int(.5 * hparams.sample_rate))) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}_{}.wav'.format( basenames[i], basenames_refs[i])), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignments[i], os.path.join( log_dir, 'plots/alignment-{}_{}.png'.format( basenames[i], basenames_refs[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) if self.args.emt_attn and self.args.attn == 'simple': plot.plot_alignment( alignments_emt[i], os.path.join( log_dir, 'plots/alignment_emt-{}_{}.png'.format( basenames[i], basenames_refs[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) #save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join( log_dir, 'plots/mel-{}_{}.png'.format( basenames[i], basenames_refs[i])), title='{}'.format(texts[i]), split_title=True) print("Finished saving {}_{}".format(basenames[i], basenames_refs[i])) if hparams.predict_linear: #save wav (linear -> wav) if hparams.GL_on_GPU: wav = self.session.run( self.GLGPU_lin_outputs, feed_dict={self.GLGPU_lin_inputs: linears[i]}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_linear_spectrogram( linears[i].T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-linear_{}.wav'.format( basenames[i], basenames_refs[i])), sr=hparams.sample_rate) #save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join( log_dir, 'plots/linear-{}_{}.png'.format( basenames[i], basenames_refs[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths, speaker_ids
for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run([ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], eval_model.tower_linear_targets[0][0], ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run([ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) eval_loss = sum(eval_losses) / len(eval_losses)
def spec2wav(linears, hparams, save_path): wav = audio.inv_linear_spectrogram(linears.T, hparams) audio.save_wav(wav, save_path, sr=hparams.sample_rate)
def synthesize(self, text, index, out_dir, log_dir, mel_filename, speaker_id): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] if is_korean_text(text): text = normalize_number(text) text = split_to_jamo(text, cleaner_names) seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), self.model.speaker_ids: np.asarray([speaker_id], dtype=np.int32) } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80) if self.gta or not hparams.predict_linear: mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape(-1, hparams.num_mels) # Thanks to @imdatsolak for pointing this out if index is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) np.save(mel_filename, mels, allow_pickle=False) if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate) if hparams.predict_linear: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linear.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate) if is_korean_char(text): text = j2h(text) # save alignments plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) # save mel spectrogram plot plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) return mel_filename
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Embeddings metadata char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv') if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, 'w', encoding='utf-8') as f: for symbol in symbols: if symbol == ' ': symbol = '\\s' #For visual purposes, swap space with \s f.write('{}\n'.format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, '..') #Potential Griffin-Lim GPU setup if hparams.GL_on_GPU: GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow( GLGPU_mel_inputs, hparams) GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow( GLGPU_lin_inputs, hparams) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=20) log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) ckpt = tf.train.load_checkpoint( checkpoint_state.model_checkpoint_path) variables = list( ckpt.get_variable_to_shape_map().keys()) #print('=====================PRINTING VARS===============================') #print(variables) #drop_source_layers = ['Tacotron_model/inference/inputs_embedding','Tacotron_model/Tacotron_model/inference/inputs_embedding/Adam_1','Tacotron_model/Tacotron_model/inference/inputs_embedding/Adam'] #for v in tf.global_variables(): # if not any(layer in v.op.name for layer in drop_source_layers): # print('Loading', v.op.name) # v.load(ckpt.get_tensor(v.op.name), session=sess) # Initialize all variables needed for DS, but not loaded from ckpt #init_op = tf.variables_initializer([v for v in tf.global_variables() if any(layer in v.op.name for layer in drop_source_layers)]) #sess.run(init_op) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], eval_model.tower_linear_targets[0][0], ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) if hparams.GL_on_GPU: wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p}) wav = audio.inv_preemphasis( wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_linear_spectrogram( lin_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format( step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) log('Saving eval log to {}..'.format(eval_dir)) #Save some log to monitor model improvement on same unseen sequence if hparams.GL_on_GPU: wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment( align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram( mel_p, os.path.join( eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) if hparams.predict_linear: plot.plot_spectrogram( lin_p, os.path.join( eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format( step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format( step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_linear_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], model.tower_linear_targets[0][0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format( step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) if hparams.GL_on_GPU: wav = sess.run(GLGPU_lin_outputs, feed_dict={ GLGPU_lin_inputs: linear_prediction }) wav = audio.inv_preemphasis( wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_linear_spectrogram( linear_prediction.T, hparams) audio.save_wav( wav, os.path.join( wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) #Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join( plot_dir, 'step-{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) else: input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) if hparams.GL_on_GPU: wav = sess.run( GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram( mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) #save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) log('Input at step {}: {}'.format( step, sequence_to_text(input_seq))) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: #Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) #Update Projector log('\nSaving Model Character Embeddings visualization..') add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) log('Tacotron Character embeddings have been updated on tensorboard!' ) log('Tacotron training complete after {} global steps!'.format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] input_lengths = [len(seq) for seq in seqs] seqs = self._prepare_inputs(seqs) feed_dict = { self.model.inputs: seqs, self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] target_lengths = [len(np_target) for np_target in np_targets] padded_targets = self._prepare_targets(np_targets, self._hparams.outputs_per_step) feed_dict[self.model.mel_targets] = padded_targets.reshape(len(np_targets), -1, 80) if self.gta or not hparams.predict_linear: mels, alignments = self.session.run([self.mel_outputs, self.alignments], feed_dict=feed_dict) if self.gta: mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] # Take off the reduction factor padding frames for time consistency with wavenet assert len(mels) == len(np_targets) else: linears, mels, alignments = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments], feed_dict=feed_dict) if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', hparams.sample_rate) # Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return saved_mels_paths = [] speaker_ids = [] for i, mel in enumerate(mels): # Get speaker id for global conditioning (only used with GTA generally) if hparams.gin_channels > 0: raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.') speaker_id = '<no_g>' # set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable) speaker_ids.append(speaker_id) # finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker) else: speaker_id = '<no_g>' speaker_ids.append(speaker_id) # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), hparams.sample_rate) # save alignments plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) # save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) if hparams.predict_linear: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), hparams.sample_rate) # save mel spectrogram plot plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths, speaker_ids
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained/') checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') os.makedirs(eval_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 1000 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run( [eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], eval_model.targets_lengths[0], eval_model.alignments[0], eval_model.linear_outputs[0]]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run( [eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], eval_model.targets_lengths[0], eval_model.alignments[0]]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) log('Saving eval log to {}..'.format(eval_dir)) #Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eloss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eloss), target_spectrogram=mel_t, max_len=t_len) log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([ model.inputs[0], model.mel_outputs[0], model.linear_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format(step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) else: input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) #save alignment plot to disk (control purposes) plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps)) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e)) traceback.print_exc() coord.request_stop(e)
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): hparams = self._hparams # [-max, max] or [0,max] t2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) seqs = [np.asarray(text_to_sequence(text)) for text in texts] input_lengths = [len(seq) for seq in seqs] input_seqs, max_seq_len = self._prepare_inputs(seqs) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] target_lengths = [len(np_target) for np_target in np_targets] target_seqs, max_target_len = self._prepare_targets(np_targets, self._hparams.outputs_per_step) feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) linears = None if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) # Natural batch synthesis # Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) # Take off the batch wise padding mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) # Natural batch synthesis # Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) # Take off the batch wise padding mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)] linears = np.clip(linears, t2_output_range[0], t2_output_range[1]) assert len(mels) == len(linears) == len(texts) mels = np.clip(mels, t2_output_range[0], t2_output_range[1]) if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way if platform.system() == 'Linux': # Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': # windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') return saved_mels_paths = [] for i, mel in enumerate(mels): # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) # save alignments plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) # save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) if linears: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) # save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths