def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): hparams = self.hparams # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.tacotron_num_gpus != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) assert 0 == len(texts) % self.hparams.tacotron_num_gpus seqs = [np.asarray(hangul_to_sequence(dir=hparams.base_dir, hangul_text=text, hangul_type=hparams.hangul_type)) for text in texts] input_lengths = [len(seq) for seq in seqs] ## calculate sequence length on each GPU device sequence_size_per_device = len(seqs)// hparams.tacotron_num_gpus ##### create input sequence on each GPU then concat split_infos = [] input_sequence = None ### synthesize input sequence for i in range(hparams.tacotron_num_gpus): on_device_input_sequence = seqs[sequence_size_per_device*i : sequence_size_per_device*(i+1)] on_device_input_sequence_padded, input_length = _prepare_inputs(inputs = on_device_input_sequence) input_sequence = np.concatenate((input_sequence, on_device_input_sequence_padded), axis=1) if input_sequence is not None else on_device_input_sequence_padded split_infos.append([input_length, 0,0,0]) ### add input info to feed dict feed_dict = { self.inputs: input_sequence, self.input_lengths: np.asarray(input_lengths, dtype=np.int32) } if self.GTA: np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] assert len(np_targets) == len(texts) #### get target sequence from mel_targets on each GPU target_sequence = None ### synthesize input sequence for i in range(hparams.tacotron_num_gpus): on_device_target_sequence = np_targets[sequence_size_per_device * i: sequence_size_per_device * (i + 1)] on_device_target_sequence_padded, target_length = _prepare_targets(targets=on_device_target_sequence, alignment=hparams.outputs_per_step) target_sequence = np.concatenate((target_sequence, on_device_target_sequence_padded), axis=1) if target_sequence is not None else on_device_target_sequence_padded split_infos[i][1] = target_length ## add target mel sequence to feed dict feed_dict[self.targets] = target_sequence feed_dict[self.split_infos]= np.asarray(split_infos, dtype=np.int32) ####### synthesize ####### mels, alignments, stop_tokens,encoder_outputs = self.session.run([self.mel_outputs, self.alignment, self.stop_token,self.encoder_outputs], feed_dict=feed_dict) # Linearised outputs (n_gpus -> 1D) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [align for gpu_aligns in alignments for align in gpu_aligns] stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] # for i,seq in enumerate(seqs): # print(feed_dict[self.inputs][i]) # print(len(seq)) # print(texts[i]) # print('=============================================') # # print([max(stop_token) for stop_token in stop_tokens]) # print([len(stop_token) for stop_token in stop_tokens]) # print(feed_dict[self.input_lengths]) target_lengths = _get_output_lengths(stop_tokens) ##todo: need more effort this code part # cut off the silence part (the part behind stop_token) mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] # [-max, max] or [0,max] T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value) mels = [np.clip(mel, T2_output_range[0], T2_output_range[1]) for mel in mels] if basenames is None: # Generate wav and read it wav = mel_to_audio_serie(mels[0].T, hparams) save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way if system() == 'Linux': # Linux wav reader os.system('aplay temp.wav') elif system() == 'Windows': # windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError('Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-150" and feel free to make a Pull Request ;) Thanks!') return saved_mels_paths = [] speaker_ids=[] for (i,mel), text in zip(enumerate(mels), texts): # Get speaker id for global conditioning (only used with GTA generally) if hparams.gin_channels > 0: ### when there are many speakers (need edit this code part for multiple speakers model speaker_id = '<no_g>' # set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable) speaker_ids.append(speaker_id) # finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker) else: ### when only 1 speaker speaker_id = '<no_g>' speaker_ids.append(speaker_id) # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'speech-mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: # save wav (mel -> wav) wav = mel_to_audio_serie(mel.T, hparams) wav = inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) # save alignments plot_alignment(alignments[i], os.path.join(log_dir, 'plots/speech-alignment-{}.png'.format(basenames[i])), info='{}'.format(texts[i]), split_title=True) # save mel spectrogram plot plot_spectrogram(mel, os.path.join(log_dir, 'plots/speech-mel-{}.png'.format(basenames[i])), info='{}'.format(texts[i]), split_title=True) return saved_mels_paths, speaker_ids
def synthesize(self, text, index, out_dir, log_dir, mel_filename): hparams = self._hparams seq = hangul_to_sequence(dir=hparams.base_dir, hangul_text=text, hangul_type=hparams.hangul_type) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape( 1, -1, hparams.num_mels) if self.gta or not hparams.predict_linear: mels, alignment = self.session.run( [self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape( -1, hparams.num_mels) # Thanks to @imdatsolak for pointing this out if index is None: # Generate wav and read it wav = mel_to_audio_serie(mels.T, hparams) save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) np.save(mel_filename, mels, allow_pickle=False) if log_dir is not None: # save wav (mel -> wav) wav = mel_to_audio_serie(mels.T, hparams) save_wav(wav, os.path.join( log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate) if hparams.predict_linear: # save wav (linear -> wav) wav = linear_to_audio_serie(linear.T, hparams) save_wav( wav, os.path.join( log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate) # save alignments plot_alignment( alignment, os.path.join( log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) # save mel spectrogram plot plot_spectrogram(mels, os.path.join( log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) return mel_filename
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained/') checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') os.makedirs(eval_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) ## check whethe post-processing network will be used for linear spectrogram prediction if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) # save log info log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) # Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder ## create an object of Feeder class to feed preprocessed data: # (audio time series, mel spectrogram matrix, text sequences) to training model coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) ######################################## # Set up model: # create model based on '--model' parameters ('Tacotron', 'Tacotron2', 'WaveNet', 'Both') global_step = tf.Variable( 0, name='global_step', trainable=False ) ## define global step to use in tf.train.cosine_decay() when using teacher forcing model, stats = model_train_mode(args, feeder, hparams, global_step) step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) # initializing feeder ## feed preprocessed data to threads feeder.start_threads(sess) # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) ##### save check point when check point interval has been met if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps: # Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving Mel-Spectrograms and griffin-lim inverted waveform..' ) if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run( [ model.inputs[0], model.mel_outputs[0], model.linear_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) # save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format( step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (linear -> wav) wav = linear_to_audio_serie(linear_prediction.T, hparams) save_wav( wav, os.path.join( wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) else: input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0] ]) # save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = mel_to_audio_serie(mel_prediction.T, hparams) save_wav(wav, os.path.join( wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format( args.model, datetime.now().strftime('%Y-%m-%d %H:%M'), step, loss), max_len=target_length // hparams.outputs_per_step) # save real and predicted mel-spectrogram plot to disk (control purposes) plot_spectrogram( mel_prediction, os.path.join( plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format( args.model, datetime.now().strftime('%Y-%m-%d %H:%M'), step, loss), target_spectrogram=target, max_len=target_length) log('Input at step {}: {}'.format( step, sequence_to_text(input_seq))) ##### FINISH training ##### Testing.... ## do the test when step is the maximum of tacotron training step return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e)) traceback.print_exc() coord.request_stop(e)