def tacotron_synthesize(args, hparams, checkpoint): output_dir = 'tacotron_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except AttributeError: # Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa) if 'Both' in checkpoint: checkpoint = checkpoint.replace('Both', 'Tacotron-2') elif 'Tacotron-2' in checkpoint: checkpoint = checkpoint.replace('Tacotron-2', 'Both') else: raise AssertionError( 'Cannot restore checkpoint: {}, did you train a model?'.format( checkpoint)) try: # Try loading again checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) return run_synthesis(args, checkpoint_path, output_dir, hparams)
def load(self, checkpoint_path, hparams, model_name='WaveNet'): log('Constructing model: {}'.format(model_name)) self._hparams = hparams local_cond, global_cond = self._check_conditions() self.local_conditions = tf.placeholder( tf.float32, shape=[1, None, hparams.num_mels], name='local_condition_features') if local_cond else None self.global_conditions = tf.placeholder( tf.int32, shape=(), name='global_condition_features') if global_cond else None self.synthesis_length = tf.placeholder( tf.int32, shape=(), name='synthesis_length') if not local_cond else None with tf.variable_scope('model') as scope: self.model = Wavenet(hparams) self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, input_lengths=None, synthesis_length=self.synthesis_length) self._hparams = hparams sh_saver = create_shadow_saver(self.model) log('Loading checkpoint: {}'.format(checkpoint_path)) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) load_averaged_model(self.session, sh_saver, checkpoint_path)
def run_inference(checkpoint_path, output_dir, hparams, sentences): inference_dir = os.path.join(output_dir, 'inference') log_dir = os.path.join(output_dir, 'logs-inference') os.makedirs(inference_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log('running inference..') synth = Synthesizer() synth.load(checkpoint_path, hparams, GTA=False) sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range( 0, len(sentences), hparams.tacotron_synthesis_batch_size) ] ### save synthesized info to map.txt and folder with open(os.path.join(inference_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(text)) ] mel_filename = synth.synthesize(text, basenames, inference_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms of \"{}\" at {}'.format( sentences, inference_dir)) return inference_dir
def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir): hparams = self._hparams local_cond, global_cond = self._check_conditions() #Get True length of audio to be synthesized: audio_len = mel_len * hop_size audio_lengths = [ len(x) * get_hop_size(self._hparams) for x in mel_spectrograms ] #Prepare local condition batch maxlen = max([len(x) for x in mel_spectrograms]) #[-max, max] or [0,max] T2_output_range = ( -self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else ( 0, self._hparams.max_abs_value) c_batch = np.stack([ _pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms ]).astype(np.float32) if self._hparams.normalize_for_wavenet: #rerange to [0, 1] c_batch = np.interp(c_batch, T2_output_range, (0, 1)) g = None if speaker_ids is None else np.asarray( speaker_ids, dtype=np.int32).reshape(len(c_batch), 1) feed_dict = {} if local_cond: feed_dict[self.local_conditions] = c_batch else: feed_dict[self.synthesis_length] = 100 if global_cond: feed_dict[self.global_conditions] = g #Generate wavs and clip extra padding to select Real speech parts generated_wavs = self.session.run( self.model.y_hat, feed_dict=feed_dict) #### todo: problem here <<<<<<========== generated_wavs = [ generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths) ] audio_filenames = [] for i, generated_wav in enumerate(generated_wavs): #Save wav to disk audio_filename = os.path.join( out_dir, 'wavenet-audio-{}.wav'.format(basenames[i])) save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate) audio_filenames.append(audio_filename) #Save waveplot to disk if log_dir is not None: plot_filename = os.path.join( log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i])) log('debug: synthesizer.py line 99, plot_filename={}'.format( plot_filename)) waveplot(plot_filename, generated_wav, None, hparams) return audio_filenames
def tacotron_synthesize(args, hparams, checkpoint): output_dir = args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except AttributeError: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) return run_synthesis(args, checkpoint_path, output_dir, hparams)
def make_test_batches(self): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step # Test on entire test set examples = [self._get_test_groups() for i in range(len(self._test_meta))] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i: i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) return batches, r
def tacotron_inference(args, hparams, checkpoint, sentences): output_dir = args.output_dir if sentences is None: raise RuntimeError( 'Inference mode requires input sentence(s), make sure you put sentences in sentences.txt!' ) try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except AttributeError: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) return run_inference(checkpoint_path, output_dir, hparams, sentences)
def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step examples = [self._get_next_example() for i in range(n * _batches_per_group)] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i: i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def make_test_batches(self): start = time.time() #Read one example for evaluation n = 1 #Test on entire test set (one sample at an evaluation step) examples = [ self._get_test_groups() for i in range(len(self._test_meta)) ] batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) return batches
def load(self, checkpoint_path, hparams, GTA=False, reference_mel=None, model_name='Tacotron'): log('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, (None, None), name='inputs') input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths') targets = tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets') targets_lengths = tf.placeholder(tf.int32, (None, hparams.num_mels), name='targets_lengths') if reference_mel is not None: reference_mel = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'reference_mel') split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos') with tf.variable_scope('model'): self.model = Tacotron(hparams) if GTA: self.model.initialize(inputs=inputs, input_lengths=input_lengths, mel_targets=targets, GTA=GTA, split_infos=split_infos,reference_mel=reference_mel) else: self.model.initialize(inputs=inputs, input_lengths=input_lengths, split_infos=split_infos,reference_mel=reference_mel) self.mel_outputs = self.model.tower_mel_outputs self.alignment = self.model.tower_alignments self.stop_token = self.model.tower_stop_token_prediction self.targets = targets self.encoder_outputs = self.model.encoder_outputs self.GTA = GTA self.hparams = hparams log('Loading checkpoint: %s' % checkpoint_path) # Memory allocation on the GPUs as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path) self.inputs = inputs self.input_lengths = input_lengths self.mel_targets = targets self.split_infos = split_infos
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams): log('\nSaving intermediate states at step {}'.format(global_step)) idx = 0 y_hat, y, length = sess.run([model.y_hat_log[idx], model.y_log[idx], model.input_lengths[idx]]) # mask by length y_hat[length:] = 0 y[length:] = 0 # Make audio and plot paths pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) # Save audio librosa.output.write_wav(pred_wav_path, y_hat, sr=hparams.sample_rate) librosa.output.write_wav(target_wav_path, y, sr=hparams.sample_rate) # Save figure waveplot(plot_path, y_hat, y, hparams)
def add_loss(self): '''Adds loss computation to the graph. Supposes that initialize function has already been called. ''' with tf.variable_scope('loss') as scope: if self.is_training: if is_mulaw_quantize(self._hparams.input_type): self.loss = MaskedCrossEntropyLoss(self.y_hat_q[:, :-1, :], self.y[:, 1:], mask=self.mask) else: if self._hparams.out_channels == 2: self.loss = GaussianMaximumLikelihoodEstimation( self.y_hat[:, :, :-1], self.y[:, 1:, :], hparams=self._hparams, mask=self.mask) else: # self.loss = DiscretizedMixtureLogisticLoss(self.y_hat[:, :, :-1], self.y, hparams=self._hparams, mask=self.mask) self.loss = DiscretizedMixtureLogisticLoss( self.y_hat[:, :, :-1], self.y[:, 1:, :], hparams=self._hparams, mask=self.mask) elif self.is_evaluating: if is_mulaw_quantize(self._hparams.input_type): self.eval_loss = MaskedCrossEntropyLoss( self.y_hat_eval, self.y_eval, lengths=[self.eval_length]) log('debug: wavenet.py line 411, self.y_hat_eval={}'. format(self.y_hat_eval)) log('debug: wavenet.py line 411, self.y_eval={}'.format( self.y_eval)) log('debug: wavenet.py line 411, self.eval_length={}'. format(self.eval_length)) log('debug: wavenet.py line 411, self.eval_loss={}'.format( self.eval_loss)) else: if self._hparams.out_channels == 2: self.eval_loss = GaussianMaximumLikelihoodEstimation( self.y_hat_eval, self.y_eval, hparams=self._hparams, lengths=[self.eval_length]) else: self.eval_loss = DiscretizedMixtureLogisticLoss( self.y_hat_eval, self.y_eval, hparams=self._hparams, lengths=[self.eval_length])
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams): '''Evaluate model during training. Supposes that model variables are averaged. ''' start_time = time.time() y_hat, y_target, loss = sess.run( [model.y_hat, model.y_target, model.eval_loss]) duration = time.time() - start_time log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)' .format(len(y_target), duration, len(y_target) / duration)) pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) # Save Audio wavfile.write(pred_wav_path, hparams.sample_rate, y_hat) wavfile.write(target_wav_path, hparams.sample_rate, y_target) # Save figure util.waveplot(plot_path, y_hat, y_target, model._hparams) log('Eval loss for global step {}: {:.3f}'.format(global_step, loss)) if summary_writer is not None: log('Writing eval summary!') add_test_stats(summary_writer, global_step, loss)
def run_synthesis(args, checkpoint_path, output_dir, hparams): GTA = (args.GTA == 'True') if GTA: synth_dir = os.path.join(output_dir, 'gta') # Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) else: synth_dir = os.path.join(output_dir, 'natural') # Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_filename = os.path.join(args.input_dir, 'train.txt') synth = Synthesizer() synth.load(checkpoint_path, hparams, GTA=GTA) with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata), hours)) time.sleep(1) log('starting synthesis..') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'audio') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: for i, meta in enumerate(tqdm(metadata)): text = meta[5] mel_filename = os.path.join(mel_dir, meta[1]) wav_filename = os.path.join(wav_dir, meta[0]) mel_output_filename = synth.synthesize(text, i + 1, synth_dir, None, mel_filename) file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text)) print('done!') time.sleep(1) print('Predicted mel spectrograms are saved in {}'.format(synth_dir)) print('Exitting...') time.sleep(3) return os.path.join(synth_dir, 'map.txt')
def run_inference(checkpoint_path, output_dir, hparams, sentences): print('creating folders for inference..') inference_dir = os.path.join(output_dir, 'inference') log_dir = os.path.join(output_dir, 'logs-inference') os.makedirs(inference_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) time.sleep(1) print('done!') time.sleep(1) print('running inference..') synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(inference_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): mel_filename = synth.synthesize(text, i + 1, inference_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms of \"{}\" at {}'.format(sentences,inference_dir)) return inference_dir
def load(self, checkpoint_path, hparams, GTA=False, model_name='Tacotron'): log('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets') with tf.variable_scope('model') as scope: self.model = Tacotron(hparams) if GTA: self.model.initialize(inputs, input_lengths, targets, GTA=GTA) else: self.model.initialize(inputs, input_lengths) self.mel_outputs = self.model.mel_outputs self.alignment = self.model.alignments[0] self.gta = GTA self._hparams = hparams log('Loading checkpoint: %s' % checkpoint_path) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def load(self, checkpoint_path, hparams, model_name='wavenet'): log('Constructing model: {}'.format(model_name)) self._hparams = hparams local_cond, global_cond = self._check_conditions() self.local_conditions = tf.placeholder( tf.float32, shape=[None, None, hparams.num_mels], name='local_condition_features') if local_cond else None self.global_conditions = tf.placeholder( tf.int32, shape=(None, 1), name='global_condition_features') if global_cond else None self.synthesis_length = tf.placeholder( tf.int32, shape=(), name='synthesis_length') if not local_cond else None with tf.variable_scope('model') as scope: self.model = wavenet(hparams) self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, input_lengths=None, synthesis_length=self.synthesis_length) self._hparams = hparams sh_saver = create_shadow_saver(self.model) log('Loading checkpoint: {}'.format(checkpoint_path)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) load_averaged_model(self.session, sh_saver, checkpoint_path)
def train(log_dir, args, hparams, input_path): save_dir = os.path.join(log_dir, 'wave_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'wavenet_events') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) ### load check point checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt') input_path = os.path.join(args.base_dir, input_path) log('Checkpoint_path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) # Start by setting a seed for repeatability tf.set_random_seed(hparams.wavenet_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, args.base_dir, hparams) # Set up model training_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, training_step) eval_model = model_test_mode(args, feeder, hparams, training_step) # Calculating loss and executed time step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) sh_saver = create_shadow_saver(model, training_step) log('wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps), end='\n==================================================================\n') # Memory allocation on the memory config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True with tf.Session(config = config) as sess: try: ###initialize variables sess.run(tf.global_variables_initializer()) #### restore model from checkpoint if args.restore: try: checkpoint_state=tf.train.get_checkpoint_state(save_dir) if(checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loadding checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True) load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training..', slack=True) ### start Feeder thread from session feeder.start_threads(sess) #### looping over epochs (training steps) while not coord.should_stop() and step< args.wavenet_train_steps: ###Save current time (to calculate executed time) start_time=time.time() step,y_hat,loss,opt = sess.run([training_step, model.y_hat, model.loss, model.optimize]) #### add executed time to time window. time_window.append(time.time() - start_time) ### add loss to loss window loss_window.append(loss) #### print info to console message = 'Step = {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) ###### exit if loss exploded if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') #### save checkpoint when meet checkpoint interval if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps: save_log(sess, step, model, plot_dir, wav_dir, hparams=hparams) save_checkpoint(sess, sh_saver, checkpoint_path, training_step) ### save inference result when meed inference interval if step % args.eval_interval == 0: log('Evaluating at step {}'.format(step)) eval_step(sess, step, eval_model, eval_plot_dir, eval_wav_dir, summary_writer=None, hparams=model._hparams) log('wavenet training complete after {} global steps'.format(args.wavenet_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() ### close data feeder object to free memory coord.request_stop(e)
def wavenet_synthesize(args, hparams, checkpoint): output_dir = 'wavenet_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) log_dir = os.path.join(output_dir, 'plots') wav_dir = os.path.join(output_dir, 'wavs') synth = Synthesizer() synth.load(checkpoint_path, hparams) if args.model == 'Tacotron': raise RuntimeError( 'Please run Tacotron synthesis from Tacotron folder, not here..') else: ### get mel file (inference result from tacotron model) from input mels_dir mel_files = [ os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy' ] texts = None ### create result folders os.makedirs(log_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) log('Starting wavenet synthesis! (this will take a while..)') ### split mels file (numpy matrix) to small parts due to wavenet_synthesis_batch_size in hyperparams mel_files = [ mel_files[i:i + hparams.wavenet_synthesis_batch_size] for i in range(0, len(mel_files), hparams.wavenet_synthesis_batch_size) ] log('debug: synthesize.py line 128') ### open map.txt file and write down result ii = 0 iii = 0 with open(os.path.join(wav_dir, 'map.txt'), 'w') as file: log('debug: synthesize.py line 131') #### loop over mel_files (remember that at here, mel_files are an numpy matrix) for i, mel_batch in enumerate(tqdm(mel_files)): log('debug: synthesize.py vong lap ben ngoai, ii={}'.format( ii)) ii = ii + 1 #### load numpy matrix of mel file mel_spectros = [np.load(mel) for mel in mel_batch] log('debug: synthesize.py line 136, mel_spectros={}'.format( mel_spectros)) ### get npy file name basenames = [ os.path.basename(mel).replace('.npy', '') for mel in mel_batch ] log('debug: synthesize.py line 139, basenames={}'.format( basenames)) ### generate audio and save in wav_dir audio_files = synth.synthesize(mel_spectros, None, basenames, wav_dir, log_dir) log('debug: synthesize.py line 142, audio_files={}'.format( audio_files)) speaker_logs = ['<no_g>'] * len(mel_batch) log('debug: synthesize.py line 144, audio_files={}'.format( speaker_logs)) ###write down result for j, mel_file in enumerate(mel_batch): if texts is None: file.write('{}|{}\n'.format(mel_file, audio_files[j], speaker_logs[j])) log('debug: synthesize.py vong lap ben trong, iii={}'. format(iii)) iii = iii + 1 else: file.write('{}|{}|{}\n'.format(texts[i][j], mel_file, audio_files[j], speaker_logs[j])) log('debug: synthesize.py vong lap ben trong, iii={}'. format(iii)) iii = iii + 1 log('synthesized audio waveforms at {}'.format(wav_dir))
def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) log('Initializing wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not ( self.is_training or self.is_evaluating))) with tf.variable_scope('inference') as scope: #Training log('wavenet model current mode: {}'.format(self.is_training)) if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask( input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step( x, c, g, softmax=False ) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL) if self._hparams.out_channels == 2: self.means = self.y_hat[:, 0, :] self.log_scales = self.y_hat[:, 1, :] else: self.means = None #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies( [tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = inv_mulaw_quantize(y_hat_log, hparams.quantize_channels) y_log = inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] if hparams.out_channels == 2: y_hat_log = sample_from_gaussian( y_hat_log, log_scale_min_gauss=hparams.log_scale_min_gauss) else: y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = tf.expand_dims(g[idx], axis=0) batch_size = tf.shape(c)[0] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental(initial_input, c=c, g=g, time_length=length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: batch_size = tf.shape(c)[0] if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ( 'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}' .format(hparams.cin_channels, c.shape)) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [batch_size, -1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [batch_size, -1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [batch_size, -1]) self.y_hat = y_hat if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() self.ema = tf.train.ExponentialMovingAverage( decay=hparams.wavenet_ema_decay)
def run_synthesis(args, checkpoint_path, output_dir, hparams): ''' generate mel spectrograms from text using trained model :param args: run time params :param checkpoint_path: path to checkpoint of pretrained model :param output_dir: output dir to save spectrograms (can be got from args) :param hparams: Hyper params :return: ''' GTA = (args.GTA == 'True') if GTA: synth_dir = os.path.join(output_dir, 'gta') # Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) else: synth_dir = os.path.join(output_dir, 'natural') # Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_filename = os.path.join(args.input_dir, 'train.txt') synth = Synthesizer() synth.load(checkpoint_path, hparams, GTA=GTA) ### read data from train.txt file <-- this file is generated after preprocessing with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata), hours)) ## generate batches from metadata metadata = [metadata[i:i + 512] for i in range(0, len(metadata), 512) ] ### fix hparams.tacotron_synthesis_batch_size = 512 log('starting synthesis..') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'audio') #### batch synthesizing. Need more effort with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: for i, meta in enumerate(tqdm(metadata)): texts = [m[5] for m in meta] mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta] basenames = [ os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames ] mel_output_filenames, speaker_ids = synth.synthesize( texts, basenames, synth_dir, None, mel_filenames) for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): file.write('|'.join([str(x) for x in elems]) + '\n') # with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: # for i, meta in enumerate(tqdm(metadata)): # text = meta[5] # mel_filename = os.path.join(mel_dir, meta[1]) # wav_filename = os.path.join(wav_dir, meta[0]) # mel_output_filename, speaker_id = synth.synthesize(text, i + 1, synth_dir, None, mel_filename) # file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text)) log('Predicted mel spectrograms are saved in {}'.format(synth_dir)) return os.path.join(synth_dir, 'map.txt')
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, GTA=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ ### checking for conditions if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not GTA: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not GTA and self.hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if GTA and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self.hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) ####### declare variables with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hparams = self.hparams assert hparams.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hparams.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None ### get symbol to create embedding lookup table if self.hparams.hangul_type == 1: hangul_symbol = hangul_symbol_1 elif self.hparams.hangul_type == 2: hangul_symbol = hangul_symbol_2 elif self.hparams.hangul_type == 3: hangul_symbol = hangul_symbol_3 elif self.hparams.hangul_type == 4: hangul_symbol = hangul_symbol_4 else: hangul_symbol = hangul_symbol_5 # Embeddings ==> [batch_size, sequence_length, embedding_dim] # create embedding look up table with shape of [number of symbols, embedding dimension (declare in hparams] embedding_table = tf.get_variable( 'inputs_embedding', [len(hangul_symbol), hparams.embedding_dim], dtype=tf.float32) ### inputs is a tensor of sequence of IDs (created using text_to_sequence) # which is loaded through feeder class (_meta_data variable) from train.txt in training_data folder ## embedded_input is a Tensor with same type with embedding_table Tensor embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) ########################## ## create Encoder object## ########################## encoder_cell = Encoder.EncoderCell( EncoderConvolution=Encoder.EncoderConvolution( is_training=is_training, hparams=hparams, scope='encoder_convolutions'), EncoderLSTM=Encoder.EncoderLSTM(is_training=is_training, size=256, zoneout=0.1, scope='encoder_lstm')) # extract Encoder model output encoder_outputs = encoder_cell(embedded_inputs, input_lengths=input_lengths) # store convolution output shape for visualization enc_conv_output_shape = encoder_cell.conv_output_shape ########################## ## create Decoder object## ########################## decoder_cell = Decoder.TacotronDecoderCell( prenet=Decoder.Prenet(is_training=is_training, layers_sizes=[256, 256], drop_rate=hparams.dropout_rate, scope='decoder_prenet'), attention_mechanism=Decoder.LocationSensitiveAttention( num_units=hparams.attention_dim, memory=encoder_outputs, hparams=hparams, mask_encoder=True, memory_sequence_length=input_lengths, cumulate_weights=True), rnn_cell=Decoder.DecoderRNN(is_training=is_training, layers=2, zoneout=hparams.zoneout_rate, scope='decoder_lstm'), frame_projection=Decoder.FrameProjection( shape=hparams.num_mels * 2, scope='linear_transform'), stop_projection=Decoder.StopProjection( is_training=is_training or is_evaluating, shape=2, scope='stop_token_projection'), ) ##initiate the first state of decoder decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) # Define the helper for our decoder if is_training or is_evaluating or GTA: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hparams, GTA, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hparams) # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( Decoder.CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=hparams.max_iters, swap_memory=hparams.tacotron_swap_with_cpu) # Only use max iterations at synthesis time max_iters = hparams.max_iters if not (is_training or is_evaluating) else None # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hparams.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet.Postnet(is_training, hparams=hparams, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = Decoder.FrameProjection( hparams.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual # time-domain waveforms is only used for predicting mels to train wavenet vocoder\ # so we omit post processing when doing GTA synthesis post_condition = hparams.predict_linear and not GTA if post_condition: # Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py # Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = Encoder.EncoderCell( Encoder.EncoderConvolution( is_training, hparams=hparams, scope='post_processing_convolutions'), Encoder.EncoderLSTM(is_training, size=hparams.enc_lstm_hidden_size, zoneout=hparams.zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = Decoder.FrameProjection( hparams.num_freq, scope='post_processing_projection')(expand_outputs) self.linear_outputs = linear_outputs self.linear_targets = linear_targets # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(GTA)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._train_offset = 0 self._test_offset = 0 # Load metadata #load mel spectrogram numpy matrix data self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') #load linear spectrograme numpy matrix data self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') #load metadata of text which are stored in train.txt file with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] ### major variable ##calculate total audio length (for logging information) #calculate length in milisecond per hop_size frame_shift_ms = hparams.hop_size / hparams.sample_rate #calculate length in hour by getting 4th variable in train.txt hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(self._metadata), hours)) # Train test split ## training dataset: _train_meta ## test dataset: _test_meta if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches is not None test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None else hparams.tacotron_test_batches * hparams.tacotron_batch_size) indices = np.arange(len( self._metadata)) # create a integer array from 0 to len(metadata) # indicate train index and test index from above array train_indices, test_indices = train_test_split( indices, test_size=test_size, random_state=hparams.tacotron_data_random_state) # Make sure test_indices is a multiple of batch_size else round up len_test_indices = self._round_up(len(test_indices), hparams.tacotron_batch_size) print('len test indices {}'.format(len_test_indices)) # redundant test_indices extra_test = test_indices[len_test_indices:] # new test_indices based on new length test_indices = test_indices[:len_test_indices] # new train_indices by joining old one with redundant test_indices train_indices = np.concatenate([train_indices, extra_test]) self._train_meta = list(np.array(self._metadata)[train_indices]) self._test_meta = list(np.array(self._metadata)[test_indices]) self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches == self.test_steps # pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 # explicitely setting the padding to a value that doesn't originally exist in the spectogram # to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -(hparams.max_abs_value + .1) else: self._target_pad = -0.1 # Mark finished sequences with 1s self._token_pad = 1. with tf.device('/cpu:0'): # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name='inputs'), tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'), ] # Create queue for buffering data queue = tf.FIFOQueue(8, [ tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32 ], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.token_targets.set_shape(self._placeholders[3].shape) self.linear_targets.set_shape(self._placeholders[4].shape) self.targets_lengths.set_shape(self._placeholders[5].shape) # Create eval queue for buffering eval data eval_queue = tf.FIFOQueue(1, [ tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32 ], name='eval_queue') self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \ self.eval_linear_targets, self.eval_targets_lengths = eval_queue.dequeue() self.eval_inputs.set_shape(self._placeholders[0].shape) self.eval_input_lengths.set_shape(self._placeholders[1].shape) self.eval_mel_targets.set_shape(self._placeholders[2].shape) self.eval_token_targets.set_shape(self._placeholders[3].shape) self.eval_linear_targets.set_shape(self._placeholders[4].shape) self.eval_targets_lengths.set_shape(self._placeholders[5].shape)
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, GTA=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None, reference_mel=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ ### checking for conditions if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not GTA: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not GTA and self.hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if GTA and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self.hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) ### get symbol to create embedding lookup table if self.hparams.hangul_type == 1: hangul_symbol = hangul_symbol_1 elif self.hparams.hangul_type == 2: hangul_symbol = hangul_symbol_2 elif self.hparams.hangul_type == 3: hangul_symbol = hangul_symbol_3 elif self.hparams.hangul_type == 4: hangul_symbol = hangul_symbol_4 else: hangul_symbol = hangul_symbol_5 split_device = '/cpu:0' with tf.device(split_device): hp = self.hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] ##todo: tower_ref_audio = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) if is_training: ## if training, add mel_targets as ref_audio tower_ref_audio.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # tower_ref_encoder = [] # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None ## reference embedding: ##if there are input reference audio # if tower_ref_audio[i] is not None: # print(tower_ref_audio[i]) # ref_encoder = reference_encoder(inputs=tower_ref_audio[i], is_training=is_training) # self.tower_ref_output.append(ref_encoder) # log('Tacotron.py line 180 ref_encoder.shape: {}'.format(ref_encoder.shape)) # ref_attention = Attention.MultiheadsAttention( # query=tf.expand_dims(ref_encoder, axis=1), # value=tf.tanh(tf.tile(tf.expand_dims(self.style_tokens, axis=0), [batch_size,1,1])), # attention_heads=self.hparams.attention_heads, # num_units=128, # normalize=True # ) # style_embedding = ref_attention.multi_heads_attention() # log('Tacotron.py line 188 style_embedding.shape: {}'.format(style_embedding.shape)) # else: ### if there is not input reference audio, use random # rand_weight = tf.nn.softmax(tf.random_uniform([hp.attention_heads, hp.num_tokens], maxval=1.0, dtype=tf.float32),name='random_weight_gst') # style_embedding = tf.reshape(tf.matmul(rand_weight,tf.nn.tanh(self.style_tokens), [1,1]+[hp.attention_heads+self.style_tokens.get_shape().as_list()[1]])) # # log('Tacotron.py line 193 style_embedding.shape: {}'.format(style_embedding.shape)) if hp.use_gst: # Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) self.gst_tokens = gst_tokens # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(hangul_symbol), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) self.embedded_inputs_ = embedded_inputs # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = Encoder.EncoderCell( Encoder.EncoderConvolution( is_training, hparams=hp, scope='encoder_convolutions'), Encoder.EncoderLSTM(is_training, size=hp.enc_lstm_hidden_size, zoneout=hp.zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) # # ### append reference embedding to encoder output if is_training: ### on training, reference_mel is None, set it to target mel reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.reference_filters, kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: style_attention = Attention.MultiheadAttention( query=tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] value=tf.tanh( tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_embeddings = style_attention.multi_head_attention( ) # [N, 1, 256] else: style_embeddings = tf.expand_dims( refnet_outputs, axis=1) # [N, 1, 128] style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1 ]) # [N, T_in, 128] else: print("Use random weight for GST.") random_weights = tf.random_uniform( [hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.reshape( style_embeddings, [1, 1] + [ hp.num_heads * gst_tokens.get_shape().as_list()[1] ]) style_embeddings = tf.tile(style_embeddings, [ shape_list(encoder_outputs)[0], shape_list(encoder_outputs)[1], 1 ]) # [N, T_in, 128] encoder_outputs = tf.concat( [encoder_outputs, style_embeddings], axis=-1) self.encoder_outputs = encoder_outputs print('encoder_outputs.shape after {}'.format( encoder_outputs.shape)) # For shape visualization purpose enc_conv_output_shape = self.encoder_outputs.shape # Decoder Parts # Attention Decoder Prenet prenet = Decoder.Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.dropout_rate, scope='decoder_prenet') print('memory.shape {}'.format(encoder_outputs.shape)) attention_mechanism = Attention.LocationSensitiveAttention( num_units=hp.attention_dim, memory=encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = Decoder.DecoderRNN( is_training=is_training, layers=2, size=hp.decoder_lstm_units, zoneout=hp.zoneout_rate, scope='decoder_LSTM') # Frames Projection layer frame_projection = Decoder.FrameProjection( shape=hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') # <stop_token> projection layer stop_projection = Decoder.StopProjection( is_training, shape=hp.outputs_per_step, scope='stop_token_projection') # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = Decoder.TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) # Define the helper for our decoder if is_training or is_evaluating or GTA: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, GTA, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) # initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) # Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( Decoder.CustomDecoder(decoder_cell, self.helper, decoder_init_state), maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: decoder_output = tf.minimum( tf.maximum( decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) # Postnet postnet = Postnet.Postnet(is_training, hparams=hp, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] print('decoder_output.shape {}'.format( decoder_output.shape)) residual = postnet(decoder_output) print('residual.shape {}'.format(residual.shape)) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = Decoder.FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) print('projected_residual.shape {}'.format( projected_residual.shape)) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual print('mel_outputs.shape {}'.format(mel_outputs.shape)) if hp.clip_outputs: mel_outputs = tf.minimum( tf.maximum( mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) log('initialization done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape))