def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} train batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict( zip(self._placeholders, self._prepare_batch(batch, r))) if not self._coord.should_stop(): #log("enque op started (train).") self._session.run(self._enqueue_op, feed_dict=feed_dict) #log("enque op finished (train).") log("_enqueue_next_train_group finished.")
def run_synthesis(args, checkpoint, hparams): try: checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format( hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) gta = args.GTA == 'True' synth_dir = get_synth_dir(args.caching_dir, gta) gta_map_file = get_gta_map_file(synth_dir) #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_path = get_train_txt(args.caching_dir) metadata = load_meta(metadata_path) log(hparams_debug_string()) synth = Synthesizer(args.caching_dir) synth.load(checkpoint_path, hparams, gta=gta) frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[2]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) #Set inputs batch wise metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)] log('Starting Synthesis') txt_dir = get_txt_dir(args.caching_dir) mel_dir = get_mel_dir(args.caching_dir) wav_dir = get_wav_dir(args.caching_dir) symbol_file = get_symbols_file(args.caching_dir) conv = get_from_file(symbol_file) with open(gta_map_file, 'w') as file: for i, meta in enumerate(tqdm(metadata)): if i % 10 == 0: text_paths = [os.path.join(txt_dir, "{}.npy".format(m[0])) for m in meta] text_symbols = [np.load(pth) for pth in text_paths] # trim ~ at the end texts = [conv.sequence_to_original_text(x) for x in text_symbols] #texts = [m[5] for m in meta] mel_filenames = [os.path.join(mel_dir, "{}.npy".format(m[0])) for m in meta] wav_filenames = [os.path.join(wav_dir, "{}.npy".format(m[0])) for m in meta] basenames = [m[0] for m in meta] mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames) for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(synth_dir)) return gta_map_file
def run(): parser = argparse.ArgumentParser() parser.add_argument('--caching_dir', default='/datasets/models/tacotron/cache') parser.add_argument( '--hparams', default='', help= 'Hyperparameter overrides as a comma-separated list of name=value pairs' ) accepted_modes = ['eval', 'synthesis', 'live'] parser.add_argument( '--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes)) parser.add_argument( '--GTA', default='True', help= 'Ground truth aligned synthesis, defaults to True, only considered in synthesis mode' ) parser.add_argument( '--text_list', default='', help= 'Text file contains list of texts to be synthesized. Valid if mode=eval' ) parser.add_argument( '--speaker_id', default=None, help= 'Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids' ) args = parser.parse_args() if args.mode not in accepted_modes: raise ValueError('accepted modes are: {}, found {}'.format( accepted_modes, args.mode)) if args.GTA not in ('True', 'False'): raise ValueError('GTA option must be either True or False') modified_hp = hparams.parse(args.hparams) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' from src.tac.training.wav_training import get_log_dir from src.tac.training.wav_training import get_save_dir wavenet_log_dir = get_log_dir(args.caching_dir) wavenet_pretrained = get_save_dir(wavenet_log_dir) log('Synthesizing audio from mel-spectrograms.. (This may take a while)') run_synthesis(args, wavenet_pretrained, args.caching_dir, hparams) log('Tacotron-2 TTS synthesis complete!')
def run_eval(args, checkpoint, hparams, sentences): try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( 'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( 'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) output_dir = get_synthesis_output_dir(args.caching_dir) eval_dir = get_evals_dir(args.caching_dir) log_dir = os.path.join(output_dir, 'logs-eval') #if args.model == 'Tacotron-2': #assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer(args.caching_dir) synth.load(checkpoint_path, hparams) #Set inputs batch wise sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range( 0, len(sentences), hparams.tacotron_synthesis_batch_size) ] log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(sentences)): start = time.time() basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts)) ] mel_filenames, speaker_ids = synth.synthesize( texts, basenames, eval_dir, log_dir, None) for elems in zip(texts, mel_filenames, speaker_ids): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def make_test_batches(self): start = time.time() #Read one example for evaluation n = 1 #Test on entire test set (one sample at an evaluation step) examples = [ self._get_test_groups() for i in range(len(self._test_meta)) ] batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) return batches
def load(self, checkpoint_path, hparams): log('Constructing model: {}'.format('WaveNet')) self._hparams = hparams local_cond, global_cond = self._check_conditions() self.local_conditions = tf.placeholder( tf.float32, shape=(None, None, hparams.num_mels), name='local_condition_features') if local_cond else None self.global_conditions = tf.placeholder( tf.int32, shape=(None, 1), name='global_condition_features') if global_cond else None self.synthesis_length = tf.placeholder( tf.int32, shape=(), name='synthesis_length') if not local_cond else None self.targets = tf.placeholder( tf.float32, shape=(1, None, 1), name='audio_targets' ) if hparams.wavenet_synth_debug else None #Debug only with 1 wav self.input_lengths = tf.placeholder( tf.int32, shape=(1, ), name='input_lengths') if hparams.wavenet_synth_debug else None self.synth_debug = hparams.wavenet_synth_debug with tf.variable_scope('WaveNet_model') as scope: self.model = create_model(hparams) self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, input_lengths=self.input_lengths, synthesis_length=self.synthesis_length, test_inputs=self.targets) self._hparams = hparams sh_saver = create_shadow_saver(self.model) log('Loading checkpoint: {}'.format(checkpoint_path)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) load_averaged_model(self.session, sh_saver, checkpoint_path)
def make_test_batches(self): start = time.time() # Read a group of examples n = self._hparams.tacotron_batch_size r = self._hparams.outputs_per_step #Test on entire test set examples = [ self._get_test_groups() for i in range(len(self._test_meta)) ] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) return batches, r
def run_live(args, checkpoint, hparams): # if args.mode != eval or synthesis try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( 'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( 'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) #Log to Terminal without keeping any records in files log(hparams_debug_string()) synth = Synthesizer(args.caching_dir) synth.load(checkpoint_path, hparams) #Generate fast greeting message greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' log(greetings) generate_fast(synth, greetings) #Interaction loop while True: try: text = input() generate_fast(synth, text) except KeyboardInterrupt: leave = 'Thank you for testing our features. see you soon.' log(leave) generate_fast(synth, leave) sleep(2) break
def _enqueue_next_test_group(self): #Create test batches once and evaluate on them for all test steps test_batches, r = self.make_test_batches() while not self._coord.should_stop(): for batch in test_batches: feed_dict = dict( zip(self._placeholders, self._prepare_batch(batch, r))) if not self._coord.should_stop(): log("enque op started (test).") self._session.run(self._eval_enqueue_op, feed_dict=feed_dict) log("enque op finished (test).") log("_enqueue_next_test_group finished.")
def run(): parser = argparse.ArgumentParser() parser.add_argument('--caching_dir', default='/datasets/models/tacotron/cache') parser.add_argument('--mode', default='synthesis', help='mode for synthesis of tacotron after training') parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode') parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') parser.add_argument('--hparams', default='', help='Hyperparameter overrides as a comma-separated list of name=value pairs') args = parser.parse_args() modified_hp = hparams.parse(args.hparams) os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) log_dir = get_log_dir(args.caching_dir) os.makedirs(log_dir, exist_ok=True) infolog_path = get_infolog_path(log_dir) infolog.init(infolog_path, 'tacotron') log('\n##########################################################\n') log('Tacotron GTA Synthesis\n') log('###########################################################\n') tacotron_pretrained = get_save_dir(log_dir) run_synthesis(args, tacotron_pretrained, modified_hp)
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name): log('\nSaving intermediate states at step {}'.format(global_step)) idx = 0 y_hat, y, loss, length, input_mel, upsampled_features = sess.run([ model.tower_y_hat_log[0][idx], model.tower_y_log[0][idx], model.loss, model.tower_input_lengths[0][idx], model.tower_c[0][idx], model.tower_upsampled_local_features[0][idx] ]) #mask by length y_hat[length:] = 0 y[length:] = 0 #Make audio and plot paths pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) mel_path = os.path.join( plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step)) upsampled_path = os.path.join( plot_dir, 'step-{}-upsampled-features.png'.format(global_step)) #Save figure util.waveplot(plot_path, y_hat, y, hparams, title='{}, {}, step={}, loss={:.5f}'.format( model_name, time_string(), global_step, loss)) #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range) util.plot_spectrogram( generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}' .format(global_step, loss), target_spectrogram=input_mel.T) util.plot_spectrogram( upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'. format(global_step, loss), auto_aspect=True) #Save audio save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
def __init__(self, coordinator, caching_dir, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._train_offset = 0 self._test_offset = 0 # Load metadata self._txt_dir = get_txt_dir(caching_dir) self._mel_dir = get_mel_dir(caching_dir) self._linear_dir = get_lin_dir(caching_dir) metadata_path = get_train_txt(caching_dir) self._metadata = load_meta(metadata_path) #self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') #self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') # with open(metadata_filename, encoding='utf-8') as f: # self._metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[2]) for x in self._metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(self._metadata), hours)) #Train test split if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches is not None test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None else hparams.tacotron_test_batches * hparams.tacotron_batch_size) indices = np.arange(len(self._metadata)) train_indices, test_indices = train_test_split( indices, test_size=test_size, random_state=hparams.tacotron_data_random_state) #Make sure test_indices is a multiple of batch_size else round down len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size) extra_test = test_indices[len_test_indices:] test_indices = test_indices[:len_test_indices] train_indices = np.concatenate([train_indices, extra_test]) self._train_meta = list(np.array(self._metadata)[train_indices]) self._test_meta = list(np.array(self._metadata)[test_indices]) self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size log('test_steps = {}'.format(self.test_steps)) if hparams.tacotron_test_size is None: assert hparams.tacotron_test_batches == self.test_steps #pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 #explicitely setting the padding to a value that doesn't originally exist in the spectogram #to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. #Mark finished sequences with 1s self._token_pad = 1. with tf.device('/cpu:0'): # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name='inputs'), tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'), tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos'), ] # Create queue for buffering data self.input_queue = tf.FIFOQueue(8, [ tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32 ], name='input_queue') self._enqueue_op = self.input_queue.enqueue(self._placeholders) log("dequeue input_queuue") self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths, self.split_infos = self.input_queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.token_targets.set_shape(self._placeholders[3].shape) self.linear_targets.set_shape(self._placeholders[4].shape) self.targets_lengths.set_shape(self._placeholders[5].shape) self.split_infos.set_shape(self._placeholders[6].shape) # Create eval queue for buffering eval data self.eval_queue = tf.FIFOQueue(1, [ tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32 ], name='eval_queue') #todo here is a bug with session self._eval_enqueue_op = self.eval_queue.enqueue(self._placeholders) log("dequeue eval") self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \ self.eval_linear_targets, self.eval_targets_lengths, self.eval_split_infos = self.eval_queue.dequeue() self.eval_inputs.set_shape(self._placeholders[0].shape) self.eval_input_lengths.set_shape(self._placeholders[1].shape) self.eval_mel_targets.set_shape(self._placeholders[2].shape) self.eval_token_targets.set_shape(self._placeholders[3].shape) self.eval_linear_targets.set_shape(self._placeholders[4].shape) self.eval_targets_lengths.set_shape(self._placeholders[5].shape) self.eval_split_infos.set_shape(self._placeholders[6].shape)
def initialize(self, inputs, input_lengths, symbols_count: int, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0' with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [symbols_count, hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: decoder_output = tf.minimum( tf.maximum( decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if hp.clip_outputs: mel_outputs = tf.minimum( tf.maximum( mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) if hp.clip_outputs: linear_outputs = tf.minimum( tf.maximum( linear_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def load(self, checkpoint_path, hparams, gta=False): log('Constructing model: Tacotron') #Force the batch size to be known in order to use attention masking in batch synthesis inputs = tf.placeholder(tf.int32, (None, None), name='inputs') input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths') targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets') split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos') with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: self.model = Tacotron(hparams) symbols_count = self._symbol_converter.get_symbols_count() if gta: self.model.initialize(inputs, input_lengths, symbols_count, targets, gta=gta, split_infos=split_infos) else: self.model.initialize(inputs, input_lengths, symbols_count, split_infos=split_infos) self.mel_outputs = self.model.tower_mel_outputs self.linear_outputs = self.model.tower_linear_outputs if ( hparams.predict_linear and not gta) else None self.alignments = self.model.tower_alignments self.stop_token_prediction = self.model.tower_stop_token_prediction self.charsets = targets if hparams.GL_on_GPU: self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') self.GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow( self.GLGPU_mel_inputs, hparams) self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow( self.GLGPU_lin_inputs, hparams) self.gta = gta self._hparams = hparams #pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 #explicitely setting the padding to a value that doesn't originally exist in the spectogram #to avoid any possible conflicts, without affecting the output range of the model too much if hparams.symmetric_mels: self._target_pad = -hparams.max_abs_value else: self._target_pad = 0. self.inputs = inputs self.input_lengths = input_lengths self.targets = targets self.split_infos = split_infos log('Loading checkpoint: %s' % checkpoint_path) #Memory allocation on the GPUs as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def run(testrun: bool = False): import argparse parser = argparse.ArgumentParser() train_steps = 5000 checkpoint_intervall = 100 if testrun: train_steps = 20 checkpoint_intervall = 1 parser.add_argument('--caching_dir', default='/datasets/models/tacotron/cache') parser.add_argument( '--GTA', default='True', help= 'Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode' ) parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') parser.add_argument( '--hparams', default='', help= 'Hyperparameter overrides as a comma-separated list of name=value pairs' ) parser.add_argument('--restore', type=bool, default=False, help='Set this to False to do a fresh training') parser.add_argument('--checkpoint_interval', type=int, default=checkpoint_intervall, help='Steps between writing checkpoints') # 2500 parser.add_argument('--eval_interval', type=int, default=100000, help='Steps between eval on test data') parser.add_argument('--summary_interval', type=int, default=10000, help='Steps between running summary ops') parser.add_argument( '--embedding_interval', type=int, default=10000, help='Steps between updating embeddings projection visualization') parser.add_argument('--wavenet_train_steps', type=int, default=train_steps, help='total number of wavenet training steps') args = parser.parse_args() modified_hp = hparams.parse(args.hparams) os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) log_dir = get_log_dir(args.caching_dir) os.makedirs(log_dir, exist_ok=True) infolog_path = get_infolog_path(log_dir) infolog.init(infolog_path, 'tacotron') log('\n##########################################################\n') log('Wavenet Train\n') log('###########################################################\n') train(log_dir, args, modified_hp)
def run_synthesis(args, checkpoint, caching_dir, hparams): output_dir = get_output_dir(caching_dir) try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) log_dir = os.path.join(output_dir, 'plots') wav_dir = os.path.join(output_dir, 'wavs') #We suppose user will provide correct folder depending on training method log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #if args.model == 'Tacotron-2': #If running all Tacotron-2, synthesize audio from evaluated mels evals_dir = get_evals_dir(args.caching_dir) metadata_filename = os.path.join(evals_dir, 'map.txt') with open(metadata_filename, encoding='utf-8') as f: metadata = np.array([line.strip().split('|') for line in f]) speaker_ids = metadata[:, 2] mel_files = metadata[:, 1] texts = metadata[:, 0] speaker_ids = None if (speaker_ids == '<no_g>').all() else speaker_ids # else: # #else Get all npy files in input_dir (supposing they are mels) # mel_files = sorted([os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy']) # speaker_ids = None if args.speaker_id is None else args.speaker_id.replace(' ', '').split(',') # if speaker_ids is not None: # assert len(speaker_ids) == len(mel_files) # texts = None log('Starting synthesis! (this will take a while..)') os.makedirs(log_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) mel_files = [ mel_files[i:i + hparams.wavenet_synthesis_batch_size] for i in range(0, len(mel_files), hparams.wavenet_synthesis_batch_size) ] speaker_ids = None if speaker_ids is None else [ speaker_ids[i:i + hparams.wavenet_synthesis_batch_size] for i in range( 0, len(speaker_ids), hparams.wavenet_synthesis_batch_size) ] texts = None if texts is None else [ texts[i:i + hparams.wavenet_synthesis_batch_size] for i in range(0, len(texts), hparams.wavenet_synthesis_batch_size) ] with open(os.path.join(wav_dir, 'map.txt'), 'w') as file: for i, mel_batch in enumerate(tqdm(mel_files)): mel_spectros = [np.load(mel_file) for mel_file in mel_batch] basenames = [ os.path.basename(mel_file).replace('.npy', '') for mel_file in mel_batch ] speaker_id_batch = None if speaker_ids is None else speaker_ids[i] audio_files = synth.synthesize(mel_spectros, speaker_id_batch, basenames, wav_dir, log_dir) speaker_logs = ['<no_g>'] * len( mel_batch) if speaker_id_batch is None else speaker_id_batch for j, mel_file in enumerate(mel_batch): if texts is None: file.write('{}|{}\n'.format(mel_file, audio_files[j], speaker_logs[j])) else: file.write('{}|{}|{}\n'.format(texts[i][j], mel_file, audio_files[j], speaker_logs[j])) log('synthesized audio waveforms at {}'.format(wav_dir))
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name): '''Evaluate model during training. Supposes that model variables are averaged. ''' start_time = time.time() y_hat, y_target, loss, input_mel, upsampled_features = sess.run([ model.tower_y_hat[0], model.tower_y_target[0], model.eval_loss, model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0] ]) duration = time.time() - start_time log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)' .format(len(y_target), duration, len(y_target) / duration)) #Make audio and plot paths pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) mel_path = os.path.join( plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step)) upsampled_path = os.path.join( plot_dir, 'step-{}-upsampled-features.png'.format(global_step)) #Save figure util.waveplot(plot_path, y_hat, y_target, model._hparams, title='{}, {}, step={}, loss={:.5f}'.format( model_name, time_string(), global_step, loss)) log('Eval loss for global step {}: {:.3f}'.format(global_step, loss)) #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range) util.plot_spectrogram( generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}' .format(global_step, loss), target_spectrogram=input_mel.T) util.plot_spectrogram( upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'. format(global_step, loss), auto_aspect=True) #Save Audio save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) #Write eval summary to tensorboard log('Writing eval summary!') add_test_stats(summary_writer, global_step, loss, hparams=hparams)
def run(): parser = argparse.ArgumentParser() parser.add_argument('--caching_dir', default='/datasets/models/tacotron/cache') parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint') parser.add_argument( '--hparams', default='', help= 'Hyperparameter overrides as a comma-separated list of name=value pairs' ) #parser.add_argument('--name', help='Name of logging directory if the two models were trained together.') #parser.add_argument('--tacotron_name', help='Name of logging directory of Tacotron. If trained separately') #parser.add_argument('--wavenet_name', help='Name of logging directory of WaveNet. If trained separately') #parser.add_argument('--model', default='Tacotron-2') #parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets') #parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet') #parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') accepted_modes = ['eval', 'synthesis', 'live'] parser.add_argument( '--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes)) parser.add_argument( '--GTA', default='True', help= 'Ground truth aligned synthesis, defaults to True, only considered in synthesis mode' ) parser.add_argument( '--text_list', default='', help= 'Text file contains list of texts to be synthesized. Valid if mode=eval' ) parser.add_argument( '--speaker_id', default=None, help= 'Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids' ) args = parser.parse_args() if args.mode not in accepted_modes: raise ValueError('accepted modes are: {}, found {}'.format( accepted_modes, args.mode)) if args.GTA not in ('True', 'False'): raise ValueError('GTA option must be either True or False') modified_hp = hparams.parse(args.hparams) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' from src.tac.training.tacotron_training import get_log_dir from src.tac.training.tacotron_training import get_save_dir taco_log_dir = get_log_dir(args.caching_dir) tacotron_pretrained = get_save_dir(taco_log_dir) #run_name = args.name or args.tacotron_name or args.model #taco_checkpoint = os.path.join('logs-' + run_name, 'taco_' + args.checkpoint) sentences = get_sentences(args) log('Synthesizing mel-spectrograms from text..') run_eval(args, tacotron_pretrained, modified_hp, sentences)
def train(log_dir, args, hparams): save_dir = get_save_dir(log_dir) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'wavenet_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt') gta = args.GTA == 'True' synth_dir = get_synth_dir(args.caching_dir, gta) gta_map_file = get_gta_map_file(synth_dir) log('Checkpoint_path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(gta_map_file)) log('Using model: {}'.format('WaveNet')) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.wavenet_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, gta_map_file, hparams) #Set up model global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Speaker Embeddings metadata if hparams.speakers_path is not None: speaker_embedding_meta = hparams.speakers_path else: speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv') if not os.path.isfile(speaker_embedding_meta): with open(speaker_embedding_meta, 'w', encoding='utf-8') as f: for speaker in hparams.speakers: f.write('{}\n'.format(speaker)) speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..') #book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) sh_saver = create_shadow_saver(model, global_step) log('Wavenet training set to a maximum of {} steps'.format( args.wavenet_train_steps)) #Memory allocation on the memory config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True run_init = False #Train sess = tf.Session(config=config) summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) if hparams.wavenet_weight_normalization: run_init = True except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) if hparams.wavenet_weight_normalization: run_init = True if run_init: log('\nApplying Weight normalization in fresh training. Applying data dependent initialization forward pass..' ) #Create init_model init_model, _ = model_train_mode(args, feeder, hparams, global_step, init=True) #initializing feeder feeder.start_threads(sess) if run_init: #Run one forward pass for model parameters initialization (make prediction on init_batch) _ = sess.run(init_model.tower_y_hat) log('Data dependent initialization done. Starting training!') #Training loop while not coord.should_stop() and step < args.wavenet_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss) or loss > 100: log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps: save_log(sess, step, model, plot_dir, wav_dir, hparams=hparams, model_name='WaveNet') save_checkpoint(sess, sh_saver, checkpoint_path, global_step) if step % args.eval_interval == 0: log('\nEvaluating at step {}'.format(step)) eval_step(sess, step, eval_model, eval_plot_dir, eval_wav_dir, summary_writer=summary_writer, hparams=model._hparams, model_name='WaveNet') if hparams.gin_channels > 0 and (step % args.embedding_interval == 0 or step == args.wavenet_train_steps or step == 1): #Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) #Update Projector log('\nSaving Model Speaker Embeddings visualization..') add_embedding_stats(summary_writer, [model.embedding_table.name], [speaker_embedding_meta], checkpoint_state.model_checkpoint_path) log('WaveNet Speaker embeddings have been updated on tensorboard!') log('Wavenet training complete after {} global steps'.format( args.wavenet_train_steps), slack=True) coord.request_stop() coord.wait_for_stop() try: sess.close() tf.reset_default_graph() except: log("Session bug occured.") # except Exception as e: # log('Exiting due to exception: {}'.format(e), slack=True) # traceback.print_exc() # coord.request_stop(e) # coord.wait_for_stop() # raise Exception('Exception occured.') sleep(0.5)