def generate_data(label_type, model, batch_size=1, splice=1): """ Args: label_type (string): character or phone or multitask model (string): ctc or attention or joint_ctc_attention batch_size (int, optional): the size of mini-batch splice (int, optional): frames to splice. Default is 1 frame. Returns: inputs: `[B, T, input_size]` labels: `[B]` inputs_seq_len: `[B, frame_num]` labels_seq_len: `[B]` (if model is attention) """ # Make input data inputs, inputs_seq_len = wav2feature( ['./sample/LDC93S1.wav'] * batch_size, feature_type='logfbank', feature_dim=40, energy=False, delta1=True, delta2=True) # Splice inputs = do_splice(inputs, splice=splice) phone2idx_ctc = Phone2idx( map_file_path='../../experiments/timit/metrics/mapping_files/ctc/phone61.txt') phone2idx_att = Phone2idx( map_file_path='../../experiments/timit/metrics/mapping_files/attention/phone61.txt') # Make transcripts if model == 'ctc': if label_type == 'character': transcript = _read_text('./sample/LDC93S1.txt') transcript = ' ' + transcript.replace('.', '') + ' ' labels = [alpha2idx(transcript)] * batch_size # Convert to SparseTensor labels = list2sparsetensor(labels, padded_value=-1) return inputs, labels, inputs_seq_len elif label_type == 'phone': transcript = _read_phone('./sample/LDC93S1.phn') labels = [phone2idx_ctc(transcript.split(' '))] * batch_size # Convert to SparseTensor labels = list2sparsetensor(labels, padded_value=-1) return inputs, labels, inputs_seq_len elif label_type == 'multitask': transcript_char = _read_text('./sample/LDC93S1.txt') transcript_phone = _read_phone('./sample/LDC93S1.phn') transcript_char = ' ' + transcript_char.replace('.', '') + ' ' labels_char = [alpha2idx(transcript_char)] * batch_size labels_phone = [phone2idx_ctc(transcript_phone.split(' '))] * batch_size # Convert to SparseTensor labels_char = list2sparsetensor(labels_char, padded_value=-1) labels_phone = list2sparsetensor(labels_phone, padded_value=-1) return inputs, labels_char, labels_phone, inputs_seq_len elif model == 'attention': if label_type == 'character': transcript = _read_text('./sample/LDC93S1.txt') transcript = '<' + transcript.replace('.', '') + '>' labels = [alpha2idx(transcript)] * batch_size labels_seq_len = [len(labels[0])] * batch_size return inputs, labels, inputs_seq_len, labels_seq_len elif label_type == 'phone': transcript = _read_phone('./sample/LDC93S1.phn') transcript = '< ' + transcript + ' >' labels = [phone2idx_att(transcript.split(' '))] * batch_size labels_seq_len = [len(labels[0])] * batch_size return inputs, labels, inputs_seq_len, labels_seq_len elif label_type == 'multitask': transcript_char = _read_text('./sample/LDC93S1.txt') transcript_phone = _read_phone('./sample/LDC93S1.phn') transcript_char = '<' + transcript_char.replace('.', '') + '>' transcript_phone = '< ' + transcript_phone + ' >' labels_char = [alpha2idx(transcript_char)] * batch_size labels_phone = [phone2idx_att(transcript_phone.split(' '))] * batch_size target_len_char = [len(labels_char[0])] * batch_size target_len_phone = [len(labels_phone[0])] * batch_size return (inputs, labels_char, labels_phone, inputs_seq_len, target_len_char, target_len_phone) elif model == 'joint_ctc_attention': if label_type == 'character': transcript = _read_text('./sample/LDC93S1.txt') att_transcript = '<' + transcript.replace('.', '') + '>' ctc_transcript = ' ' + transcript.replace('.', '') + ' ' att_labels = [alpha2idx(att_transcript)] * batch_size labels_seq_len = [len(att_labels[0])] * batch_size ctc_labels = [alpha2idx(ctc_transcript)] * batch_size # Convert to SparseTensor ctc_labels = list2sparsetensor(ctc_labels, padded_value=-1) return inputs, att_labels, inputs_seq_len, labels_seq_len, ctc_labels elif label_type == 'phone': transcript = _read_phone('./sample/LDC93S1.phn') att_transcript = '< ' + transcript + ' >' att_labels = [phone2idx_att(att_transcript.split(' '))] * batch_size labels_seq_len = [len(att_labels[0])] * batch_size ctc_labels = [phone2idx_ctc(transcript.split(' '))] * batch_size # Convert to SparseTensor ctc_labels = list2sparsetensor(ctc_labels, padded_value=-1) return inputs, att_labels, inputs_seq_len, labels_seq_len, ctc_labels
def do_train(model, params, gpu_indices): """Run training. Args: model: the model to train params (dict): A dictionary of parameters gpu_indices (list): GPU indices """ if 'kanji' in params['label_type']: map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '_' + params['train_data_size'] + '.txt' elif 'kana' in params['label_type']: map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' # Load dataset train_data = Dataset( data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], map_file_path=map_file_path, batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], num_gpu=len(gpu_indices)) dev_data = Dataset( data_type='dev', train_data_size=params['train_data_size'], label_type=params['label_type'], map_file_path=map_file_path, batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, num_gpu=len(gpu_indices)) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to track the global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set optimizer learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') optimizer = model._set_optimizer( params['optimizer'], learning_rate_pl) # Calculate the gradients for each model tower total_grads_and_vars, total_losses = [], [] decode_ops_infer, ler_ops = [], [] all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))] # NOTE: /cpu:0 is prepared for evaluation with tf.variable_scope(tf.get_variable_scope()): for i_gpu in range(len(all_devices)): with tf.device(all_devices[i_gpu]): with tf.name_scope('tower_gpu%d' % i_gpu) as scope: # Define placeholders in each tower model.create_placeholders() # Calculate the total loss for the current tower of the # model. This function constructs the entire model but # shares the variables across all towers. tower_loss, tower_logits, tower_decoder_outputs_train, tower_decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[i_gpu], model.labels_pl_list[i_gpu], model.inputs_seq_len_pl_list[i_gpu], model.labels_seq_len_pl_list[i_gpu], model.keep_prob_encoder_pl_list[i_gpu], model.keep_prob_decoder_pl_list[i_gpu], model.keep_prob_embedding_pl_list[i_gpu], scope) tower_loss = tf.expand_dims(tower_loss, axis=0) total_losses.append(tower_loss) # Reuse variables for the next tower tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this # tower tower_grads_and_vars = optimizer.compute_gradients( tower_loss) # Gradient clipping tower_grads_and_vars = model._clip_gradients( tower_grads_and_vars) # TODO: Optionally add gradient noise # Keep track of the gradients across all towers total_grads_and_vars.append(tower_grads_and_vars) # Add to the graph each operation per tower _, decode_op_tower_infer = model.decode( tower_decoder_outputs_train, tower_decoder_outputs_infer) decode_ops_infer.append(decode_op_tower_infer) # ler_op_tower = model.compute_ler( # decode_op_tower, model.labels_pl_list[i_gpu]) ler_op_tower = model.compute_ler( model.labels_st_true_pl_list[i_gpu], model.labels_st_pred_pl_list[i_gpu]) ler_op_tower = tf.expand_dims(ler_op_tower, axis=0) ler_ops.append(ler_op_tower) # Aggregate losses, then calculate average loss total_losses = tf.concat(axis=0, values=total_losses) loss_op = tf.reduce_mean(total_losses, axis=0) ler_ops = tf.concat(axis=0, values=ler_ops) ler_op = tf.reduce_mean(ler_ops, axis=0) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers average_grads_and_vars = average_gradients(total_grads_and_vars) # Apply the gradients to adjust the shared variables. train_op = optimizer.apply_gradients(average_grads_and_vars, global_step=global_step) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph # NOTE: Start running operations on the Graph. allow_soft_placement # must be set to True to build towers on GPU, as some of the ops do not # have GPU implementations. with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( model.save_path, sess.graph) # Initialize param sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() cer_dev_best = 1 not_improved_epoch = 0 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels_train, inputs_seq_len, labels_seq_len, _ = data feed_dict_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_train[model.inputs_pl_list[i_gpu] ] = inputs[i_gpu] feed_dict_train[model.labels_pl_list[i_gpu] ] = labels_train[i_gpu] feed_dict_train[model.inputs_seq_len_pl_list[i_gpu] ] = inputs_seq_len[i_gpu] feed_dict_train[model.labels_seq_len_pl_list[i_gpu] ] = labels_seq_len[i_gpu] feed_dict_train[model.keep_prob_encoder_pl_list[i_gpu] ] = 1 - float(params['dropout_encoder']) feed_dict_train[model.keep_prob_decoder_pl_list[i_gpu] ] = 1 - float(params['dropout_decoder']) feed_dict_train[model.keep_prob_embedding_pl_list[i_gpu] ] = 1 - float(params['dropout_embedding']) feed_dict_train[learning_rate_pl] = learning_rate # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % int(params['print_step'] / len(gpu_indices)) == 0: # Create feed dictionary for next mini batch (dev) inputs, labels_dev, inputs_seq_len, labels_seq_len, _ = dev_data.next()[ 0] feed_dict_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_dev[model.inputs_pl_list[i_gpu] ] = inputs[i_gpu] feed_dict_dev[model.labels_pl_list[i_gpu] ] = labels_dev[i_gpu] feed_dict_dev[model.inputs_seq_len_pl_list[i_gpu] ] = inputs_seq_len[i_gpu] feed_dict_dev[model.labels_seq_len_pl_list[i_gpu] ] = labels_seq_len[i_gpu] feed_dict_dev[model.keep_prob_encoder_pl_list[i_gpu] ] = 1.0 feed_dict_dev[model.keep_prob_decoder_pl_list[i_gpu] ] = 1.0 feed_dict_dev[model.keep_prob_embedding_pl_list[i_gpu] ] = 1.0 # Compute loss loss_train = sess.run( loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode for i_gpu in range(len(gpu_indices)): feed_dict_train[model.keep_prob_encoder_pl_list[i_gpu]] = 1.0 feed_dict_train[model.keep_prob_decoder_pl_list[i_gpu]] = 1.0 feed_dict_train[model.keep_prob_embedding_pl_list[i_gpu]] = 1.0 # Predict class ids predicted_ids_train_list, summary_str_train = sess.run( [decode_ops_infer, summary_train], feed_dict=feed_dict_train) predicted_ids_dev_list, summary_str_dev = sess.run( [decode_ops_infer, summary_dev], feed_dict=feed_dict_dev) # Convert to sparsetensor to compute LER feed_dict_ler_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_ler_train[model.labels_st_true_pl_list[i_gpu]] = list2sparsetensor( labels_train[i_gpu], padded_value=train_data.padded_value), feed_dict_ler_train[model.labels_st_pred_pl_list[i_gpu]] = list2sparsetensor( predicted_ids_train_list[i_gpu], padded_value=train_data.padded_value) feed_dict_ler_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_ler_dev[model.labels_st_true_pl_list[i_gpu]] = list2sparsetensor( labels_dev[i_gpu], padded_value=dev_data.padded_value), feed_dict_ler_dev[model.labels_st_pred_pl_list[i_gpu]] = list2sparsetensor( predicted_ids_dev_list[i_gpu], padded_value=dev_data.padded_value) # Compute accuracy # ler_train = sess.run(ler_op, feed_dict=feed_dict_ler_train) # ler_dev = sess.run(ler_op, feed_dict=feed_dict_ler_dev) ler_train = 1 ler_dev = 1 csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) # TODO: fix this # Update even files summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() print('=== Dev Data Evaluation ===') cer_dev_epoch = do_eval_cer( session=sess, decode_ops=decode_ops_infer, model=model, dataset=dev_data, label_type=params['label_type'], train_data_size=params['train_data_size'], eval_batch_size=1) print(' CER: %f %%' % (cer_dev_epoch * 100)) if cer_dev_epoch < cer_dev_best: cer_dev_best = cer_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') # Save model (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) else: not_improved_epoch += 1 duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == params['not_improved_patient_epoch']: break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=cer_dev_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def do_train(model, params): """Run training. If target labels are phone, the model is evaluated by PER with 39 phones. Args: model: the model to train params (dict): A dictionary of parameters """ # Load dataset train_data = Dataset(data_type='train', label_type=params['label_type'], batch_size=params['batch_size'], eos_index=params['eos_index'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True) dev_data = Dataset(data_type='dev', label_type=params['label_type'], batch_size=params['batch_size'], eos_index=params['eos_index'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) if 'char' in params['label_type']: test_data = Dataset(data_type='test', label_type=params['label_type'], batch_size=1, eos_index=params['eos_index'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) else: test_data = Dataset(data_type='test', label_type='phone39', batch_size=1, eos_index=params['eos_index'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) # TODO(hirofumi): add frame_stacking and splice # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation (including model definition) loss_op, att_logits, ctc_logits, decoder_outputs_train, decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[0], model.att_labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.att_labels_seq_len_pl_list[0], model.ctc_labels_pl_list[0], model.keep_prob_input_pl_list[0], model.keep_prob_hidden_pl_list[0], model.keep_prob_output_pl_list[0]) train_op = model.train(loss_op, optimizer=params['optimizer'], learning_rate=learning_rate_pl) _, decode_op_infer = model.decoder(decoder_outputs_train, decoder_outputs_infer, decode_type='greedy', beam_width=20) ler_op = model.compute_ler(model.att_labels_st_true_pl, model.att_labels_st_pred_pl) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total param parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M param" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(model.save_path, sess.graph) # Initialize param sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() ler_dev_best = 1 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, att_labels_train, ctc_labels, inputs_seq_len, att_labels_seq_len, _ = data feed_dict_train = { model.inputs_pl_list[0]: inputs, model.att_labels_pl_list[0]: att_labels_train, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.att_labels_seq_len_pl_list[0]: att_labels_seq_len, model.ctc_labels_pl_list[0]: list2sparsetensor( ctc_labels, padded_value=train_data.ctc_padded_value), model.keep_prob_input_pl_list[0]: params['dropout_input'], model.keep_prob_hidden_pl_list[0]: params['dropout_hidden'], model.keep_prob_output_pl_list[0]: params['dropout_output'], learning_rate_pl: learning_rate } # Update param sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % params['print_step'] == 0: # Create feed dictionary for next mini batch (dev) (inputs, att_labels_dev, ctc_labels, inputs_seq_len, att_labels_seq_len, _), _ = dev_data().next() feed_dict_dev = { model.inputs_pl_list[0]: inputs, model.att_labels_pl_list[0]: att_labels_dev, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.att_labels_seq_len_pl_list[0]: att_labels_seq_len, model.ctc_labels_pl_list[0]: list2sparsetensor( ctc_labels, padded_value=dev_data.ctc_padded_value), model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode feed_dict_train[model.keep_prob_input_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_hidden_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_output_pl_list[0]] = 1.0 # Predict class ids & update event files predicted_ids_train, summary_str_train = sess.run( [decode_op_infer, summary_train], feed_dict=feed_dict_train) predicted_ids_dev, summary_str_dev = sess.run( [decode_op_infer, summary_dev], feed_dict=feed_dict_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() # Convert to sparsetensor to compute LER feed_dict_ler_train = { model.att_labels_true_st: list2sparsetensor(att_labels_train, padded_value=params['eos_index']), model.att_labels_st_pred_pl: list2sparsetensor(predicted_ids_train, padded_value=params['eos_index']) } feed_dict_ler_dev = { model.att_labels_true_st: list2sparsetensor(att_labels_dev, padded_value=params['eos_index']), model.att_labels_st_pred_pl: list2sparsetensor(predicted_ids_dev, padded_value=params['eos_index']) } # Compute accuracy ler_train = sess.run(ler_op, feed_dict=feed_dict_ler_train) ler_dev = sess.run(ler_op, feed_dict=feed_dict_ler_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) duration_step = time.time() - start_time_step print( "Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) # sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() if 'char' in params['label_type']: print('=== Dev Data Evaluation ===') ler_dev_epoch = do_eval_cer( session=sess, decode_op=decode_op_infer, model=model, dataset=dev_data, eval_batch_size=1) print(' CER: %f %%' % (ler_dev_epoch * 100)) if ler_dev_epoch < ler_dev_best: ler_dev_best = ler_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join(model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') ler_test = do_eval_cer( session=sess, decode_op=decode_op_infer, model=model, dataset=test_data, eval_batch_size=1) print(' CER: %f %%' % (ler_test * 100)) else: print('=== Dev Data Evaluation ===') ler_dev_epoch = do_eval_per( session=sess, decode_op=decode_op_infer, per_op=ler_op, model=model, dataset=dev_data, label_type=params['label_type'], eval_batch_size=1) print(' PER: %f %%' % (ler_dev_epoch * 100)) if ler_dev_epoch < ler_dev_best: ler_dev_best = ler_dev_epoch print('■■■ ↑Best Score (PER)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join(model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') ler_test = do_eval_per( session=sess, decode_op=decode_op_infer, per_op=ler_op, model=model, dataset=test_data, label_type=params['label_type'], eval_batch_size=1) print(' PER: %f %%' % (ler_test * 100)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=ler_dev_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def do_train(model, params): """Run training. Args: model: the model to train params (dict): A dictionary of parameters """ # Load dataset train_data = Dataset( data_type='train', label_type=params['label_type'], batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True) dev_data = Dataset( data_type='dev', label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) test_data = Dataset( data_type='dev', label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation (including model definition) loss_op, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) train_op = model.train( loss_op, optimizer=params['optimizer'], learning_rate=learning_rate_pl) decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=params['beam_width']) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) posteriors_op = model.posteriors(logits, blank_prior=1) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=False) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( model.save_path, sess.graph) # Initialize parameters sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() fmean_dev_best = 0 fmean_time_dev_best = 0 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels, inputs_seq_len, _ = data feed_dict_train = { model.inputs_pl_list[0]: inputs[0], model.labels_pl_list[0]: list2sparsetensor( labels[0], padded_value=train_data.padded_value), model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1 - float(params['dropout']), learning_rate_pl: learning_rate } # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % params['print_step'] == 0: # Create feed dictionary for next mini batch (dev) (inputs, labels, inputs_seq_len, _), _ = dev_data.next() feed_dict_dev = { model.inputs_pl_list[0]: inputs[0], model.labels_pl_list[0]: list2sparsetensor( labels[0], padded_value=dev_data.padded_value), model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.keep_prob_pl_list[0]: 1.0 } # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode feed_dict_train[model.keep_prob_pl_list[0]] = 1.0 # Compute accuracy & update event files ler_train, summary_str_train = sess.run( [ler_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev = sess.run( [ler_op, summary_dev], feed_dict=feed_dict_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() print('=== Dev Data Evaluation ===') fmean_dev_epoch, df_acc = do_eval_fmeasure( session=sess, decode_op=decode_op, model=model, dataset=dev_data, eval_batch_size=params['batch_size']) print(df_acc) print(' F-measure: %f %%' % (fmean_dev_epoch)) if fmean_dev_epoch > fmean_dev_best: fmean_dev_best = fmean_dev_epoch print('■■■ ↑Best Score (F-measure)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') fmean_test_epoch, df_acc = do_eval_fmeasure( session=sess, decode_op=decode_op, model=model, dataset=test_data, eval_batch_size=params['batch_size']) print(df_acc) print(' F-measure: %f %%' % (fmean_test_epoch)) # fmean_time_dev_epoch, df_acc = do_eval_fmeasure_time( # session=sess, # decode_op=decode_op, # posteriors_op=posteriors_op, # model=model, # dataset=dev_data, # eval_batch_size=params['batch_size']) # print(df_acc) # print(' Time F-measure: %f %%' % # (fmean_time_dev_epoch)) # if fmean_time_dev_best < fmean_time_dev_epoch: # fmean_time_dev_best = fmean_time_dev_epoch # print('■■■ ↑Best Score (Time F-measure)↑ ■■■') # fmean_time_test_epoch, df_acc = do_eval_fmeasure_time( # session=sess, # decode_op=decode_op, # posteriors_op=posteriors_op, # model=model, # dataset=test_data, # eval_batch_size=params['batch_size']) # print(df_acc) # print(' Time F-measure: %f %%' % # (fmean_time_test_epoch)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=fmean_dev_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def check(self, encoder_type, attention_type, label_type='character'): print('==================================================') print(' encoder_type: %s' % encoder_type) print(' attention_type: %s' % attention_type) print(' label_type: %s' % label_type) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 4 inputs, labels, inputs_seq_len, labels_seq_len = generate_data( label_type=label_type, model='attention', batch_size=batch_size) # Define model graph num_classes = 27 if label_type == 'character' else 61 model = AttentionSeq2Seq(input_size=inputs[0].shape[1], encoder_type=encoder_type, encoder_num_units=256, encoder_num_layers=2, encoder_num_proj=None, attention_type=attention_type, attention_dim=128, decoder_type='lstm', decoder_num_units=256, decoder_num_layers=1, embedding_dim=64, num_classes=num_classes, sos_index=num_classes, eos_index=num_classes + 1, max_decode_length=100, use_peephole=True, splice=1, parameter_init=0.1, clip_grad_norm=5.0, clip_activation_encoder=50, clip_activation_decoder=50, weight_decay=1e-8, time_major=True, sharpening_factor=1.0, logits_temperature=1.0) # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, logits, decoder_outputs_train, decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.labels_seq_len_pl_list[0], model.keep_prob_encoder_pl_list[0], model.keep_prob_decoder_pl_list[0], model.keep_prob_embedding_pl_list[0]) train_op = model.train(loss_op, optimizer='adam', learning_rate=learning_rate_pl) decode_op_train, decode_op_infer = model.decode( decoder_outputs_train, decoder_outputs_infer) ler_op = model.compute_ler(model.labels_st_true_pl, model.labels_st_pred_pl) # Define learning rate controller learning_rate = 1e-3 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=20, decay_rate=0.9, decay_patient_epoch=10, lower_better=True) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: labels, model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.labels_seq_len_pl_list[0]: labels_seq_len, model.keep_prob_encoder_pl_list[0]: 0.8, model.keep_prob_decoder_pl_list[0]: 1.0, model.keep_prob_embedding_pl_list[0]: 1.0, learning_rate_pl: learning_rate } idx2phone = Idx2phone(map_file_path='./phone61.txt') with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 1000 start_time_step = time.time() for step in range(max_steps): # Compute loss _, loss_train = sess.run( [train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(model.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[model.keep_prob_encoder_pl_list[0]] = 1.0 feed_dict[model.keep_prob_decoder_pl_list[0]] = 1.0 feed_dict[model.keep_prob_embedding_pl_list[0]] = 1.0 # Predict class ids predicted_ids_train, predicted_ids_infer = sess.run( [decode_op_train, decode_op_infer], feed_dict=feed_dict) # Compute accuracy try: feed_dict_ler = { model.labels_st_true_pl: list2sparsetensor( labels, padded_value=model.eos_index), model.labels_st_pred_pl: list2sparsetensor( predicted_ids_infer, padded_value=model.eos_index) } ler_train = sess.run( ler_op, feed_dict=feed_dict_ler) except IndexError: ler_train = 1 duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / ler = %.3f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train, duration_step, learning_rate)) start_time_step = time.time() # Visualize if label_type == 'character': print('True : %s' % idx2alpha(labels[0])) print('Pred (Training) : <%s' % idx2alpha(predicted_ids_train[0])) print('Pred (Inference): <%s' % idx2alpha(predicted_ids_infer[0])) else: print('True : %s' % idx2phone(labels[0])) print('Pred (Training) : < %s' % idx2phone(predicted_ids_train[0])) print('Pred (Inference): < %s' % idx2phone(predicted_ids_infer[0])) if ler_train < 0.1: print('Model is Converged.') break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train) feed_dict[learning_rate_pl] = learning_rate
def check(self, decoder_type): print('==================================================') print(' decoder_type: %s' % decoder_type) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 num_stack = 2 inputs, labels, inputs_seq_len = generate_data( label_type='character', model='ctc', batch_size=batch_size, num_stack=num_stack, splice=1) max_time = inputs.shape[1] # Define model graph model = CTC(encoder_type='blstm', input_size=inputs[0].shape[-1], splice=1, num_stack=num_stack, num_units=256, num_layers=2, num_classes=27, lstm_impl='LSTMBlockCell', parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-6) # Define placeholders model.create_placeholders() # Add to the graph each operation _, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) beam_width = 20 if 'beam_search' in decoder_type else 1 decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=beam_width) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) posteriors_op = model.posteriors(logits, blank_prior=1) if decoder_type == 'np_greedy': decoder = GreedyDecoder(blank_index=model.num_classes) elif decoder_type == 'np_beam_search': decoder = BeamSearchDecoder(space_index=26, blank_index=model.num_classes - 1) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 1.0 } # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state('./') # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') if decoder_type in ['tf_greedy', 'tf_beam_search']: # Decode labels_pred_st = sess.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list( labels_pred_st, batch_size=batch_size) # Compute accuracy cer = sess.run(ler_op, feed_dict=feed_dict) else: # Compute CTC posteriors probs = sess.run(posteriors_op, feed_dict=feed_dict) probs = probs.reshape(-1, max_time, model.num_classes) if decoder_type == 'np_greedy': # Decode labels_pred = decoder(probs=probs, seq_len=inputs_seq_len) elif decoder_type == 'np_beam_search': # Decode labels_pred, scores = decoder(probs=probs, seq_len=inputs_seq_len, beam_width=beam_width) # Compute accuracy cer = compute_cer(str_pred=idx2alpha(labels_pred[0]), str_true=idx2alpha(labels[0]), normalize=True) # Visualize print('CER: %.3f %%' % (cer * 100)) print('Ref: %s' % idx2alpha(labels[0])) print('Hyp: %s' % idx2alpha(labels_pred[0]))
def check_training(self, label_type): print('----- ' + label_type + ' -----') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 1 inputs, att_labels, inputs_seq_len, att_labels_seq_len, ctc_labels_st = generate_data( label_type=label_type, model='joint_ctc_attention', batch_size=batch_size) # Define model graph att_num_classes = 26 + 2 if label_type == 'character' else 61 + 2 ctc_num_classes = 26 if label_type == 'character' else 61 # model = load(model_type=model_type) network = JointCTCAttention(input_size=inputs[0].shape[1], encoder_num_unit=256, encoder_num_layer=2, attention_dim=128, attention_type='content', decoder_num_unit=256, decoder_num_layer=1, embedding_dim=20, att_num_classes=att_num_classes, ctc_num_classes=ctc_num_classes, att_task_weight=0.5, sos_index=att_num_classes - 2, eos_index=att_num_classes - 1, max_decode_length=50, attention_weights_tempareture=1.0, logits_tempareture=1.0, parameter_init=0.1, clip_grad=5.0, clip_activation_encoder=50, clip_activation_decoder=50, dropout_ratio_input=0.9, dropout_ratio_hidden=0.9, dropout_ratio_output=1.0, weight_decay=1e-8, beam_width=1, time_major=False) # Define placeholders network.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, att_logits, ctc_logits, decoder_outputs_train, decoder_outputs_infer = network.compute_loss( network.inputs_pl_list[0], network.att_labels_pl_list[0], network.inputs_seq_len_pl_list[0], network.att_labels_seq_len_pl_list[0], network.ctc_labels_pl_list[0], network.keep_prob_input_pl_list[0], network.keep_prob_hidden_pl_list[0], network.keep_prob_output_pl_list[0]) train_op = network.train(loss_op, optimizer='adam', learning_rate=learning_rate_pl) decode_op_train, decode_op_infer = network.decoder( decoder_outputs_train, decoder_outputs_infer) ler_op = network.compute_ler(network.att_labels_st_true_pl, network.att_labels_st_pred_pl) # Define learning rate controller learning_rate = 1e-3 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=10, decay_rate=0.99, decay_patient_epoch=5, lower_better=True) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { network.inputs_pl_list[0]: inputs, network.att_labels_pl_list[0]: att_labels, network.inputs_seq_len_pl_list[0]: inputs_seq_len, network.att_labels_seq_len_pl_list[0]: att_labels_seq_len, network.ctc_labels_pl_list[0]: ctc_labels_st, network.keep_prob_input_pl_list[0]: network.dropout_ratio_input, network.keep_prob_hidden_pl_list[0]: network.dropout_ratio_hidden, network.keep_prob_output_pl_list[0]: network.dropout_ratio_output, learning_rate_pl: learning_rate } map_file_path = '../../experiments/timit/metrics/mapping_files/attention/phone61_to_num.txt' with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 400 start_time_global = time.time() start_time_step = time.time() ler_train_pre = 1 not_improved_count = 0 for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(network.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[network.keep_prob_input_pl_list[0]] = 1.0 feed_dict[network.keep_prob_hidden_pl_list[0]] = 1.0 feed_dict[network.keep_prob_output_pl_list[0]] = 1.0 # Predict class ids predicted_ids_train, predicted_ids_infer = sess.run( [decode_op_train, decode_op_infer], feed_dict=feed_dict) # Compute accuracy try: feed_dict_ler = { network.att_labels_st_true_pl: list2sparsetensor(att_labels, padded_value=27), network.att_labels_st_pred_pl: list2sparsetensor(predicted_ids_infer, padded_value=27) } ler_train = sess.run(ler_op, feed_dict=feed_dict_ler) except ValueError: ler_train = 1 duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f / ler = %.4f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train, duration_step, learning_rate)) start_time_step = time.time() # Visualize if label_type == 'character': print('True : %s' % idx2alpha(att_labels[0])) print('Pred (Training) : <%s' % idx2alpha(predicted_ids_train[0])) print('Pred (Inference): <%s' % idx2alpha(predicted_ids_infer[0])) else: print('True : %s' % idx2phone(att_labels[0], map_file_path)) print('Pred (Training) : < %s' % idx2phone( predicted_ids_train[0], map_file_path)) print('Pred (Inference): < %s' % idx2phone( predicted_ids_infer[0], map_file_path)) if ler_train >= ler_train_pre: not_improved_count += 1 else: not_improved_count = 0 if not_improved_count >= 10: print('Model is Converged.') break ler_train_pre = ler_train # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train) feed_dict[learning_rate_pl] = learning_rate duration_global = time.time() - start_time_global print('Total time: %.3f sec' % (duration_global))
def do_train(model, params, gpu_indices): """Run training. Args: model: the model to train params (dict): A dictionary of parameters gpu_indices (list): GPU indices """ if 'kanji' in params['label_type']: map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '_' + params['train_data_size'] + '.txt' elif 'kana' in params['label_type']: map_file_path = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' # Load dataset train_data = Dataset(data_type='train', train_data_size=params['train_data_size'], label_type=params['label_type'], map_file_path=map_file_path, batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], num_gpu=len(gpu_indices)) dev_data = Dataset(data_type='dev', train_data_size=params['train_data_size'], label_type=params['label_type'], map_file_path=map_file_path, batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False, num_gpu=len(gpu_indices)) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to track the global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set optimizer learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') optimizer = model._set_optimizer(params['optimizer'], learning_rate_pl) # Calculate the gradients for each model tower total_grads_and_vars, total_losses = [], [] decode_ops_infer, ler_ops = [], [] all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))] # NOTE: /cpu:0 is prepared for evaluation with tf.variable_scope(tf.get_variable_scope()): for i_gpu in range(len(all_devices)): with tf.device(all_devices[i_gpu]): with tf.name_scope('tower_gpu%d' % i_gpu) as scope: # Define placeholders in each tower model.create_placeholders() # Calculate the total loss for the current tower of the # model. This function constructs the entire model but # shares the variables across all towers. tower_loss, tower_logits, tower_decoder_outputs_train, tower_decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[i_gpu], model.labels_pl_list[i_gpu], model.inputs_seq_len_pl_list[i_gpu], model.labels_seq_len_pl_list[i_gpu], model.keep_prob_encoder_pl_list[i_gpu], model.keep_prob_decoder_pl_list[i_gpu], model.keep_prob_embedding_pl_list[i_gpu], scope) tower_loss = tf.expand_dims(tower_loss, axis=0) total_losses.append(tower_loss) # Reuse variables for the next tower tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this # tower tower_grads_and_vars = optimizer.compute_gradients( tower_loss) # Gradient clipping tower_grads_and_vars = model._clip_gradients( tower_grads_and_vars) # TODO: Optionally add gradient noise # Keep track of the gradients across all towers total_grads_and_vars.append(tower_grads_and_vars) # Add to the graph each operation per tower _, decode_op_tower_infer = model.decode( tower_decoder_outputs_train, tower_decoder_outputs_infer) decode_ops_infer.append(decode_op_tower_infer) # ler_op_tower = model.compute_ler( # decode_op_tower, model.labels_pl_list[i_gpu]) ler_op_tower = model.compute_ler( model.labels_st_true_pl_list[i_gpu], model.labels_st_pred_pl_list[i_gpu]) ler_op_tower = tf.expand_dims(ler_op_tower, axis=0) ler_ops.append(ler_op_tower) # Aggregate losses, then calculate average loss total_losses = tf.concat(axis=0, values=total_losses) loss_op = tf.reduce_mean(total_losses, axis=0) ler_ops = tf.concat(axis=0, values=ler_ops) ler_op = tf.reduce_mean(ler_ops, axis=0) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers average_grads_and_vars = average_gradients(total_grads_and_vars) # Apply the gradients to adjust the shared variables. train_op = optimizer.apply_gradients(average_grads_and_vars, global_step=global_step) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph # NOTE: Start running operations on the Graph. allow_soft_placement # must be set to True to build towers on GPU, as some of the ops do not # have GPU implementations. with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(model.save_path, sess.graph) # Initialize param sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() cer_dev_best = 1 not_improved_epoch = 0 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels_train, inputs_seq_len, labels_seq_len, _ = data feed_dict_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_train[ model.inputs_pl_list[i_gpu]] = inputs[i_gpu] feed_dict_train[ model.labels_pl_list[i_gpu]] = labels_train[i_gpu] feed_dict_train[model.inputs_seq_len_pl_list[ i_gpu]] = inputs_seq_len[i_gpu] feed_dict_train[model.labels_seq_len_pl_list[ i_gpu]] = labels_seq_len[i_gpu] feed_dict_train[ model.keep_prob_encoder_pl_list[i_gpu]] = 1 - float( params['dropout_encoder']) feed_dict_train[ model.keep_prob_decoder_pl_list[i_gpu]] = 1 - float( params['dropout_decoder']) feed_dict_train[ model.keep_prob_embedding_pl_list[i_gpu]] = 1 - float( params['dropout_embedding']) feed_dict_train[learning_rate_pl] = learning_rate # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % int( params['print_step'] / len(gpu_indices)) == 0: # Create feed dictionary for next mini batch (dev) inputs, labels_dev, inputs_seq_len, labels_seq_len, _ = dev_data.next( )[0] feed_dict_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_dev[ model.inputs_pl_list[i_gpu]] = inputs[i_gpu] feed_dict_dev[ model.labels_pl_list[i_gpu]] = labels_dev[i_gpu] feed_dict_dev[model.inputs_seq_len_pl_list[ i_gpu]] = inputs_seq_len[i_gpu] feed_dict_dev[model.labels_seq_len_pl_list[ i_gpu]] = labels_seq_len[i_gpu] feed_dict_dev[ model.keep_prob_encoder_pl_list[i_gpu]] = 1.0 feed_dict_dev[ model.keep_prob_decoder_pl_list[i_gpu]] = 1.0 feed_dict_dev[ model.keep_prob_embedding_pl_list[i_gpu]] = 1.0 # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode for i_gpu in range(len(gpu_indices)): feed_dict_train[ model.keep_prob_encoder_pl_list[i_gpu]] = 1.0 feed_dict_train[ model.keep_prob_decoder_pl_list[i_gpu]] = 1.0 feed_dict_train[ model.keep_prob_embedding_pl_list[i_gpu]] = 1.0 # Predict class ids predicted_ids_train_list, summary_str_train = sess.run( [decode_ops_infer, summary_train], feed_dict=feed_dict_train) predicted_ids_dev_list, summary_str_dev = sess.run( [decode_ops_infer, summary_dev], feed_dict=feed_dict_dev) # Convert to sparsetensor to compute LER feed_dict_ler_train = {} for i_gpu in range(len(gpu_indices)): feed_dict_ler_train[model.labels_st_true_pl_list[ i_gpu]] = list2sparsetensor( labels_train[i_gpu], padded_value=train_data.padded_value), feed_dict_ler_train[model.labels_st_pred_pl_list[ i_gpu]] = list2sparsetensor( predicted_ids_train_list[i_gpu], padded_value=train_data.padded_value) feed_dict_ler_dev = {} for i_gpu in range(len(gpu_indices)): feed_dict_ler_dev[model.labels_st_true_pl_list[ i_gpu]] = list2sparsetensor( labels_dev[i_gpu], padded_value=dev_data.padded_value), feed_dict_ler_dev[model.labels_st_pred_pl_list[ i_gpu]] = list2sparsetensor( predicted_ids_dev_list[i_gpu], padded_value=dev_data.padded_value) # Compute accuracy # ler_train = sess.run(ler_op, feed_dict=feed_dict_ler_train) # ler_dev = sess.run(ler_op, feed_dict=feed_dict_ler_dev) ler_train = 1 ler_dev = 1 csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) # TODO: fix this # Update even files summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print( "Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() print('=== Dev Data Evaluation ===') cer_dev_epoch = do_eval_cer( session=sess, decode_ops=decode_ops_infer, model=model, dataset=dev_data, label_type=params['label_type'], train_data_size=params['train_data_size'], eval_batch_size=1) print(' CER: %f %%' % (cer_dev_epoch * 100)) if cer_dev_epoch < cer_dev_best: cer_dev_best = cer_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') # Save model (check point) checkpoint_file = join(model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) else: not_improved_epoch += 1 duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == params[ 'not_improved_patient_epoch']: break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=cer_dev_epoch) start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def check(self, encoder_type, lstm_impl, time_major=False): print('==================================================') print(' encoder_type: %s' % str(encoder_type)) print(' lstm_impl: %s' % str(lstm_impl)) print(' time_major: %s' % str(time_major)) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 inputs, labels_char, labels_phone, inputs_seq_len = generate_data( label_type='multitask', model='ctc', batch_size=batch_size) # Define model graph num_classes_main = 27 num_classes_sub = 61 model = MultitaskCTC( encoder_type=encoder_type, input_size=inputs[0].shape[1], num_units=256, num_layers_main=2, num_layers_sub=1, num_classes_main=num_classes_main, num_classes_sub=num_classes_sub, main_task_weight=0.8, lstm_impl=lstm_impl, parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-8, # bottleneck_dim=50, bottleneck_dim=None, time_major=time_major) # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, logits_main, logits_sub = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) train_op = model.train(loss_op, optimizer='adam', learning_rate=learning_rate_pl) decode_op_main, decode_op_sub = model.decoder( logits_main, logits_sub, model.inputs_seq_len_pl_list[0], beam_width=20) ler_op_main, ler_op_sub = model.compute_ler( decode_op_main, decode_op_sub, model.labels_pl_list[0], model.labels_sub_pl_list[0]) # Define learning rate controller learning_rate = 1e-3 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=20, decay_rate=0.9, decay_patient_epoch=5, lower_better=True) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels_char, padded_value=-1), model.labels_sub_pl_list[0]: list2sparsetensor(labels_phone, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 0.9, learning_rate_pl: learning_rate } idx2phone = Idx2phone(map_file_path='./phone61.txt') with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 1000 start_time_step = time.time() for step in range(max_steps): # Compute loss _, loss_train = sess.run([train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(model.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[model.keep_prob_pl_list[0]] = 1.0 # Compute accuracy ler_train_char, ler_train_phone = sess.run( [ler_op_main, ler_op_sub], feed_dict=feed_dict) duration_step = time.time() - start_time_step print( 'Step %d: loss = %.3f / cer = %.3f / per = %.3f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train_char, ler_train_phone, duration_step, learning_rate)) start_time_step = time.time() # Visualize labels_pred_char_st, labels_pred_phone_st = sess.run( [decode_op_main, decode_op_sub], feed_dict=feed_dict) labels_pred_char = sparsetensor2list( labels_pred_char_st, batch_size=batch_size) labels_pred_phone = sparsetensor2list( labels_pred_phone_st, batch_size=batch_size) print('Character') try: print(' Ref: %s' % idx2alpha(labels_char[0])) print(' Hyp: %s' % idx2alpha(labels_pred_char[0])) except IndexError: print('Character') print(' Ref: %s' % idx2alpha(labels_char[0])) print(' Hyp: %s' % '') print('Phone') try: print(' Ref: %s' % idx2phone(labels_phone[0])) print(' Hyp: %s' % idx2phone(labels_pred_phone[0])) except IndexError: print(' Ref: %s' % idx2phone(labels_phone[0])) print(' Hyp: %s' % '') # NOTE: This is for no prediction print('-' * 30) if ler_train_char < 0.1: print('Modle is Converged.') break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train_char) feed_dict[learning_rate_pl] = learning_rate
def check(self, encoder_type, lstm_impl, time_major=False): print('==================================================') print(' encoder_type: %s' % str(encoder_type)) print(' lstm_impl: %s' % str(lstm_impl)) print(' time_major: %s' % str(time_major)) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 inputs, labels_char, labels_phone, inputs_seq_len = generate_data( label_type='multitask', model='ctc', batch_size=batch_size) # Define model graph num_classes_main = 27 num_classes_sub = 61 model = MultitaskCTC( encoder_type=encoder_type, input_size=inputs[0].shape[1], num_units=256, num_layers_main=2, num_layers_sub=1, num_classes_main=num_classes_main, num_classes_sub=num_classes_sub, main_task_weight=0.8, lstm_impl=lstm_impl, parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-8, # bottleneck_dim=50, bottleneck_dim=None, time_major=time_major) # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, logits_main, logits_sub = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) train_op = model.train( loss_op, optimizer='adam', learning_rate=learning_rate_pl) decode_op_main, decode_op_sub = model.decoder( logits_main, logits_sub, model.inputs_seq_len_pl_list[0], beam_width=20) ler_op_main, ler_op_sub = model.compute_ler( decode_op_main, decode_op_sub, model.labels_pl_list[0], model.labels_sub_pl_list[0]) # Define learning rate controller learning_rate = 1e-3 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=20, decay_rate=0.9, decay_patient_epoch=5, lower_better=True) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels_char, padded_value=-1), model.labels_sub_pl_list[0]: list2sparsetensor(labels_phone, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 0.9, learning_rate_pl: learning_rate } idx2phone = Idx2phone(map_file_path='./phone61.txt') with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 1000 start_time_step = time.time() for step in range(max_steps): # Compute loss _, loss_train = sess.run( [train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(model.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[model.keep_prob_pl_list[0]] = 1.0 # Compute accuracy ler_train_char, ler_train_phone = sess.run( [ler_op_main, ler_op_sub], feed_dict=feed_dict) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / cer = %.3f / per = %.3f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train_char, ler_train_phone, duration_step, learning_rate)) start_time_step = time.time() # Visualize labels_pred_char_st, labels_pred_phone_st = sess.run( [decode_op_main, decode_op_sub], feed_dict=feed_dict) labels_pred_char = sparsetensor2list( labels_pred_char_st, batch_size=batch_size) labels_pred_phone = sparsetensor2list( labels_pred_phone_st, batch_size=batch_size) print('Character') try: print(' Ref: %s' % idx2alpha(labels_char[0])) print(' Hyp: %s' % idx2alpha(labels_pred_char[0])) except IndexError: print('Character') print(' Ref: %s' % idx2alpha(labels_char[0])) print(' Hyp: %s' % '') print('Phone') try: print(' Ref: %s' % idx2phone(labels_phone[0])) print(' Hyp: %s' % idx2phone(labels_pred_phone[0])) except IndexError: print(' Ref: %s' % idx2phone(labels_phone[0])) print(' Hyp: %s' % '') # NOTE: This is for no prediction print('-' * 30) if ler_train_char < 0.1: print('Modle is Converged.') break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train_char) feed_dict[learning_rate_pl] = learning_rate
def do_train(model, params): """Run training. If target labels are phone, the model is evaluated by PER with 39 phones. Args: model: the model to train params (dict): A dictionary of parameters """ map_file_path_train = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' if 'phone' in params['label_type']: map_file_path_eval = '../metrics/mapping_files/phone39.txt' else: map_file_path_eval = '../metrics/mapping_files/' + \ params['label_type'] + '.txt' # Load dataset train_data = Dataset( data_type='train', label_type=params['label_type'], batch_size=params['batch_size'], map_file_path=map_file_path_train, max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch']) dev_data = Dataset( data_type='dev', label_type=params['label_type'], batch_size=params['batch_size'], map_file_path=map_file_path_train, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) if 'char' in params['label_type']: test_data = Dataset( data_type='test', label_type=params['label_type'], batch_size=1, map_file_path=map_file_path_eval, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) else: test_data = Dataset( data_type='test', label_type='phone39', batch_size=1, map_file_path=map_file_path_eval, splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation (including model definition) loss_op, logits, decoder_outputs_train, decoder_outputs_infer = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.labels_seq_len_pl_list[0], model.keep_prob_encoder_pl_list[0], model.keep_prob_decoder_pl_list[0], model.keep_prob_embedding_pl_list[0]) train_op = model.train(loss_op, optimizer=params['optimizer'], learning_rate=learning_rate_pl) _, decode_op_infer = model.decode( decoder_outputs_train, decoder_outputs_infer) ler_op = model.compute_ler(model.labels_st_true_pl, model.labels_st_pred_pl) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total param parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M param" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( model.save_path, sess.graph) # Initialize param sess.run(init_op) # Train model start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() ler_dev_best = 1 learning_rate = float(params['learning_rate']) for step, (data, is_new_epoch) in enumerate(train_data): # Create feed dictionary for next mini batch (train) inputs, labels_train, inputs_seq_len, labels_seq_len, _ = data feed_dict_train = { model.inputs_pl_list[0]: inputs[0], model.labels_pl_list[0]: labels_train[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.labels_seq_len_pl_list[0]: labels_seq_len[0], model.keep_prob_encoder_pl_list[0]: 1 - float(params['dropout_encoder']), model.keep_prob_decoder_pl_list[0]: 1 - float(params['dropout_decoder']), model.keep_prob_embedding_pl_list[0]: 1 - float(params['dropout_embedding']), learning_rate_pl: learning_rate } # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % params['print_step'] == 0: # Create feed dictionary for next mini batch (dev) (inputs, labels_dev, inputs_seq_len, labels_seq_len, _), _ = dev_data.next() feed_dict_dev = { model.inputs_pl_list[0]: inputs[0], model.labels_pl_list[0]: labels_dev[0], model.inputs_seq_len_pl_list[0]: inputs_seq_len[0], model.labels_seq_len_pl_list[0]: labels_seq_len[0], model.keep_prob_encoder_pl_list[0]: 1.0, model.keep_prob_decoder_pl_list[0]: 1.0, model.keep_prob_embedding_pl_list[0]: 1.0 } # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode feed_dict_train[model.keep_prob_encoder_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_decoder_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_embedding_pl_list[0]] = 1.0 # Predict class ids & update even files predicted_ids_train, summary_str_train = sess.run( [decode_op_infer, summary_train], feed_dict=feed_dict_train) predicted_ids_dev, summary_str_dev = sess.run( [decode_op_infer, summary_dev], feed_dict=feed_dict_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() # Convert to sparsetensor to compute LER feed_dict_ler_train = { model.labels_st_true_pl: list2sparsetensor( labels_train[0], padded_value=train_data.padded_value), model.labels_st_pred_pl: list2sparsetensor( predicted_ids_train, padded_value=train_data.padded_value) } feed_dict_ler_dev = { model.labels_st_true_pl: list2sparsetensor( labels_dev[0], padded_value=dev_data.padded_value), model.labels_st_pred_pl: list2sparsetensor( predicted_ids_dev, padded_value=dev_data.padded_value) } # Compute accuracy ler_train = sess.run(ler_op, feed_dict=feed_dict_ler_train) ler_dev = sess.run(ler_op, feed_dict=feed_dict_ler_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) duration_step = time.time() - start_time_step print("Step %d (epoch: %.3f): loss = %.3f (%.3f) / ler = %.3f (%.3f) / lr = %.5f (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train, loss_dev, ler_train, ler_dev, learning_rate, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch print('-----EPOCH:%d (%.3f min)-----' % (train_data.epoch, duration_epoch / 60)) # Save fugure of loss & ler plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) plot_ler(csv_ler_train, csv_ler_dev, csv_steps, label_type=params['label_type'], save_path=model.save_path) if train_data.epoch >= params['eval_start_epoch']: start_time_eval = time.time() if 'char' in params['label_type']: print('=== Dev Data Evaluation ===') ler_dev_epoch, wer_dev_epoch = do_eval_cer( session=sess, decode_op=decode_op_infer, model=model, dataset=dev_data, label_type=params['label_type'], eval_batch_size=1) print(' CER: %f %%' % (ler_dev_epoch * 100)) print(' WER: %f %%' % (wer_dev_epoch * 100)) if ler_dev_epoch < ler_dev_best: ler_dev_best = ler_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') ler_test, wer_test = do_eval_cer( session=sess, decode_op=decode_op_infer, model=model, dataset=test_data, label_type=params['label_type'], is_test=True, eval_batch_size=1) print(' CER: %f %%' % (ler_test * 100)) print(' WER: %f %%' % (wer_test * 100)) else: print('=== Dev Data Evaluation ===') ler_dev_epoch = do_eval_per( session=sess, decode_op=decode_op_infer, per_op=ler_op, model=model, dataset=dev_data, label_type=params['label_type'], eval_batch_size=1) print(' PER: %f %%' % (ler_dev_epoch * 100)) if ler_dev_epoch < ler_dev_best: ler_dev_best = ler_dev_epoch print('■■■ ↑Best Score (PER)↑ ■■■') # Save model only when best accuracy is # obtained (check point) checkpoint_file = join( model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=train_data.epoch) print("Model saved in file: %s" % save_path) print('=== Test Data Evaluation ===') ler_test = do_eval_per( session=sess, decode_op=decode_op_infer, per_op=ler_op, model=model, dataset=test_data, label_type=params['label_type'], is_test=True, eval_batch_size=1) print(' PER: %f %%' % (ler_test * 100)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=train_data.epoch, value=ler_dev_epoch) start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')
def check(self, decoder_type): print('==================================================') print(' decoder_type: %s' % decoder_type) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 num_stack = 2 inputs, labels, inputs_seq_len = generate_data( label_type='character', model='ctc', batch_size=batch_size, num_stack=num_stack, splice=1) max_time = inputs.shape[1] # Define model graph model = CTC(encoder_type='blstm', input_size=inputs[0].shape[-1], splice=1, num_stack=num_stack, num_units=256, num_layers=2, num_classes=27, lstm_impl='LSTMBlockCell', parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-6) # Define placeholders model.create_placeholders() # Add to the graph each operation _, logits = model.compute_loss(model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) beam_width = 20 if 'beam_search' in decoder_type else 1 decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=beam_width) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) posteriors_op = model.posteriors(logits, blank_prior=1) if decoder_type == 'np_greedy': decoder = GreedyDecoder(blank_index=model.num_classes) elif decoder_type == 'np_beam_search': decoder = BeamSearchDecoder(space_index=26, blank_index=model.num_classes - 1) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 1.0 } # Create a saver for writing training checkpoints saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state('./') # If check point exists if ckpt: model_path = ckpt.model_checkpoint_path saver.restore(sess, model_path) print("Model restored: " + model_path) else: raise ValueError('There are not any checkpoints.') if decoder_type in ['tf_greedy', 'tf_beam_search']: # Decode labels_pred_st = sess.run(decode_op, feed_dict=feed_dict) labels_pred = sparsetensor2list(labels_pred_st, batch_size=batch_size) # Compute accuracy cer = sess.run(ler_op, feed_dict=feed_dict) else: # Compute CTC posteriors probs = sess.run(posteriors_op, feed_dict=feed_dict) probs = probs.reshape(-1, max_time, model.num_classes) if decoder_type == 'np_greedy': # Decode labels_pred = decoder(probs=probs, seq_len=inputs_seq_len) elif decoder_type == 'np_beam_search': # Decode labels_pred, scores = decoder(probs=probs, seq_len=inputs_seq_len, beam_width=beam_width) # Compute accuracy cer = compute_cer(str_pred=idx2alpha(labels_pred[0]), str_true=idx2alpha(labels[0]), normalize=True) # Visualize print('CER: %.3f %%' % (cer * 100)) print('Ref: %s' % idx2alpha(labels[0])) print('Hyp: %s' % idx2alpha(labels_pred[0]))
def check(self, encoder_type, label_type='character', lstm_impl=None, time_major=True, save_params=False): print('==================================================') print(' encoder_type: %s' % encoder_type) print(' label_type: %s' % label_type) print(' lstm_impl: %s' % lstm_impl) print(' time_major: %s' % str(time_major)) print(' save_params: %s' % str(save_params)) print('==================================================') tf.reset_default_graph() with tf.Graph().as_default(): # Load batch data batch_size = 2 splice = 11 if encoder_type in ['vgg_blstm', 'vgg_lstm', 'cnn_zhang', 'vgg_wang', 'resnet_wang', 'cldnn_wang'] else 1 num_stack = 2 inputs, labels, inputs_seq_len = generate_data( label_type=label_type, model='ctc', batch_size=batch_size, num_stack=num_stack, splice=splice) # NOTE: input_size must be even number when using CudnnLSTM # Define model graph num_classes = 27 if label_type == 'character' else 61 model = CTC(encoder_type=encoder_type, input_size=inputs[0].shape[-1] // splice // num_stack, splice=splice, num_stack=num_stack, num_units=256, num_layers=2, num_classes=num_classes, lstm_impl=lstm_impl, parameter_init=0.1, clip_grad_norm=5.0, clip_activation=50, num_proj=256, weight_decay=1e-10, # bottleneck_dim=50, bottleneck_dim=None, time_major=time_major) # Define placeholders model.create_placeholders() learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate') # Add to the graph each operation loss_op, logits = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_pl_list[0]) train_op = model.train(loss_op, optimizer='nestrov', learning_rate=learning_rate_pl) # NOTE: Adam does not run on CudnnLSTM decode_op = model.decoder(logits, model.inputs_seq_len_pl_list[0], beam_width=20) ler_op = model.compute_ler(decode_op, model.labels_pl_list[0]) # Define learning rate controller learning_rate = 1e-4 lr_controller = Controller(learning_rate_init=learning_rate, decay_start_epoch=50, decay_rate=0.9, decay_patient_epoch=10, lower_better=True) if save_params: # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Count total parameters if lstm_impl != 'CudnnLSTM': parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) # Make feed dict feed_dict = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_pl_list[0]: 1.0, learning_rate_pl: learning_rate } idx2phone = Idx2phone(map_file_path='./phone61.txt') with tf.Session() as sess: # Initialize parameters sess.run(init_op) # Wrapper for tfdbg # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Train model max_steps = 1000 start_time_step = time.time() for step in range(max_steps): # for debug # encoder_outputs = sess.run( # model.encoder_outputs, feed_dict) # print(encoder_outputs.shape) # Compute loss _, loss_train = sess.run( [train_op, loss_op], feed_dict=feed_dict) # Gradient check # grads = sess.run(model.clipped_grads, # feed_dict=feed_dict) # for grad in grads: # print(np.max(grad)) if (step + 1) % 10 == 0: # Change to evaluation mode feed_dict[model.keep_prob_pl_list[0]] = 1.0 # Compute accuracy ler_train = sess.run(ler_op, feed_dict=feed_dict) duration_step = time.time() - start_time_step print('Step %d: loss = %.3f / ler = %.3f (%.3f sec) / lr = %.5f' % (step + 1, loss_train, ler_train, duration_step, learning_rate)) start_time_step = time.time() # Decode labels_pred_st = sess.run( decode_op, feed_dict=feed_dict) # Visualize try: labels_pred = sparsetensor2list( labels_pred_st, batch_size=batch_size) if label_type == 'character': print('Ref: %s' % idx2alpha(labels[0])) print('Hyp: %s' % idx2alpha(labels_pred[0])) else: print('Ref: %s' % idx2phone(labels[0])) print('Hyp: %s' % idx2phone(labels_pred[0])) except IndexError: if label_type == 'character': print('Ref: %s' % idx2alpha(labels[0])) print('Hyp: %s' % '') else: print('Ref: %s' % idx2phone(labels[0])) print('Hyp: %s' % '') # NOTE: This is for no prediction if ler_train < 0.1: print('Modle is Converged.') if save_params: # Save model (check point) checkpoint_file = './model.ckpt' save_path = saver.save( sess, checkpoint_file, global_step=2) print("Model saved in file: %s" % save_path) break # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=step, value=ler_train) feed_dict[learning_rate_pl] = learning_rate
def do_train(model, params): """Run training. Args: model: model to train params: A dictionary of parameters """ # Load dataset train_data = Dataset(data_type='train', label_type_main=params['label_type_main'], label_type_sub=params['label_type_sub'], train_data_size=params['train_data_size'], batch_size=params['batch_size'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True) dev_data_step = Dataset(data_type='dev', label_type_main=params['label_type_main'], label_type_sub=params['label_type_sub'], train_data_size=params['train_data_size'], batch_size=params['batch_size'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) dev_data_epoch = Dataset(data_type='dev', label_type_main=params['label_type_main'], label_type_sub=params['label_type_sub'], train_data_size=params['train_data_size'], batch_size=params['batch_size'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders model.create_placeholders(gpu_index=0) # Add to the graph each operation loss_op, logits_main, logits_sub = model.compute_loss( model.inputs_pl_list[0], model.labels_pl_list[0], model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0], model.keep_prob_input_pl_list[0], model.keep_prob_hidden_pl_list[0], model.keep_prob_output_pl_list[0]) train_op = model.train(loss_op, optimizer=params['optimizer'], learning_rate=model.learning_rate_pl_list[0]) decode_op_main, decode_op_sub = model.decoder( logits_main, logits_sub, model.inputs_seq_len_pl_list[0], decode_type='beam_search', beam_width=20) ler_op_main, ler_op_sub = model.compute_ler( decode_op_main, decode_op_sub, model.labels_pl_list[0], model.labels_sub_pl_list[0]) # Define learning rate controller lr_controller = Controller( learning_rate_init=params['learning_rate'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=1, lower_better=True) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(model.summaries_train) summary_dev = tf.summary.merge(model.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_main_train, csv_ler_main_dev = [], [] csv_ler_sub_train, csv_ler_sub_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( model.save_path, sess.graph) # Initialize parameters sess.run(init_op) # Make mini-batch generator mini_batch_train = train_data.next_batch() mini_batch_dev = dev_data_step.next_batch() # Train model iter_per_epoch = int(train_data.data_num / params['batch_size']) train_step = train_data.data_num / params['batch_size'] if (train_step) != int(train_step): iter_per_epoch += 1 max_steps = iter_per_epoch * params['num_epoch'] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() ler_main_dev_best = 1 learning_rate = float(params['learning_rate']) for step in range(max_steps): # Create feed dictionary for next mini batch (train) inputs, labels_main, labels_sub, inputs_seq_len, _ = mini_batch_train.__next__() feed_dict_train = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels_main, padded_value=-1), model.labels_sub_pl_list[0]: list2sparsetensor(labels_sub, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_input_pl_list[0]: model.dropout_ratio_input, model.keep_prob_hidden_pl_list[0]: model.dropout_ratio_hidden, model.keep_prob_output_pl_list[0]: model.dropout_ratio_output, model.learning_rate_pl_list[0]: learning_rate } # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % 200 == 0: # Create feed dictionary for next mini batch (dev) inputs, labels_main, labels_sub, inputs_seq_len, _ = mini_batch_dev.__next__() feed_dict_dev = { model.inputs_pl_list[0]: inputs, model.labels_pl_list[0]: list2sparsetensor(labels_main, padded_value=-1), model.labels_sub_pl_list[0]: list2sparsetensor(labels_sub, padded_value=-1), model.inputs_seq_len_pl_list[0]: inputs_seq_len, model.keep_prob_input_pl_list[0]: 1.0, model.keep_prob_hidden_pl_list[0]: 1.0, model.keep_prob_output_pl_list[0]: 1.0 } # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode feed_dict_train[model.keep_prob_input_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_hidden_pl_list[0]] = 1.0 feed_dict_train[model.keep_prob_output_pl_list[0]] = 1.0 # Compute accuracy & update event file ler_main_train, ler_sub_train, summary_str_train = sess.run( [ler_op_main, ler_op_sub, summary_train], feed_dict=feed_dict_train) ler_main_dev, ler_sub_dev, summary_str_dev = sess.run( [ler_op_main, ler_op_sub, summary_dev], feed_dict=feed_dict_dev) csv_ler_main_train.append(ler_main_train) csv_ler_main_dev.append(ler_main_dev) csv_ler_sub_train.append(ler_sub_train) csv_ler_sub_dev.append(ler_sub_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print('Step %d: loss = %.3f (%.3f) / ler_main = %.4f (%.4f) / ler_sub = %.4f (%.4f) (%.3f min)' % (step + 1, loss_train, loss_dev, ler_main_train, ler_main_dev, ler_sub_train, ler_sub_dev, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps: duration_epoch = time.time() - start_time_epoch epoch = (step + 1) // iter_per_epoch print('-----EPOCH:%d (%.3f min)-----' % (epoch, duration_epoch / 60)) # Save model (check point) checkpoint_file = join(model.save_path, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=epoch) print("Model saved in file: %s" % save_path) if epoch >= 5: start_time_eval = time.time() print('=== Dev Evaluation ===') ler_main_dev_epoch = do_eval_cer( session=sess, decode_op=decode_op_main, model=model, dataset=dev_data_epoch, label_type=params['label_type_main'], eval_batch_size=params['batch_size'], is_multitask=True, is_main=True) print(' CER (main): %f %%' % (ler_main_dev_epoch * 100)) ler_sub_dev_epoch = do_eval_cer( session=sess, decode_op=decode_op_sub, model=model, dataset=dev_data_epoch, label_type=params['label_type_sub'], eval_batch_size=params['batch_size'], is_multitask=True, is_main=False) print(' CER (sub): %f %%' % (ler_sub_dev_epoch * 100)) if ler_main_dev_epoch < ler_main_dev_best: ler_main_dev_best = ler_main_dev_epoch print('■■■ ↑Best Score (CER main)↑ ■■■') duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) # Update learning rate learning_rate = lr_controller.decay_lr( learning_rate=learning_rate, epoch=epoch, value=ler_main_dev_epoch) start_time_epoch = time.time() start_time_step = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Save train & dev loss, ler save_loss(csv_steps, csv_loss_train, csv_loss_dev, save_path=model.save_path) save_ler(csv_steps, csv_ler_main_train, csv_ler_sub_dev, save_path=model.save_path) save_ler(csv_steps, csv_ler_sub_train, csv_ler_sub_dev, save_path=model.save_path) # Training was finished correctly with open(join(model.save_path, 'complete.txt'), 'w') as f: f.write('')