def do_train(network, optimizer, learning_rate, batch_size, epoch_num, label_type, num_stack, num_skip): """Run training. Args: network: network to train optimizer: string, the name of optimizer. ex.) adam, rmsprop learning_rate: initial learning rate batch_size: size of mini batch epoch_num: epoch num to train label_type: phone39 or phone48 or phone61 or character num_stack: int, the number of frames to stack num_skip: int, the number of frames to skip """ # Load dataset train_data = DataSet(data_type='train', label_type=label_type, num_stack=num_stack, num_skip=num_skip, is_sorted=True) if label_type == 'character': dev_data = DataSet(data_type='dev', label_type='character', num_stack=num_stack, num_skip=num_skip, is_sorted=False) test_data = DataSet(data_type='test', label_type='character', num_stack=num_stack, num_skip=num_skip, is_sorted=False) else: dev_data = DataSet(data_type='dev', label_type='phone39', num_stack=num_stack, num_skip=num_skip, is_sorted=False) test_data = DataSet(data_type='test', label_type='phone39', num_stack=num_stack, num_skip=num_skip, is_sorted=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define model network.define() # NOTE: define model under tf.Graph() # Add to the graph each operation loss_op = network.loss() train_op = network.train(optimizer=optimizer, learning_rate_init=learning_rate, is_scheduled=False) decode_op = network.decoder(decode_type='beam_search', beam_width=20) per_op = network.ler(decode_op) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(network.summaries_train) summary_dev = tf.summary.merge(network.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format(total_parameters / 1000000))) csv_steps = [] csv_train_loss = [] csv_dev_loss = [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter( network.model_dir, sess.graph) # Initialize parameters sess.run(init_op) # Train model iter_per_epoch = int(train_data.data_num / batch_size) if (train_data.data_num / batch_size) != int(train_data.data_num / batch_size): iter_per_epoch += 1 max_steps = iter_per_epoch * epoch_num start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() error_best = 1 for step in range(max_steps): # Create feed dictionary for next mini batch (train) inputs, labels, seq_len, _ = train_data.next_batch( batch_size=batch_size) indices, values, dense_shape = list2sparsetensor(labels) feed_dict_train = { network.inputs_pl: inputs, network.label_indices_pl: indices, network.label_values_pl: values, network.label_shape_pl: dense_shape, network.seq_len_pl: seq_len, network.keep_prob_input_pl: network.dropout_ratio_input, network.keep_prob_hidden_pl: network.dropout_ratio_hidden, network.lr_pl: learning_rate } # Create feed dictionary for next mini batch (dev) inputs, labels, seq_len, _ = dev_data.next_batch( batch_size=batch_size) indices, values, dense_shape = list2sparsetensor(labels) feed_dict_dev = { network.inputs_pl: inputs, network.label_indices_pl: indices, network.label_values_pl: values, network.label_shape_pl: dense_shape, network.seq_len_pl: seq_len, network.keep_prob_input_pl: network.dropout_ratio_input, network.keep_prob_hidden_pl: network.dropout_ratio_hidden } # Update parameters & compute loss _, loss_train = sess.run( [train_op, loss_op], feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_train_loss.append(loss_train) csv_dev_loss.append(loss_dev) if (step + 1) % 10 == 0: # Change feed dict for evaluation feed_dict_train[network.keep_prob_input_pl] = 1.0 feed_dict_train[network.keep_prob_hidden_pl] = 1.0 feed_dict_dev[network.keep_prob_input_pl] = 1.0 feed_dict_dev[network.keep_prob_hidden_pl] = 1.0 # Compute accuracy & update event file ler_train, summary_str_train = sess.run([per_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev, labels_st = sess.run([per_op, summary_dev, decode_op], feed_dict=feed_dict_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print('Step %d: loss = %.3f (%.3f) / ler = %.4f (%.4f) (%.3f min)' % (step + 1, loss_train, loss_dev, ler_train, ler_dev, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps: duration_epoch = time.time() - start_time_epoch epoch = (step + 1) // iter_per_epoch print('-----EPOCH:%d (%.3f min)-----' % (epoch, duration_epoch / 60)) # Save model (check point) checkpoint_file = join(network.model_dir, 'model.ckpt') save_path = saver.save( sess, checkpoint_file, global_step=epoch) print("Model saved in file: %s" % save_path) if epoch >= 10: start_time_eval = time.time() if label_type == 'character': print('■Dev Data Evaluation:■') error_epoch = do_eval_cer(session=sess, decode_op=decode_op, network=network, dataset=dev_data, eval_batch_size=1) if error_epoch < error_best: error_best = error_epoch print('■■■ ↑Best Score (CER)↑ ■■■') print('■Test Data Evaluation:■') do_eval_cer(session=sess, decode_op=decode_op, network=network, dataset=test_data, eval_batch_size=1) else: print('■Dev Data Evaluation:■') error_epoch = do_eval_per(session=sess, decode_op=decode_op, per_op=per_op, network=network, dataset=dev_data, label_type=label_type, eval_batch_size=1) if error_epoch < error_best: error_best = error_epoch print('■■■ ↑Best Score (PER)↑ ■■■') print('■Test Data Evaluation:■') do_eval_per(session=sess, decode_op=decode_op, per_op=per_op, network=network, dataset=test_data, label_type=label_type, eval_batch_size=1) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) start_time_epoch = time.time() start_time_step = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Save train & dev loss save_loss(csv_steps, csv_train_loss, csv_dev_loss, save_path=network.model_dir) # Training was finished correctly with open(join(network.model_dir, 'complete.txt'), 'w') as f: f.write('')
def do_train(network, optimizer, learning_rate, batch_size, epoch_num, label_type, num_stack, num_skip, gpu_indices): """Run training. If target labels are phone, the model is evaluated by PER with 39 phones. Args: network: network to train optimizer: string, the name of optimizer. ex.) adam, rmsprop learning_rate: A flaot value, the initial learning rate batch_size: int, teh the size of mini-batch epoch_num: int, the number of epochs to train label_type: string, phone39 or phone48 or phone61 or character num_stack: int, the number of frames to stack num_skip: int, the number of frames to skip gpu_indices: list of integer """ # Load dataset train_data = DataSet(data_type='train', label_type=label_type, batch_size=batch_size, num_stack=num_stack, num_skip=num_skip, is_sorted=True, num_gpu=len(gpu_indices)) dev_data = DataSet(data_type='dev', label_type=label_type, batch_size=batch_size, num_stack=num_stack, num_skip=num_skip, is_sorted=False, num_gpu=len(gpu_indices)) if label_type == 'character': # TODO: evaluationのときはどうする? test_data = DataSet(data_type='test', label_type='character', batch_size=batch_size, num_stack=num_stack, num_skip=num_skip, is_sorted=False, num_gpu=1) else: test_data = DataSet(data_type='test', label_type='phone39', batch_size=batch_size, num_stack=num_stack, num_skip=num_skip, is_sorted=False, num_gpu=1) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to track the global step global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) # Calculate the gradients for each model tower tower_grads = [] network.inputs = [] network.labels = [] network.inputs_seq_len = [] network.keep_prob_input = [] network.keep_prob_hidden = [] # TODO: cpu 用のタワーも用意する all_devices = ['/gpu:%d' % i_gpu for i_gpu in range(len(gpu_indices))] # NOTE: /cpu:0 is prepared for evaluation loss_dict = {} total_loss = [] with tf.variable_scope(tf.get_variable_scope()): for i_device in range(len(all_devices)): with tf.device(all_devices[i_device]): with tf.name_scope('%s_%d' % ('tower', i_device)) as scope: # Define placeholders in each tower network.inputs.append( tf.placeholder( tf.float32, shape=[None, None, network.input_size], name='input' + str(i_device))) indices_pl = tf.placeholder(tf.int64, name='indices%d' % i_device) values_pl = tf.placeholder(tf.int32, name='values%d' % i_device) shape_pl = tf.placeholder(tf.int64, name='shape%d' % i_device) network.labels.append( tf.SparseTensor(indices_pl, values_pl, shape_pl)) network.inputs_seq_len.append( tf.placeholder(tf.int64, shape=[None], name='inputs_seq_len%d' % i_device)) network.keep_prob_input.append( tf.placeholder(tf.float32, name='keep_prob_input%d' % i_device)) network.keep_prob_hidden.append( tf.placeholder(tf.float32, name='keep_prob_hidden%d' % i_device)) # Calculate the loss for one tower of the model. This # function constructs the entire model but shares the # variables across all towers loss, logits = network.compute_loss( network.inputs[i_device], network.labels[i_device], network.inputs_seq_len[i_device], network.keep_prob_input[i_device], network.keep_prob_hidden[i_device]) # Assemble all of the losses for the current tower # only losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower tower_loss = tf.add_n(losses, name='tower_loss') total_loss.append(tower_loss) # Reuse variables for the next tower tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. # summaries = tf.get_collection( # tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this # tower. grads = optimizer.compute_gradients(tower_loss) # TODO: gradient clipping # Keep track of the gradients across all towers. tower_grads.append(grads) # Aggregate losses, then calculate average loss. loss_op = tf.add_n(total_loss) / len(gpu_indices) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers # for i in range(len(gpu_indices)): grads = average_gradients(tower_grads) # Add a summary to track the learning rate. # summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. # for grad, var in grads: # if grad is not None: # summaries.append(tf.summary.histogram(var.op.name + '/gradients', # grad)) # Apply the gradients to adjust the shared variables. train_op = optimizer.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. # for var in tf.trainable_variables(): # summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. # variable_averages = tf.train.ExponentialMovingAverage( # 0.9999, global_step) # variables_averages_op = # variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. # train_op = tf.group(apply_gradient_op, variables_averages_op) #################################### # Add to the graph each operation (Use last placeholders) # train_op = network.train(loss_op, # optimizer='adam', # learning_rate_init=learning_rate, # is_scheduled=False) decode_op = network.decoder(logits, network.inputs_seq_len[-1], decode_type='beam_search', beam_width=20) ler_op = network.compute_ler(decode_op, network.labels[-1]) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(network.summaries_train) summary_dev = tf.summary.merge(network.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph # NOTE: Start running operations on the Graph. allow_soft_placement # must be set to True to build towers on GPU, as some of the ops do not # have GPU implementations. with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(network.model_dir, sess.graph) # Initialize parameters sess.run(init_op) # Make generator mini_batch_train = train_data.next_batch(session=sess) mini_batch_dev = dev_data.next_batch(session=sess) # Train model iter_per_epoch = int(train_data.data_num / (batch_size * len(gpu_indices))) train_step = train_data.data_num / batch_size if train_step != int(train_step): iter_per_epoch += 1 max_steps = iter_per_epoch * epoch_num start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() error_best = 1 for step in range(max_steps): # Create feed dictionary for next mini batch (train) inputs, labels_st, inputs_seq_len, _ = mini_batch_train.__next__( ) feed_dict_train, feed_dict_dev = {}, {} for i_gpu in range(len(gpu_indices)): feed_dict_train[network.inputs[i_gpu]] = inputs[i_gpu] feed_dict_train[network.labels[i_gpu]] = labels_st[i_gpu] feed_dict_train[ network.inputs_seq_len[i_gpu]] = inputs_seq_len[i_gpu] feed_dict_train[network.keep_prob_input[ i_gpu]] = network.dropout_ratio_input feed_dict_train[network.keep_prob_hidden[ i_gpu]] = network.dropout_ratio_hidden # Create feed dictionary for next mini batch (dev) inputs, labels_st, inputs_seq_len, _ = mini_batch_dev.__next__( ) for i_gpu in range(len(gpu_indices)): feed_dict_dev[network.inputs[i_gpu]] = inputs[i_gpu] feed_dict_dev[network.labels[i_gpu]] = labels_st[i_gpu] feed_dict_dev[ network.inputs_seq_len[i_gpu]] = inputs_seq_len[i_gpu] feed_dict_dev[network.keep_prob_input[ i_gpu]] = network.dropout_ratio_input feed_dict_dev[network.keep_prob_hidden[ i_gpu]] = network.dropout_ratio_hidden # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % int(10 / len(gpu_indices)) == 0: # Compute loss print(loss_op) loss_train = sess.run(loss_op, feed_dict=feed_dict_train) # loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) # csv_loss_dev.append(loss_dev) # Change to evaluation mode for i_gpu in range(len(gpu_indices)): feed_dict_train[network.keep_prob_input[i_gpu]] = 1.0 feed_dict_train[network.keep_prob_hidden[i_gpu]] = 1.0 feed_dict_dev[network.keep_prob_input[i_gpu]] = 1.0 feed_dict_dev[network.keep_prob_hidden[i_gpu]] = 1.0 # Compute accuracy & update event file ler_train, summary_str_train = sess.run( [ler_op, summary_train], feed_dict=feed_dict_train) # ler_dev, summary_str_dev = sess.run( # [ler_op, summary_dev], feed_dict=feed_dict_dev) csv_ler_train.append(ler_train) # csv_ler_dev.append(ler_dev) summary_writer.add_summary(summary_str_train, step + 1) # summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print( "Step %d: loss = %.3f (%.3f) / ler = %.4f (%.4f) (%.3f min)" % (step + 1, loss_train, 1, ler_train, 1, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps: duration_epoch = time.time() - start_time_epoch epoch = (step + 1) // iter_per_epoch print('-----EPOCH:%d (%.3f min)-----' % (epoch, duration_epoch / 60)) # Save model (check point) checkpoint_file = join(network.model_dir, 'model.ckpt') # save_path = saver.save( # sess, checkpoint_file, global_step=epoch) # print("Model saved in file: %s" % save_path) if epoch >= 10: start_time_eval = time.time() # if label_type == 'character': # print('=== Dev Data Evaluation ===') # cer_dev_epoch = do_eval_cer( # session=sess, # decode_op=decode_op, # network=network, # dataset=dev_data) # print(' CER: %f %%' % (cer_dev_epoch * 100)) # # if cer_dev_epoch < error_best: # error_best = cer_dev_epoch # print('■■■ ↑Best Score (CER)↑ ■■■') # # print('=== Test Data Evaluation ===') # cer_test = do_eval_cer( # session=sess, # decode_op=decode_op, # network=network, # dataset=test_data, # eval_batch_size=1) # print(' CER: %f %%' % (cer_test * 100)) # # else: # print('=== Dev Data Evaluation ===') # per_dev_epoch = do_eval_per( # session=sess, # decode_op=decode_op, # per_op=ler_op, # network=network, # dataset=dev_data, # label_type=label_type) # print(' PER: %f %%' % (per_dev_epoch * 100)) # # if per_dev_epoch < error_best: # error_best = per_dev_epoch # print('■■■ ↑Best Score (PER)↑ ■■■') # # print('=== Test Data Evaluation ===') # per_test = do_eval_per( # session=sess, # decode_op=decode_op, # per_op=ler_op, # network=network, # dataset=test_data, # label_type=label_type, # eval_batch_size=1) # print(' PER: %f %%' % (per_test * 100)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) start_time_epoch = time.time() start_time_step = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Save train & dev loss save_loss(csv_steps, csv_loss_train, csv_loss_dev, save_path=network.model_dir) save_ler(csv_steps, csv_ler_train, csv_ler_dev, save_path=network.model_dir) # Training was finished correctly with open(join(network.model_dir, 'complete.txt'), 'w') as f: f.write('')
def do_train(network, optimizer, learning_rate, batch_size, epoch_num, label_type, num_stack, num_skip): """Run training. If target labels are phone, the model is evaluated by PER with 39 phones. Args: network: network to train optimizer: string, the name of optimizer. ex.) adam, rmsprop learning_rate: A float value, the initial learning rate batch_size: int, the size of mini-batch epoch_num: int, the number of epochs to train label_type: string, phone39 or phone48 or phone61 or character num_stack: int, the number of frames to stack num_skip: int, the number of frames to skip """ # Load dataset train_data = DataSet(data_type='train', label_type=label_type, batch_size=batch_size, num_stack=num_stack, num_skip=num_skip, is_sorted=True) dev_data = DataSet(data_type='dev', label_type=label_type, batch_size=batch_size, num_stack=num_stack, num_skip=num_skip, is_sorted=False) if label_type == 'character': test_data = DataSet(data_type='test', label_type='character', batch_size=1, num_stack=num_stack, num_skip=num_skip, is_sorted=False) else: test_data = DataSet(data_type='test', label_type='phone39', batch_size=1, num_stack=num_stack, num_skip=num_skip, is_sorted=False) # Tell TensorFlow that the model will be built into the default graph with tf.Graph().as_default(): # Define placeholders network.inputs = tf.placeholder(tf.float32, shape=[None, None, network.input_size], name='input') indices_pl = tf.placeholder(tf.int64, name='indices') values_pl = tf.placeholder(tf.int32, name='values') shape_pl = tf.placeholder(tf.int64, name='shape') network.labels = tf.SparseTensor(indices_pl, values_pl, shape_pl) network.inputs_seq_len = tf.placeholder(tf.int64, shape=[None], name='inputs_seq_len') network.keep_prob_input = tf.placeholder(tf.float32, name='keep_prob_input') network.keep_prob_hidden = tf.placeholder(tf.float32, name='keep_prob_hidden') # Add to the graph each operation (including model definition) loss_op, logits = network.compute_loss(network.inputs, network.labels, network.inputs_seq_len, network.keep_prob_input, network.keep_prob_hidden) train_op = network.train(loss_op, optimizer=optimizer, learning_rate_init=float(learning_rate), is_scheduled=False) decode_op = network.decoder(logits, network.inputs_seq_len, decode_type='beam_search', beam_width=20) ler_op = network.compute_ler(decode_op, network.labels) # Build the summary tensor based on the TensorFlow collection of # summaries summary_train = tf.summary.merge(network.summaries_train) summary_dev = tf.summary.merge(network.summaries_dev) # Add the variable initializer operation init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints saver = tf.train.Saver(max_to_keep=None) # Count total parameters parameters_dict, total_parameters = count_total_parameters( tf.trainable_variables()) for parameter_name in sorted(parameters_dict.keys()): print("%s %d" % (parameter_name, parameters_dict[parameter_name])) print("Total %d variables, %s M parameters" % (len(parameters_dict.keys()), "{:,}".format( total_parameters / 1000000))) # Make mini-batch generator mini_batch_train = train_data.next_batch() mini_batch_dev = dev_data.next_batch() csv_steps, csv_loss_train, csv_loss_dev = [], [], [] csv_ler_train, csv_ler_dev = [], [] # Create a session for running operation on the graph with tf.Session() as sess: # Instantiate a SummaryWriter to output summaries and the graph summary_writer = tf.summary.FileWriter(network.model_dir, sess.graph) # Initialize parameters sess.run(init_op) # Train model iter_per_epoch = int(train_data.data_num / batch_size) train_step = train_data.data_num / batch_size if train_step != int(train_step): iter_per_epoch += 1 max_steps = iter_per_epoch * epoch_num start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() error_best = 1 for step in range(max_steps): # Create feed dictionary for next mini batch (train) inputs, labels_st, inputs_seq_len, _ = mini_batch_train.__next__( ) feed_dict_train = { network.inputs: inputs, network.labels: labels_st, network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden, network.lr: learning_rate } # Create feed dictionary for next mini batch (dev) inputs, labels_st, inputs_seq_len, _ = mini_batch_dev.__next__( ) feed_dict_dev = { network.inputs: inputs, network.labels: labels_st, network.inputs_seq_len: inputs_seq_len, network.keep_prob_input: network.dropout_ratio_input, network.keep_prob_hidden: network.dropout_ratio_hidden } # Update parameters sess.run(train_op, feed_dict=feed_dict_train) if (step + 1) % 10 == 0: # Compute loss loss_train = sess.run(loss_op, feed_dict=feed_dict_train) loss_dev = sess.run(loss_op, feed_dict=feed_dict_dev) csv_steps.append(step) csv_loss_train.append(loss_train) csv_loss_dev.append(loss_dev) # Change to evaluation mode feed_dict_train[network.keep_prob_input] = 1.0 feed_dict_train[network.keep_prob_hidden] = 1.0 feed_dict_dev[network.keep_prob_input] = 1.0 feed_dict_dev[network.keep_prob_hidden] = 1.0 # Compute accuracy & update event file ler_train, summary_str_train = sess.run( [ler_op, summary_train], feed_dict=feed_dict_train) ler_dev, summary_str_dev = sess.run( [ler_op, summary_dev], feed_dict=feed_dict_dev) csv_ler_train.append(ler_train) csv_ler_dev.append(ler_dev) summary_writer.add_summary(summary_str_train, step + 1) summary_writer.add_summary(summary_str_dev, step + 1) summary_writer.flush() duration_step = time.time() - start_time_step print( "Step %d: loss = %.3f (%.3f) / ler = %.4f (%.4f) (%.3f min)" % (step + 1, loss_train, loss_dev, ler_train, ler_dev, duration_step / 60)) sys.stdout.flush() start_time_step = time.time() # Save checkpoint and evaluate model per epoch if (step + 1) % iter_per_epoch == 0 or (step + 1) == max_steps: duration_epoch = time.time() - start_time_epoch epoch = (step + 1) // iter_per_epoch print('-----EPOCH:%d (%.3f min)-----' % (epoch, duration_epoch / 60)) # Save model (check point) checkpoint_file = join(network.model_dir, 'model.ckpt') save_path = saver.save(sess, checkpoint_file, global_step=epoch) print("Model saved in file: %s" % save_path) if epoch >= 10: start_time_eval = time.time() if label_type == 'character': print('=== Dev Data Evaluation ===') cer_dev_epoch = do_eval_cer(session=sess, decode_op=decode_op, network=network, dataset=dev_data) print(' CER: %f %%' % (cer_dev_epoch * 100)) if cer_dev_epoch < error_best: error_best = cer_dev_epoch print('■■■ ↑Best Score (CER)↑ ■■■') print('=== Test Data Evaluation ===') cer_test = do_eval_cer(session=sess, decode_op=decode_op, network=network, dataset=test_data, eval_batch_size=1) print(' CER: %f %%' % (cer_test * 100)) else: print('=== Dev Data Evaluation ===') per_dev_epoch = do_eval_per( session=sess, decode_op=decode_op, per_op=ler_op, network=network, dataset=dev_data, train_label_type=label_type) print(' PER: %f %%' % (per_dev_epoch * 100)) if per_dev_epoch < error_best: error_best = per_dev_epoch print('■■■ ↑Best Score (PER)↑ ■■■') print('=== Test Data Evaluation ===') per_test = do_eval_per( session=sess, decode_op=decode_op, per_op=ler_op, network=network, dataset=test_data, train_label_type=label_type, eval_batch_size=1) print(' PER: %f %%' % (per_test * 100)) duration_eval = time.time() - start_time_eval print('Evaluation time: %.3f min' % (duration_eval / 60)) start_time_epoch = time.time() start_time_step = time.time() duration_train = time.time() - start_time_train print('Total time: %.3f hour' % (duration_train / 3600)) # Save train & dev loss, ler save_loss(csv_steps, csv_loss_train, csv_loss_dev, save_path=network.model_dir) save_ler(csv_steps, csv_ler_train, csv_ler_dev, save_path=network.model_dir) # Training was finished correctly with open(join(network.model_dir, 'complete.txt'), 'w') as f: f.write('')