def eval_single_checkpoint(ckpt_name, output_path, model, datasets, embedding_matrices): """Runs quantitative evaluation on a single checkpoint.""" if gfile.exists(output_path): logging.info('Skipping already exists: "%s"', output_path) return metrics = model.create_metrics() logging.info('Evaluating: "%s"', ckpt_name) utils.do_evaluation(model, metrics, datasets, embedding_matrices) # This code assumed the checkpoint name contains the epoch and step in the # following format. path_search = re.search(r'ep(\w+)_step(\w+)', ckpt_name) epoch = int(path_search.group(1)) step = int(path_search.group(2)) to_write = collections.OrderedDict() to_write['checkpoint'] = ckpt_name to_write['epoch'] = epoch to_write['step'] = step for metric in metrics.values(): if metric.name in METRICS_TO_SAVE: tf.summary.scalar(metric.name, metric.result(), step=step) to_write[metric.name] = metric.result().numpy() metric.reset_states() # Save the results to a text file. with gfile.GFile(output_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=to_write.keys()) writer.writeheader() writer.writerow(to_write)
def train(save_dir, num_epochs=300, learning_rate=0.0001, save_every_n_epochs=25): """Train pipeline for next sentence embedding prediction on ROCStories.""" #### LOAD DATA #### datasets, embedding_matrices = prepare_datasets() #### CREATE MODEL AND OPTIMIZER #### num_input_sentences = tf.compat.v1.data.get_output_shapes( datasets['train'])[0][1] model = models.build_model( num_input_sentences=num_input_sentences, embedding_matrix=embedding_matrices['train']) metrics = model.create_metrics() optimizer = tf.keras.optimizers.Adam(lr=learning_rate) checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) num_train_steps = 0 #### DO TRAINING #### summary_writer = tf.summary.create_file_writer( os.path.join(save_dir, 'summaries')) with summary_writer.as_default(): logging.info('Starting training.') for epoch in range(1, num_epochs+1): for x, labels in datasets['train']: utils.train_step(model, optimizer, x, labels, metrics) num_train_steps += 1 start_time = time.time() utils.do_evaluation(model, metrics, datasets, embedding_matrices) logging.info('Eval took %f seconds.', (time.time() - start_time)) to_log = ['%s=%f, ' % (m.name, m.result()) for m in metrics.values()] logging.info('Epoch %d, %s ', epoch, ''.join(to_log)) # Add each metric to the TensorBoard and then reset it for the next epoch. for metric in metrics.values(): tf.summary.scalar( metric.name, metric.result(), step=optimizer.iterations) metric.reset_states() # lr = cur_learning_rate(optimizer) # tf.summary.scalar('learning_rate', lr, step=optimizer.iterations) if epoch % save_every_n_epochs == 0: prefix = os.path.join( save_dir, 'ep%04d_step%05d.ckpt' % (epoch, num_train_steps)) logging.info('Saving checkpoint: %s', prefix) checkpoint.save(file_prefix=prefix) #### SAVE HYPERPARAMETERS AND FINAL EVAL RESULTS TO FILE #### to_save = {} for metric in metrics.values(): metric.reset_states() utils.do_evaluation(model, metrics, datasets, embedding_matrices) for metric in metrics.values(): to_save['metric_' + metric.name] = metric.result().numpy() results_file_path = os.path.join(save_dir, 'final_eval.tsv') with gfile.GFile(results_file_path, 'w') as f: for name, value in to_save.iteritems(): f.write('%s\t%s\n' % (name, str(value)))