def main(_): if FLAGS.seqio_summaries: subdirs = tf.io.gfile.listdir(FLAGS.summary_dir) summary_dirs = [os.path.join(FLAGS.summary_dir, d) for d in subdirs] else: summary_dirs = [FLAGS.summary_dir] scores = None for d in summary_dirs: events = eval_utils.parse_events_files(d, FLAGS.seqio_summaries) if FLAGS.perplexity_eval: task_metrics = events else: task_metrics = eval_utils.get_eval_metric_values( events, task_name=os.path.basename(d) if FLAGS.seqio_summaries else None) if scores: scores.update(task_metrics) else: scores = task_metrics if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return df = eval_utils.scores_to_df(scores) df = eval_utils.compute_avg_glue(df) df = eval_utils.sort_columns(df) eval_utils.log_csv(df, output_file=FLAGS.out_file)
def main(_): events = eval_utils.parse_events_files(FLAGS.summary_dir) scores = eval_utils.get_eval_metric_values(events) if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return scores = eval_utils.compute_avg_glue(scores) eval_utils.log_csv(scores, output_file=FLAGS.out_file)
def main(_): events = eval_utils.parse_events_files(FLAGS.summary_dir) if FLAGS.perplexity_eval: scores = events else: scores = eval_utils.get_eval_metric_values(events) if not scores: logging.info("No evaluation events found in %s", FLAGS.summary_dir) return df = eval_utils.scores_to_df(scores) df = eval_utils.compute_avg_glue(df) df = eval_utils.sort_columns(df) eval_utils.log_csv(df, output_file=FLAGS.out_file)
def test_log_csv(self): with self.assertRaises(ValueError): eval_utils.log_csv({"foo_task/unknown_metric": [(10, 30.)]}) metric_keys = list(eval_utils.METRIC_NAMES.keys()) metric_names = list(eval_utils.METRIC_NAMES.values()) scores = { metric_keys[0]: [(20, 1.), (30, 2.)], metric_keys[1]: [(10, 3.)], metric_keys[2]: [(10, 4.)], } output_file = os.path.join(self.create_tempdir().full_path, "results.csv") eval_utils.log_csv(scores, output_file=output_file) with tf.gfile.Open(output_file) as f: output = f.read() expected = """step,{},{},{} 10,,3.000,4.000 20,1.000,, 30,2.000,, max,2.000,3.000,4.000 step,30,10,10""".format(*[m.name for m in metric_names[:3]]) self.assertEqual(output, expected)
def test_log_csv(self): metric_names = list(eval_utils.METRIC_NAMES.values()) df = pd.DataFrame( collections.OrderedDict([ (metric_names[0].name, [np.nan, 1., 2.]), (metric_names[1].name, [3., np.nan, np.nan]), (metric_names[2].name, [4., np.nan, np.nan]), ]), index=[10, 20, 30], ) df.index.name = "step" output_file = os.path.join(self.create_tempdir().full_path, "results.csv") eval_utils.log_csv(df, output_file=output_file) with tf.io.gfile.GFile(output_file) as f: output = f.read() expected = """step,{},{},{} 10,,3.000,4.000 20,1.000,, 30,2.000,, max,2.000,3.000,4.000 step,30,10,10""".format(*[m.name for m in metric_names[:3]]) self.assertEqual(output, expected)