def test_parent(self): for pred, score in zip(TEST_PREDS, TEST_SCORES): print("prediction = %s" % " ".join(pred)) _, _, parent_score, _ = table_text_eval.parent([pred], [[TEST_REF]], [TEST_TABLE], lambda_weight=None) print(parent_score) self.assertAlmostEqual(score, parent_score, places=3)
def main(_): keys_to_exclude = ["reference"] input_json = FLAGS.data_file # Read raw data with tf.gfile.Open(input_json, "r") as f: eval_data = json.load(f) uniq_keys = set([k[:-5] for k in eval_data[0] if k.endswith("-pred")]) uniq_keys.add("reference") uniq_keys = list(uniq_keys) if FLAGS.entailment_fn == "cooccurrence": assert FLAGS.cooccurrence_counts is not None logging.info("Reading %s...", FLAGS.cooccurrence_counts) with tf.gfile.Open(FLAGS.cooccurrence_counts) as f: cooccur_counts = json.load(f) entail_method = table_text_eval.cooccur_probability_fn(cooccur_counts) else: entail_method = table_text_eval.overlap_probability # Compute scores for each lambda. # pylint: disable=g-complex-comprehension logging.info("Computing scores for each system.") all_parent_scores = {k: [] for k in uniq_keys} for key in uniq_keys: if key in keys_to_exclude: continue sentences = [_text(item[key + "-pred"]) for item in eval_data] references = [[_text(reference) for reference in item["references"]] for item in eval_data] tables = [_table(item["table"]) for item in eval_data] logging.info("System %s", key) _, _, _, parent_scores = table_text_eval.parent( sentences, references, tables, lambda_weight=None, entailment_fn=entail_method) all_parent_scores[key] = parent_scores logging.info("Done.") # Bootstrap sampling. metrics = ["grammar", "fluency", "semantics", "parent"] human_metrics = ["grammar", "fluency", "semantics"] metric_to_scores = {m: {k: [] for k in uniq_keys} for m in metrics} metric_to_correlations = {m: {m_: [] for m_ in metrics} for m in metrics} for m in metrics: metric_to_correlations[m]["average"] = [] for _ in tqdm(range(FLAGS.num_bootstrap)): # Get the bootstrap sample based on the eval_subset. all_keys = range(len(eval_data)) bootstrap_sample = [ random.choice(all_keys) for _ in range(len(eval_data)) ] # Compute average scores available. key_to_grammar = {k: [] for k in uniq_keys} key_to_fluency = {k: [] for k in uniq_keys} key_to_semantics = {k: [] for k in uniq_keys} for ii in bootstrap_sample: for k in uniq_keys: if k in keys_to_exclude: continue key_to_grammar[k].append(float(eval_data[ii][k + "-grammar"])) key_to_fluency[k].append(float(eval_data[ii][k + "-fluency"])) key_to_semantics[k].append( float(eval_data[ii][k + "-semantics"])) key_to_parent = { k: [all_parent_scores[k][n] for n in bootstrap_sample] for k in uniq_keys if k not in keys_to_exclude } # Compute average scores. for k in uniq_keys: if k in keys_to_exclude: continue metric_to_scores["grammar"][k].append( sum(key_to_grammar[k]) / len(key_to_grammar[k])) metric_to_scores["fluency"][k].append( sum(key_to_fluency[k]) / len(key_to_fluency[k])) metric_to_scores["semantics"][k].append( sum(key_to_semantics[k]) / len(key_to_semantics[k])) # PARENT. metric_to_scores["parent"][k].append(np.mean(key_to_parent[k])) # Correlations. for m1 in metrics: scores_1 = [ metric_to_scores[m1][k][-1] for k in uniq_keys if k not in keys_to_exclude ] for m2 in metrics: scores_2 = [ metric_to_scores[m2][k][-1] for k in uniq_keys if k not in keys_to_exclude ] metric_to_correlations[m1][m2].append( pearsonr(scores_1, scores_2)[0]) metric_to_correlations[m1]["average"].append( sum([ metric_to_correlations[m1][m2][-1] for m2 in human_metrics ]) / 3) # Mean and 95% CI for each model on each metric. all_models = [k for k in uniq_keys if k not in keys_to_exclude] print("Model," + ",".join(metrics)) for model in all_models: means = [] for metric in metrics: scores = sorted(metric_to_scores[metric][model]) means.append(np.mean(scores)) print(model + "," + ",".join("%.3f" % means[ii] for ii in range(len(means)))) # Average correlation and 95% CI for each metric's correlation. human_metrics += ["average"] print("Correlations," + ",".join(human_metrics)) for metric in metric_to_correlations: corrs = [] for hm in human_metrics: scores = sorted(metric_to_correlations[metric][hm]) mean = np.mean(scores) corrs.append(mean) print(metric + "," + ",".join("%.3f" % mean for mean in corrs)) # Save correlations to JSON. json.dump( { m: {m_: str(v_) for m_, v_ in v.iteritems()} for m, v in metric_to_correlations.iteritems() }, tf.gfile.Open(FLAGS.save_output + ".correlations.json", "w"))
def main(_): # Read the data. with tf.gfile.Open(FLAGS.data_file, "r") as f: raw_data = json.load(f) with tf.gfile.Open(FLAGS.bootstrap_file, "r") as f: bootstrap = json.load(f) uniq_keys = raw_data["all_sentences"].keys() if FLAGS.entailment_fn == "entailment": assert FLAGS.entailment_counts is not None logging.info("Reading %s...", FLAGS.entailment_counts) with tf.gfile.Open(FLAGS.entailment_counts) as f: cooccur_counts = json.load(f) entail_method = table_text_eval._entailment_probability_fn( cooccur_counts) else: entail_method = table_text_eval._overlap_probability # Compute PARENT scores for each lambda. # pylint: disable=g-complex-comprehension logging.info("Computing PARENT scores for each system.") all_parent_scores = {k: [] for k in uniq_keys} for key in uniq_keys: if key == "reference": continue logging.info("System %s", key) _, _, _, parent_scores = table_text_eval.parent( raw_data["all_sentences"][key], raw_data["all_references"], raw_data["all_tables_tokenized"], lambda_weight=None, entailment_fn=entail_method) all_parent_scores[key] = parent_scores logging.info("Done.") # Correlations for each bootstrap sample. metrics = ["human", "parent"] metric_to_scores = {m: {k: [] for k in uniq_keys} for m in metrics} metric_to_correlations = {m: {m_: [] for m_ in metrics} for m in metrics} for ii in range(len(bootstrap)): bootstrap_sample = bootstrap[ii]["ids"] quality_scores = bootstrap[ii]["human_eval"] key_to_parent = { k: [all_parent_scores[k][n] for n in bootstrap_sample] for k in uniq_keys if k != "reference" } # Scores. for k in uniq_keys: if k == "reference": continue metric_to_scores["parent"][k].append(np.mean(key_to_parent[k])) metric_to_scores["human"][k].append(quality_scores[k]) # Correlations. for m1 in metrics: scores_1 = [ metric_to_scores[m1][k][-1] for k in uniq_keys if k != "reference" ] for m2 in metrics: scores_2 = [ metric_to_scores[m2][k][-1] for k in uniq_keys if k != "reference" ] metric_to_correlations[m1][m2].append( pearsonr(scores_1, scores_2)[0]) # Mean for each model on each metric. all_models = [k for k in uniq_keys if k != "reference"] print("Model," + ",".join(metrics)) for model in all_models: means = [] for metric in metrics: scores = sorted(metric_to_scores[metric][model]) means.append(np.mean(scores)) print(model + "," + ",".join("%.3f" % means[ii] for ii in range(len(means)))) # Average correlation and std for each metric's correlation. print("Correlations") for metric in metric_to_correlations: scores = sorted(metric_to_correlations[metric]["human"]) mean = np.mean(scores) std = np.std(scores) print(metric + "," + "%.3f,%.3f" % (mean, std)) # Save correlations to JSON. json.dump( { m: {m_: str(v_) for m_, v_ in v.iteritems()} for m, v in metric_to_correlations.iteritems() }, tf.gfile.Open(FLAGS.save_output + ".correlations.json", "w")) json.dump( { m: {m_: str(v_) for m_, v_ in v.iteritems()} for m, v in metric_to_scores.iteritems() }, tf.gfile.Open(FLAGS.save_output + ".scores.json", "w"))
def main(_): # Filenames. raw_data_json = os.path.join(FLAGS.data_dir, "generations.json") input_csv = os.path.join(FLAGS.data_dir, "inputs.csv") annotation_csv = os.path.join(FLAGS.data_dir, "annotations.csv") key_csv = os.path.join(FLAGS.data_dir, "keys.csv") # Read raw data with tf.gfile.Open(raw_data_json, "r") as f: eval_data = json.load(f) # Read the item descriptions and their keys. item_to_keys = {} uniq_keys = set() f_in = tf.gfile.Open(input_csv, "r") f_key = tf.gfile.Open(key_csv, "r") input_reader = csv.reader(f_in) input_headers = input_reader.next() description_index = input_headers.index("a.description") # pylint: disable=g-complex-comprehension for data, key in zip(input_reader, f_key): item_to_keys[data[description_index]] = [[ k.split(",") for k in kk.split("..") ] for kk in key.strip().split("\t")] uniq_keys.update([ k for key in item_to_keys[data[description_index]] for kk in key for k in kk ]) uniq_keys = list(uniq_keys) # Read annotations. item_to_annotations = {} with tf.gfile.Open(annotation_csv, "r") as f: reader = csv.reader(f) headers = reader.next() description_index = headers.index("i.description") status_index = headers.index("t.status") annotation_indices = [] for i, header in enumerate(headers): if "t.s.pair_sentence_selection__sentence_" in header: annotation_indices.append(i) assert len(annotation_indices) == len(item_to_keys.itervalues().next()) for row in reader: if row[status_index] != "Completed": continue if row[description_index] not in item_to_keys: continue item_to_annotations[row[description_index]] = [ int(row[ii]) for ii in annotation_indices ] # Collect sentences and references for each key. all_sentences = {k: [] for k in uniq_keys} all_references = [] all_tables_tokenized = [] for n in range(len(eval_data)): for key in uniq_keys: if key == "reference": continue all_sentences[key].append(eval_data[n][key].split()) all_references.append([eval_data[n]["reference"].split()]) all_tables_tokenized.append(_tokenize_table(eval_data[n]["table"])) # Compute PARENT scores for each lambda. logging.info("Computing PARENT scores for each system.") all_parent_scores = {k: {lbd: [] for lbd in LAMBDAS} for k in uniq_keys} for key in uniq_keys: if key == "reference": continue for lbd in LAMBDAS: logging.info("System %s Lambda %.1f", key, lbd) _, _, _, parent_scores = table_text_eval.parent( all_sentences[key], all_references, all_tables_tokenized, lambda_weight=lbd) all_parent_scores[key][lbd] = parent_scores logging.info("Done.") # Compute accuracy of each metric. metrics = ["parent-%.1f" % lbd for lbd in LAMBDAS] accuracy = {m: 0. for m in metrics} total = 0 for item in tqdm(range(len(eval_data))): if str(item) not in item_to_annotations: continue annotations = item_to_annotations[str(item)] list_of_key_pairs = item_to_keys[str(item)] for ii, key_pairs in enumerate(list_of_key_pairs): annotation = annotations[ii] key_pair = key_pairs[0] if "reference" in key_pair: continue # Compute metrics. scores = {} # PARENT. for lbd in LAMBDAS: scores["parent-%.1f" % lbd] = [ all_parent_scores[key_pair[0]][lbd][item], all_parent_scores[key_pair[1]][lbd][item] ] # Accuracies. predictions = {} for metric in scores: pred = 0 if scores[metric][0] >= scores[metric][1] else 1 predictions[metric] = pred if pred == annotation: accuracy[metric] += 1. total += 1 print("Accuracies") for metric in metrics: print(metric + "," + str(accuracy[metric] / total))