Example #1
0
 def test_parent(self):
   for pred, score in zip(TEST_PREDS, TEST_SCORES):
     print("prediction = %s" % " ".join(pred))
     _, _, parent_score, _ = table_text_eval.parent([pred], [[TEST_REF]],
                                                    [TEST_TABLE],
                                                    lambda_weight=None)
     print(parent_score)
     self.assertAlmostEqual(score, parent_score, places=3)
Example #2
0
def main(_):
    keys_to_exclude = ["reference"]
    input_json = FLAGS.data_file

    # Read raw data
    with tf.gfile.Open(input_json, "r") as f:
        eval_data = json.load(f)
    uniq_keys = set([k[:-5] for k in eval_data[0] if k.endswith("-pred")])
    uniq_keys.add("reference")
    uniq_keys = list(uniq_keys)

    if FLAGS.entailment_fn == "cooccurrence":
        assert FLAGS.cooccurrence_counts is not None
        logging.info("Reading %s...", FLAGS.cooccurrence_counts)
        with tf.gfile.Open(FLAGS.cooccurrence_counts) as f:
            cooccur_counts = json.load(f)
        entail_method = table_text_eval.cooccur_probability_fn(cooccur_counts)
    else:
        entail_method = table_text_eval.overlap_probability

    # Compute scores for each lambda.
    # pylint: disable=g-complex-comprehension
    logging.info("Computing scores for each system.")
    all_parent_scores = {k: [] for k in uniq_keys}
    for key in uniq_keys:
        if key in keys_to_exclude:
            continue
        sentences = [_text(item[key + "-pred"]) for item in eval_data]
        references = [[_text(reference) for reference in item["references"]]
                      for item in eval_data]
        tables = [_table(item["table"]) for item in eval_data]
        logging.info("System %s", key)
        _, _, _, parent_scores = table_text_eval.parent(
            sentences,
            references,
            tables,
            lambda_weight=None,
            entailment_fn=entail_method)
        all_parent_scores[key] = parent_scores
    logging.info("Done.")

    # Bootstrap sampling.
    metrics = ["grammar", "fluency", "semantics", "parent"]
    human_metrics = ["grammar", "fluency", "semantics"]
    metric_to_scores = {m: {k: [] for k in uniq_keys} for m in metrics}
    metric_to_correlations = {m: {m_: [] for m_ in metrics} for m in metrics}
    for m in metrics:
        metric_to_correlations[m]["average"] = []

    for _ in tqdm(range(FLAGS.num_bootstrap)):

        # Get the bootstrap sample based on the eval_subset.
        all_keys = range(len(eval_data))
        bootstrap_sample = [
            random.choice(all_keys) for _ in range(len(eval_data))
        ]

        # Compute average scores available.
        key_to_grammar = {k: [] for k in uniq_keys}
        key_to_fluency = {k: [] for k in uniq_keys}
        key_to_semantics = {k: [] for k in uniq_keys}
        for ii in bootstrap_sample:
            for k in uniq_keys:
                if k in keys_to_exclude:
                    continue
                key_to_grammar[k].append(float(eval_data[ii][k + "-grammar"]))
                key_to_fluency[k].append(float(eval_data[ii][k + "-fluency"]))
                key_to_semantics[k].append(
                    float(eval_data[ii][k + "-semantics"]))
        key_to_parent = {
            k: [all_parent_scores[k][n] for n in bootstrap_sample]
            for k in uniq_keys if k not in keys_to_exclude
        }

        # Compute average scores.
        for k in uniq_keys:
            if k in keys_to_exclude:
                continue
            metric_to_scores["grammar"][k].append(
                sum(key_to_grammar[k]) / len(key_to_grammar[k]))
            metric_to_scores["fluency"][k].append(
                sum(key_to_fluency[k]) / len(key_to_fluency[k]))
            metric_to_scores["semantics"][k].append(
                sum(key_to_semantics[k]) / len(key_to_semantics[k]))
            # PARENT.
            metric_to_scores["parent"][k].append(np.mean(key_to_parent[k]))

        # Correlations.
        for m1 in metrics:
            scores_1 = [
                metric_to_scores[m1][k][-1] for k in uniq_keys
                if k not in keys_to_exclude
            ]
            for m2 in metrics:
                scores_2 = [
                    metric_to_scores[m2][k][-1] for k in uniq_keys
                    if k not in keys_to_exclude
                ]
                metric_to_correlations[m1][m2].append(
                    pearsonr(scores_1, scores_2)[0])
            metric_to_correlations[m1]["average"].append(
                sum([
                    metric_to_correlations[m1][m2][-1] for m2 in human_metrics
                ]) / 3)

    # Mean and 95% CI for each model on each metric.
    all_models = [k for k in uniq_keys if k not in keys_to_exclude]
    print("Model," + ",".join(metrics))
    for model in all_models:
        means = []
        for metric in metrics:
            scores = sorted(metric_to_scores[metric][model])
            means.append(np.mean(scores))
        print(model + "," + ",".join("%.3f" % means[ii]
                                     for ii in range(len(means))))

    # Average correlation and 95% CI for each metric's correlation.
    human_metrics += ["average"]
    print("Correlations," + ",".join(human_metrics))
    for metric in metric_to_correlations:
        corrs = []
        for hm in human_metrics:
            scores = sorted(metric_to_correlations[metric][hm])
            mean = np.mean(scores)
            corrs.append(mean)
        print(metric + "," + ",".join("%.3f" % mean for mean in corrs))

    # Save correlations to JSON.
    json.dump(
        {
            m: {m_: str(v_)
                for m_, v_ in v.iteritems()}
            for m, v in metric_to_correlations.iteritems()
        }, tf.gfile.Open(FLAGS.save_output + ".correlations.json", "w"))
def main(_):
    # Read the data.
    with tf.gfile.Open(FLAGS.data_file, "r") as f:
        raw_data = json.load(f)
    with tf.gfile.Open(FLAGS.bootstrap_file, "r") as f:
        bootstrap = json.load(f)

    uniq_keys = raw_data["all_sentences"].keys()

    if FLAGS.entailment_fn == "entailment":
        assert FLAGS.entailment_counts is not None
        logging.info("Reading %s...", FLAGS.entailment_counts)
        with tf.gfile.Open(FLAGS.entailment_counts) as f:
            cooccur_counts = json.load(f)
        entail_method = table_text_eval._entailment_probability_fn(
            cooccur_counts)
    else:
        entail_method = table_text_eval._overlap_probability

    # Compute PARENT scores for each lambda.
    # pylint: disable=g-complex-comprehension
    logging.info("Computing PARENT scores for each system.")
    all_parent_scores = {k: [] for k in uniq_keys}
    for key in uniq_keys:
        if key == "reference":
            continue
        logging.info("System %s", key)
        _, _, _, parent_scores = table_text_eval.parent(
            raw_data["all_sentences"][key],
            raw_data["all_references"],
            raw_data["all_tables_tokenized"],
            lambda_weight=None,
            entailment_fn=entail_method)
        all_parent_scores[key] = parent_scores
    logging.info("Done.")

    # Correlations for each bootstrap sample.
    metrics = ["human", "parent"]
    metric_to_scores = {m: {k: [] for k in uniq_keys} for m in metrics}
    metric_to_correlations = {m: {m_: [] for m_ in metrics} for m in metrics}
    for ii in range(len(bootstrap)):
        bootstrap_sample = bootstrap[ii]["ids"]
        quality_scores = bootstrap[ii]["human_eval"]
        key_to_parent = {
            k: [all_parent_scores[k][n] for n in bootstrap_sample]
            for k in uniq_keys if k != "reference"
        }

        # Scores.
        for k in uniq_keys:
            if k == "reference":
                continue
            metric_to_scores["parent"][k].append(np.mean(key_to_parent[k]))
            metric_to_scores["human"][k].append(quality_scores[k])

        # Correlations.
        for m1 in metrics:
            scores_1 = [
                metric_to_scores[m1][k][-1] for k in uniq_keys
                if k != "reference"
            ]
            for m2 in metrics:
                scores_2 = [
                    metric_to_scores[m2][k][-1] for k in uniq_keys
                    if k != "reference"
                ]
                metric_to_correlations[m1][m2].append(
                    pearsonr(scores_1, scores_2)[0])

    # Mean for each model on each metric.
    all_models = [k for k in uniq_keys if k != "reference"]
    print("Model," + ",".join(metrics))
    for model in all_models:
        means = []
        for metric in metrics:
            scores = sorted(metric_to_scores[metric][model])
            means.append(np.mean(scores))
        print(model + "," + ",".join("%.3f" % means[ii]
                                     for ii in range(len(means))))

    # Average correlation and std for each metric's correlation.
    print("Correlations")
    for metric in metric_to_correlations:
        scores = sorted(metric_to_correlations[metric]["human"])
        mean = np.mean(scores)
        std = np.std(scores)
        print(metric + "," + "%.3f,%.3f" % (mean, std))

    # Save correlations to JSON.
    json.dump(
        {
            m: {m_: str(v_)
                for m_, v_ in v.iteritems()}
            for m, v in metric_to_correlations.iteritems()
        }, tf.gfile.Open(FLAGS.save_output + ".correlations.json", "w"))
    json.dump(
        {
            m: {m_: str(v_)
                for m_, v_ in v.iteritems()}
            for m, v in metric_to_scores.iteritems()
        }, tf.gfile.Open(FLAGS.save_output + ".scores.json", "w"))
Example #4
0
def main(_):
    # Filenames.
    raw_data_json = os.path.join(FLAGS.data_dir, "generations.json")
    input_csv = os.path.join(FLAGS.data_dir, "inputs.csv")
    annotation_csv = os.path.join(FLAGS.data_dir, "annotations.csv")
    key_csv = os.path.join(FLAGS.data_dir, "keys.csv")

    # Read raw data
    with tf.gfile.Open(raw_data_json, "r") as f:
        eval_data = json.load(f)

    # Read the item descriptions and their keys.
    item_to_keys = {}
    uniq_keys = set()
    f_in = tf.gfile.Open(input_csv, "r")
    f_key = tf.gfile.Open(key_csv, "r")
    input_reader = csv.reader(f_in)
    input_headers = input_reader.next()
    description_index = input_headers.index("a.description")

    # pylint: disable=g-complex-comprehension
    for data, key in zip(input_reader, f_key):
        item_to_keys[data[description_index]] = [[
            k.split(",") for k in kk.split("..")
        ] for kk in key.strip().split("\t")]
        uniq_keys.update([
            k for key in item_to_keys[data[description_index]] for kk in key
            for k in kk
        ])

    uniq_keys = list(uniq_keys)

    # Read annotations.
    item_to_annotations = {}
    with tf.gfile.Open(annotation_csv, "r") as f:
        reader = csv.reader(f)
        headers = reader.next()
        description_index = headers.index("i.description")
        status_index = headers.index("t.status")
        annotation_indices = []
        for i, header in enumerate(headers):
            if "t.s.pair_sentence_selection__sentence_" in header:
                annotation_indices.append(i)
        assert len(annotation_indices) == len(item_to_keys.itervalues().next())
        for row in reader:
            if row[status_index] != "Completed":
                continue
            if row[description_index] not in item_to_keys:
                continue
            item_to_annotations[row[description_index]] = [
                int(row[ii]) for ii in annotation_indices
            ]

    # Collect sentences and references for each key.
    all_sentences = {k: [] for k in uniq_keys}
    all_references = []
    all_tables_tokenized = []
    for n in range(len(eval_data)):
        for key in uniq_keys:
            if key == "reference":
                continue
            all_sentences[key].append(eval_data[n][key].split())
        all_references.append([eval_data[n]["reference"].split()])
        all_tables_tokenized.append(_tokenize_table(eval_data[n]["table"]))

    # Compute PARENT scores for each lambda.
    logging.info("Computing PARENT scores for each system.")
    all_parent_scores = {k: {lbd: [] for lbd in LAMBDAS} for k in uniq_keys}
    for key in uniq_keys:
        if key == "reference":
            continue
        for lbd in LAMBDAS:
            logging.info("System %s Lambda %.1f", key, lbd)
            _, _, _, parent_scores = table_text_eval.parent(
                all_sentences[key],
                all_references,
                all_tables_tokenized,
                lambda_weight=lbd)
            all_parent_scores[key][lbd] = parent_scores
    logging.info("Done.")

    # Compute accuracy of each metric.
    metrics = ["parent-%.1f" % lbd for lbd in LAMBDAS]
    accuracy = {m: 0. for m in metrics}
    total = 0
    for item in tqdm(range(len(eval_data))):
        if str(item) not in item_to_annotations:
            continue
        annotations = item_to_annotations[str(item)]
        list_of_key_pairs = item_to_keys[str(item)]
        for ii, key_pairs in enumerate(list_of_key_pairs):
            annotation = annotations[ii]
            key_pair = key_pairs[0]
            if "reference" in key_pair:
                continue

            # Compute metrics.
            scores = {}
            # PARENT.
            for lbd in LAMBDAS:
                scores["parent-%.1f" % lbd] = [
                    all_parent_scores[key_pair[0]][lbd][item],
                    all_parent_scores[key_pair[1]][lbd][item]
                ]

            # Accuracies.
            predictions = {}
            for metric in scores:
                pred = 0 if scores[metric][0] >= scores[metric][1] else 1
                predictions[metric] = pred
                if pred == annotation:
                    accuracy[metric] += 1.

            total += 1

    print("Accuracies")
    for metric in metrics:
        print(metric + "," + str(accuracy[metric] / total))