Example #1
0
    def setUp(self):
        self.evaluator = rouge.TimelineRougeEvaluator()

        self.ground_truth = timelines.GroundTruth(
            [
                timelines.Timeline(
                    {
                        datetime.date(2010, 1, 1): ["timeline summarization ."],
                        datetime.date(2010, 1, 2): ["timeline summarization is awesome .",
                                                    "coreference resolution is , too ."],
                        datetime.date(2010, 1, 4): ["alignments are really nice"]
                    }
                ),
                timelines.Timeline(
                    {
                        datetime.date(2010, 1, 2): ["metrics are complicated ."]
                    }
                ),
            ]
        )

        self.output_same_number_of_dates = timelines.Timeline(
            {
                datetime.date(2010, 1, 2): ["timeline summarization ."],
                datetime.date(2010, 1, 3): ["doing some metric checks ."],
                datetime.date(2010, 1, 5): ["checks for alignments ."],
            }
        )

        self.output_same_number_of_dates_scores_higher_with_date_content_costs = timelines.Timeline(
            {
                datetime.date(2010, 1, 2): ["timeline summarization ."],
                datetime.date(2010, 1, 3): ["alignments are really nice"],
                datetime.date(2010, 1, 5): ["timeline summarization ."],
            }
        )

        self.output_less_dates = timelines.Timeline(
            {
                datetime.date(2010, 1, 2): ["timeline summarization ."],
                datetime.date(2010, 1, 3): ["doing some metric checks ."],
            }
        )

        self.output_more_dates = timelines.Timeline(
            {
                datetime.date(2010, 1, 2): ["timeline summarization ."],
                datetime.date(2010, 1, 3): ["doing some metric checks ."],
                datetime.date(2010, 1, 5): ["checks for alignments ."],
                datetime.date(2010, 1, 6): ["asdf"],
            }
        )
Example #2
0
    #     news_corpora[topic] = news_corpora[topic].filter_by_keywords_contained(keyword_mapping[topic])

    # read groundtruth timelines
    for filename in sorted(
            list(os.listdir(raw_directory + "/" + topic + "/timelines/"))):
        full_path = raw_directory + "/" + topic + "/timelines/" + filename

        temp_reference_timelines[topic].append(
            timelines.Timeline.from_file(
                codecs.open(full_path, "r", "utf-8", "replace")))

for topic in temp_reference_timelines:
    reference_timelines[topic] = timelines.GroundTruth(
        temp_reference_timelines[topic])

evaluator = rouge.TimelineRougeEvaluator(measures=["rouge_1", "rouge_2"],
                                         beta=1)

model_avg_scores = {}

for _config in configs:
    config_file = modelbase_dir + _config
    print(_config)
    print(config_file)
    config = json.load(open(config_file))
    config["rouge_computation"] = "reimpl"

    logging.info(config)

    algorithm = None

    if config["algorithm"] == "chieu":
Example #3
0
def evaluate_tl_main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f",
                        dest="filter_corpus",
                        default=False,
                        action="store_true")
    parser.add_argument("-c", dest="constraint", default="sent")
    parser.add_argument("-t", dest="timelines", nargs="+")
    parser.add_argument("-m",
                        dest="num_multi_selection_runs",
                        type=int,
                        default=None)
    parser.add_argument("--queryfile")
    parser.add_argument("corpus_pickle")
    parser.add_argument("config")

    args = parser.parse_args()

    if args.constraint == "sent":
        use_token_count = False
    elif args.constraint == "tok":
        use_token_count = True
    else:
        raise ValueError("Unknown constraint {}".format(args.constraint))

    corpus = load_corpus(args.corpus_pickle,
                         filter_blacklist=args.filter_corpus)

    timelines = []

    for tl_fname in args.timelines:
        with open(tl_fname, errors="ignore") as f:
            timeline = Timeline.from_file(f)
            timelines.append((os.path.basename(tl_fname), timeline))

    #tl_gen = APClusteringTimelineGenerator(True)

    with open(args.config) as f:
        config = json.load(f)

    tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config)

    corpus_basename = os.path.basename(corpus.name).split(".")[0]
    print(corpus_basename)
    config_basename = os.path.basename(args.config)

    results_basename = config_basename
    if args.queryfile:
        results_basename += "+queryfilter"

    out_timelines_dir = os.path.join("system_timelines",
                                     results_basename + "+" + args.constraint,
                                     corpus_basename)
    results_dir = os.path.join("evaluation_results",
                               results_basename + "+" + args.constraint)

    if not os.path.isdir(out_timelines_dir):
        os.makedirs(out_timelines_dir)
    if not os.path.isdir(results_dir):
        os.makedirs(results_dir)

    query_words = None
    if args.queryfile is not None:
        with open(args.queryfile) as f:
            query_words = [l.strip() for l in f]

    debug_identifier = results_basename + "+" + corpus_basename

    if use_token_count:
        config["scoring"]["use_length"] = True

    if args.num_multi_selection_runs is None:
        sys_timelines = tl_gen.generate_timelines(
            corpus, [
                determine_tl_parameters(tl, use_token_count=use_token_count)
                for _, tl in timelines
            ],
            reference_timelines=list(map(lambda x: x[1], timelines)),
            query_words=query_words,
            debug_identifier=debug_identifier)

        write_results_file(os.path.join(results_dir, corpus_basename + ".txt"),
                           out_timelines_dir, timelines, sys_timelines)

    else:
        with open("multirun-results+{}.txt".format(config_basename),
                  "a") as f_out:
            print(timelines)

            evaluator = rouge.TimelineRougeEvaluator(
                measures=["rouge_1", "rouge_2"])
            all_run_timelines = tl_gen.generate_timelines(
                corpus, [
                    determine_tl_parameters(tl,
                                            use_token_count=use_token_count)
                    for _, tl in timelines
                ],
                reference_timelines=list(map(lambda x: x[1], timelines)),
                query_words=query_words,
                debug_identifier=debug_identifier,
                num_selection_runs=args.num_multi_selection_runs)
            for sys_timelines in all_run_timelines:
                for (timeline_name, gold_timeline), sys_timeline in zip(
                        timelines, sys_timelines):
                    reference_timeline = GroundTruth([gold_timeline])
                    eval_results = evaluator.evaluate_concat(
                        "TL", sys_timeline, reference_timeline)
                    eval_results_agree = evaluator.evaluate_agreement(
                        "TL", sys_timeline, reference_timeline)
                    eval_results_align = evaluator.evaluate_align_date_content_costs_many_to_one(
                        "TL", sys_timeline, reference_timeline)

                    f_out.write(" ".join(
                        map(str, [
                            eval_results["rouge_1"]["f_score"],
                            eval_results["rouge_2"]["f_score"],
                            eval_results_agree["rouge_1"]["f_score"],
                            eval_results_agree["rouge_2"]["f_score"],
                            eval_results_align["rouge_1"]["f_score"],
                            eval_results_align["rouge_2"]["f_score"]
                        ])))
                    f_out.write("\n")
                f_out.write("--------\n")

            f_out.write("========\n")
Example #4
0
def write_results_file(outfilename, out_timelines_dir, timelines,
                       sys_timelines):
    evaluator = rouge.TimelineRougeEvaluator(measures=["rouge_1", "rouge_2"])

    rouge_1_sum = 0
    rouge_1_r_sum = 0
    rouge_1_p_sum = 0
    rouge_2_sum = 0
    rouge_2_r_sum = 0
    rouge_2_p_sum = 0

    agree_rouge_1_sum = 0
    agree_rouge_1_r_sum = 0
    agree_rouge_1_p_sum = 0
    agree_rouge_2_sum = 0
    agree_rouge_2_r_sum = 0
    agree_rouge_2_p_sum = 0

    align_rouge_1_sum = 0
    align_rouge_1_r_sum = 0
    align_rouge_1_p_sum = 0
    align_rouge_2_sum = 0
    align_rouge_2_r_sum = 0
    align_rouge_2_p_sum = 0

    date_f1_sum = 0
    date_f1_r_sum = 0
    date_f1_p_sum = 0

    with open(outfilename, "w") as f_out:
        f_out.write(
            "Timeline        \tDate R\tDate P\tDate F1\tR1 R\tR1 P\tR1 F1\tR2 R\tR2 P\tR2 F1\tR1 R\tR1 P\tR1 F1\tR2 R\tR2 P\tR2 F1\tR1 R\tR1 P\tR1 F1\tR2 R\tR2 P\tR2 F1\n"
        )

        for (timeline_name,
             gold_timeline), sys_timeline in zip(timelines, sys_timelines):
            with open(os.path.join(out_timelines_dir, timeline_name),
                      "w") as f_tl:
                f_tl.write(str(sys_timeline))

            reference_timeline = GroundTruth([gold_timeline])
            eval_results = evaluator.evaluate_concat(sys_timeline,
                                                     reference_timeline)
            rouge_1_sum += eval_results["rouge_1"]["f_score"]
            rouge_1_r_sum += eval_results["rouge_1"]["recall"]
            rouge_1_p_sum += eval_results["rouge_1"]["precision"]
            rouge_2_sum += eval_results["rouge_2"]["f_score"]
            rouge_2_r_sum += eval_results["rouge_2"]["recall"]
            rouge_2_p_sum += eval_results["rouge_2"]["precision"]

            eval_results_agree = evaluator.evaluate_agreement(
                sys_timeline, reference_timeline)
            agree_rouge_1_sum += eval_results_agree["rouge_1"]["f_score"]
            agree_rouge_1_r_sum += eval_results_agree["rouge_1"]["recall"]
            agree_rouge_1_p_sum += eval_results_agree["rouge_1"]["precision"]
            agree_rouge_2_sum += eval_results_agree["rouge_2"]["f_score"]
            agree_rouge_2_r_sum += eval_results_agree["rouge_2"]["recall"]
            agree_rouge_2_p_sum += eval_results_agree["rouge_2"]["precision"]

            eval_results_align = evaluator.evaluate_align_date_content_costs_many_to_one(
                sys_timeline, reference_timeline)
            align_rouge_1_sum += eval_results_align["rouge_1"]["f_score"]
            align_rouge_1_r_sum += eval_results_align["rouge_1"]["recall"]
            align_rouge_1_p_sum += eval_results_align["rouge_1"]["precision"]
            align_rouge_2_sum += eval_results_align["rouge_2"]["f_score"]
            align_rouge_2_r_sum += eval_results_align["rouge_2"]["recall"]
            align_rouge_2_p_sum += eval_results_align["rouge_2"]["precision"]

            print(" ".join(
                map(lambda x: "{}-{}-{}".format(x.year, x.month, x.day),
                    sorted(sys_timeline))))
            print(" ".join(
                map(lambda x: "{}-{}-{}".format(x.year, x.month, x.day),
                    sorted(gold_timeline))))

            date_recall = len(set(sys_timeline)
                              & set(gold_timeline)) / len(gold_timeline)
            date_precision = len(set(sys_timeline)
                                 & set(gold_timeline)) / len(sys_timeline)

            if date_recall + date_precision > 0:
                date_f1 = 2 * (date_recall * date_precision) / (date_recall +
                                                                date_precision)
            else:
                date_f1 = 0.0

            date_f1_sum += date_f1
            date_f1_r_sum += date_recall
            date_f1_p_sum += date_precision

            f_out.write(
                "{:<16}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n"
                .format(timeline_name, date_recall, date_precision, date_f1,
                        eval_results["rouge_1"]["recall"],
                        eval_results["rouge_1"]["precision"],
                        eval_results["rouge_1"]["f_score"],
                        eval_results["rouge_2"]["recall"],
                        eval_results["rouge_2"]["precision"],
                        eval_results["rouge_2"]["f_score"],
                        eval_results_agree["rouge_1"]["recall"],
                        eval_results_agree["rouge_1"]["precision"],
                        eval_results_agree["rouge_1"]["f_score"],
                        eval_results_agree["rouge_2"]["recall"],
                        eval_results_agree["rouge_2"]["precision"],
                        eval_results_agree["rouge_2"]["f_score"],
                        eval_results_align["rouge_1"]["recall"],
                        eval_results_align["rouge_1"]["precision"],
                        eval_results_align["rouge_1"]["f_score"],
                        eval_results_align["rouge_2"]["recall"],
                        eval_results_align["rouge_2"]["precision"],
                        eval_results_align["rouge_2"]["f_score"]))

        f_out.write(
            "{:<16}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n"
            .format(
                "All", date_f1_r_sum / len(timelines),
                date_f1_p_sum / len(timelines), date_f1_sum / len(timelines),
                rouge_1_r_sum / len(timelines), rouge_1_p_sum / len(timelines),
                rouge_1_sum / len(timelines), rouge_2_r_sum / len(timelines),
                rouge_2_p_sum / len(timelines), rouge_2_sum / len(timelines),
                agree_rouge_1_r_sum / len(timelines), agree_rouge_1_p_sum /
                len(timelines), agree_rouge_1_sum / len(timelines),
                agree_rouge_2_r_sum / len(timelines), agree_rouge_2_p_sum /
                len(timelines), agree_rouge_2_sum / len(timelines),
                align_rouge_1_r_sum / len(timelines), align_rouge_1_p_sum /
                len(timelines), align_rouge_1_sum / len(timelines),
                align_rouge_2_r_sum / len(timelines), align_rouge_2_p_sum /
                len(timelines), align_rouge_2_sum / len(timelines)))
        #print(sys_timeline)

    print("ROUGE 1", rouge_1_sum / len(timelines))
    print("ROUGE 2", rouge_2_sum / len(timelines))
    print("Date F1", date_f1_sum / len(timelines))
Example #5
0
def evaluate(tls_model,
             dataset,
             result_path,
             trunc_timelines=False,
             time_span_extension=0,
             word_mover_stop_words='nltk'):
    results = []
    metric = 'align_date_content_costs_many_to_one'
    evaluator = tilse_rouge.TimelineRougeEvaluator(
        measures=["rouge_1", "rouge_2"])
    n_topics = len(dataset.collections)

    for i, collection in enumerate(dataset.collections):

        ref_timelines = [
            TilseTimeline(tl.date_to_summaries) for tl in collection.timelines
        ]
        topic = collection.name
        n_ref = len(ref_timelines)

        if trunc_timelines:
            ref_timelines = data.truncate_timelines(ref_timelines, collection)

        for j, ref_timeline in enumerate(ref_timelines):
            print(
                f'topic {i + 1}/{n_topics}: {topic}, ref timeline {j + 1}/{n_ref}'
            )

            tls_model.load(ignored_topics=[collection.name])

            ref_dates = sorted(ref_timeline.dates_to_summaries)

            start, end = data.get_input_time_span(ref_dates,
                                                  time_span_extension)

            collection.start = start
            collection.end = end

            # utils.plot_date_stats(collection, ref_dates)

            l = len(ref_dates)
            k = data.get_average_summary_length(ref_timeline)

            pred_timeline_ = tls_model.predict(
                collection,
                max_dates=l,
                max_summary_sents=k,
                ref_tl=ref_timeline  # only oracles need this
            )

            print('*** PREDICTED ***')
            utils.print_tl(pred_timeline_)

            print('timeline done')
            pred_timeline = TilseTimeline(pred_timeline_.date_to_summaries)
            sys_len = len(pred_timeline.get_dates())
            ground_truth = TilseGroundTruth([ref_timeline])

            rouge_scores = get_scores(metric, pred_timeline, ground_truth,
                                      evaluator)
            date_scores = evaluate_dates(pred_timeline, ground_truth)
            wm_scores = get_wordmover_score(pred_timeline,
                                            ground_truth,
                                            word_mover_stop_words,
                                            device='cpu')
            dd_scores = date_dist_scores(pred_timeline, ground_truth)

            print('sys-len:', sys_len, 'gold-len:', l, 'gold-k:', k)

            print('Alignment-based ROUGE:')
            pprint(rouge_scores)
            print('Date selection:')
            pprint(date_scores)
            pprint(dd_scores)
            print('WordMover scores:')
            pprint(wm_scores)
            print('-' * 100)
            results.append((rouge_scores, date_scores, wm_scores, dd_scores,
                            pred_timeline_.to_dict()))

            print("Running average:")
            print(get_average_results(results))
            print()

    avg_results = get_average_results(results)
    print('Average results:')
    pprint(avg_results)
    output = {
        'average': avg_results,
        'results': results,
    }
    utils.write_json(output, result_path)
Example #6
0
def evaluate(tls_model,
             dataset,
             result_path,
             trunc_timelines=False,
             time_span_extension=0):

    results = []
    metric = 'align_date_content_costs_many_to_one'
    evaluator = rouge.TimelineRougeEvaluator(measures=["rouge_1", "rouge_2"])
    n_topics = len(dataset.collections)
    ave_cluster = 0

    for i, collection in enumerate(dataset.collections):

        ref_timelines = [
            TilseTimeline(tl.date_to_summaries) for tl in collection.timelines
        ]
        topic = collection.name
        n_ref = len(ref_timelines)

        # only for entity
        if trunc_timelines:
            ref_timelines = data.truncate_timelines(ref_timelines, collection)

        for j, ref_timeline in enumerate(ref_timelines):

            print(
                f'topic {i+1}/{n_topics}: {topic}, ref timeline {j+1}/{n_ref}')

            tls_model.load(ignored_topics=[collection.name])

            ref_dates = sorted(ref_timeline.dates_to_summaries)
            #print("data to summaries = {}".format(ref_dates))

            start, end = data.get_input_time_span(ref_dates,
                                                  time_span_extension)

            collection.start = start
            collection.end = end
            print("name = {} start = {} end = {}".format(topic, start, end))

            #utils.plot_date_stats(collection, ref_dates)

            l = len(ref_dates)
            k = data.get_average_summary_length(ref_timeline)

            pred_timeline_, n_clusters = tls_model.predict(
                collection,
                max_dates=l,
                max_summary_sents=k,
                ref_tl=ref_timeline  # only oracles need this
            )
            ave_cluster = ave_cluster + n_clusters

            # print('*** PREDICTED ***')
            # utils.print_tl(pred_timeline_)

            print('timeline done')
            pred_timeline = TilseTimeline(pred_timeline_.date_to_summaries)
            sys_len = len(pred_timeline.get_dates())
            ground_truth = TilseGroundTruth([ref_timeline])

            rouge_scores = get_scores(metric, pred_timeline, ground_truth,
                                      evaluator)
            date_scores = evaluate_dates(pred_timeline, ground_truth)

            print('sys-len:', sys_len, 'gold-len:', l, 'gold-k:', k)

            print('Alignment-based ROUGE:')
            pprint(rouge_scores)
            print('Date selection:')
            pprint(date_scores)
            print('-' * 100)
            results.append(
                (rouge_scores, date_scores, pred_timeline_.to_dict()))

    avg_results = get_average_results(results)
    print('Average results:')
    pprint(avg_results)
    output = {
        'average_clusters': ave_cluster / len(dataset.collections),
        'average': avg_results,
        'results': results,
    }
    utils.write_json(output, result_path)