def evaluate(tls_model, dataset, result_path, trunc_timelines=False, time_span_extension=0, word_mover_stop_words='nltk'): results = [] metric = 'align_date_content_costs_many_to_one' evaluator = tilse_rouge.TimelineRougeEvaluator( measures=["rouge_1", "rouge_2"]) n_topics = len(dataset.collections) for i, collection in enumerate(dataset.collections): ref_timelines = [ TilseTimeline(tl.date_to_summaries) for tl in collection.timelines ] topic = collection.name n_ref = len(ref_timelines) if trunc_timelines: ref_timelines = data.truncate_timelines(ref_timelines, collection) for j, ref_timeline in enumerate(ref_timelines): print( f'topic {i + 1}/{n_topics}: {topic}, ref timeline {j + 1}/{n_ref}' ) tls_model.load(ignored_topics=[collection.name]) ref_dates = sorted(ref_timeline.dates_to_summaries) start, end = data.get_input_time_span(ref_dates, time_span_extension) collection.start = start collection.end = end # utils.plot_date_stats(collection, ref_dates) l = len(ref_dates) k = data.get_average_summary_length(ref_timeline) pred_timeline_ = tls_model.predict( collection, max_dates=l, max_summary_sents=k, ref_tl=ref_timeline # only oracles need this ) print('*** PREDICTED ***') utils.print_tl(pred_timeline_) print('timeline done') pred_timeline = TilseTimeline(pred_timeline_.date_to_summaries) sys_len = len(pred_timeline.get_dates()) ground_truth = TilseGroundTruth([ref_timeline]) rouge_scores = get_scores(metric, pred_timeline, ground_truth, evaluator) date_scores = evaluate_dates(pred_timeline, ground_truth) wm_scores = get_wordmover_score(pred_timeline, ground_truth, word_mover_stop_words, device='cpu') dd_scores = date_dist_scores(pred_timeline, ground_truth) print('sys-len:', sys_len, 'gold-len:', l, 'gold-k:', k) print('Alignment-based ROUGE:') pprint(rouge_scores) print('Date selection:') pprint(date_scores) pprint(dd_scores) print('WordMover scores:') pprint(wm_scores) print('-' * 100) results.append((rouge_scores, date_scores, wm_scores, dd_scores, pred_timeline_.to_dict())) print("Running average:") print(get_average_results(results)) print() avg_results = get_average_results(results) print('Average results:') pprint(avg_results) output = { 'average': avg_results, 'results': results, } utils.write_json(output, result_path)
def evaluate(tls_model, dataset, result_path, trunc_timelines=False, time_span_extension=0): results = [] metric = 'align_date_content_costs_many_to_one' evaluator = rouge.TimelineRougeEvaluator(measures=["rouge_1", "rouge_2"]) n_topics = len(dataset.collections) ave_cluster = 0 for i, collection in enumerate(dataset.collections): ref_timelines = [ TilseTimeline(tl.date_to_summaries) for tl in collection.timelines ] topic = collection.name n_ref = len(ref_timelines) # only for entity if trunc_timelines: ref_timelines = data.truncate_timelines(ref_timelines, collection) for j, ref_timeline in enumerate(ref_timelines): print( f'topic {i+1}/{n_topics}: {topic}, ref timeline {j+1}/{n_ref}') tls_model.load(ignored_topics=[collection.name]) ref_dates = sorted(ref_timeline.dates_to_summaries) #print("data to summaries = {}".format(ref_dates)) start, end = data.get_input_time_span(ref_dates, time_span_extension) collection.start = start collection.end = end print("name = {} start = {} end = {}".format(topic, start, end)) #utils.plot_date_stats(collection, ref_dates) l = len(ref_dates) k = data.get_average_summary_length(ref_timeline) pred_timeline_, n_clusters = tls_model.predict( collection, max_dates=l, max_summary_sents=k, ref_tl=ref_timeline # only oracles need this ) ave_cluster = ave_cluster + n_clusters # print('*** PREDICTED ***') # utils.print_tl(pred_timeline_) print('timeline done') pred_timeline = TilseTimeline(pred_timeline_.date_to_summaries) sys_len = len(pred_timeline.get_dates()) ground_truth = TilseGroundTruth([ref_timeline]) rouge_scores = get_scores(metric, pred_timeline, ground_truth, evaluator) date_scores = evaluate_dates(pred_timeline, ground_truth) print('sys-len:', sys_len, 'gold-len:', l, 'gold-k:', k) print('Alignment-based ROUGE:') pprint(rouge_scores) print('Date selection:') pprint(date_scores) print('-' * 100) results.append( (rouge_scores, date_scores, pred_timeline_.to_dict())) avg_results = get_average_results(results) print('Average results:') pprint(avg_results) output = { 'average_clusters': ave_cluster / len(dataset.collections), 'average': avg_results, 'results': results, } utils.write_json(output, result_path)