def _compute_average(self): topics = sorted(list(self.mapping.keys())) modes = [k for k in self.mapping[list(topics)[0]]] self.mapping["average_score"] = OrderedDict() for mode in modes: self.mapping["average_score"][mode] = {} for measure in self.mapping[list(topics)[0]][mode]: self.mapping["average_score"][mode][measure] = {} mode_scores = [self.mapping[t][mode][measure] for t in topics] self.mapping["average_score"][mode][measure]["precision"] = \ sum([s["precision"] for s in mode_scores]) / len(mode_scores) self.mapping["average_score"][mode][measure]["recall"] = \ sum([s["recall"] for s in mode_scores]) / len(mode_scores) self.mapping["average_score"][mode][measure]["f_score"] = util.get_f_score( self.mapping["average_score"][mode][measure]["precision"], self.mapping["average_score"][mode][measure]["recall"], beta=self.beta ) all_date_scores = [self.date_mapping[t] for t in topics] self.date_mapping["average_score"] = OrderedDict() self.date_mapping["average_score"]["precision"] = sum([x["precision"] for x in all_date_scores]) / len(topics) self.date_mapping["average_score"]["recall"] = sum([x["recall"] for x in all_date_scores]) / len(topics) self.date_mapping["average_score"]["f_score"] = util.get_f_score( self.date_mapping["average_score"]["precision"], self.date_mapping["average_score"]["recall"], beta=self.beta )
def evaluate_concat(self, predicted_timeline, reference_timelines): """ Evaluate a predicted timeline w.r.t. a set of reference timelines using the 'concat' ROUGE variant. This variant first concatenates all daily summaries of the respective timelines. The resulting documents are then evaluated using the ROUGE measure. Args: predicted_timeline (data.timelines.Timeline): A timeline. reference_timelines (data.timelines.GroundTruth): A ground truth of timelines. Returns: A dict(str, dict(str, str)) object mapping each ROUGE measure in `self.measures` to a dict that maps 'precision', 'recall' and 'f_score' to the corresponding values, e.g. {"rouge_1": {"precision": 1.0, "recall": 1.0, "f_score": 1.0}} """ pred_sents = [] for date in sorted(list(predicted_timeline.get_dates())): pred_sents.extend( [sent.split() for sent in predicted_timeline[date]]) ref_sents = {} for i, timeline in enumerate(reference_timelines.timelines): ref_sents[str(i)] = [] timeline_dates = sorted(list(timeline.get_dates())) for date in timeline_dates: ref_sents[str(i)].extend( [sent.split() for sent in timeline[date]]) scores = self._get_rouge_counts(pred_sents, ref_sents) output_scores = {} for measure in self.measures: prec = scores[measure]["prec_num"] rec = scores[measure]["rec_num"] if (scores[measure]["prec_denom"] > 0): prec = scores[measure]["prec_num"] / scores[measure][ "prec_denom"] if (scores[measure]["rec_denom"] > 0): rec = scores[measure]["rec_num"] / scores[measure]["rec_denom"] output_scores[measure] = { "precision": prec, "recall": rec, "f_score": util.get_f_score(prec, rec, beta=self.beta) } return output_scores
def _evaluate_per_day_mapping_micro( self, predicted_timeline, reference_timelines, compute_costs, optimize_assignment): precision_numerator = collections.defaultdict(list) precision_denominator = collections.defaultdict(list) recall_numerator = collections.defaultdict(list) recall_denominator = collections.defaultdict(list) pred_dates = sorted(list(predicted_timeline.get_dates())) ref_dates = sorted(list(reference_timelines.get_dates())) prec_costs = compute_costs(pred_dates, ref_dates, predicted_timeline, reference_timelines, axis=0) rec_costs = compute_costs(pred_dates, ref_dates, predicted_timeline, reference_timelines, axis=1) prec_row, prec_col = optimize_assignment(prec_costs) rec_row, rec_col = optimize_assignment(rec_costs) # precision for row, col in zip(prec_row, prec_col): pred_date = pred_dates[row] ref_date = ref_dates[col] temp_groundtruth = reference_timelines[ref_date] groundtruth = {} for name in temp_groundtruth: groundtruth[name] = [sent.split() for sent in temp_groundtruth[name]] scores = self._get_rouge_counts( [sent.split() for sent in predicted_timeline[pred_date]], groundtruth ) for measure in self.measures: precision_numerator[measure].append( (1 / (abs(pred_date.toordinal() - ref_date.toordinal()) + 1)) * scores[measure]["prec_num"]) precision_denominator[measure].append(scores[measure]["prec_denom"]) matched_prec = set(list(prec_row)) for i, date in enumerate(pred_dates): if i not in matched_prec: pred_date = pred_dates[i] scores = self._get_rouge_counts( [sent.split() for sent in predicted_timeline[pred_date]], {str(i): [[""]] for i, _ in enumerate(reference_timelines.timelines)} ) for measure in self.measures: precision_numerator[measure].append(scores[measure]["prec_num"]) precision_denominator[measure].append(scores[measure]["prec_denom"]) # recall for row, col in zip(rec_row, rec_col): pred_date = pred_dates[col] ref_date = ref_dates[row] temp_groundtruth = reference_timelines[ref_date] groundtruth = {} for name in temp_groundtruth: groundtruth[name] = [sent.split() for sent in temp_groundtruth[name]] scores = self._get_rouge_counts( [sent.split() for sent in predicted_timeline[pred_date]], groundtruth ) for measure in self.measures: recall_numerator[measure].append( (1 / (abs(pred_date.toordinal() - ref_date.toordinal()) + 1)) * scores[measure]["rec_num"]) recall_denominator[measure].append(scores[measure]["rec_denom"]) matched_rec = set(list(rec_row)) for i, date in enumerate(ref_dates): if i not in matched_rec: ref_date = ref_dates[i] temp_groundtruth = reference_timelines[ref_date] groundtruth = {} for name in temp_groundtruth: groundtruth[name] = [sent.split() for sent in temp_groundtruth[name]] scores = self._get_rouge_counts( [[""]], groundtruth ) for measure in self.measures: recall_numerator[measure].append(scores[measure]["rec_num"]) recall_denominator[measure].append(scores[measure]["rec_denom"]) output_scores = {} for measure in self.measures: prec_denom_sum = sum(precision_denominator[measure]) if prec_denom_sum == 0: prec = 0 else: prec = sum(precision_numerator[measure]) / prec_denom_sum rec_denom_sum = sum(recall_denominator[measure]) if rec_denom_sum == 0: rec = 0 else: rec = sum(recall_numerator[measure]) / rec_denom_sum output_scores[measure] = { "precision": prec, "recall": rec, "f_score": util.get_f_score(prec, rec, beta=self.beta) } return output_scores
def evaluate_agreement(self, predicted_timeline, reference_timelines): """ Evaluate a predicted timeline w.r.t. a set of reference timelines using the 'agreement' ROUGE variant. This variant compares the daily summaries of a date if the date appears in both the predicted timeline and in one of the reference timelines. Args: predicted_timeline (data.timelines.Timeline): A timeline. reference_timelines (data.timelines.GroundTruth): A ground truth of timelines. Returns: A dict(str, dict(str, str)) object mapping each ROUGE measure in `self.measures` to a dict that maps 'precision', 'recall' and 'f_score' to the corresponding values, e.g. {"rouge_1": {"precision": 1.0, "recall": 1.0, "f_score": 1.0}} """ precision_numerator = collections.defaultdict(list) precision_denominator = collections.defaultdict(list) recall_numerator = collections.defaultdict(list) recall_denominator = collections.defaultdict(list) pred_dates = predicted_timeline.get_dates() ref_dates = reference_timelines.get_dates() all_dates = pred_dates.union(ref_dates) for date in all_dates: temp_groundtruth = reference_timelines[date] groundtruth = {} for name in temp_groundtruth: groundtruth[name] = [sent.split() for sent in temp_groundtruth[name]] scores = self._get_rouge_counts( [sent.split() for sent in predicted_timeline[date]], groundtruth ) for measure in self.measures: if date in pred_dates: precision_numerator[measure].append(scores[measure]["prec_num"]) precision_denominator[measure].append(scores[measure]["prec_denom"]) if date in ref_dates: recall_numerator[measure].append(scores[measure]["rec_num"]) recall_denominator[measure].append(scores[measure]["rec_denom"]) output_scores = {} for measure in self.measures: prec_denom_sum = sum(precision_denominator[measure]) if prec_denom_sum == 0: prec = 0 else: prec = sum(precision_numerator[measure]) / prec_denom_sum rec_denom_sum = sum(recall_denominator[measure]) if rec_denom_sum == 0: rec = 0 else: rec = sum(recall_numerator[measure]) / rec_denom_sum output_scores[measure] = { "precision": prec, "recall": rec, "f_score": util.get_f_score(prec, rec, beta=self.beta) } return output_scores
def test_get_f_score(self): self.assertEqual(0, util.get_f_score(0, 1, 1)) self.assertEqual(1, util.get_f_score(1, 1, 1)) self.assertEqual(1, util.get_f_score(1, 1, 1)) self.assertAlmostEqual(0.555555556, util.get_f_score(0.5, 1, 0.5))
def train(self, corpora, preprocessed_information, timelines, topic_to_evaluate): """ Computes per-day ROUGE F1 for each sentence in the corpus for `topic_to_evaluate` (This is quite a misuse of the semantics of this function). Params: corpora (dict(str, tilse.data.corpora.Corpus)): A mapping of topic names to corresponding corpora. preprocessed_information (object): Arbitrary information obtained from preprocessing. reference_timelines (dict(str, tilse.data.timelines.Groundtruth)): A mapping of topic names to corresponding reference timelines. topic_to_evaluate (str): The topic to evaluate (must be a key in `corpora`. The given topic will not be used during training (such that it can serve as evaluation data later). Returns: A mapping of timeline properties for each of the timelines in `timelines[`topic_to_evaluate`]` to an numpy array of per-day ROUGE-1 F1 scores for all sentences in the corresponding corpus. """ rouge = RougeReimplementation() corpus = corpora[topic_to_evaluate] reference_timelines = timelines[topic_to_evaluate] rouge_vals = {} for tl in reference_timelines.timelines: tp = self.get_timeline_properties(tl) rouge_vals[tp] = [] for doc in corpus.docs: for sent in doc: sent_processed = [[x.content for x in sent]] ref_processed = { "0": [[x for x in s.split()] for s in tl[sent.date]] } rouge_computed = rouge.score_summary( sent_processed, ref_processed) if rouge_computed["rouge_1_p_count"] == 0: prec = 0 else: prec = rouge_computed["rouge_1_h_count"] / \ rouge_computed["rouge_1_p_count"] if rouge_computed["rouge_1_m_count"] == 0: rec = 0 else: rec = rouge_computed["rouge_1_h_count"] / \ rouge_computed["rouge_1_m_count"] f1 = get_f_score(prec, rec) rouge_vals[tp].append(f1) rouge_vals[tp] = numpy.array(rouge_vals[tp]) return rouge_vals
def train(self, corpora, preprocessed_information, reference_timelines, timeline_to_evaluate): """ Trains the model. For details on training, see the docstring of this class. Params: corpora (dict(str, tilse.data.corpora.Corpus)): A mapping of topic names to corresponding corpora. preprocessed_information (object): Arbitrary information obtained from preprocessing. reference_timelines (dict(str, tilse.data.timelines.Groundtruth)): A mapping of topic names to corresponding reference timelines. topic_to_evaluate (str): The topic to evaluate (must be a key in `corpora`. The given topic will not be used during training (such that it can serve as evaluation data later). Returns: Nothing, `self.model` is updated. """ rouge = RougeReimplementation() features = [] f1_scores = [] for t in corpora: if t == timeline_to_evaluate: continue corpus = corpora[t] sum_tfidf, avg_tfidf = preprocessed_information[t] i = 0 for doc in corpus: for sent in doc: sent_processed = [[x.content for x in sent]] ref_temp = reference_timelines[t][sent.date] ref_processed = {} for k, sents in ref_temp.items(): ref_processed[k] = [[x for x in s.split()] for s in sents] rouge_computed = rouge.score_summary( sent_processed, ref_processed) if rouge_computed["rouge_1_p_count"] == 0: prec = 0 else: prec = rouge_computed[ "rouge_1_h_count"] / rouge_computed[ "rouge_1_p_count"] if rouge_computed["rouge_1_m_count"] == 0: rec = 0 else: rec = rouge_computed[ "rouge_1_h_count"] / rouge_computed[ "rouge_1_m_count"] f1 = util.get_f_score(prec, rec) features.append( Regression._compute_features_for_sent( sent, i, sum_tfidf, avg_tfidf)) f1_scores.append(f1) i += 1 vectorized = self.vectorizer.fit_transform(features) self.model.fit(vectorized, f1_scores)