def evaluate_instances( instances: List[EvalInstance], metrics: List[Metric]) -> Tuple[MetricsDict, List[Metrics]]: macro = MetricsDict() micro_list = get_initial_micro_list(instances) for metric in metrics: # Prepare the input arguments summary_args = [] for field in metric.required_summary_fields: summary_args.append( [instance.fields[field].to_input() for instance in instances]) context_args = [] for field in metric.required_context_fields: context_args.append( [instance.fields[field].to_input() for instance in instances]) # Score all the summaries this_macro, this_micro_list = metric.evaluate(*summary_args, *context_args) # Update the global metrics dictionaries macro.update(this_macro) for micro, this_micro in zip(micro_list, this_micro_list): micro.metrics.update(this_micro) return macro, micro_list
def _run(self, summary_index_to_scus: List[Set[int]], reference_index_to_scus_list: List[List[Set[int]]], matches_list: List[List[Tuple[int, int, float]]], precision_weights: List[float], recall_weights_list: List[List[float]]): standard_counts = MetricsDict({'weight': 0, 'summary_weight': 0, 'reference_weight': 0, 'scu_weight': 0, 'non_scu_weight': 0}) scu_counts = MetricsDict({'weight': 0, 'summary_weight': 0, 'reference_weight': 0}) non_scu_counts = MetricsDict({'weight': 0, 'summary_weight': 0, 'reference_weight': 0}) for matches, reference_index_to_scus, recall_weights in zip(matches_list, reference_index_to_scus_list, recall_weights_list): # Filter the SCUs to just those which the summary and reference have in common valid_scus = self._get_scu_intersection(summary_index_to_scus, reference_index_to_scus) this_summary_index_to_scus = self._filter_index_to_scus(summary_index_to_scus, valid_scus) this_reference_index_to_scus = self._filter_index_to_scus(reference_index_to_scus, valid_scus) standard_counts += self.backend.calculate_standard_metric(this_summary_index_to_scus, this_reference_index_to_scus, precision_weights, recall_weights, matches) scu_counts += self.backend.calculate_scu_metric(this_summary_index_to_scus, this_reference_index_to_scus, precision_weights, recall_weights, matches) non_scu_counts += self.backend.calculate_non_scu_metric(this_summary_index_to_scus, this_reference_index_to_scus, precision_weights, recall_weights, matches) self._add_pr(standard_counts) self._add_pr(scu_counts) self._add_pr(non_scu_counts) return MetricsDict({ f'{self.name}-standard': standard_counts, f'{self.name}-scu': scu_counts, f'{self.name}-non-scu': non_scu_counts })
def test_init_with_metrics_dict(self): a = MetricsDict({'k1': 1, 'k2': {'k3': [1, 2, 3]}}) b = MetricsDict(a) b['k2']['k3'].append(4) assert a == {'k1': 1, 'k2': {'k3': [1, 2, 3]}} assert b == {'k1': 1, 'k2': {'k3': [1, 2, 3, 4]}}
def test_add(self): a = MetricsDict({'k1': 1, 'k2': {'k3': 4}}) b = MetricsDict({'k1': 2, 'k2': {'k3': 5}}) c = MetricsDict({'k1': 3, 'k2': {'k3': 6}}) assert a + b == {'k1': 3, 'k2': {'k3': 9}} assert a == {'k1': 1, 'k2': {'k3': 4}} assert b == {'k1': 2, 'k2': {'k3': 5}} assert sum([a, b, c]) == {'k1': 6, 'k2': {'k3': 15}} assert a == {'k1': 1, 'k2': {'k3': 4}} assert b == {'k1': 2, 'k2': {'k3': 5}} assert c == {'k1': 3, 'k2': {'k3': 6}}
def _load_generic_scores(input_file: str): data = json.load(open(input_file, 'r')) instances = [] metrics_list = [] documents = {} for i, instance in enumerate(data): document = instance['text'].strip() summary = instance['summary'].strip() scores = instance['scores'] summarizer_id = str(i) if document not in documents: documents[document] = str(len(documents)) instance_id = documents[document] instances.append({ 'instance_id': instance_id, 'summarizer_id': summarizer_id, 'summarizer_type': 'peer', 'summary': { 'text': summary }, 'document': { 'text': document } }) metrics_list.append( Metrics(instance_id, summarizer_id, 'peer', MetricsDict({'generic_quality': scores}))) return instances, metrics_list
def select_matches(self, summary_tokens: List[Token], reference_tokens: List[Token], matches: List[Tuple[int, int]], intersection: int): common_matches = [] for i, j in matches: summary_token = summary_tokens[i] reference_token = reference_tokens[j] if self.is_match(summary_token, reference_token): common_matches.append((i, j, 1.0)) num_matches = calculate_maximum_matching(common_matches) num_summary_tokens = sum(1 for token in summary_tokens if self.is_candidate(token)) num_reference_tokens = sum(1 for token in reference_tokens if self.is_candidate(token)) precision = num_matches / num_summary_tokens * 100 if num_summary_tokens > 0 else 0 recall = num_matches / num_reference_tokens * 100 if num_reference_tokens > 0 else 0 f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 contribution = num_matches / intersection * 100 if len(matches) > 0 else 0 return MetricsDict({ self.name: { 'precision': precision, 'recall': recall, 'f1': f1, 'contribution': contribution } })
def write_table(metrics1: MetricsDict, metrics2: MetricsDict, difference: MetricsDict, rel_difference: MetricsDict, output_path: str) -> None: # Put all of the data into tuples and sort by the relative difference data = [] for key in rel_difference.keys(): if 'f1' in rel_difference[key]: value1 = metrics1[key]['f1'] value2 = metrics2[key]['f1'] data.append((key, value1, value2, difference[key]['f1'], rel_difference[key]['f1'])) data.sort(key=lambda t: -t[4]) # Prepare the lines for writing lines = [] for category, value1, value2, diff, rel_diff in data: lines.append(' & '.join([ category, f'{value1:.1f}', f'{value2:.1f}', f'{diff:.1f}', f'{rel_diff:.1f}' ]) + ' \\\\') dirname = os.path.dirname(output_path) if dirname: os.makedirs(dirname, exist_ok=True) with open(output_path, 'w') as out: out.write('\n'.join(lines))
def calculate_scu_metric( self, summary_index_to_scus: List[Set[int]], reference_index_to_scus: List[Set[int]], summary_weights: List[float], reference_weights: List[float], matches: List[Tuple[int, int, float]]) -> MetricsDict: summary_scu_to_indices = self._get_scu_to_indices( summary_index_to_scus) reference_scu_to_indices = self._get_scu_to_indices( reference_index_to_scus) all_matches = [] for scu in summary_scu_to_indices.keys(): summary_indices = summary_scu_to_indices[scu] reference_indices = reference_scu_to_indices[scu] scu_matches = self._get_matches(summary_indices, reference_indices, matches) all_matches.extend(scu_matches) intersection = calculate_maximum_matching(all_matches) return MetricsDict({ 'weight': intersection, 'summary_weight': self._sum_scu_token_weight(summary_index_to_scus, summary_weights), 'reference_weight': self._sum_scu_token_weight(reference_index_to_scus, reference_weights) })
def _parse_output_file(self, file_path: str) -> List[List[MetricsDict]]: metrics_dicts = defaultdict(dict) with open(file_path, 'r') as f: for i, line in enumerate(f): # Header if i == 0: continue columns = line.split('\t') if len(columns) != 5: raise Exception(f'Expected 5 columns: {line}') instance_index = int(columns[0]) summarizer_index = int(columns[1]) metrics_dicts[instance_index][summarizer_index] = MetricsDict({ 'AutoSummENG': float(columns[2]), 'MeMoG': float(columns[3]), 'NPowER': float(columns[4]) }) metrics_lists = [] for i in range(len(metrics_dicts)): metrics_lists.append([]) for j in range(len(metrics_dicts[i])): metrics_lists[-1].append(metrics_dicts[i][j]) return metrics_lists
def select_matches(self, summary_tokens: List[Token], reference_tokens: List[Token], matches: List[Tuple[int, int]], weights: List[float], metric: str, backend: Backend) -> Tuple[List[Tuple[int, int]], MetricsDict]: common_matches = [] for i, j, weight in matches: summary_token = summary_tokens[i] reference_token = reference_tokens[j] if self.is_match(summary_token, reference_token): common_matches.append((i, j, weight)) norm_weight = 0 if metric == 'precision': tokens = summary_tokens else: tokens = reference_tokens for i, token in enumerate(tokens): if self.is_candidate(token): norm_weight += weights[i] matching_weight = backend.get_total_weight(common_matches) metrics = MetricsDict({ self.name: { f'{metric}_weight': matching_weight, f'{metric}_norm_weight': norm_weight, } }) return common_matches, metrics
def select_matches(self, summary_tokens: List[Token], reference_tokens: List[Token], matches: List[Tuple[int, int]], weights: List[float], metric: str, backend: Backend) -> Tuple[List[Tuple[int, int]], MetricsDict]: summary_tuples = self.get_tuples(summary_tokens) reference_tuples = self.get_tuples(reference_tokens) if isinstance(backend, BertScoreBackend): common_matches, total_weight = self._select_matches_bert(summary_tuples, reference_tuples, matches) else: common_matches, total_weight = self._select_matches_rouge(summary_tuples, reference_tuples, matches) if metric == 'precision': tuples = summary_tuples else: tuples = reference_tuples norm_weight = 0 norm_indices = set() for tup in tuples: for i in tup.values(): norm_indices.add(i) for i in norm_indices: norm_weight += weights[i] return common_matches, MetricsDict({ self.name: { f'{metric}_weight': total_weight, f'{metric}_norm_weight': norm_weight, } })
def test_pyramid_score(self): # This is a regression test, not necessarily a test for correctness pyramids = { pyramid.instance_id: pyramid for pyramid in JsonlReader(_pyramid_file_path, Pyramid).read() } annotations = JsonlReader(_annotation_file_path, PyramidAnnotation).read() annotation_pyramids = [ pyramids[annotation.instance_id] for annotation in annotations ] metric = PyramidScore() actual_output = metric.score_all(annotations, annotation_pyramids)[:5] expected_output = [{ 'modified_pyramid_score': 0.2413793103448276 }, { 'modified_pyramid_score': 0.0 }, { 'modified_pyramid_score': 0.06896551724137931 }, { 'modified_pyramid_score': 0.034482758620689655 }, { 'modified_pyramid_score': 0.1724137931034483 }] for i, (expected, actual) in enumerate(zip(expected_output, actual_output)): assert actual.approx_equal( MetricsDict(expected), abs=1e-4 ), f'Instance {i} not equal. Expected {expected}, actual {actual}'
def _run(self, summary: SummaryType, annotation: PyramidAnnotation, pyramid: Pyramid) -> MetricsDict: summary_all_scus_to_offsets = self._get_summary_scu_to_offsets(annotation) standard_counts = MetricsDict({'intersection': 0, 'num_summary_tokens': 0, 'num_reference_tokens': 0, 'num_scu_matches': 0, 'num_non_scu_matches': 0}) scu_counts = MetricsDict({'intersection': 0, 'num_summary_tokens': 0, 'num_reference_tokens': 0}) non_scu_counts = MetricsDict({'intersection': 0, 'num_summary_tokens': 0, 'num_reference_tokens': 0}) total_common_scus = 0 for i, reference in enumerate(pyramid.summaries): reference_all_scus_to_offsets = self._get_reference_scu_to_offsets(pyramid, i) valid_scus = self._get_scu_intersection(annotation, pyramid, i) total_common_scus += len(valid_scus) # Take only the SCUs which are common between the summary and reference summary_scus_to_offsets = self._filter_scu_to_offsets(summary_all_scus_to_offsets, valid_scus) reference_scus_to_offsets = self._filter_scu_to_offsets(reference_all_scus_to_offsets, valid_scus) # Tokenize each summary_tokens, summary_index_to_scus = self._tokenize(annotation.summary, summary_scus_to_offsets) reference_tokens, reference_index_to_scus = self._tokenize(reference, reference_scus_to_offsets) # Compute ROUGE standard_counts += self._compute_standard_rouge(summary_tokens, summary_index_to_scus, reference_tokens, reference_index_to_scus) scu_counts += self._compute_scu_rouge(summary_tokens, summary_index_to_scus, reference_tokens, reference_index_to_scus) non_scu_counts += self._compute_non_scu_rouge(summary_tokens, summary_index_to_scus, reference_tokens, reference_index_to_scus) avg_common_scus = total_common_scus / len(pyramid.summaries) self._add_pr(standard_counts) self._add_pr(scu_counts) self._add_pr(non_scu_counts) return MetricsDict({ 'common_scus': avg_common_scus, 'standard-rouge': standard_counts, 'scu-rouge': scu_counts, 'non-scu-rouge': non_scu_counts, })
def _score( self, answered_questions_list: List[List[AnsweredQuestion]] ) -> MetricsDict: # Average over references metrics = [] for answered_questions in answered_questions_list: metrics.append(self._score_reference(answered_questions)) final_metrics = sum(metrics) / len(metrics) return MetricsDict({'qa-eval': final_metrics})
def score_multi_all(self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]], **kwargs) -> List[List[MetricsDict]]: summaries_list = self._flatten_summaries(summaries_list) references_list = self._flatten_summaries(references_list) logger.info(f'Serializing the summaries and references to a file') num_summaries = 0 with TemporaryDirectory() as temp_dir: input_file = f'{temp_dir}/input.jsonl' output_file = f'{temp_dir}/output.jsonl' with JsonlWriter(input_file) as out: for summaries, references in zip(summaries_list, references_list): for summary in summaries: out.write({ 'summary': summary, 'references': references }) num_summaries += 1 logger.info(f'Wrote {num_summaries} (summary, references) pairs') commands = [f'cd {self.s3_root}/S3'] if self.environment_name is not None: commands.append(f'source {os.environ["CONDA_INIT"]}') commands.append(f'conda activate {self.environment_name}') commands.append( f'python2.7 run_batch.py {input_file} {output_file} {self.embeddings_file} {self.model_dir}' ) command = ' && '.join(commands) logger.info(f'Running command: "{command}"') redirect = None if self.verbose else PIPE process = Popen(command, stdout=redirect, stderr=redirect, shell=True) process.communicate() scores = JsonlReader(output_file).read() assert len(scores) == num_summaries metrics_list = [] index = 0 for summaries in summaries_list: metrics_list.append([]) for _ in summaries: metrics_list[-1].append( MetricsDict({ 's3': { 'pyr': scores[index]['pyr'], 'resp': scores[index]['resp'], } })) index += 1 return metrics_list
def score_multi_all( self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]] ) -> List[List[MetricsDict]]: summaries_list = [[ self.preprocess_summary(summary) for summary in summaries ] for summaries in summaries_list] references_list = [[ self.preprocess_summary(reference) for reference in references ] for references in references_list] metrics_lists = [] for summaries, references in zip(summaries_list, references_list): metrics_list = [MetricsDict() for _ in summaries] for n in self.ngram_orders: reference_ngrams_list = [ self._count_ngrams(reference, n) for reference in references ] for i, summary in enumerate(summaries): total_reference_count = 0 total_summary_count = 0 total_intersection = 0 summary_ngrams = self._count_ngrams(summary, n) for reference_ngrams in reference_ngrams_list: reference_total, summary_total, intersection = self._calculate_intersection( reference_ngrams, summary_ngrams) total_reference_count += reference_total total_summary_count += summary_total total_intersection += intersection precision, recall, f1 = self._calculate_pr_f1( total_reference_count, total_summary_count, total_intersection) metrics_list[i][f'python-rouge-{n}'] = { 'precision': precision, 'recall': recall, 'f1': f1, } if self.compute_rouge_l: for i, summary in enumerate(summaries): precision, recall, f1 = self._calculate_rouge_l( references, summary) metrics_list[i]['python-rouge-l'] = { 'precision': precision, 'recall': recall, 'f1': f1 } metrics_lists.append(metrics_list) return metrics_lists
def main(args): metrics1 = MetricsDict( json.loads(open(args.metrics_json1, 'r').read())['metrics']) metrics2 = MetricsDict( json.loads(open(args.metrics_json2, 'r').read())['metrics']) # If you calculate the differences based on the true values and then do rounding, the results # look a little weird in the table because the rounded values no longer make sense. # For example (26.61 - 21.95 = 4.66; The table would show 26.6 - 22.0 = 4.7). Therefore, we # first round the numbers, then calculate the differences. This shouldn't have any major # impact on the results, but it will avoid any confusion from the reader. round_metrics(metrics1, 1) round_metrics(metrics2, 1) difference = metrics2 - metrics1 rel_difference = calculate_relative_difference(metrics1, difference) write_table(metrics1, metrics2, difference, rel_difference, args.output_tex)
def run(self, args): params = Params.from_file(args.config, args.overrides) dataset_reader = DatasetReader.from_params( params.pop('dataset_reader')) metrics = load_metrics(params) instances = dataset_reader.read() summaries = [instance.summary for instance in instances] macro = MetricsDict() micro_list = get_initial_micro_list(instances) for metric in metrics: # Prepare the extra input arguments eval_args = [] for field in metric.required_fields: eval_args.append( [instance.fields[field] for instance in instances]) # Score all the summaries this_macro, this_micro_list = metric.evaluate( summaries, *eval_args) # Update the global metrics dictionaries macro.update(this_macro) for micro, this_micro in zip(micro_list, this_micro_list): micro.metrics.update(this_micro) dirname = os.path.dirname(args.macro_output_json) if dirname: os.makedirs(dirname, exist_ok=True) serialized_macro = jsons.dumps({'metrics': macro}, jdkwargs={'indent': 2}) with open(args.macro_output_json, 'w') as out: out.write(serialized_macro) if not args.silent: print(serialized_macro) with JsonlWriter(args.micro_output_jsonl) as out: for metrics_dict in micro_list: out.write(metrics_dict)
def _compute_standard_rouge(self, summary_tokens: List[str], summary_index_to_scus: List[Set[int]], reference_tokens: List[str], reference_index_to_scus: List[Set[int]]) -> MetricsDict(): # This is the standard ROUGE calculation except the SCU-based matches are # given priority over non-SCU matches to maximize the percentage of the # ROUGE score the SCU matches contribute. summary_scu_to_indices = self._get_scu_to_indices(summary_index_to_scus) reference_scu_to_indices = self._get_scu_to_indices(reference_index_to_scus) all_matches = [] for scu in summary_scu_to_indices.keys(): summary_indices = summary_scu_to_indices[scu] reference_indices = reference_scu_to_indices[scu] matches = self._get_matches(summary_tokens, summary_indices, reference_tokens, reference_indices) all_matches.extend(matches) num_scu_matches, matching = calculate_maximum_matching(all_matches, return_matching=True) # Mark which tokens were matched and therefore no long eligible summary_matches = [False] * len(summary_tokens) references_matches = [False] * len(reference_tokens) for i, j in matching: summary_matches[i] = True references_matches[j] = True summary_indices = [i for i in range(len(summary_tokens)) if not summary_matches[i]] reference_indices = [i for i in range(len(reference_tokens)) if not references_matches[i]] matches = self._get_matches(summary_tokens, summary_indices, reference_tokens, reference_indices) num_non_scu_matches = calculate_maximum_matching(matches) intersection = num_scu_matches + num_non_scu_matches m = MetricsDict({ 'intersection': intersection, 'num_summary_tokens': len(summary_tokens), 'num_reference_tokens': len(reference_tokens), 'num_scu_matches': num_scu_matches, 'num_non_scu_matches': num_non_scu_matches, }) return m
def _assert_expected_output(self, metric: Metric, expected_output: List[MetricsDict], *args): """Ensures that the output from `score_all` is equal to the `expected_output`.""" assert len(self.summaries) == len(expected_output) actual_output = metric.score_all(self.summaries, *args) assert len(actual_output) == len(expected_output) for i, (expected, actual) in enumerate(zip(expected_output, actual_output)): assert actual.approx_equal( MetricsDict(expected), abs=1e-4 ), f'Instance {i} not equal. Expected {expected}, actual {actual}'
def _run(self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]]) -> List[List[MetricsDict]]: summaries_list = [[flatten(summary) for summary in summaries] for summaries in summaries_list] references_list = [[flatten(reference) for reference in references] for references in references_list] # Create the candidate and reference lists for passing to the scoring function input_candidates = [] input_references = [] empty_inputs = set() for i, (summaries, references) in enumerate(zip(summaries_list, references_list)): for j, summary in enumerate(summaries): if len(summary) == 0: empty_inputs.add((i, j)) else: input_candidates.append(summary) input_references.append(references) # Score the summaries precisions, recalls, f1s = bert_score.score( input_candidates, input_references, model_type=self.model_type, num_layers=self.num_layers, idf=False, nthreads=self.nthreads, batch_size=self.batch_size, lang=self.lang, verbose=self.verbose ) # Remap the scores to the summaries index = 0 metrics_lists = [] for i, summaries in enumerate(summaries_list): metrics_lists.append([]) for j, summary in enumerate(summaries): if (i, j) in empty_inputs: precision, recall, f1 = 0.0, 0.0, 0.0 else: precision = precisions[index].item() recall = recalls[index].item() f1 = f1s[index].item() index += 1 metrics_lists[-1].append(MetricsDict({ 'bertscore': { 'precision': precision, 'recall': recall, 'f1': f1, } })) return metrics_lists
def _run( self, summaries_list: List[List[SummaryType]], references_list: List[List[SummaryType]] ) -> List[List[MetricsDict]]: summaries_list = self._flatten_summaries(summaries_list) references_list = self._flatten_summaries(references_list) unique_summaries = self._get_unique_summaries(summaries_list) unique_references = self._get_unique_summaries(references_list) idf_dict_summaries = get_idf_dict(unique_summaries) idf_dict_references = get_idf_dict(unique_references) # Prepare the inputs into flat lists for faster processing. The # indices will keep track of which item the score belongs to indices = [] input_summaries = [] input_references = [] for i, (summaries, references) in enumerate( zip(summaries_list, references_list)): for j, summary in enumerate(summaries): for reference in references: indices.append((i, j)) input_summaries.append(summary) input_references.append(reference) # Score all of the data scores = word_mover_score(input_references, input_summaries, idf_dict_references, idf_dict_summaries, self.stopwords, n_gram=1, remove_subwords=True, batch_size=48) # Compute the mean over the references indices_to_scores = defaultdict(list) for pair, score in zip(indices, scores): indices_to_scores[pair].append(score) indices_to_score = {} for pair, scores in indices_to_scores.items(): indices_to_score[pair] = np.mean(scores) # Put back into lists metrics_dict_lists = [] for i in range(len(summaries_list)): metrics_dict_lists.append([]) for j in range(len(summaries_list[i])): metrics_dict_lists[-1].append( MetricsDict({'MoverScore': indices_to_score[(i, j)]})) return metrics_dict_lists
def select_matches(self, summary_tokens: List[Token], reference_tokens: List[Token], matches: List[Tuple[int, int]], intersection: int): summary_tuples = self.get_tuples(summary_tokens) reference_tuples = self.get_tuples(reference_tokens) matches = set(matches) # Figure out the list of tuple-level matches based on whether the tuples # match each other completely tuple_matches = [] for s_i, summary_tuple in enumerate(summary_tuples): for r_j, reference_tuple in enumerate(reference_tuples): assert len(summary_tuple) == len(reference_tuple) # See if each component of these two tuples can be aligned matched = True for key, i in summary_tuple.items(): if key not in reference_tuple: matched = False break j = reference_tuple[key] if (i, j) not in matches: matched = False break if matched: tuple_matches.append((s_i, r_j, len(summary_tuple))) # Calculate the weight of the matched tuples, only allowing each tuple to be matched once. # The tuples form an equivalence class, so it doesn't matter exactly what match we use. This is # equivalent to calculating the size of the maximum matching in a bipartite graph where the # two disjoint sets of vertices are the summary and reference tuples, and an edge exists between # them if they were matched total_weight = calculate_maximum_matching(tuple_matches) summary_tuples_weight = sum(len(tup) for tup in summary_tuples) reference_tuples_weight = sum(len(tup) for tup in reference_tuples) precision = total_weight / summary_tuples_weight * 100 if summary_tuples_weight > 0 else 0 recall = total_weight / reference_tuples_weight * 100 if reference_tuples_weight > 0 else 0 f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 contribution = total_weight / intersection * 100 if len(matches) > 0 else 0 return MetricsDict({ self.name: { 'precision': precision, 'recall': recall, 'f1': f1, 'contribution': contribution } })
def _run( self, summaries_list: List[List[SummaryType]] ) -> List[List[MetricsDict]]: with TemporaryDirectory() as temp_dir: summaries_file = f'{temp_dir}/summaries.jsonl' predictions_file = f'{temp_dir}/predictions.json' # Save all of the summaries to a file with JsonlWriter(summaries_file) as out: for summaries in summaries_list: for summary in summaries: out.write({'summary': self._flatten_summary(summary)}) commands = [f'cd {self.sum_qe_root}'] if self.environment_name: commands += [f'source activate {self.environment_name}'] commands += [ ' '.join([ 'python', '-m', 'src.BERT_experiments.predict', summaries_file, self.model_file, predictions_file ]) ] redirect = None if self.verbose else PIPE process = Popen(' && '.join(commands), stdout=redirect, stderr=redirect, shell=True) stdout, stderr = process.communicate() predictions = json.loads(open(predictions_file, 'r').read()) index = 0 metrics_lists = [] for summaries in summaries_list: metrics_lists.append([]) for summary in summaries: preds = predictions[index] metrics_lists[-1].append( MetricsDict({ 'SumQE': { 'Q1': preds[0], 'Q2': preds[1], 'Q3': preds[2], 'Q4': preds[3], 'Q5': preds[4] } })) index += 1 return metrics_lists
def score_multi_all(self, summaries_list: List[List[SummaryType]], references_list: List[List[ReferenceType]]) -> List[List[MetricsDict]]: metrics_dict_lists = [] for summaries, references in zip(summaries_list, references_list): metrics_dict_lists.append([]) for summary in summaries: summary_value = float(summary) total = 0 for reference in references: reference_value = float(reference) total += summary_value * reference_value metrics_dict_lists[-1].append(MetricsDict({'test': total})) return metrics_dict_lists
def calculate_standard_metric( self, summary_index_to_scus: List[Set[int]], reference_index_to_scus: List[Set[int]], summary_weights: List[float], reference_weights: List[float], matches: List[Tuple[int, int, float]]) -> MetricsDict: # This is the standard ROUGE calculation except the SCU-based matches are # given priority over non-SCU matches to maximize the percentage of the # ROUGE score the SCU matches contribute. summary_scu_to_indices = self._get_scu_to_indices( summary_index_to_scus) reference_scu_to_indices = self._get_scu_to_indices( reference_index_to_scus) all_matches = [] for scu in summary_scu_to_indices.keys(): summary_indices = summary_scu_to_indices[scu] reference_indices = reference_scu_to_indices[scu] scu_matches = self._get_matches(summary_indices, reference_indices, matches) all_matches.extend(scu_matches) num_scu_matches, matching = calculate_maximum_matching( all_matches, return_matching=True) # Mark which tokens were matched and therefore no long eligible summary_matches = [False] * len(summary_index_to_scus) references_matches = [False] * len(reference_index_to_scus) for i, j in matching: summary_matches[i] = True references_matches[j] = True summary_indices = [ i for i in range(len(summary_index_to_scus)) if not summary_matches[i] ] reference_indices = [ i for i in range(len(reference_index_to_scus)) if not references_matches[i] ] non_scus_matches = self._get_matches(summary_indices, reference_indices, matches) num_non_scu_matches = calculate_maximum_matching(non_scus_matches) intersection = num_scu_matches + num_non_scu_matches return MetricsDict({ 'weight': intersection, 'summary_weight': sum(summary_weights), 'reference_weight': sum(reference_weights), 'scu_weight': num_scu_matches, 'non_scu_weight': num_non_scu_matches, })
def _score_summaries( self, array_index_to_tgt_index: List[int]) -> List[MetricsDict]: logging.info('Building pyramids and scoring peers') # Each step can be run by piping its ID into the pyreval.py program. # 4: pyramid # 5 -t: score (-t means to write the results to file) for args in ['4', '5 -t']: commands = [f'cd {self.pyreval_root}'] if self.environment_name is not None: commands.append(f'source {os.environ["CONDA_INIT"]}') commands.append(f'conda activate {self.environment_name}') commands.append(f'echo {args} | python2.7 pyreval.py') command = ' && '.join(commands) logger.info(f'Running command: "{command}"') redirect = None if self.verbose else PIPE process = Popen(command, stdout=redirect, stderr=redirect, shell=True) process.communicate() # Parse the results results_path = f'{self.pyreval_root}/results.csv' if not os.path.exists(results_path): raise Exception( f'PyrEval results file does not exist: "{results_path}"') # First line is the name of the pyramid # Second line is the header lines = open(results_path, 'r').read().splitlines() metrics_dicts = {} for line in lines[2:]: index, raw, quality, coverage, comprehensive = line.split(',') metrics_dicts[int(index)] = MetricsDict({ 'pyreval': { 'raw': int(raw), 'quality': float(quality), 'coverage': float(coverage), 'comprehensive': float(comprehensive), } }) metrics_list = [] for index in array_index_to_tgt_index: metrics_list.append(metrics_dicts[index]) logging.info('Finished building pyramids and scoring peers') return metrics_list
def _aggregate_summary_scores(self, summaries_list: List[List[str]], references_list: List[List[str]], tuple_to_indices: Dict[Tuple[int, int], List[int]], individual_scores: List[float]) -> List[List[MetricsDict]]: metrics_lists = [] for i, (summaries, references) in enumerate(zip(summaries_list, references_list)): metrics_lists.append([]) for j, summary in enumerate(summaries): scores = [individual_scores[index] for index in tuple_to_indices[(i, j)]] metrics_lists[-1].append(MetricsDict({ 'METEOR': sum(scores) / len(scores) })) return metrics_lists
def test_get_set_item(self): metrics = MetricsDict() metrics['a'] = 4 assert metrics['a'] == 4 metrics['b']['c'] = [1, 2] assert metrics['b']['c'] == [1, 2] assert isinstance(metrics['b'], MetricsDict) metrics['d'] = {'e': 4, 'f': {'g': 4}} assert metrics['d'] == {'e': 4, 'f': {'g': 4}} assert isinstance(metrics['d'], MetricsDict) assert isinstance(metrics['d']['f'], MetricsDict)
def calculate_non_scu_metric(self, summary_index_to_scus: List[Set[int]], reference_index_to_scus: List[Set[int]], summary_weights: List[float], reference_weights: List[float], matches: List[Tuple[int, int, float]]) -> MetricsDict: total_weight = 0 for i, j, weight in matches: if len(summary_index_to_scus[i] & reference_index_to_scus[j]) == 0: total_weight += weight return MetricsDict({ 'weight': total_weight, 'summary_weight': sum(summary_weights), 'reference_weight': sum(reference_weights) })