def _assert_expected_output(self, metric: Metric, expected_output: List[MetricsDict], *args): """Ensures that the output from `score_all` is equal to the `expected_output`.""" assert len(self.summaries) == len(expected_output) actual_output = metric.score_all(self.summaries, *args) assert len(actual_output) == len(expected_output) for i, (expected, actual) in enumerate(zip(expected_output, actual_output)): assert actual.approx_equal( MetricsDict(expected), abs=1e-4 ), f'Instance {i} not equal. Expected {expected}, actual {actual}'
def _assert_order_invariant(self, metric: Metric, *args): """Ensures that the output from `score_multi_all` returns the same results, no matter the order.""" # Reverse the summaries to create a new fake set of summaries that will be grouped together # with the same references. It does not matter that they're not the right references since we are # only testing to make sure the output doesn't change. faked = list(reversed(self.summaries)) summaries_list = list(zip(*[self.summaries, faked])) metrics_lists1 = metric.score_multi_all(summaries_list, *args) metrics_lists1 = list(zip(*metrics_lists1)) summaries_list = list(zip(*[faked, self.summaries])) metrics_lists2 = metric.score_multi_all(summaries_list, *args) metrics_lists2 = list(zip(*metrics_lists2)) metrics_lists2 = list(reversed(metrics_lists2)) for metrics_list1, metrics_list2 in zip(metrics_lists1, metrics_lists2): for metrics1, metrics2 in zip(metrics_list1, metrics_list2): assert metrics1.approx_equal(metrics2)
def _score_with_metric(metric: Metric, instances: List[EvalInstance], metrics_dicts: Dict[str, Dict[str, Metrics]]) -> None: # The summaries need to be grouped based on identical context. For instance, we group all of the summaries # that have the same reference documents together. This can sometimes make calculating the metric faster. The # following variables assist doing this. # # Maintains a list of the unique contexts which group the summaries fields_list = [] # A mapping from the context to its index in `fields_list` field_to_index = {} # A nested list that will be parallel to `fields_list`. The entry at index `i` is a list of instances which should # be scored with `fields_list[i]` instances_list = [] # A nested list that will be parallel to `instances_list` which contains the summary-specific fields # for the corresponding instance summary_fields_lists = [] # A nested list that will be parallel to `instances_list` which marks if the calculation for that (summary, context) # pair represents jackknifing or not jackknifing_flags = [] for instance in instances: # Select just the relevant fields for this metric summary_fields = instance.fields.select_fields( metric.required_summary_fields) context_fields = instance.fields.select_fields( metric.required_context_fields) # Score the instance normally using all of the fields. However, # if the metric requires jackknifing and this is a reference summary, # the metric is comparable to the jackknifing metrics. is_jackknifing = metric.requires_jackknifing( ) and instance.summarizer_type == 'reference' if context_fields not in field_to_index: field_to_index[context_fields] = len(field_to_index) fields_list.append(context_fields) instances_list.append([]) summary_fields_lists.append([]) jackknifing_flags.append([]) index = field_to_index[context_fields] instances_list[index].append(instance) summary_fields_lists[index].append(summary_fields) jackknifing_flags[index].append(is_jackknifing) # Potentially run jackknifing for the peers if metric.requires_jackknifing( ) and instance.summarizer_type == 'peer': jk_fields_list = metric.jackknifer.get_jackknifing_fields_list( context_fields) if jk_fields_list: for jk_fields in jk_fields_list: if jk_fields not in field_to_index: field_to_index[jk_fields] = len(field_to_index) fields_list.append(jk_fields) instances_list.append([]) summary_fields_lists.append([]) jackknifing_flags.append([]) index = field_to_index[jk_fields] instances_list[index].append(instance) summary_fields_lists[index].append(summary_fields) jackknifing_flags[index].append(True) # Construct the arguments that will be passed to the scoring method summary_args = [] for name in metric.required_summary_fields: summary_args.append([[ summary_fields[name].to_input() for summary_fields in summary_fields_list ] for summary_fields_list in summary_fields_lists]) context_args = [] for name in metric.required_context_fields: context_args.append( [fields[name].to_input() for fields in fields_list]) # Score the summaries results_lists = metric.score_multi_all(*summary_args, *context_args) # Used to aggregate the jk results jk_results = defaultdict(lambda: defaultdict(list)) for i, results_list in enumerate(results_lists): for j, results in enumerate(results_list): instance = instances_list[i][j] is_jackknifing = jackknifing_flags[i][j] if is_jackknifing: jk_results[instance.instance_id][ instance.summarizer_id].append(results) else: metrics_dicts[instance.instance_id][ instance.summarizer_id].metrics.update(results) # Aggregate the jk results for instance_id in jk_results.keys(): for summarizer_id, results in jk_results[instance_id].items(): result = sum(results) / len(results) for name, value in result.items(): metrics_dicts[instance_id][summarizer_id].metrics[ name + '_jk'] = value
def _load_metrics(params: Params) -> List[Metric]: metrics = [] for metric_params in params.pop('metrics'): metric = Metric.from_params(metric_params) metrics.append(metric) return metrics
def _score_with_metric(metric: Metric, instances: List[EvalInstance], metrics_dicts: Dict[str, Dict[str, Metrics]]) -> None: fields_list = [] field_to_index = {} instances_list = [] jackknifing_flags = [] for instance in instances: # Select just the relevant fields for this metric fields = instance.fields.select_fields(metric.required_fields) # Score the instance normally using all of the fields. However, # if the metric requires jackknifing and this is a reference summary, # the metric is comparable to the jackknifing metrics. is_jackknifing = metric.requires_jackknifing( ) and instance.summarizer_type == 'reference' if fields not in field_to_index: field_to_index[fields] = len(field_to_index) fields_list.append(fields) instances_list.append([]) jackknifing_flags.append([]) index = field_to_index[fields] instances_list[index].append(instance) jackknifing_flags[index].append(is_jackknifing) # Potentially run jackknifing for the peers if metric.requires_jackknifing( ) and instance.summarizer_type == 'peer': jk_fields_list = metric.jackknifer.get_jackknifing_fields_list( fields) if jk_fields_list: for jk_fields in jk_fields_list: if jk_fields not in field_to_index: field_to_index[jk_fields] = len(field_to_index) fields_list.append(jk_fields) instances_list.append([]) jackknifing_flags.append([]) index = field_to_index[jk_fields] instances_list[index].append(instance) jackknifing_flags[index].append(True) # Score the summaries summaries_lists = [[instance.summary for instance in instances] for instances in instances_list] args = [[fields[name] for fields in fields_list] for name in metric.required_fields] results_lists = metric.score_multi_all(summaries_lists, *args) # Used to aggregate the jk results jk_results = defaultdict(lambda: defaultdict(list)) for i, results_list in enumerate(results_lists): for j, results in enumerate(results_list): instance = instances_list[i][j] is_jackknifing = jackknifing_flags[i][j] if is_jackknifing: jk_results[instance.instance_id][ instance.summarizer_id].append(results) else: metrics_dicts[instance.instance_id][ instance.summarizer_id].metrics.update(results) # Aggregate the jk results for instance_id in jk_results.keys(): for summarizer_id, results in jk_results[instance_id].items(): result = sum(results) / len(results) for name, value in result.items(): metrics_dicts[instance_id][summarizer_id].metrics[ name + '_jk'] = value