Exemple #1
0
 def _assert_expected_output(self, metric: Metric,
                             expected_output: List[MetricsDict], *args):
     """Ensures that the output from `score_all` is equal to the `expected_output`."""
     assert len(self.summaries) == len(expected_output)
     actual_output = metric.score_all(self.summaries, *args)
     assert len(actual_output) == len(expected_output)
     for i, (expected,
             actual) in enumerate(zip(expected_output, actual_output)):
         assert actual.approx_equal(
             MetricsDict(expected), abs=1e-4
         ), f'Instance {i} not equal. Expected {expected}, actual {actual}'
Exemple #2
0
    def _assert_order_invariant(self, metric: Metric, *args):
        """Ensures that the output from `score_multi_all` returns the same results, no matter the order."""
        # Reverse the summaries to create a new fake set of summaries that will be grouped together
        # with the same references. It does not matter that they're not the right references since we are
        # only testing to make sure the output doesn't change.
        faked = list(reversed(self.summaries))

        summaries_list = list(zip(*[self.summaries, faked]))
        metrics_lists1 = metric.score_multi_all(summaries_list, *args)
        metrics_lists1 = list(zip(*metrics_lists1))

        summaries_list = list(zip(*[faked, self.summaries]))
        metrics_lists2 = metric.score_multi_all(summaries_list, *args)
        metrics_lists2 = list(zip(*metrics_lists2))

        metrics_lists2 = list(reversed(metrics_lists2))
        for metrics_list1, metrics_list2 in zip(metrics_lists1,
                                                metrics_lists2):
            for metrics1, metrics2 in zip(metrics_list1, metrics_list2):
                assert metrics1.approx_equal(metrics2)
Exemple #3
0
def _score_with_metric(metric: Metric, instances: List[EvalInstance],
                       metrics_dicts: Dict[str, Dict[str, Metrics]]) -> None:
    # The summaries need to be grouped based on identical context. For instance, we group all of the summaries
    # that have the same reference documents together. This can sometimes make calculating the metric faster. The
    # following variables assist doing this.
    #
    # Maintains a list of the unique contexts which group the summaries
    fields_list = []

    # A mapping from the context to its index in `fields_list`
    field_to_index = {}

    # A nested list that will be parallel to `fields_list`. The entry at index `i` is a list of instances which should
    # be scored with `fields_list[i]`
    instances_list = []

    # A nested list that will be parallel to `instances_list` which contains the summary-specific fields
    # for the corresponding instance
    summary_fields_lists = []

    # A nested list that will be parallel to `instances_list` which marks if the calculation for that (summary, context)
    # pair represents jackknifing or not
    jackknifing_flags = []

    for instance in instances:
        # Select just the relevant fields for this metric
        summary_fields = instance.fields.select_fields(
            metric.required_summary_fields)
        context_fields = instance.fields.select_fields(
            metric.required_context_fields)

        # Score the instance normally using all of the fields. However,
        # if the metric requires jackknifing and this is a reference summary,
        # the metric is comparable to the jackknifing metrics.
        is_jackknifing = metric.requires_jackknifing(
        ) and instance.summarizer_type == 'reference'

        if context_fields not in field_to_index:
            field_to_index[context_fields] = len(field_to_index)
            fields_list.append(context_fields)
            instances_list.append([])
            summary_fields_lists.append([])
            jackknifing_flags.append([])

        index = field_to_index[context_fields]
        instances_list[index].append(instance)
        summary_fields_lists[index].append(summary_fields)
        jackknifing_flags[index].append(is_jackknifing)

        # Potentially run jackknifing for the peers
        if metric.requires_jackknifing(
        ) and instance.summarizer_type == 'peer':
            jk_fields_list = metric.jackknifer.get_jackknifing_fields_list(
                context_fields)
            if jk_fields_list:
                for jk_fields in jk_fields_list:
                    if jk_fields not in field_to_index:
                        field_to_index[jk_fields] = len(field_to_index)
                        fields_list.append(jk_fields)
                        instances_list.append([])
                        summary_fields_lists.append([])
                        jackknifing_flags.append([])

                    index = field_to_index[jk_fields]
                    instances_list[index].append(instance)
                    summary_fields_lists[index].append(summary_fields)
                    jackknifing_flags[index].append(True)

    # Construct the arguments that will be passed to the scoring method
    summary_args = []
    for name in metric.required_summary_fields:
        summary_args.append([[
            summary_fields[name].to_input()
            for summary_fields in summary_fields_list
        ] for summary_fields_list in summary_fields_lists])

    context_args = []
    for name in metric.required_context_fields:
        context_args.append(
            [fields[name].to_input() for fields in fields_list])

    # Score the summaries
    results_lists = metric.score_multi_all(*summary_args, *context_args)

    # Used to aggregate the jk results
    jk_results = defaultdict(lambda: defaultdict(list))

    for i, results_list in enumerate(results_lists):
        for j, results in enumerate(results_list):
            instance = instances_list[i][j]
            is_jackknifing = jackknifing_flags[i][j]
            if is_jackknifing:
                jk_results[instance.instance_id][
                    instance.summarizer_id].append(results)
            else:
                metrics_dicts[instance.instance_id][
                    instance.summarizer_id].metrics.update(results)

    # Aggregate the jk results
    for instance_id in jk_results.keys():
        for summarizer_id, results in jk_results[instance_id].items():
            result = sum(results) / len(results)
            for name, value in result.items():
                metrics_dicts[instance_id][summarizer_id].metrics[
                    name + '_jk'] = value
Exemple #4
0
def _load_metrics(params: Params) -> List[Metric]:
    metrics = []
    for metric_params in params.pop('metrics'):
        metric = Metric.from_params(metric_params)
        metrics.append(metric)
    return metrics
Exemple #5
0
def _score_with_metric(metric: Metric, instances: List[EvalInstance],
                       metrics_dicts: Dict[str, Dict[str, Metrics]]) -> None:
    fields_list = []
    field_to_index = {}
    instances_list = []
    jackknifing_flags = []

    for instance in instances:
        # Select just the relevant fields for this metric
        fields = instance.fields.select_fields(metric.required_fields)

        # Score the instance normally using all of the fields. However,
        # if the metric requires jackknifing and this is a reference summary,
        # the metric is comparable to the jackknifing metrics.
        is_jackknifing = metric.requires_jackknifing(
        ) and instance.summarizer_type == 'reference'

        if fields not in field_to_index:
            field_to_index[fields] = len(field_to_index)
            fields_list.append(fields)
            instances_list.append([])
            jackknifing_flags.append([])

        index = field_to_index[fields]
        instances_list[index].append(instance)
        jackknifing_flags[index].append(is_jackknifing)

        # Potentially run jackknifing for the peers
        if metric.requires_jackknifing(
        ) and instance.summarizer_type == 'peer':
            jk_fields_list = metric.jackknifer.get_jackknifing_fields_list(
                fields)
            if jk_fields_list:
                for jk_fields in jk_fields_list:
                    if jk_fields not in field_to_index:
                        field_to_index[jk_fields] = len(field_to_index)
                        fields_list.append(jk_fields)
                        instances_list.append([])
                        jackknifing_flags.append([])

                    index = field_to_index[jk_fields]
                    instances_list[index].append(instance)
                    jackknifing_flags[index].append(True)

    # Score the summaries
    summaries_lists = [[instance.summary for instance in instances]
                       for instances in instances_list]
    args = [[fields[name] for fields in fields_list]
            for name in metric.required_fields]
    results_lists = metric.score_multi_all(summaries_lists, *args)

    # Used to aggregate the jk results
    jk_results = defaultdict(lambda: defaultdict(list))

    for i, results_list in enumerate(results_lists):
        for j, results in enumerate(results_list):
            instance = instances_list[i][j]
            is_jackknifing = jackknifing_flags[i][j]
            if is_jackknifing:
                jk_results[instance.instance_id][
                    instance.summarizer_id].append(results)
            else:
                metrics_dicts[instance.instance_id][
                    instance.summarizer_id].metrics.update(results)

    # Aggregate the jk results
    for instance_id in jk_results.keys():
        for summarizer_id, results in jk_results[instance_id].items():
            result = sum(results) / len(results)
            for name, value in result.items():
                metrics_dicts[instance_id][summarizer_id].metrics[
                    name + '_jk'] = value