Ejemplo n.º 1
0
def evaluate_instances(
        instances: List[EvalInstance],
        metrics: List[Metric]) -> Tuple[MetricsDict, List[Metrics]]:
    macro = MetricsDict()
    micro_list = get_initial_micro_list(instances)

    for metric in metrics:
        # Prepare the input arguments
        summary_args = []
        for field in metric.required_summary_fields:
            summary_args.append(
                [instance.fields[field].to_input() for instance in instances])

        context_args = []
        for field in metric.required_context_fields:
            context_args.append(
                [instance.fields[field].to_input() for instance in instances])

        # Score all the summaries
        this_macro, this_micro_list = metric.evaluate(*summary_args,
                                                      *context_args)

        # Update the global metrics dictionaries
        macro.update(this_macro)
        for micro, this_micro in zip(micro_list, this_micro_list):
            micro.metrics.update(this_micro)

    return macro, micro_list
Ejemplo n.º 2
0
    def _combine_metrics(self, recall_metrics: MetricsDict,
                         precision_metrics: MetricsDict) -> MetricsDict:
        combined = MetricsDict()
        combined.update(recall_metrics)
        combined.update(precision_metrics)

        for key in combined.keys():
            if 'precision' and 'recall' in combined[key]:
                precision = combined[key]['precision']
                recall = combined[key]['recall']
                f1 = 0.0
                if precision + recall != 0.0:
                    f1 = 2 * (precision * recall) / (precision + recall)
                combined[key]['f1'] = f1

        return combined
    def run(self, args):
        params = Params.from_file(args.config, args.overrides)
        dataset_reader = DatasetReader.from_params(
            params.pop('dataset_reader'))
        metrics = load_metrics(params)

        instances = dataset_reader.read()
        summaries = [instance.summary for instance in instances]

        macro = MetricsDict()
        micro_list = get_initial_micro_list(instances)

        for metric in metrics:
            # Prepare the extra input arguments
            eval_args = []
            for field in metric.required_fields:
                eval_args.append(
                    [instance.fields[field] for instance in instances])

            # Score all the summaries
            this_macro, this_micro_list = metric.evaluate(
                summaries, *eval_args)

            # Update the global metrics dictionaries
            macro.update(this_macro)
            for micro, this_micro in zip(micro_list, this_micro_list):
                micro.metrics.update(this_micro)

        dirname = os.path.dirname(args.macro_output_json)
        if dirname:
            os.makedirs(dirname, exist_ok=True)

        serialized_macro = jsons.dumps({'metrics': macro},
                                       jdkwargs={'indent': 2})
        with open(args.macro_output_json, 'w') as out:
            out.write(serialized_macro)
        if not args.silent:
            print(serialized_macro)

        with JsonlWriter(args.micro_output_jsonl) as out:
            for metrics_dict in micro_list:
                out.write(metrics_dict)
Ejemplo n.º 4
0
    def _run_metric(self, summary_tokens: List[Token],
                    reference_tokens_list: List[List[Token]],
                    matches_list: List[List[Tuple[int, int, float]]],
                    token_weights_list: List[List[float]], metric: str):
        total_weight = 0
        total_normalization_weight = 0

        total_matches = 0
        content_type_to_total_matches = defaultdict(float)
        matcher_metrics = [[] for _ in self.matchers]
        for reference_tokens, matches, weights in zip(reference_tokens_list,
                                                      matches_list,
                                                      token_weights_list):
            total_weight += self.backend.get_total_weight(matches)
            total_normalization_weight += sum(weights)

            all_matches = []
            content_type_to_matches = defaultdict(list)
            for i, matcher in enumerate(self.matchers):
                category_matches, metrics = matcher.select_matches(
                    summary_tokens, reference_tokens, matches, weights, metric,
                    self.backend)
                content_type_to_matches[matcher.content_type].extend(
                    category_matches)
                all_matches.extend(category_matches)
                matcher_metrics[i].append(metrics)

            total_matches += self.backend.get_total_weight(all_matches)
            for content_type, content_matches in content_type_to_matches.items(
            ):
                content_type_to_total_matches[
                    content_type] += self.backend.get_total_weight(
                        content_matches)

        # Compute the aggregated metrics for each matcher
        metrics = MetricsDict()
        for matcher, metrics_list in zip(self.matchers, matcher_metrics):
            metrics.update(matcher.finalize(metrics_list, total_weight,
                                            metric))

        # Add the standard rouge score
        measure = 0.0
        if total_normalization_weight != 0.0:
            measure = total_weight / total_normalization_weight * 100
        metrics[self.name] = {
            f'{metric}_total_weight': total_weight,
            f'{metric}_total_norm_weight': total_normalization_weight,
            metric: measure
        }

        # Compute the metric for just the edges that the categories selected
        measure = 0.0
        if total_normalization_weight != 0.0:
            measure = total_matches / total_normalization_weight * 100

        coverage = 0.0
        if total_weight != 0.0:
            coverage = total_matches / total_weight * 100

        # Calculate each content type coverage
        for content_type, content_total_matches in content_type_to_total_matches.items(
        ):
            content_coverage = 0.0
            if total_weight != 0.0:
                content_coverage = content_total_matches / total_weight * 100
            metrics[f'{metric}_content-coverages'][
                content_type] = content_coverage

        metrics[f'interpretable-{self.name}'] = {
            f'{metric}_total_weight': total_matches,
            f'{metric}_total_norm_weight': total_normalization_weight,
            metric: measure,
            f'{metric}_coverage': coverage
        }

        return metrics
Ejemplo n.º 5
0
 def test_update(self):
     m1 = MetricsDict({'k1': 1, 'k2': {'k3': [1, 2, 3]}})
     m2 = MetricsDict({'k4': 4, 'k2': {'k3': 5, 'k5': 8}})
     m1.update(m2)
     assert m1 == {'k1': 1, 'k2': {'k3': 5, 'k5': 8}, 'k4': 4}