Esempio n. 1
0
def get_pairwise_performance_gap(sys1: SysOutputInfo,
                                 sys2: SysOutputInfo) -> SysOutputInfo:

    sys = copy.deepcopy(sys1)

    orm, or1, or2 = (unwrap(x.results.overall) for x in (sys, sys1, sys2))
    for metric_name, performance_unit in orm.items():
        orm[metric_name].value = float(or1[metric_name].value) - float(
            or2[metric_name].value)
        orm[metric_name].confidence_score_low = None
        orm[metric_name].confidence_score_high = None

    fgr, fgr1, fgr2 = (unwrap(x.results.fine_grained)
                       for x in (sys, sys1, sys2))
    for bucket_attr, buckets in fgr.items():
        for bucket_id, bucket in enumerate(buckets):
            for perf_id, perf in enumerate(bucket.performances):
                perf.value = float(
                    fgr1[bucket_attr][bucket_id].performances[perf_id].value
                ) - float(
                    fgr2[bucket_attr][bucket_id].performances[perf_id].value)
                # TODO(gneubig): these could be done via pairwise bootstraps
                perf.confidence_score_low = None
                perf.confidence_score_high = None

    return sys
Esempio n. 2
0
def get_tasks(task: TaskType, system_outputs: list[str]) -> list[TaskType]:
    """
    Get the task for each system output.
    :param task: Explicitly specified task. Use if present
    :param system_outputs: System output files, load from metadata in these files if
      an explicit task is not set
    :return: A list of task types for each system
    """
    real_tasks: list[TaskType] = []
    if task:
        real_tasks = [task] * len(system_outputs)
        if task not in TaskType.list():
            raise ValueError(
                f'Task name {task} was not recognized. ExplainaBoard currently '
                f'supports:{TaskType.list()}')
        return real_tasks
    else:
        for sys_output in system_outputs:
            # give me a task, or give me death (by exception)
            task_or_die: TaskType | None = None
            msg: str = ''
            try:
                metadata = FileLoaderMetadata.from_file(sys_output)
                task_or_die = TaskType(unwrap(metadata.task_name))
            except Exception as e:
                msg = str(e)
            if task_or_die is None:
                raise ValueError(
                    'Must either specify a task explicitly or have one '
                    'specified in metadata, but could find neither for '
                    f'{sys_output}. {msg}')
            real_tasks.append(unwrap(task_or_die))
    return real_tasks
Esempio n. 3
0
    def _complete_features(self,
                           sys_info: SysOutputInfo,
                           sys_output: list[dict],
                           external_stats=None) -> list[str]:
        """
        This function takes in meta-data about system outputs, system outputs, and a few
        other optional pieces of information, then calculates feature functions and
        modifies `sys_output` to add these feature values

        :param sys_info: Information about the system output
        :param sys_output: The system output itself
        :param external_stats: Training set statistics that are used to calculate
            training set specific features
        :return: The features that are active (e.g. skipping training set features when
            no training set available)
        """
        sys_features = unwrap(sys_info.features)
        active_features = list(
            sys_features.get_bucket_features(
                include_training_dependent=external_stats is not None))

        # One pass over the test set to find token test frequency
        all_tokens = [
            unwrap(sys_info.source_tokenizer)(x['output']) for x in sys_output
        ]
        all_log_probs = [self._get_predicted_label(x) for x in sys_output]
        test_freq: dict[str, int] = {}
        for tokens in all_tokens:
            for tok in tokens:
                test_freq[tok] = test_freq.get(tok, 0) + 1

        sent_feats: list[str] = []
        tok_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in sys_features) else tok_feats).append(x)

        for _id, (dict_sysout, tokens, log_probs) in progress(
                enumerate(zip(sys_output, all_tokens, all_log_probs)),
                desc="featurizing"):
            # Get values of bucketing features
            text = dict_sysout["output"]

            # sentence_length
            dict_sysout["text_length"] = len(tokens)
            dict_sysout["text_chars"] = len(text)

            # sentence-level training set dependent features
            if external_stats is not None:
                dict_sysout["num_oov"] = self._get_num_oov(
                    tokens, external_stats)
                dict_sysout["fre_rank"] = self._get_fre_rank(
                    tokens, external_stats)

            # span features for true and predicted spans
            dict_sysout["tok_info"] = self._complete_tok_features(
                tokens, log_probs, test_freq, statistics=external_stats)

        return active_features
Esempio n. 4
0
 def _statistics_func(self, samples: Iterator, sys_info: SysOutputInfo):
     return {
         'source_vocab':
         accumulate_vocab_from_samples(samples, lambda x: x['source'],
                                       unwrap(sys_info.source_tokenizer)),
         'target_vocab':
         accumulate_vocab_from_samples(samples, lambda x: x['reference'],
                                       unwrap(sys_info.target_tokenizer)),
     }
Esempio n. 5
0
 def _get_answer_length(self, sys_info: SysOutputInfo,
                        existing_features: dict):
     if isinstance(existing_features["answers"]["text"], list):
         return len(
             unwrap(sys_info.source_tokenizer)(
                 existing_features["answers"]["text"][0]))
     else:
         return len(
             unwrap(sys_info.source_tokenizer)(
                 existing_features["answers"]["text"]))
Esempio n. 6
0
def bucket_attribute_specified_bucket_interval(
    sample_features: list[tuple[BucketCase, T]],
    bucket_number: int,
    bucket_setting: list[tuple],
) -> list[BucketCaseCollection]:
    intervals = unwrap(bucket_setting)
    bucket2examp: dict[tuple,
                       list[BucketCase]] = {k: list()
                                            for k in intervals}

    if isinstance(list(intervals)[0][0],
                  str):  # discrete value, such as entity tags
        for k, v in sample_features:
            if v in bucket2examp:
                bucket2examp[(v, )].append(k)
    else:
        for examp, value in sample_features:
            res_key = find_key(bucket2examp, value)
            if res_key is None:
                continue
            bucket2examp[res_key].append(examp)

    bucket_collections = [
        BucketCaseCollection((k, ), v) for k, v in bucket2examp.items()
    ]

    return bucket_collections
Esempio n. 7
0
    def _statistics_func(self, samples, sys_info: SysOutputInfo):
        vocab: dict[str, float] = {}
        length_fre: dict[int, float] = {}
        total_samps = 0
        tokenizer = unwrap(sys_info.source_tokenizer)
        for sample in progress(samples):
            text = sample["text"]
            tokens = tokenizer(text)
            length = len(tokens)

            length_fre[length] = length_fre.get(length, 0.0) + 1.0

            # update vocabulary
            for w in tokens:
                vocab[w] = vocab.get(w, 0.0) + 1.0

            total_samps += 1

        # the rank of each word based on its frequency
        sorted_dict = {
            key: rank
            for rank, key in enumerate(
                sorted(set(vocab.values()), reverse=True), 1)
        }
        vocab_rank = {k: sorted_dict[v] for k, v in vocab.items()}

        for k, v in length_fre.items():
            length_fre[k] = v * 1.0 / total_samps

        return {
            "vocab": vocab,
            "vocab_rank": vocab_rank,
            "length_fre": length_fre
        }
    def _gen_metric_stats(self, sys_info: SysOutputInfo,
                          sys_output: list[dict]) -> list[MetricStats]:
        """Generate sufficient statistics for scoring different metrics.
        :param sys_info: Information about the system outputs
        :param sys_output: The system output itself
        :return: Statistics sufficient for scoring
        """

        metrics = unwrap(self._get_metrics(sys_info))
        true_data = [self._get_true_label(x) for x in sys_output]
        pred_data = [self._get_predicted_label(x) for x in sys_output]
        rank_data = [self._get_rank_data(x)
                     for x in sys_output]  # rank of true entity in predictions

        if any(item is None for item in rank_data):
            raise ValueError(
                'Some data points do not have rank information; check system outputs.'
            )

        metric_stats = []
        for metric in metrics:
            if (isinstance(metric, MeanReciprocalRank)
                    or isinstance(metric, MeanRank)
                    or isinstance(metric, Hits)):
                metric_stats.append(metric.calc_stats_from_rank(rank_data))
            else:
                metric_stats.append(
                    metric.calc_stats_from_data(true_data, pred_data))
        return metric_stats
Esempio n. 9
0
 def _get_fre_rank(self, sys_info: SysOutputInfo, existing_features: dict,
                   statistics: Any):
     return explainaboard.utils.feature_funcs.feat_freq_rank(
         existing_features,
         statistics,
         lambda x: x['context'],
         unwrap(sys_info.source_tokenizer),
     )
Esempio n. 10
0
 def _get_ref_fre_rank(self, sys_info: SysOutputInfo,
                       existing_features: dict, statistics: Any):
     return explainaboard.utils.feature_funcs.feat_freq_rank(
         existing_features,
         statistics['target_vocab'],
         lambda x: x['reference'],
         unwrap(sys_info.target_tokenizer),
     )
Esempio n. 11
0
 def _get_src_num_oov(self, sys_info: SysOutputInfo,
                      existing_features: dict, statistics: Any):
     return explainaboard.utils.feature_funcs.feat_num_oov(
         existing_features,
         statistics['source_vocab'],
         lambda x: x['source'],
         unwrap(sys_info.source_tokenizer),
     )
Esempio n. 12
0
 def _get_absolute_blank_position(self, sys_info: SysOutputInfo,
                                  existing_features: dict):
     source_tokens = unwrap(sys_info.source_tokenizer)(
         existing_features["context"]).strs
     if existing_features["question_mark"] not in source_tokens:
         return 0
     else:
         return source_tokens.index(existing_features["question_mark"])
Esempio n. 13
0
 def _get_config(self, config: Optional[MetricConfig] = None) -> MetricConfig:
     """
     Get the configuration or overwritten configuration
     :param config: Optional configuration to override the default configuration
     :return: Either the default or overridden configuration
     """
     ret_config: MetricConfig = unwrap(config) if config is not None else self.config
     return ret_config
Esempio n. 14
0
    def _complete_features(self,
                           sys_info: SysOutputInfo,
                           sys_output: list[dict],
                           external_stats=None) -> list[str]:
        """
        This function takes in meta-data about system outputs, system outputs, and a few
        other optional pieces of information, then calculates feature functions and
        modifies `sys_output` to add these feature values

        :param sys_info: Information about the system output
        :param sys_output: The system output itself
        :param external_stats: Training set statistics that are used to calculate
            training set specific features
        :return: The features that are active (e.g. skipping training set features when
            no training set available)
        """
        sys_features = unwrap(sys_info.features)
        active_features = list(
            sys_features.get_bucket_features(
                include_training_dependent=external_stats is not None))

        sent_feats: list[str] = []
        tok_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in sys_features) else tok_feats).append(x)

        for _id, dict_sysout in progress(enumerate(sys_output),
                                         desc="featurizing"):
            # Get values of bucketing features
            tokens = dict_sysout["tokens"]

            # sentence_length
            dict_sysout["sentence_length"] = len(tokens)

            # entity density
            dict_sysout["span_density"] = len(
                self._span_ops.get_spans_simple(
                    tags=dict_sysout["true_tags"])) / len(tokens)

            # sentence-level training set dependent features
            if external_stats is not None:
                dict_sysout["num_oov"] = self._get_num_oov(
                    tokens, external_stats)
                dict_sysout["fre_rank"] = self._get_fre_rank(
                    tokens, external_stats)

            # span features for true and predicted spans
            dict_sysout["span_info"] = self._complete_span_features(
                tokens,
                dict_sysout["true_tags"],
                dict_sysout["pred_tags"],
                statistics=external_stats,
            )

        # This is not used elsewhere, so just keep it as-is
        return active_features
Esempio n. 15
0
    def _get_length_fre(self, sys_info: SysOutputInfo, existing_features: dict,
                        statistics: Any):
        length_fre = 0
        length = len(
            unwrap(sys_info.source_tokenizer)(existing_features["text"]))

        if length in statistics['length_fre'].keys():
            length_fre = statistics['length_fre'][length]

        return length_fre
Esempio n. 16
0
    def get_bucket_performance_lm(
        self,
        sys_info: SysOutputInfo,
        sys_output: list[dict],
        samples_over_bucket: list[BucketCaseCollection],
    ) -> list[BucketPerformance]:
        """
        This function defines how to get bucket-level performance w.r.t a given feature
        (e.g., sentence length)
        :param sys_info: Information about the system output
        :param sys_output: The system output itself
        :param samples_over_bucket: a dictionary mapping bucket interval names to
            true sample IDs
        :return: bucket_performances: a list of bucket intervals to
            bucket performance
        """

        bucket_performances = []
        for bucket_collection in samples_over_bucket:
            bucket_metrics = [
                x.to_metric() for x in unwrap(sys_info.metric_configs)
            ]
            log_probs = []
            for bucket_case in bucket_collection.samples:
                bcp = cast(BucketCaseSpan, bucket_case)
                log_probs.append(sys_output[bcp.sample_id]['tok_info'][
                    bcp.token_span[0]]['tok_log_prob'])

            bucket_samples = self._subsample_bucket_cases(
                bucket_collection.samples)

            bucket_performance = BucketPerformance(
                bucket_interval=bucket_collection.interval,
                n_samples=len(bucket_collection),
                bucket_samples=bucket_samples,
            )
            for metric in bucket_metrics:
                metric_val = metric.evaluate(None,
                                             log_probs,
                                             conf_value=sys_info.conf_value)
                conf_low, conf_high = (metric_val.conf_interval
                                       if metric_val.conf_interval else None)
                performance = Performance(
                    metric_name=metric.config.name,
                    value=metric_val.value,
                    confidence_score_low=conf_low,
                    confidence_score_high=conf_high,
                )
                bucket_performance.performances.append(performance)

            bucket_performances.append(bucket_performance)
        bucket_performances.sort(key=lambda x: x.bucket_interval)

        return bucket_performances
Esempio n. 17
0
 def process(self, metadata: dict, sys_output: list[dict]) -> SysOutputInfo:
     # TODO(Pengfei): Rethink if this is a good way to manipulate `system_output`
     overall_statistics = self.get_overall_statistics(metadata, sys_output)
     sys_info = unwrap(overall_statistics.sys_info)
     metric_stats = overall_statistics.metric_stats
     active_features = unwrap(overall_statistics.active_features)
     overall_results = sys_info.results.overall
     performance_over_bucket = self.bucketing_samples(
         sys_info, sys_output, active_features, metric_stats=metric_stats
     )
     self.sort_bucket_info(
         performance_over_bucket,
         sort_by=metadata.get('sort_by', 'key'),
         sort_by_metric=metadata.get('sort_by_metric', 'first'),
         sort_ascending=metadata.get('sort_ascending', False),
     )
     sys_info.results = Result(
         overall=overall_results, fine_grained=performance_over_bucket
     )
     return sys_info
Esempio n. 18
0
 def _statistics_func(self, samples: Iterator, sys_info: SysOutputInfo):
     if sys_info.source_language is None or sys_info.target_language is None:
         raise ValueError(
             'source or target languages must be specified to load '
             f'translation data, but source={sys_info.source_language} '
             f', target={sys_info.target_language}'
         )
     src = FileLoaderField(('translation', sys_info.source_language), '', str)
     trg = FileLoaderField(('translation', sys_info.target_language), '', str)
     return {
         'source_vocab': accumulate_vocab_from_samples(
             samples,
             lambda x: FileLoader.find_field(x, src),
             unwrap(sys_info.source_tokenizer),
         ),
         'target_vocab': accumulate_vocab_from_samples(
             samples,
             lambda x: FileLoader.find_field(x, trg),
             unwrap(sys_info.target_tokenizer),
         ),
     }
Esempio n. 19
0
 def _fetch_metric_stats(self, metric_stats: dict[str, Any]):
     """
     A utility function used to lazily fetch the actual scoring dict when it's
     necessary.
     """
     if 'request_id' in metric_stats:
         client: AsyncClient = unwrap(self._eaas_client)
         eaas_stats: dict[str, Any] = client.wait_and_get_result(
             metric_stats['request_id'])
         metric_stats.clear()
         for k, v in eaas_stats.items():
             metric_stats[k] = v
Esempio n. 20
0
    def get_overall_statistics(
        self, metadata: dict, sys_output: list[dict]
    ) -> OverallStatistics:
        """
        Get the overall statistics information, including performance, of the system
        output
        :param metadata: The metadata of the system
        :param sys_output: The system output itself
        """
        if metadata is None:
            metadata = {}
        if "task_name" not in metadata.keys():
            metadata["task_name"] = self.task_type().value

        sys_info = SysOutputInfo.from_dict(metadata)
        if sys_info.metric_configs is None:
            sys_info.metric_configs = self.default_metrics(
                source_language=sys_info.source_language,
                target_language=sys_info.target_language,
            )
        if sys_info.target_tokenizer is None:
            sys_info.target_tokenizer = get_default_tokenizer(
                task_type=self.task_type(), lang=sys_info.target_language
            )
        if sys_info.source_tokenizer is None:
            sys_info.source_tokenizer = (
                sys_info.target_tokenizer
                if sys_info.source_language == sys_info.target_language
                else get_default_tokenizer(
                    task_type=self.task_type(), lang=sys_info.source_language
                )
            )

        # declare customized features: _features will be updated
        custom_features: dict = metadata.get('custom_features', {})
        sys_info.features = self._customize_features(custom_features)

        # get scoring statistics
        metric_stats = unwrap(self._gen_metric_stats(sys_info, sys_output))
        external_stats = self._gen_external_stats(sys_info, self._statistics_func)
        active_features = self._complete_features(
            sys_info, sys_output, external_stats=external_stats
        )
        overall_results = self.get_overall_performance(
            sys_info, sys_output, metric_stats=metric_stats
        )
        sys_info.results = Result(
            overall=overall_results, calibration=None, fine_grained=None
        )
        return OverallStatistics(sys_info, metric_stats, active_features)
Esempio n. 21
0
    def _gen_metric_stats(
        self, sys_info: SysOutputInfo, sys_output: list[dict]
    ) -> list[MetricStats]:
        """Generate sufficient statistics for scoring different metrics.

        :param sys_info: Information about the system outputs
        :param sys_output: The system output itself
        :return: Statistics sufficient for scoring
        """
        metrics = unwrap(self._get_metrics(sys_info))
        true_data = [self._get_true_label(x) for x in sys_output]
        pred_data = [self._get_predicted_label(x) for x in sys_output]
        metric_stats = []

        for metric in metrics:
            metric_stats.append(metric.calc_stats_from_data(true_data, pred_data))
        return metric_stats
Esempio n. 22
0
    def bucketing_samples(
        self,
        sys_info: SysOutputInfo,
        sys_output: list[dict],
        active_features: list[str],
        metric_stats: list[MetricStats],
    ) -> dict[str, list[BucketPerformance]]:

        features = unwrap(sys_info.features)

        sent_feats: list[str] = []
        tok_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in features) else tok_feats).append(x)

        # First, get the buckets for sentences using the standard protocol
        performances_over_bucket = super().bucketing_samples(
            sys_info, sys_output, sent_feats, metric_stats)

        # Bucketing
        feature_lists = self._get_feature_lists(sys_output, tok_feats)

        for i, feature_name in enumerate(
                progress(tok_feats, desc="token-level bucketing")):
            my_feature = features["tok_info"].feature.feature[feature_name]
            bucket_info = my_feature.bucket_info

            # Get buckets for true spans
            bucket_func: Callable[..., list[BucketCaseCollection]] = getattr(
                bucketing, bucket_info.method)

            samples_over_bucket = bucket_func(
                sample_features=feature_lists[i],
                bucket_number=bucket_info.number,
                bucket_setting=bucket_info.setting,
            )

            # evaluating bucket: get bucket performance
            performances_over_bucket[
                feature_name] = self.get_bucket_performance_lm(
                    sys_info,
                    sys_output,
                    samples_over_bucket,
                )
        return performances_over_bucket
Esempio n. 23
0
    def bucketing_samples(
        self,
        sys_info: SysOutputInfo,
        sys_output: list[dict],
        active_features: list[str],
        metric_stats: list[MetricStats],
    ) -> dict[str, list[BucketPerformance]]:
        """
        Separate samples into buckets and calculate performance over them
        :param sys_info: Information about the system output
        :param sys_output: The system output itself, already annotated with features
        :param active_features: The features to perform bucketing over
        :param metric_stats: The stats from which to calculate performance
        :return:
            performances_over_bucket:
                a dictionary of feature name -> list of performances by bucket
        """
        sys_features = unwrap(sys_info.features)

        # Bucketing
        performances_over_bucket: dict[str, list[BucketPerformance]] = {}
        for feature_name in progress(active_features, desc="sample-level bucketing"):
            # Preparation for bucketing
            bucket_func: Callable[..., list[BucketCaseCollection]] = getattr(
                explainaboard.utils.bucketing,
                sys_features[feature_name].bucket_info.method,
            )
            samples_over_bucket = bucket_func(
                sample_features=[
                    (BucketCase(x), sys_output[x][feature_name])
                    for x in range(len(sys_output))
                ],
                bucket_number=sys_features[feature_name].bucket_info.number,
                bucket_setting=sys_features[feature_name].bucket_info.setting,
            )

            # evaluating bucket: get bucket performance
            performances_over_bucket[feature_name] = self.get_bucket_performance(
                sys_info,
                sys_output,
                samples_over_bucket,
                metric_stats=metric_stats,
            )
        return performances_over_bucket
Esempio n. 24
0
    def bucketing_samples(
        self,
        sys_info: SysOutputInfo,
        sys_output: list[dict],
        active_features: list[str],
        metric_stats: list[MetricStats],
    ) -> dict[str, list[BucketPerformance]]:

        features = unwrap(sys_info.features)
        sent_feats: list[str] = []
        tok_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in features) else tok_feats).append(x)

        # First, get the buckets for sentences using the standard protocol
        performances_over_bucket = super().bucketing_samples(
            sys_info, sys_output, sent_feats, metric_stats)

        all_sample_features = self._get_sample_features(sys_output, tok_feats)

        # Second, get the buckets for tokens
        for feature_id, feature_name in enumerate(
                progress(tok_feats, desc="bucketing token features")):

            # Choose behavior based on whether this is a feature of samples or spans
            my_feature = features["ref_tok_info"].feature.feature[feature_name]
            bucket_info = my_feature.bucket_info

            # Get buckets for true spans
            bucket_func: Callable[..., list[BucketCaseCollection]] = getattr(
                bucketing, bucket_info.method)

            sample_features = [(case, feats[feature_id])
                               for case, feats in all_sample_features]

            samples_over_bucket = bucket_func(
Esempio n. 25
0
    def _complete_span_features(self,
                                sentence,
                                true_tags,
                                pred_tags,
                                statistics=None) -> list[Span]:
        # Get training set stats if they exist
        has_stats = statistics is not None and len(statistics) > 0
        econ_dic = statistics["econ_dic"] if has_stats else None
        efre_dic = statistics["efre_dic"] if has_stats else None

        self._span_ops.set_resources(resources={
            "has_stats": has_stats,
            "econ_dic": econ_dic,
            "efre_dic": efre_dic,
        })

        # Merge the spans together so that the span tag is "true_tag pred_tag"
        # using "_DEFAULT_TAG" if that span doesn't exist in the true or predicted tags
        # respectively
        # TODO(gneubig): This is probably calculating features twice, could be just once
        true_spans = self._span_ops.get_spans(toks=sentence, tags=true_tags)
        pred_spans = self._span_ops.get_spans(toks=sentence, tags=pred_tags)
        merged_spans = {}
        for span in true_spans:
            span.span_tag = f'{span.span_tag} {self._DEFAULT_TAG}'
            merged_spans[span.span_pos] = span
        for span in pred_spans:
            merged_span = merged_spans.get(span.span_pos)
            if not merged_span:
                span.span_tag = f'{self._DEFAULT_TAG} {span.span_tag}'
                merged_spans[span.span_pos] = span
            else:
                true_tag, _ = unwrap(merged_span.span_tag).split(' ')
                merged_span.span_tag = f'{true_tag} {span.span_tag}'

        return list(merged_spans.values())
Esempio n. 26
0
    def _complete_features(
        self, sys_info: SysOutputInfo, sys_output: list[dict], external_stats=None
    ) -> list[str]:
        """
        This function takes in meta-data about system outputs, system outputs, and a few
        other optional pieces of information, then calculates feature functions and
        modifies `sys_output` to add these feature values

        :param sys_info: Information about the system output
        :param sys_output: The system output itself
        :param external_stats: External statistics that are used to calculate training
            set specific features
        :return: The features that are active (e.g. skipping training set features when
            no training set available)
        """
        bucket_feature_funcs: dict[str, tuple[Callable, bool]] = {}
        sys_features = unwrap(sys_info.features)

        for bucket_feature in sys_features.get_bucket_features():

            feature_info = sys_features[bucket_feature]

            # Skip training set features if no stats
            if external_stats is None and feature_info.require_training_set:
                continue

            feature_func = self._get_feature_func(
                bucket_feature, feature_info.is_custom
            )

            bucket_feature_funcs[bucket_feature] = (
                feature_func,
                feature_info.require_training_set,
            )

        for _id, dict_sysout in progress(enumerate(sys_output), desc="featurizing"):
            # Get values of bucketing features
            for (
                bucket_key,
                (
                    bucket_func,
                    training_dependent,
                ),
            ) in bucket_feature_funcs.items():

                feature_info = sys_features[bucket_key]

                # handles user-defined features
                if feature_info.is_custom:
                    # TODO(Pengfei): this should be generalized
                    feature_value = (
                        "_".join(dict_sysout[bucket_key])
                        if isinstance(dict_sysout[bucket_key], list)
                        else dict_sysout[bucket_key]
                    )
                    dict_sysout[bucket_key] = feature_value

                # handles all other features
                else:
                    dict_sysout[bucket_key] = (
                        bucket_func(sys_info, dict_sysout, external_stats)
                        if training_dependent
                        else bucket_func(sys_info, dict_sysout)
                    )

        return list(bucket_feature_funcs.keys())
Esempio n. 27
0
 def _get_metrics(self, sys_info: SysOutputInfo) -> list[Metric]:
     return [config.to_metric() for config in unwrap(sys_info.metric_configs)]
 def _get_head_entity_length(self, sys_info: SysOutputInfo,
                             existing_features: dict):
     return len(
         unwrap(sys_info.source_tokenizer)(
             existing_features["true_head_decipher"]))
Esempio n. 29
0
def draw_bar_chart_from_reports(reports: list[str],
                                output_dir: str,
                                sys_names: list[str] | None = None) -> None:
    """
    Draw bar charts from report file generated from ExplainaBoard
    :param reports: Reports to plot
    :param output_dir:
    :return:
    """

    # TODO(gneubig): This should get the system name from inside the report
    if sys_names is None:
        sys_names = [os.path.basename(x).replace('.json', '') for x in reports]
    elif len(sys_names) != len(reports):
        raise ValueError('Length of sys_names must equal that of reports')

    report_info: list[SysOutputInfo] = []
    for report in reports:
        with open(report) as fin:
            report_info.append(SysOutputInfo.from_dict(json.load(fin)))
    overall_results = [
        list(unwrap(x.results.overall).values()) for x in report_info
    ]
    overall_metric_names = list(unwrap(report_info[0].results.overall).keys())
    fg_results = [unwrap(x.results.fine_grained) for x in report_info]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Overall performance
    ys = [[x.value for x in y] for y in overall_results]
    y_errs = None
    if overall_results[0][0].confidence_score_low is not None:
        y_errs = [(
            [x.value - unwrap(x.confidence_score_low) for x in y],
            [unwrap(x.confidence_score_high) - x.value for x in y],
        ) for y in overall_results]

    make_bar_chart(
        ys,
        output_dir,
        'overall',
        output_fig_format='png',
        fig_size=(8, 6),
        sys_names=sys_names,
        errs=y_errs,
        title=None,
        xticklabels=overall_metric_names,
        ylabel='metric value',
    )

    # Bucket performance: feature name, for example, sentence length
    for feature_name in progress(fg_results[0].keys()):
        # Make sure that buckets exist
        buckets: list[list[BucketPerformance]] = []
        for i, fg_result in enumerate(fg_results):
            if feature_name not in fg_result:
                get_logger().error(
                    f'error: feature {feature_name} not in {reports[i]}')
            else:
                buckets.append(fg_result[feature_name])
                bnames0, bnames = [x.bucket_interval for x in buckets[0]
                                   ], [x.bucket_interval for x in buckets[-1]]
                if len(bnames0) != len(bnames):
                    get_logger().error(
                        f'error: different number of buckets for {feature_name} in '
                        f'{reports[0]} and {reports[i]}')
                    buckets = []
                elif bnames0 != bnames:
                    get_logger().warning(
                        f'warning: different bucket labels for {feature_name} in '
                        f'{reports[0]} and {reports[i]}')
            if len(buckets) != i + 1:
                break
        if len(buckets) != len(reports):
            continue

        bucket0_intervals = [x.bucket_interval for x in buckets[0]]
        bucket_metrics = [x.metric_name for x in buckets[0][0].performances]
        for metric_id, metric_name in enumerate(bucket_metrics):

            performances: list[list[Performance]] = [
                [x.performances[metric_id] for x in y] for y in buckets
            ]
            ys = [[x.value for x in y] for y in performances]

            y_errs = None
            if performances[0][0].confidence_score_low is not None:
                y_errs = [(
                    [x.value - unwrap(x.confidence_score_low) for x in y],
                    [unwrap(x.confidence_score_high) - x.value for x in y],
                ) for y in performances]

            make_bar_chart(
                ys,
                output_dir,
                f'{feature_name}_{metric_name}',
                output_fig_format='png',
                fig_size=(8, 6),
                sys_names=sys_names,
                errs=y_errs,
                title=None,
                xlabel=feature_name,
                xticklabels=bucket0_intervals,
                ylabel=metric_name,
            )
Esempio n. 30
0
 def _get_attr_compression(self, sys_info: SysOutputInfo, existing_features: dict):
     return len(
         unwrap(sys_info.source_tokenizer)(existing_features["source"])
     ) / len(unwrap(sys_info.target_tokenizer)(existing_features["reference"]))