def _statistics_func(self, samples, sys_info: SysOutputInfo):
        vocab: dict[str, float] = {}
        length_fre: dict[int, float] = {}
        total_samps = 0
        tokenizer = unwrap(sys_info.source_tokenizer)
        for sample in progress(samples):
            text = sample["text"]
            tokens = tokenizer(text)
            length = len(tokens)

            length_fre[length] = length_fre.get(length, 0.0) + 1.0

            # update vocabulary
            for w in tokens:
                vocab[w] = vocab.get(w, 0.0) + 1.0

            total_samps += 1

        # the rank of each word based on its frequency
        sorted_dict = {
            key: rank
            for rank, key in enumerate(
                sorted(set(vocab.values()), reverse=True), 1)
        }
        vocab_rank = {k: sorted_dict[v] for k, v in vocab.items()}

        for k, v in length_fre.items():
            length_fre[k] = v * 1.0 / total_samps

        return {
            "vocab": vocab,
            "vocab_rank": vocab_rank,
            "length_fre": length_fre
        }
    def _complete_features(self,
                           sys_info: SysOutputInfo,
                           sys_output: list[dict],
                           external_stats=None) -> list[str]:
        """
        This function takes in meta-data about system outputs, system outputs, and a few
        other optional pieces of information, then calculates feature functions and
        modifies `sys_output` to add these feature values

        :param sys_info: Information about the system output
        :param sys_output: The system output itself
        :param external_stats: Training set statistics that are used to calculate
            training set specific features
        :return: The features that are active (e.g. skipping training set features when
            no training set available)
        """
        sys_features = unwrap(sys_info.features)
        active_features = list(
            sys_features.get_bucket_features(
                include_training_dependent=external_stats is not None))

        # One pass over the test set to find token test frequency
        all_tokens = [
            unwrap(sys_info.source_tokenizer)(x['output']) for x in sys_output
        ]
        all_log_probs = [self._get_predicted_label(x) for x in sys_output]
        test_freq: dict[str, int] = {}
        for tokens in all_tokens:
            for tok in tokens:
                test_freq[tok] = test_freq.get(tok, 0) + 1

        sent_feats: list[str] = []
        tok_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in sys_features) else tok_feats).append(x)

        for _id, (dict_sysout, tokens, log_probs) in progress(
                enumerate(zip(sys_output, all_tokens, all_log_probs)),
                desc="featurizing"):
            # Get values of bucketing features
            text = dict_sysout["output"]

            # sentence_length
            dict_sysout["text_length"] = len(tokens)
            dict_sysout["text_chars"] = len(text)

            # sentence-level training set dependent features
            if external_stats is not None:
                dict_sysout["num_oov"] = self._get_num_oov(
                    tokens, external_stats)
                dict_sysout["fre_rank"] = self._get_fre_rank(
                    tokens, external_stats)

            # span features for true and predicted spans
            dict_sysout["tok_info"] = self._complete_tok_features(
                tokens, log_probs, test_freq, statistics=external_stats)

        return active_features
Beispiel #3
0
    def _complete_features(self,
                           sys_info: SysOutputInfo,
                           sys_output: list[dict],
                           external_stats=None) -> list[str]:
        """
        This function takes in meta-data about system outputs, system outputs, and a few
        other optional pieces of information, then calculates feature functions and
        modifies `sys_output` to add these feature values

        :param sys_info: Information about the system output
        :param sys_output: The system output itself
        :param external_stats: Training set statistics that are used to calculate
            training set specific features
        :return: The features that are active (e.g. skipping training set features when
            no training set available)
        """
        sys_features = unwrap(sys_info.features)
        active_features = list(
            sys_features.get_bucket_features(
                include_training_dependent=external_stats is not None))

        sent_feats: list[str] = []
        tok_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in sys_features) else tok_feats).append(x)

        for _id, dict_sysout in progress(enumerate(sys_output),
                                         desc="featurizing"):
            # Get values of bucketing features
            tokens = dict_sysout["tokens"]

            # sentence_length
            dict_sysout["sentence_length"] = len(tokens)

            # entity density
            dict_sysout["span_density"] = len(
                self._span_ops.get_spans_simple(
                    tags=dict_sysout["true_tags"])) / len(tokens)

            # sentence-level training set dependent features
            if external_stats is not None:
                dict_sysout["num_oov"] = self._get_num_oov(
                    tokens, external_stats)
                dict_sysout["fre_rank"] = self._get_fre_rank(
                    tokens, external_stats)

            # span features for true and predicted spans
            dict_sysout["span_info"] = self._complete_span_features(
                tokens,
                dict_sysout["true_tags"],
                dict_sysout["pred_tags"],
                statistics=external_stats,
            )

        # This is not used elsewhere, so just keep it as-is
        return active_features
Beispiel #4
0
    def get_econ_efre_dic(
            self, words: list[str],
            bio_tags: list[str]) -> tuple[dict[str, float], dict[str, int]]:
        """
        Calculate the entity label consistency and frequency features from this paper
        https://aclanthology.org/2020.emnlp-main.489.pdf
        :param words: a list of all words in the corpus
        :param bio_tags: a list of all tags in the corpus
        :return: Returns two dictionaries:
                    econ: 'span|||tag' pointing to entity consistency values
                    efre: 'span' pointing to entity frequency values
        """
        chunks_train = self._span_ops.get_spans_simple(bio_tags)

        # Create pseudo-trie
        prefixes: set[str] = set()
        chunk_to_tag: dict[tuple[int, int], str] = {}
        entity_to_tagcnt: dict[str, dict[str, int]] = {}
        efre_dic: dict[str, int] = {}
        for true_chunk in progress(chunks_train):
            idx_start = true_chunk[1]
            idx_end = true_chunk[2]
            chunk_to_tag[(idx_start, idx_end)] = true_chunk[0]
            span_str = ''
            for i in range(0, idx_end - idx_start):
                w = words[idx_start + i].lower()
                span_str += w if i == 0 else f' {w}'
                prefixes.add(span_str)
            entity_to_tagcnt[span_str] = {}
            efre_dic[span_str] = efre_dic.get(span_str, 0) + 1

        # Actually calculate stats
        ltws = len(words)
        for idx_start in range(ltws):
            span_str = ''
            for i in range(0, ltws - idx_start):
                w = words[idx_start + i].lower()
                span_str += w if i == 0 else f' {w}'
                if span_str not in prefixes:
                    break
                if span_str in entity_to_tagcnt:
                    my_tag = chunk_to_tag.get((idx_start, idx_start + i + 1),
                                              self._DEFAULT_TAG)
                    entity_to_tagcnt[span_str][my_tag] = (
                        entity_to_tagcnt[span_str].get(my_tag, 0) + 1)

        econ_dic: dict[str, float] = {}
        for span_str, cnt_dic in entity_to_tagcnt.items():
            cnt_sum = float(sum(cnt_dic.values()))
            for tag, cnt in cnt_dic.items():
                econ_dic[f'{span_str}|||{tag}'] = cnt / cnt_sum
        return econ_dic, efre_dic
    def _statistics_func(self, samples: Iterator, sys_info: SysOutputInfo):
        """
        `Samples` is a dataset iterator: List[Dict], to know more about it, you can:
        # pip install datalabs
        dataset = load_dataset("fb15k_237", 'readable')
        print(dataset['train'])
        """
        dict_head: dict[str, int] = {}
        dict_link: dict[str, int] = {}
        dict_tail: dict[str, int] = {}

        entity_dic = {}
        file_path = cache_api.cache_online_file(
            'http://phontron.com/download/explainaboard/pre_computed/kg/entity2wikidata.json',  # noqa
            'pre_computed/kg/entity2wikidata.json',
        )
        with open(file_path, 'r') as file:
            entity_dic = json.loads(file.read())

        for sample in progress(samples):

            tail = (sample['tail'] if sample['tail'] not in entity_dic.keys()
                    else entity_dic[sample['tail']]['label'])
            if tail not in dict_tail.keys():
                dict_tail[tail] = 1
            else:
                dict_tail[tail] += 1

            head = (sample['head'] if sample['head'] not in entity_dic.keys()
                    else entity_dic[sample['head']]['label'])
            if head not in dict_head.keys():
                dict_head[head] = 1
            else:
                dict_head[head] += 1

            link = (sample['link'] if sample['link'] not in entity_dic.keys()
                    else entity_dic[sample['link']]['label'])
            if link not in dict_link.keys():
                dict_link[link] = 1
            else:
                dict_link[link] += 1

        return {
            "head_fre": dict_head,
            "link_fre": dict_link,
            "tail_fre": dict_tail,
        }
    def bucketing_samples(
        self,
        sys_info: SysOutputInfo,
        sys_output: list[dict],
        active_features: list[str],
        metric_stats: list[MetricStats],
    ) -> dict[str, list[BucketPerformance]]:

        features = unwrap(sys_info.features)

        sent_feats: list[str] = []
        tok_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in features) else tok_feats).append(x)

        # First, get the buckets for sentences using the standard protocol
        performances_over_bucket = super().bucketing_samples(
            sys_info, sys_output, sent_feats, metric_stats)

        # Bucketing
        feature_lists = self._get_feature_lists(sys_output, tok_feats)

        for i, feature_name in enumerate(
                progress(tok_feats, desc="token-level bucketing")):
            my_feature = features["tok_info"].feature.feature[feature_name]
            bucket_info = my_feature.bucket_info

            # Get buckets for true spans
            bucket_func: Callable[..., list[BucketCaseCollection]] = getattr(
                bucketing, bucket_info.method)

            samples_over_bucket = bucket_func(
                sample_features=feature_lists[i],
                bucket_number=bucket_info.number,
                bucket_setting=bucket_info.setting,
            )

            # evaluating bucket: get bucket performance
            performances_over_bucket[
                feature_name] = self.get_bucket_performance_lm(
                    sys_info,
                    sys_output,
                    samples_over_bucket,
                )
        return performances_over_bucket
Beispiel #7
0
def accumulate_vocab_from_samples(samples: Iterator,
                                  text_from_sample: Callable,
                                  tokenizer: Tokenizer):
    vocab: dict[str, int] = {}
    for sample in progress(samples):
        for w in tokenizer(text_from_sample(sample)):
            vocab[w] = vocab.get(w, 0) + 1
    # the rank of each word based on its frequency
    sorted_dict = {
        key: rank
        for rank, key in enumerate(sorted(set(vocab.values()), reverse=True),
                                   1)
    }
    vocab_rank = {k: sorted_dict[v] for k, v in vocab.items()}
    return {
        "vocab": vocab,
        "vocab_rank": vocab_rank,
    }
Beispiel #8
0
    def bucketing_samples(
        self,
        sys_info: SysOutputInfo,
        sys_output: list[dict],
        active_features: list[str],
        metric_stats: list[MetricStats],
    ) -> dict[str, list[BucketPerformance]]:
        """
        Separate samples into buckets and calculate performance over them
        :param sys_info: Information about the system output
        :param sys_output: The system output itself, already annotated with features
        :param active_features: The features to perform bucketing over
        :param metric_stats: The stats from which to calculate performance
        :return:
            performances_over_bucket:
                a dictionary of feature name -> list of performances by bucket
        """
        sys_features = unwrap(sys_info.features)

        # Bucketing
        performances_over_bucket: dict[str, list[BucketPerformance]] = {}
        for feature_name in progress(active_features, desc="sample-level bucketing"):
            # Preparation for bucketing
            bucket_func: Callable[..., list[BucketCaseCollection]] = getattr(
                explainaboard.utils.bucketing,
                sys_features[feature_name].bucket_info.method,
            )
            samples_over_bucket = bucket_func(
                sample_features=[
                    (BucketCase(x), sys_output[x][feature_name])
                    for x in range(len(sys_output))
                ],
                bucket_number=sys_features[feature_name].bucket_info.number,
                bucket_setting=sys_features[feature_name].bucket_info.setting,
            )

            # evaluating bucket: get bucket performance
            performances_over_bucket[feature_name] = self.get_bucket_performance(
                sys_info,
                sys_output,
                samples_over_bucket,
                metric_stats=metric_stats,
            )
        return performances_over_bucket
Beispiel #9
0
    def _statistics_func(self, samples: Dataset, sys_info: SysOutputInfo):
        dl_features = samples.info.features

        tokens_sequences = []
        tags_sequences = []

        vocab: dict[str, int] = {}
        tag_vocab: dict[str, int] = {}
        for sample in progress(samples):
            rep_sample = DatalabFileLoader.replace_labels(dl_features, sample)
            tokens, tags = rep_sample["tokens"], rep_sample["tags"]

            # update vocabulary
            for token, tag in zip(tokens, tags):
                vocab[token] = vocab.get(token, 0) + 1
                tag_vocab[tag] = tag_vocab.get(tag, 0) + 1

            tokens_sequences += tokens
            tags_sequences += tags

        # econ and efre dictionaries
        econ_dic, efre_dic = self.get_econ_efre_dic(tokens_sequences,
                                                    tags_sequences)
        # vocab_rank: the rank of each word based on its frequency
        sorted_dict = {
            key: rank
            for rank, key in enumerate(
                sorted(set(vocab.values()), reverse=True), 1)
        }
        vocab_rank = {k: sorted_dict[v] for k, v in vocab.items()}

        return {
            "efre_dic": efre_dic,
            "econ_dic": econ_dic,
            "vocab": vocab,
            "vocab_rank": vocab_rank,
        }
Beispiel #10
0
    def bucketing_samples(
        self,
        sys_info: SysOutputInfo,
        sys_output: list[dict],
        active_features: list[str],
        metric_stats: list[MetricStats],
    ) -> dict[str, list[BucketPerformance]]:

        features = unwrap(sys_info.features)
        sent_feats: list[str] = []
        tok_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in features) else tok_feats).append(x)

        # First, get the buckets for sentences using the standard protocol
        performances_over_bucket = super().bucketing_samples(
            sys_info, sys_output, sent_feats, metric_stats)

        all_sample_features = self._get_sample_features(sys_output, tok_feats)

        # Second, get the buckets for tokens
        for feature_id, feature_name in enumerate(
                progress(tok_feats, desc="bucketing token features")):

            # Choose behavior based on whether this is a feature of samples or spans
            my_feature = features["ref_tok_info"].feature.feature[feature_name]
            bucket_info = my_feature.bucket_info

            # Get buckets for true spans
            bucket_func: Callable[..., list[BucketCaseCollection]] = getattr(
                bucketing, bucket_info.method)

            sample_features = [(case, feats[feature_id])
                               for case, feats in all_sample_features]

            samples_over_bucket = bucket_func(
Beispiel #11
0
    def _complete_features(
        self, sys_info: SysOutputInfo, sys_output: list[dict], external_stats=None
    ) -> list[str]:
        """
        This function takes in meta-data about system outputs, system outputs, and a few
        other optional pieces of information, then calculates feature functions and
        modifies `sys_output` to add these feature values

        :param sys_info: Information about the system output
        :param sys_output: The system output itself
        :param external_stats: External statistics that are used to calculate training
            set specific features
        :return: The features that are active (e.g. skipping training set features when
            no training set available)
        """
        bucket_feature_funcs: dict[str, tuple[Callable, bool]] = {}
        sys_features = unwrap(sys_info.features)

        for bucket_feature in sys_features.get_bucket_features():

            feature_info = sys_features[bucket_feature]

            # Skip training set features if no stats
            if external_stats is None and feature_info.require_training_set:
                continue

            feature_func = self._get_feature_func(
                bucket_feature, feature_info.is_custom
            )

            bucket_feature_funcs[bucket_feature] = (
                feature_func,
                feature_info.require_training_set,
            )

        for _id, dict_sysout in progress(enumerate(sys_output), desc="featurizing"):
            # Get values of bucketing features
            for (
                bucket_key,
                (
                    bucket_func,
                    training_dependent,
                ),
            ) in bucket_feature_funcs.items():

                feature_info = sys_features[bucket_key]

                # handles user-defined features
                if feature_info.is_custom:
                    # TODO(Pengfei): this should be generalized
                    feature_value = (
                        "_".join(dict_sysout[bucket_key])
                        if isinstance(dict_sysout[bucket_key], list)
                        else dict_sysout[bucket_key]
                    )
                    dict_sysout[bucket_key] = feature_value

                # handles all other features
                else:
                    dict_sysout[bucket_key] = (
                        bucket_func(sys_info, dict_sysout, external_stats)
                        if training_dependent
                        else bucket_func(sys_info, dict_sysout)
                    )

        return list(bucket_feature_funcs.keys())
Beispiel #12
0
def draw_bar_chart_from_reports(reports: list[str],
                                output_dir: str,
                                sys_names: list[str] | None = None) -> None:
    """
    Draw bar charts from report file generated from ExplainaBoard
    :param reports: Reports to plot
    :param output_dir:
    :return:
    """

    # TODO(gneubig): This should get the system name from inside the report
    if sys_names is None:
        sys_names = [os.path.basename(x).replace('.json', '') for x in reports]
    elif len(sys_names) != len(reports):
        raise ValueError('Length of sys_names must equal that of reports')

    report_info: list[SysOutputInfo] = []
    for report in reports:
        with open(report) as fin:
            report_info.append(SysOutputInfo.from_dict(json.load(fin)))
    overall_results = [
        list(unwrap(x.results.overall).values()) for x in report_info
    ]
    overall_metric_names = list(unwrap(report_info[0].results.overall).keys())
    fg_results = [unwrap(x.results.fine_grained) for x in report_info]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Overall performance
    ys = [[x.value for x in y] for y in overall_results]
    y_errs = None
    if overall_results[0][0].confidence_score_low is not None:
        y_errs = [(
            [x.value - unwrap(x.confidence_score_low) for x in y],
            [unwrap(x.confidence_score_high) - x.value for x in y],
        ) for y in overall_results]

    make_bar_chart(
        ys,
        output_dir,
        'overall',
        output_fig_format='png',
        fig_size=(8, 6),
        sys_names=sys_names,
        errs=y_errs,
        title=None,
        xticklabels=overall_metric_names,
        ylabel='metric value',
    )

    # Bucket performance: feature name, for example, sentence length
    for feature_name in progress(fg_results[0].keys()):
        # Make sure that buckets exist
        buckets: list[list[BucketPerformance]] = []
        for i, fg_result in enumerate(fg_results):
            if feature_name not in fg_result:
                get_logger().error(
                    f'error: feature {feature_name} not in {reports[i]}')
            else:
                buckets.append(fg_result[feature_name])
                bnames0, bnames = [x.bucket_interval for x in buckets[0]
                                   ], [x.bucket_interval for x in buckets[-1]]
                if len(bnames0) != len(bnames):
                    get_logger().error(
                        f'error: different number of buckets for {feature_name} in '
                        f'{reports[0]} and {reports[i]}')
                    buckets = []
                elif bnames0 != bnames:
                    get_logger().warning(
                        f'warning: different bucket labels for {feature_name} in '
                        f'{reports[0]} and {reports[i]}')
            if len(buckets) != i + 1:
                break
        if len(buckets) != len(reports):
            continue

        bucket0_intervals = [x.bucket_interval for x in buckets[0]]
        bucket_metrics = [x.metric_name for x in buckets[0][0].performances]
        for metric_id, metric_name in enumerate(bucket_metrics):

            performances: list[list[Performance]] = [
                [x.performances[metric_id] for x in y] for y in buckets
            ]
            ys = [[x.value for x in y] for y in performances]

            y_errs = None
            if performances[0][0].confidence_score_low is not None:
                y_errs = [(
                    [x.value - unwrap(x.confidence_score_low) for x in y],
                    [unwrap(x.confidence_score_high) - x.value for x in y],
                ) for y in performances]

            make_bar_chart(
                ys,
                output_dir,
                f'{feature_name}_{metric_name}',
                output_fig_format='png',
                fig_size=(8, 6),
                sys_names=sys_names,
                errs=y_errs,
                title=None,
                xlabel=feature_name,
                xticklabels=bucket0_intervals,
                ylabel=metric_name,
            )
Beispiel #13
0
    def bucketing_samples(
        self,
        sys_info: SysOutputInfo,
        sys_output: list[dict],
        active_features: list[str],
        metric_stats: list[MetricStats],
    ) -> dict[str, list[BucketPerformance]]:

        features = unwrap(sys_info.features)

        sent_feats: list[str] = []
        span_feats: list[str] = []
        for x in active_features:
            (sent_feats if (x in features) else span_feats).append(x)

        # First, get the buckets for sentences using the standard protocol
        performances_over_bucket = super().bucketing_samples(
            sys_info, sys_output, sent_feats, metric_stats)

        case_spans: list[tuple[BucketCaseLabeledSpan, Span]] = []
        for sample_id, my_output in enumerate(sys_output):
            for tok_id, span_info in enumerate(my_output['span_info']):
                span = cast(Span, span_info)
                true_tag, pred_tag = unwrap(span.span_tag).split(' ')
                case_spans.append((
                    BucketCaseLabeledSpan(
                        sample_id=sample_id,
                        token_span=unwrap(span.span_pos),
                        char_span=unwrap(span.span_char_pos),
                        orig_str='tokens',
                        text=unwrap(span.span_text),
                        true_label=true_tag,
                        predicted_label=pred_tag,
                    ),
                    span,
                ))

        # Bucketing
        for feature_name in progress(span_feats, desc="span-level bucketing"):
            my_feature = features["true_span_info"].feature.feature[
                feature_name]
            bucket_info = my_feature.bucket_info

            # Get buckets for true spans
            bucket_func: Callable[..., list[BucketCaseCollection]] = getattr(
                bucketing, bucket_info.method)

            # Span tag is special because we keep track of both labels, keep just gold
            if feature_name == 'span_tag':
                sample_features = [(case, unwrap(span.span_tag).split(' ')[0])
                                   for case, span in case_spans]
            else:
                sample_features = [(case, getattr(span, feature_name))
                                   for case, span in case_spans]

            samples_over_bucket = bucket_func(
                sample_features=sample_features,
                bucket_number=bucket_info.number,
                bucket_setting=bucket_info.setting,
            )

            # evaluating bucket: get bucket performance
            performances_over_bucket[
                feature_name] = self.get_bucket_performance_seqlab(
                    sys_info,
                    sys_output,
                    samples_over_bucket,
                )
        return performances_over_bucket