def get_pairwise_performance_gap(sys1: SysOutputInfo, sys2: SysOutputInfo) -> SysOutputInfo: sys = copy.deepcopy(sys1) orm, or1, or2 = (unwrap(x.results.overall) for x in (sys, sys1, sys2)) for metric_name, performance_unit in orm.items(): orm[metric_name].value = float(or1[metric_name].value) - float( or2[metric_name].value) orm[metric_name].confidence_score_low = None orm[metric_name].confidence_score_high = None fgr, fgr1, fgr2 = (unwrap(x.results.fine_grained) for x in (sys, sys1, sys2)) for bucket_attr, buckets in fgr.items(): for bucket_id, bucket in enumerate(buckets): for perf_id, perf in enumerate(bucket.performances): perf.value = float( fgr1[bucket_attr][bucket_id].performances[perf_id].value ) - float( fgr2[bucket_attr][bucket_id].performances[perf_id].value) # TODO(gneubig): these could be done via pairwise bootstraps perf.confidence_score_low = None perf.confidence_score_high = None return sys
def get_tasks(task: TaskType, system_outputs: list[str]) -> list[TaskType]: """ Get the task for each system output. :param task: Explicitly specified task. Use if present :param system_outputs: System output files, load from metadata in these files if an explicit task is not set :return: A list of task types for each system """ real_tasks: list[TaskType] = [] if task: real_tasks = [task] * len(system_outputs) if task not in TaskType.list(): raise ValueError( f'Task name {task} was not recognized. ExplainaBoard currently ' f'supports:{TaskType.list()}') return real_tasks else: for sys_output in system_outputs: # give me a task, or give me death (by exception) task_or_die: TaskType | None = None msg: str = '' try: metadata = FileLoaderMetadata.from_file(sys_output) task_or_die = TaskType(unwrap(metadata.task_name)) except Exception as e: msg = str(e) if task_or_die is None: raise ValueError( 'Must either specify a task explicitly or have one ' 'specified in metadata, but could find neither for ' f'{sys_output}. {msg}') real_tasks.append(unwrap(task_or_die)) return real_tasks
def _complete_features(self, sys_info: SysOutputInfo, sys_output: list[dict], external_stats=None) -> list[str]: """ This function takes in meta-data about system outputs, system outputs, and a few other optional pieces of information, then calculates feature functions and modifies `sys_output` to add these feature values :param sys_info: Information about the system output :param sys_output: The system output itself :param external_stats: Training set statistics that are used to calculate training set specific features :return: The features that are active (e.g. skipping training set features when no training set available) """ sys_features = unwrap(sys_info.features) active_features = list( sys_features.get_bucket_features( include_training_dependent=external_stats is not None)) # One pass over the test set to find token test frequency all_tokens = [ unwrap(sys_info.source_tokenizer)(x['output']) for x in sys_output ] all_log_probs = [self._get_predicted_label(x) for x in sys_output] test_freq: dict[str, int] = {} for tokens in all_tokens: for tok in tokens: test_freq[tok] = test_freq.get(tok, 0) + 1 sent_feats: list[str] = [] tok_feats: list[str] = [] for x in active_features: (sent_feats if (x in sys_features) else tok_feats).append(x) for _id, (dict_sysout, tokens, log_probs) in progress( enumerate(zip(sys_output, all_tokens, all_log_probs)), desc="featurizing"): # Get values of bucketing features text = dict_sysout["output"] # sentence_length dict_sysout["text_length"] = len(tokens) dict_sysout["text_chars"] = len(text) # sentence-level training set dependent features if external_stats is not None: dict_sysout["num_oov"] = self._get_num_oov( tokens, external_stats) dict_sysout["fre_rank"] = self._get_fre_rank( tokens, external_stats) # span features for true and predicted spans dict_sysout["tok_info"] = self._complete_tok_features( tokens, log_probs, test_freq, statistics=external_stats) return active_features
def _statistics_func(self, samples: Iterator, sys_info: SysOutputInfo): return { 'source_vocab': accumulate_vocab_from_samples(samples, lambda x: x['source'], unwrap(sys_info.source_tokenizer)), 'target_vocab': accumulate_vocab_from_samples(samples, lambda x: x['reference'], unwrap(sys_info.target_tokenizer)), }
def _get_answer_length(self, sys_info: SysOutputInfo, existing_features: dict): if isinstance(existing_features["answers"]["text"], list): return len( unwrap(sys_info.source_tokenizer)( existing_features["answers"]["text"][0])) else: return len( unwrap(sys_info.source_tokenizer)( existing_features["answers"]["text"]))
def bucket_attribute_specified_bucket_interval( sample_features: list[tuple[BucketCase, T]], bucket_number: int, bucket_setting: list[tuple], ) -> list[BucketCaseCollection]: intervals = unwrap(bucket_setting) bucket2examp: dict[tuple, list[BucketCase]] = {k: list() for k in intervals} if isinstance(list(intervals)[0][0], str): # discrete value, such as entity tags for k, v in sample_features: if v in bucket2examp: bucket2examp[(v, )].append(k) else: for examp, value in sample_features: res_key = find_key(bucket2examp, value) if res_key is None: continue bucket2examp[res_key].append(examp) bucket_collections = [ BucketCaseCollection((k, ), v) for k, v in bucket2examp.items() ] return bucket_collections
def _statistics_func(self, samples, sys_info: SysOutputInfo): vocab: dict[str, float] = {} length_fre: dict[int, float] = {} total_samps = 0 tokenizer = unwrap(sys_info.source_tokenizer) for sample in progress(samples): text = sample["text"] tokens = tokenizer(text) length = len(tokens) length_fre[length] = length_fre.get(length, 0.0) + 1.0 # update vocabulary for w in tokens: vocab[w] = vocab.get(w, 0.0) + 1.0 total_samps += 1 # the rank of each word based on its frequency sorted_dict = { key: rank for rank, key in enumerate( sorted(set(vocab.values()), reverse=True), 1) } vocab_rank = {k: sorted_dict[v] for k, v in vocab.items()} for k, v in length_fre.items(): length_fre[k] = v * 1.0 / total_samps return { "vocab": vocab, "vocab_rank": vocab_rank, "length_fre": length_fre }
def _gen_metric_stats(self, sys_info: SysOutputInfo, sys_output: list[dict]) -> list[MetricStats]: """Generate sufficient statistics for scoring different metrics. :param sys_info: Information about the system outputs :param sys_output: The system output itself :return: Statistics sufficient for scoring """ metrics = unwrap(self._get_metrics(sys_info)) true_data = [self._get_true_label(x) for x in sys_output] pred_data = [self._get_predicted_label(x) for x in sys_output] rank_data = [self._get_rank_data(x) for x in sys_output] # rank of true entity in predictions if any(item is None for item in rank_data): raise ValueError( 'Some data points do not have rank information; check system outputs.' ) metric_stats = [] for metric in metrics: if (isinstance(metric, MeanReciprocalRank) or isinstance(metric, MeanRank) or isinstance(metric, Hits)): metric_stats.append(metric.calc_stats_from_rank(rank_data)) else: metric_stats.append( metric.calc_stats_from_data(true_data, pred_data)) return metric_stats
def _get_fre_rank(self, sys_info: SysOutputInfo, existing_features: dict, statistics: Any): return explainaboard.utils.feature_funcs.feat_freq_rank( existing_features, statistics, lambda x: x['context'], unwrap(sys_info.source_tokenizer), )
def _get_ref_fre_rank(self, sys_info: SysOutputInfo, existing_features: dict, statistics: Any): return explainaboard.utils.feature_funcs.feat_freq_rank( existing_features, statistics['target_vocab'], lambda x: x['reference'], unwrap(sys_info.target_tokenizer), )
def _get_src_num_oov(self, sys_info: SysOutputInfo, existing_features: dict, statistics: Any): return explainaboard.utils.feature_funcs.feat_num_oov( existing_features, statistics['source_vocab'], lambda x: x['source'], unwrap(sys_info.source_tokenizer), )
def _get_absolute_blank_position(self, sys_info: SysOutputInfo, existing_features: dict): source_tokens = unwrap(sys_info.source_tokenizer)( existing_features["context"]).strs if existing_features["question_mark"] not in source_tokens: return 0 else: return source_tokens.index(existing_features["question_mark"])
def _get_config(self, config: Optional[MetricConfig] = None) -> MetricConfig: """ Get the configuration or overwritten configuration :param config: Optional configuration to override the default configuration :return: Either the default or overridden configuration """ ret_config: MetricConfig = unwrap(config) if config is not None else self.config return ret_config
def _complete_features(self, sys_info: SysOutputInfo, sys_output: list[dict], external_stats=None) -> list[str]: """ This function takes in meta-data about system outputs, system outputs, and a few other optional pieces of information, then calculates feature functions and modifies `sys_output` to add these feature values :param sys_info: Information about the system output :param sys_output: The system output itself :param external_stats: Training set statistics that are used to calculate training set specific features :return: The features that are active (e.g. skipping training set features when no training set available) """ sys_features = unwrap(sys_info.features) active_features = list( sys_features.get_bucket_features( include_training_dependent=external_stats is not None)) sent_feats: list[str] = [] tok_feats: list[str] = [] for x in active_features: (sent_feats if (x in sys_features) else tok_feats).append(x) for _id, dict_sysout in progress(enumerate(sys_output), desc="featurizing"): # Get values of bucketing features tokens = dict_sysout["tokens"] # sentence_length dict_sysout["sentence_length"] = len(tokens) # entity density dict_sysout["span_density"] = len( self._span_ops.get_spans_simple( tags=dict_sysout["true_tags"])) / len(tokens) # sentence-level training set dependent features if external_stats is not None: dict_sysout["num_oov"] = self._get_num_oov( tokens, external_stats) dict_sysout["fre_rank"] = self._get_fre_rank( tokens, external_stats) # span features for true and predicted spans dict_sysout["span_info"] = self._complete_span_features( tokens, dict_sysout["true_tags"], dict_sysout["pred_tags"], statistics=external_stats, ) # This is not used elsewhere, so just keep it as-is return active_features
def _get_length_fre(self, sys_info: SysOutputInfo, existing_features: dict, statistics: Any): length_fre = 0 length = len( unwrap(sys_info.source_tokenizer)(existing_features["text"])) if length in statistics['length_fre'].keys(): length_fre = statistics['length_fre'][length] return length_fre
def get_bucket_performance_lm( self, sys_info: SysOutputInfo, sys_output: list[dict], samples_over_bucket: list[BucketCaseCollection], ) -> list[BucketPerformance]: """ This function defines how to get bucket-level performance w.r.t a given feature (e.g., sentence length) :param sys_info: Information about the system output :param sys_output: The system output itself :param samples_over_bucket: a dictionary mapping bucket interval names to true sample IDs :return: bucket_performances: a list of bucket intervals to bucket performance """ bucket_performances = [] for bucket_collection in samples_over_bucket: bucket_metrics = [ x.to_metric() for x in unwrap(sys_info.metric_configs) ] log_probs = [] for bucket_case in bucket_collection.samples: bcp = cast(BucketCaseSpan, bucket_case) log_probs.append(sys_output[bcp.sample_id]['tok_info'][ bcp.token_span[0]]['tok_log_prob']) bucket_samples = self._subsample_bucket_cases( bucket_collection.samples) bucket_performance = BucketPerformance( bucket_interval=bucket_collection.interval, n_samples=len(bucket_collection), bucket_samples=bucket_samples, ) for metric in bucket_metrics: metric_val = metric.evaluate(None, log_probs, conf_value=sys_info.conf_value) conf_low, conf_high = (metric_val.conf_interval if metric_val.conf_interval else None) performance = Performance( metric_name=metric.config.name, value=metric_val.value, confidence_score_low=conf_low, confidence_score_high=conf_high, ) bucket_performance.performances.append(performance) bucket_performances.append(bucket_performance) bucket_performances.sort(key=lambda x: x.bucket_interval) return bucket_performances
def process(self, metadata: dict, sys_output: list[dict]) -> SysOutputInfo: # TODO(Pengfei): Rethink if this is a good way to manipulate `system_output` overall_statistics = self.get_overall_statistics(metadata, sys_output) sys_info = unwrap(overall_statistics.sys_info) metric_stats = overall_statistics.metric_stats active_features = unwrap(overall_statistics.active_features) overall_results = sys_info.results.overall performance_over_bucket = self.bucketing_samples( sys_info, sys_output, active_features, metric_stats=metric_stats ) self.sort_bucket_info( performance_over_bucket, sort_by=metadata.get('sort_by', 'key'), sort_by_metric=metadata.get('sort_by_metric', 'first'), sort_ascending=metadata.get('sort_ascending', False), ) sys_info.results = Result( overall=overall_results, fine_grained=performance_over_bucket ) return sys_info
def _statistics_func(self, samples: Iterator, sys_info: SysOutputInfo): if sys_info.source_language is None or sys_info.target_language is None: raise ValueError( 'source or target languages must be specified to load ' f'translation data, but source={sys_info.source_language} ' f', target={sys_info.target_language}' ) src = FileLoaderField(('translation', sys_info.source_language), '', str) trg = FileLoaderField(('translation', sys_info.target_language), '', str) return { 'source_vocab': accumulate_vocab_from_samples( samples, lambda x: FileLoader.find_field(x, src), unwrap(sys_info.source_tokenizer), ), 'target_vocab': accumulate_vocab_from_samples( samples, lambda x: FileLoader.find_field(x, trg), unwrap(sys_info.target_tokenizer), ), }
def _fetch_metric_stats(self, metric_stats: dict[str, Any]): """ A utility function used to lazily fetch the actual scoring dict when it's necessary. """ if 'request_id' in metric_stats: client: AsyncClient = unwrap(self._eaas_client) eaas_stats: dict[str, Any] = client.wait_and_get_result( metric_stats['request_id']) metric_stats.clear() for k, v in eaas_stats.items(): metric_stats[k] = v
def get_overall_statistics( self, metadata: dict, sys_output: list[dict] ) -> OverallStatistics: """ Get the overall statistics information, including performance, of the system output :param metadata: The metadata of the system :param sys_output: The system output itself """ if metadata is None: metadata = {} if "task_name" not in metadata.keys(): metadata["task_name"] = self.task_type().value sys_info = SysOutputInfo.from_dict(metadata) if sys_info.metric_configs is None: sys_info.metric_configs = self.default_metrics( source_language=sys_info.source_language, target_language=sys_info.target_language, ) if sys_info.target_tokenizer is None: sys_info.target_tokenizer = get_default_tokenizer( task_type=self.task_type(), lang=sys_info.target_language ) if sys_info.source_tokenizer is None: sys_info.source_tokenizer = ( sys_info.target_tokenizer if sys_info.source_language == sys_info.target_language else get_default_tokenizer( task_type=self.task_type(), lang=sys_info.source_language ) ) # declare customized features: _features will be updated custom_features: dict = metadata.get('custom_features', {}) sys_info.features = self._customize_features(custom_features) # get scoring statistics metric_stats = unwrap(self._gen_metric_stats(sys_info, sys_output)) external_stats = self._gen_external_stats(sys_info, self._statistics_func) active_features = self._complete_features( sys_info, sys_output, external_stats=external_stats ) overall_results = self.get_overall_performance( sys_info, sys_output, metric_stats=metric_stats ) sys_info.results = Result( overall=overall_results, calibration=None, fine_grained=None ) return OverallStatistics(sys_info, metric_stats, active_features)
def _gen_metric_stats( self, sys_info: SysOutputInfo, sys_output: list[dict] ) -> list[MetricStats]: """Generate sufficient statistics for scoring different metrics. :param sys_info: Information about the system outputs :param sys_output: The system output itself :return: Statistics sufficient for scoring """ metrics = unwrap(self._get_metrics(sys_info)) true_data = [self._get_true_label(x) for x in sys_output] pred_data = [self._get_predicted_label(x) for x in sys_output] metric_stats = [] for metric in metrics: metric_stats.append(metric.calc_stats_from_data(true_data, pred_data)) return metric_stats
def bucketing_samples( self, sys_info: SysOutputInfo, sys_output: list[dict], active_features: list[str], metric_stats: list[MetricStats], ) -> dict[str, list[BucketPerformance]]: features = unwrap(sys_info.features) sent_feats: list[str] = [] tok_feats: list[str] = [] for x in active_features: (sent_feats if (x in features) else tok_feats).append(x) # First, get the buckets for sentences using the standard protocol performances_over_bucket = super().bucketing_samples( sys_info, sys_output, sent_feats, metric_stats) # Bucketing feature_lists = self._get_feature_lists(sys_output, tok_feats) for i, feature_name in enumerate( progress(tok_feats, desc="token-level bucketing")): my_feature = features["tok_info"].feature.feature[feature_name] bucket_info = my_feature.bucket_info # Get buckets for true spans bucket_func: Callable[..., list[BucketCaseCollection]] = getattr( bucketing, bucket_info.method) samples_over_bucket = bucket_func( sample_features=feature_lists[i], bucket_number=bucket_info.number, bucket_setting=bucket_info.setting, ) # evaluating bucket: get bucket performance performances_over_bucket[ feature_name] = self.get_bucket_performance_lm( sys_info, sys_output, samples_over_bucket, ) return performances_over_bucket
def bucketing_samples( self, sys_info: SysOutputInfo, sys_output: list[dict], active_features: list[str], metric_stats: list[MetricStats], ) -> dict[str, list[BucketPerformance]]: """ Separate samples into buckets and calculate performance over them :param sys_info: Information about the system output :param sys_output: The system output itself, already annotated with features :param active_features: The features to perform bucketing over :param metric_stats: The stats from which to calculate performance :return: performances_over_bucket: a dictionary of feature name -> list of performances by bucket """ sys_features = unwrap(sys_info.features) # Bucketing performances_over_bucket: dict[str, list[BucketPerformance]] = {} for feature_name in progress(active_features, desc="sample-level bucketing"): # Preparation for bucketing bucket_func: Callable[..., list[BucketCaseCollection]] = getattr( explainaboard.utils.bucketing, sys_features[feature_name].bucket_info.method, ) samples_over_bucket = bucket_func( sample_features=[ (BucketCase(x), sys_output[x][feature_name]) for x in range(len(sys_output)) ], bucket_number=sys_features[feature_name].bucket_info.number, bucket_setting=sys_features[feature_name].bucket_info.setting, ) # evaluating bucket: get bucket performance performances_over_bucket[feature_name] = self.get_bucket_performance( sys_info, sys_output, samples_over_bucket, metric_stats=metric_stats, ) return performances_over_bucket
def bucketing_samples( self, sys_info: SysOutputInfo, sys_output: list[dict], active_features: list[str], metric_stats: list[MetricStats], ) -> dict[str, list[BucketPerformance]]: features = unwrap(sys_info.features) sent_feats: list[str] = [] tok_feats: list[str] = [] for x in active_features: (sent_feats if (x in features) else tok_feats).append(x) # First, get the buckets for sentences using the standard protocol performances_over_bucket = super().bucketing_samples( sys_info, sys_output, sent_feats, metric_stats) all_sample_features = self._get_sample_features(sys_output, tok_feats) # Second, get the buckets for tokens for feature_id, feature_name in enumerate( progress(tok_feats, desc="bucketing token features")): # Choose behavior based on whether this is a feature of samples or spans my_feature = features["ref_tok_info"].feature.feature[feature_name] bucket_info = my_feature.bucket_info # Get buckets for true spans bucket_func: Callable[..., list[BucketCaseCollection]] = getattr( bucketing, bucket_info.method) sample_features = [(case, feats[feature_id]) for case, feats in all_sample_features] samples_over_bucket = bucket_func(
def _complete_span_features(self, sentence, true_tags, pred_tags, statistics=None) -> list[Span]: # Get training set stats if they exist has_stats = statistics is not None and len(statistics) > 0 econ_dic = statistics["econ_dic"] if has_stats else None efre_dic = statistics["efre_dic"] if has_stats else None self._span_ops.set_resources(resources={ "has_stats": has_stats, "econ_dic": econ_dic, "efre_dic": efre_dic, }) # Merge the spans together so that the span tag is "true_tag pred_tag" # using "_DEFAULT_TAG" if that span doesn't exist in the true or predicted tags # respectively # TODO(gneubig): This is probably calculating features twice, could be just once true_spans = self._span_ops.get_spans(toks=sentence, tags=true_tags) pred_spans = self._span_ops.get_spans(toks=sentence, tags=pred_tags) merged_spans = {} for span in true_spans: span.span_tag = f'{span.span_tag} {self._DEFAULT_TAG}' merged_spans[span.span_pos] = span for span in pred_spans: merged_span = merged_spans.get(span.span_pos) if not merged_span: span.span_tag = f'{self._DEFAULT_TAG} {span.span_tag}' merged_spans[span.span_pos] = span else: true_tag, _ = unwrap(merged_span.span_tag).split(' ') merged_span.span_tag = f'{true_tag} {span.span_tag}' return list(merged_spans.values())
def _complete_features( self, sys_info: SysOutputInfo, sys_output: list[dict], external_stats=None ) -> list[str]: """ This function takes in meta-data about system outputs, system outputs, and a few other optional pieces of information, then calculates feature functions and modifies `sys_output` to add these feature values :param sys_info: Information about the system output :param sys_output: The system output itself :param external_stats: External statistics that are used to calculate training set specific features :return: The features that are active (e.g. skipping training set features when no training set available) """ bucket_feature_funcs: dict[str, tuple[Callable, bool]] = {} sys_features = unwrap(sys_info.features) for bucket_feature in sys_features.get_bucket_features(): feature_info = sys_features[bucket_feature] # Skip training set features if no stats if external_stats is None and feature_info.require_training_set: continue feature_func = self._get_feature_func( bucket_feature, feature_info.is_custom ) bucket_feature_funcs[bucket_feature] = ( feature_func, feature_info.require_training_set, ) for _id, dict_sysout in progress(enumerate(sys_output), desc="featurizing"): # Get values of bucketing features for ( bucket_key, ( bucket_func, training_dependent, ), ) in bucket_feature_funcs.items(): feature_info = sys_features[bucket_key] # handles user-defined features if feature_info.is_custom: # TODO(Pengfei): this should be generalized feature_value = ( "_".join(dict_sysout[bucket_key]) if isinstance(dict_sysout[bucket_key], list) else dict_sysout[bucket_key] ) dict_sysout[bucket_key] = feature_value # handles all other features else: dict_sysout[bucket_key] = ( bucket_func(sys_info, dict_sysout, external_stats) if training_dependent else bucket_func(sys_info, dict_sysout) ) return list(bucket_feature_funcs.keys())
def _get_metrics(self, sys_info: SysOutputInfo) -> list[Metric]: return [config.to_metric() for config in unwrap(sys_info.metric_configs)]
def _get_head_entity_length(self, sys_info: SysOutputInfo, existing_features: dict): return len( unwrap(sys_info.source_tokenizer)( existing_features["true_head_decipher"]))
def draw_bar_chart_from_reports(reports: list[str], output_dir: str, sys_names: list[str] | None = None) -> None: """ Draw bar charts from report file generated from ExplainaBoard :param reports: Reports to plot :param output_dir: :return: """ # TODO(gneubig): This should get the system name from inside the report if sys_names is None: sys_names = [os.path.basename(x).replace('.json', '') for x in reports] elif len(sys_names) != len(reports): raise ValueError('Length of sys_names must equal that of reports') report_info: list[SysOutputInfo] = [] for report in reports: with open(report) as fin: report_info.append(SysOutputInfo.from_dict(json.load(fin))) overall_results = [ list(unwrap(x.results.overall).values()) for x in report_info ] overall_metric_names = list(unwrap(report_info[0].results.overall).keys()) fg_results = [unwrap(x.results.fine_grained) for x in report_info] if not os.path.exists(output_dir): os.makedirs(output_dir) # Overall performance ys = [[x.value for x in y] for y in overall_results] y_errs = None if overall_results[0][0].confidence_score_low is not None: y_errs = [( [x.value - unwrap(x.confidence_score_low) for x in y], [unwrap(x.confidence_score_high) - x.value for x in y], ) for y in overall_results] make_bar_chart( ys, output_dir, 'overall', output_fig_format='png', fig_size=(8, 6), sys_names=sys_names, errs=y_errs, title=None, xticklabels=overall_metric_names, ylabel='metric value', ) # Bucket performance: feature name, for example, sentence length for feature_name in progress(fg_results[0].keys()): # Make sure that buckets exist buckets: list[list[BucketPerformance]] = [] for i, fg_result in enumerate(fg_results): if feature_name not in fg_result: get_logger().error( f'error: feature {feature_name} not in {reports[i]}') else: buckets.append(fg_result[feature_name]) bnames0, bnames = [x.bucket_interval for x in buckets[0] ], [x.bucket_interval for x in buckets[-1]] if len(bnames0) != len(bnames): get_logger().error( f'error: different number of buckets for {feature_name} in ' f'{reports[0]} and {reports[i]}') buckets = [] elif bnames0 != bnames: get_logger().warning( f'warning: different bucket labels for {feature_name} in ' f'{reports[0]} and {reports[i]}') if len(buckets) != i + 1: break if len(buckets) != len(reports): continue bucket0_intervals = [x.bucket_interval for x in buckets[0]] bucket_metrics = [x.metric_name for x in buckets[0][0].performances] for metric_id, metric_name in enumerate(bucket_metrics): performances: list[list[Performance]] = [ [x.performances[metric_id] for x in y] for y in buckets ] ys = [[x.value for x in y] for y in performances] y_errs = None if performances[0][0].confidence_score_low is not None: y_errs = [( [x.value - unwrap(x.confidence_score_low) for x in y], [unwrap(x.confidence_score_high) - x.value for x in y], ) for y in performances] make_bar_chart( ys, output_dir, f'{feature_name}_{metric_name}', output_fig_format='png', fig_size=(8, 6), sys_names=sys_names, errs=y_errs, title=None, xlabel=feature_name, xticklabels=bucket0_intervals, ylabel=metric_name, )
def _get_attr_compression(self, sys_info: SysOutputInfo, existing_features: dict): return len( unwrap(sys_info.source_tokenizer)(existing_features["source"]) ) / len(unwrap(sys_info.target_tokenizer)(existing_features["reference"]))