def aggregate_stats(self, stats: MetricStats) -> np.ndarray: """ Aggregate sufficient statistics from multiple examples into a single example :param stats: stats for every example :return: aggregated stats """ if self.config.name in {'bleu', 'chrf'}: return np.sum(stats.get_data(), axis=0) else: return np.mean(stats.get_data(), axis=0)
def calc_stats_from_data( self, true_data: list, pred_data: list, config: Optional[MetricConfig] = None) -> MetricStats: return MetricStats( np.array([(1.0 if x == y else 0.0) for x, y in zip(true_data, pred_data)]))
def calc_stats_from_data( self, true_data: list, pred_data: list, config: Optional[MetricConfig] = None) -> MetricStats: """ Take in a list of floats (token-level), or list of lists of floats (sentence level) and either one float for each or float+length rows """ if len(pred_data) == 0 or isinstance(pred_data[0], float): return MetricStats(np.array(pred_data)) elif isinstance(pred_data[0], list): return MetricStats(np.array([[sum(x), len(x)] for x in pred_data])) else: t = type(pred_data[0]) raise ValueError( f'Invalid type of pred_data for calc_stats_from_data {t}')
def filter(self, indices: Union[list[int], np.ndarray]) -> MetricStats: """ Return a view of these stats filtered down to the indicated indices """ sdata: np.ndarray = self.get_data() if not isinstance(indices, np.ndarray): indices = np.array(indices) return MetricStats(sdata[indices])
def calc_stats_from_data( self, true_data: list, pred_data: list, config: Optional[MetricConfig] = None) -> MetricStats: return MetricStats( np.array( [self.mrr_val(t, p) for t, p in zip(true_data, pred_data)]))
def calc_stats_from_data( self, true_data: list[list[str]], pred_data: list[list[str]], config: Optional[MetricConfig] = None, ) -> MetricStats: """ Return sufficient statistics necessary to compute f-score. :param true_data: True outputs :param pred_data: Predicted outputs :param config: Configuration, if over-riding the default :return: Returns stats for each class (integer id c) in the following columns of MetricStats * c*stat_mult + 0: occurrences in the true output * c*stat_mult + 1: occurrences in the predicted output * c*stat_mult + 2: number of matches with the true output """ # Get span ops seq_config = cast(SeqF1ScoreConfig, config or self.config) if seq_config.tag_schema == 'bio': span_ops: SpanOps = BIOSpanOps() elif seq_config.tag_schema == 'bmes': span_ops = BMESSpanOps() else: raise ValueError(f'Illegal tag_schema {seq_config.tag_schema}') true_spans_list: list[list[tuple[str, int, int]]] = [ span_ops.get_spans_simple(true_tags) for true_tags in true_data ] pred_spans_list: list[list[tuple[str, int, int]]] = [ span_ops.get_spans_simple(pred_tags) for pred_tags in pred_data ] # 2. Get tag space all_classes = set([ span[0] for span in list(itertools.chain.from_iterable(true_spans_list)) + list(itertools.chain.from_iterable(pred_spans_list)) ]) tag_ids = {k: v for v, k in enumerate([x for x in all_classes])} # 3. Create the sufficient statistics stat_mult = 3 n_data, n_classes = len(true_data), len(tag_ids) # This is a bit memory inefficient if there's a large number of classes stats = np.zeros((n_data, n_classes * stat_mult)) for i, (true_spans, pred_spans) in enumerate(zip(true_spans_list, pred_spans_list)): matched_spans = set(true_spans).intersection(pred_spans) for offset, spans in enumerate( (true_spans, pred_spans, matched_spans)): for span in spans: c = tag_ids[span[0]] stats[i, c * stat_mult + offset] += 1 return MetricStats(stats)
def calc_stats_from_rank( self, rank_data: list, config: Optional[MetricConfig] = None ) -> MetricStats: # TODO(Pengfei): why do we need the 3rd argument? config = cast(HitsConfig, self._get_config(config)) return MetricStats( np.array([(1.0 if rank <= config.hits_k else 0.0) for rank in rank_data]))
def calc_stats_from_data( self, true_data: list, pred_data: list, config: Optional[MetricConfig] = None ) -> MetricStats: # TODO(Pengfei): why do we need the 3rd argument? config = cast(HitsConfig, self._get_config(config)) return MetricStats( np.array([(1.0 if t in p[:config.hits_k] else 0.0) for t, p in zip(true_data, pred_data)]))
def aggregate_stats(self, stats: MetricStats) -> np.ndarray: """ Aggregate sufficient statistics from multiple examples into a single example :param stats: stats for every example :return: aggregated stats """ data = stats.get_data() if data.size == 0: return np.array(0.0) else: return np.sum(data, axis=0)
def calc_stats_from_data( self, true_data: list[Union[str, list[str]]], pred_data: list[str], config: Optional[MetricConfig] = None, ) -> MetricStats: true_data = [[x] if isinstance(x, str) else x for x in true_data] config = self._get_config(config) preprocessor = ExtractiveQAPreprocessor( language=config.source_language) return MetricStats( np.array([ max([self.sample_level_metric(t, p, preprocessor) for t in ts]) for ts, p in zip(true_data, pred_data) ]))
def calc_stats_from_data( self, true_data: list, pred_data: list, config: Optional[MetricConfig] = None) -> MetricStats: """ Return sufficient statistics necessary to compute f-score. :param true_data: True outputs :param pred_data: Predicted outputs :param config: Configuration, if overloading the default for this object :return: Returns stats for each class (integer id c) in the following columns of MetricStats * c*stat_mult + 0: occurrences in the true output * c*stat_mult + 1: occurrences in the predicted output * c*stat_mult + 2: number of matches with the true output * c*stat_mult + 3: number of matches with the predicted output (when self.separate_match=True only) """ config = cast(F1ScoreConfig, self._get_config(config)) stat_mult: int = 4 if config.separate_match else 3 id_map: dict[str, int] = {} if config.ignore_classes is not None: for ignore_class in config.ignore_classes: id_map[ignore_class] = -1 for word in itertools.chain(true_data, pred_data): if word not in id_map: id_map[word] = len(id_map) n_data = len(true_data) n_classes = len(id_map) # This is a bit memory inefficient if there's a large number of classes stats = np.zeros((n_data, n_classes * stat_mult)) for i, (t, p) in enumerate(zip(true_data, pred_data)): tid, pid = id_map[t], id_map[p] if tid != -1: stats[i, tid * stat_mult + 0] += 1 if pid != -1: stats[i, pid * stat_mult + 1] += 1 if tid == pid: stats[i, tid * stat_mult + 2] += 1 if config.separate_match: stats[i, tid * stat_mult + 3] += 1 return MetricStats(stats)
def calc_stats_from_data( self, true_edits_ldl: list[dict[str, list]], pred_edits_ldl: list[dict[str, list]], config: Optional[MetricConfig] = None, ) -> MetricStats: def _get_flatten_edits(edits: list[dict]): flatten_edits = [] for edit in edits: start_idx, end_idx, corrections = ( edit["start_idx"], edit["end_idx"], edit["corrections"], ) for correction in corrections: flatten_edits.append((start_idx, end_idx, correction)) return flatten_edits recall = [] for true_edits_dl, pred_edits_dl in zip(true_edits_ldl, pred_edits_ldl): true_edits_ld = [ dict(zip(true_edits_dl, t)) for t in zip(*true_edits_dl.values()) ] pred_dicts_ld = [ dict(zip(pred_edits_dl, t)) for t in zip(*pred_edits_dl.values()) ] gold_flatten_edits = _get_flatten_edits(true_edits_ld) pred_flatten_edits = _get_flatten_edits(pred_dicts_ld) for gold_flatten_edit in gold_flatten_edits: if gold_flatten_edit in pred_flatten_edits: recall.append(1.0) else: recall.append(0.0) return MetricStats(np.array(recall))
def calc_stats_from_rank( self, rank_data: list, config: Optional[MetricConfig] = None) -> MetricStats: return MetricStats( np.array([1.0 / rank for rank in rank_data if rank is not None]))