def filter_texts(self, dataset): texts = [] def append_texts(datas): for data in datas: for key, value in data.items(): if key in self.text_columns: texts.append(value) for data_type, dataset in dataset.items(): append_texts(dataset.features) # append_texts(dataset.labels) texts = list(common_utils.flatten(texts)) texts = list(set(texts)) # remove duplicate return texts
def _tokenize(self, text, unit="text"): """ Text -> word tokens """ if type(text) != str: raise ValueError(f"text type is must be str. not {type(text)}") if unit == "sentence": tokens = getattr(self, f"_{self.name}")(text) else: sentences = self.sent_tokenizer.tokenize(text) tokens = [ getattr(self, f"_{self.name}")(sentence) for sentence in sentences ] if self.split_with_regex and self.name != "spacy_en": tokens = self._split_with_regex(tokens) return list(common_utils.flatten(tokens))
def _make_token_counter(self, texts, tokenizer, config=None, desc=None): tokenizer_name = tokenizer.name cache_token_counter = None if config is not None: data_reader_config = config.data_reader cache_token_counter = self.data_handler.cache_token_counter( data_reader_config, tokenizer_name) if cache_token_counter: return cache_token_counter else: tokens = [ token for text in tqdm(texts, desc=desc) for token in tokenizer.tokenize(text) ] flatten_list = list(common_utils.flatten(tokens)) token_counter = Counter(flatten_list) self.data_handler.cache_token_counter(data_reader_config, tokenizer_name, obj=token_counter) return token_counter
def make_metrics(self, predictions): """ Make metrics with prediction dictionary * Args: predictions: prediction dictionary consisting of - key: 'id' (sequence id) - value: dictionary consisting of - tag_idxs * Returns: metrics: metric dictionary consisting of - 'accuracy': sequence level accuracy - 'tag_accuracy': tag level accuracy - 'macro_f1': tag prediction macro(unweighted mean) f1 - 'macro_precision': tag prediction macro(unweighted mean) precision - 'macro_recall': tag prediction macro(unweighted mean) recall """ pred_tag_idxs_list = [] target_tag_idxs_list = [] accurate_sequence = [] for data_idx, pred in predictions.items(): target = self._dataset.get_ground_truth(data_idx) pred_tag_idxs_list.append(pred["tag_idxs"]) target_tag_idxs_list.append(target["tag_idxs"]) accurate_sequence.append(1 if ( np.asarray(target["tag_idxs"]) == np.asarray(pred["tag_idxs"]) ).all() else 0) pred_tags = [[ self._dataset.tag_idx2text[tag_idx] for tag_idx in tag_idxs ] for tag_idxs in pred_tag_idxs_list] target_tags = [[ self._dataset.tag_idx2text[tag_idx] for tag_idx in tag_idxs ] for tag_idxs in target_tag_idxs_list] flat_pred_tags = list(common_utils.flatten(pred_tags)) flat_target_tags = list(common_utils.flatten(target_tags)) # confusion matrix try: pycm_obj = pycm.ConfusionMatrix(actual_vector=flat_target_tags, predict_vector=flat_pred_tags) except pycmVectorError as e: if str(e) == "Number of the classes is lower than 2": logger.warning( "Number of tags in the batch is 1. Sanity check is highly recommended." ) return { "accuracy": 1., "tag_accuracy": 1., "macro_f1": 1., "macro_precision": 1., "macro_recall": 1., "conlleval_accuracy": 1., "conlleval_f1": 1., } raise self.write_predictions( { "target": flat_target_tags, "predict": flat_pred_tags }, pycm_obj=pycm_obj) sequence_accuracy = sum(accurate_sequence) / len(accurate_sequence) metrics = { "accuracy": sequence_accuracy, "tag_accuracy": pycm_obj.Overall_ACC, "macro_f1": macro_f1(pycm_obj), "macro_precision": macro_precision(pycm_obj), "macro_recall": macro_recall(pycm_obj), "conlleval_accuracy": conlleval_accuracy(target_tags, pred_tags), "conlleval_f1": conlleval_f1(target_tags, pred_tags), } return metrics