예제 #1
0
파일: base.py 프로젝트: min9kwak/claf
    def filter_texts(self, dataset):
        texts = []

        def append_texts(datas):
            for data in datas:
                for key, value in data.items():
                    if key in self.text_columns:
                        texts.append(value)

        for data_type, dataset in dataset.items():
            append_texts(dataset.features)
            # append_texts(dataset.labels)

        texts = list(common_utils.flatten(texts))
        texts = list(set(texts))  # remove duplicate
        return texts
예제 #2
0
파일: word.py 프로젝트: paulsunnypark/claf
    def _tokenize(self, text, unit="text"):
        """ Text -> word tokens """
        if type(text) != str:
            raise ValueError(f"text type is must be str. not {type(text)}")

        if unit == "sentence":
            tokens = getattr(self, f"_{self.name}")(text)
        else:
            sentences = self.sent_tokenizer.tokenize(text)
            tokens = [
                getattr(self, f"_{self.name}")(sentence)
                for sentence in sentences
            ]

        if self.split_with_regex and self.name != "spacy_en":
            tokens = self._split_with_regex(tokens)

        return list(common_utils.flatten(tokens))
예제 #3
0
파일: text_handler.py 프로젝트: seongl/claf
    def _make_token_counter(self, texts, tokenizer, config=None, desc=None):
        tokenizer_name = tokenizer.name

        cache_token_counter = None
        if config is not None:
            data_reader_config = config.data_reader
            cache_token_counter = self.data_handler.cache_token_counter(
                data_reader_config, tokenizer_name)

        if cache_token_counter:
            return cache_token_counter
        else:
            tokens = [
                token for text in tqdm(texts, desc=desc)
                for token in tokenizer.tokenize(text)
            ]
            flatten_list = list(common_utils.flatten(tokens))
            token_counter = Counter(flatten_list)

            self.data_handler.cache_token_counter(data_reader_config,
                                                  tokenizer_name,
                                                  obj=token_counter)
            return token_counter
예제 #4
0
    def make_metrics(self, predictions):
        """
        Make metrics with prediction dictionary

        * Args:
            predictions: prediction dictionary consisting of
                - key: 'id' (sequence id)
                - value: dictionary consisting of
                    - tag_idxs

        * Returns:
            metrics: metric dictionary consisting of
                - 'accuracy': sequence level accuracy
                - 'tag_accuracy': tag level accuracy
                - 'macro_f1': tag prediction macro(unweighted mean) f1
                - 'macro_precision': tag prediction macro(unweighted mean) precision
                - 'macro_recall': tag prediction macro(unweighted mean) recall
        """

        pred_tag_idxs_list = []
        target_tag_idxs_list = []

        accurate_sequence = []

        for data_idx, pred in predictions.items():
            target = self._dataset.get_ground_truth(data_idx)

            pred_tag_idxs_list.append(pred["tag_idxs"])
            target_tag_idxs_list.append(target["tag_idxs"])

            accurate_sequence.append(1 if (
                np.asarray(target["tag_idxs"]) == np.asarray(pred["tag_idxs"])
            ).all() else 0)

        pred_tags = [[
            self._dataset.tag_idx2text[tag_idx] for tag_idx in tag_idxs
        ] for tag_idxs in pred_tag_idxs_list]
        target_tags = [[
            self._dataset.tag_idx2text[tag_idx] for tag_idx in tag_idxs
        ] for tag_idxs in target_tag_idxs_list]

        flat_pred_tags = list(common_utils.flatten(pred_tags))
        flat_target_tags = list(common_utils.flatten(target_tags))

        # confusion matrix
        try:
            pycm_obj = pycm.ConfusionMatrix(actual_vector=flat_target_tags,
                                            predict_vector=flat_pred_tags)
        except pycmVectorError as e:
            if str(e) == "Number of the classes is lower than 2":
                logger.warning(
                    "Number of tags in the batch is 1. Sanity check is highly recommended."
                )
                return {
                    "accuracy": 1.,
                    "tag_accuracy": 1.,
                    "macro_f1": 1.,
                    "macro_precision": 1.,
                    "macro_recall": 1.,
                    "conlleval_accuracy": 1.,
                    "conlleval_f1": 1.,
                }
            raise

        self.write_predictions(
            {
                "target": flat_target_tags,
                "predict": flat_pred_tags
            },
            pycm_obj=pycm_obj)

        sequence_accuracy = sum(accurate_sequence) / len(accurate_sequence)

        metrics = {
            "accuracy": sequence_accuracy,
            "tag_accuracy": pycm_obj.Overall_ACC,
            "macro_f1": macro_f1(pycm_obj),
            "macro_precision": macro_precision(pycm_obj),
            "macro_recall": macro_recall(pycm_obj),
            "conlleval_accuracy": conlleval_accuracy(target_tags, pred_tags),
            "conlleval_f1": conlleval_f1(target_tags, pred_tags),
        }

        return metrics