Esempio n. 1
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "i'm looking for a flight from New York to London.",
                    "slots": ["O", "O", "O", "O", "O", "O", "B-city.dept", "I-city.dept" "O", "B-city.dest"]
                    // the number of tokens in sequence.split() and tags must match
                },
                ...
            ],
            "slots": [  // tag_key
                "O",    // tags should be in IOB format
                "B-city.dept",
                "I-city.dept",
                "B-city.dest",
                "I-city.dest",
                ...
            ]
        }
        """

        data = self._get_data(file_path)
        tag_idx2text, tag_text2idx = self._get_tag_dicts(data=data)

        helper = Helper(
            **{
                "file_path": file_path,
                "tag_idx2text": tag_idx2text,
                "ignore_tag_idx": self.ignore_tag_idx,
                "cls_token": self.cls_token,
                "sep_token": self.sep_token,
            })
        helper.set_model_parameter({
            "num_tags": len(tag_idx2text),
            "ignore_tag_idx": self.ignore_tag_idx,
        })
        helper.set_predict_helper({
            "tag_idx2text": tag_idx2text,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_text = example["sequence"].strip().replace("\n", "")

            sequence_tokens = self.word_tokenizer.tokenize(sequence_text)
            naive_tokens = sequence_text.split()
            is_head_word = utils.get_is_head_of_word(naive_tokens,
                                                     sequence_tokens)

            sequence_sub_tokens = []
            tagged_sub_token_idxs = []
            curr_sub_token_idx = 1  # skip CLS_TOKEN
            for token_idx, token in enumerate(sequence_tokens):
                for sub_token_pos, sub_token in enumerate(
                        self.subword_tokenizer.tokenize(token, unit="word")):
                    sequence_sub_tokens.append(sub_token)
                    if is_head_word[token_idx] and sub_token_pos == 0:
                        tagged_sub_token_idxs.append(curr_sub_token_idx)
                    curr_sub_token_idx += 1

            bert_input = [self.cls_token
                          ] + sequence_sub_tokens + [self.sep_token]

            if (self.sequence_max_length is not None and data_type == "train"
                    and len(bert_input) > self.sequence_max_length):
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            tag_texts = example[self.tag_key]
            tag_idxs = [tag_text2idx[tag_text] for tag_text in tag_texts]

            utils.sanity_check_iob(naive_tokens, tag_texts)
            assert len(naive_tokens) == len(tagged_sub_token_idxs), \
                f"""Wrong tagged_sub_token_idxs: followings mismatch.
                naive_tokens: {naive_tokens}
                sequence_sub_tokens: {sequence_sub_tokens}
                tagged_sub_token_idxs: {tagged_sub_token_idxs}"""

            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
                "tagged_sub_token_idxs": tagged_sub_token_idxs,
                "num_tokens": len(naive_tokens),
            }
            features.append(feature_row)

            label_row = {
                "id": data_uid,
                "tag_idxs": tag_idxs,
                "tag_texts": tag_texts,
            }
            labels.append(label_row)

            helper.set_example(
                data_uid, {
                    "sequence": sequence_text,
                    "sequence_sub_tokens": sequence_sub_tokens,
                    "tag_idxs": tag_idxs,
                    "tag_texts": tag_texts,
                })

        return utils.make_batch(features, labels), helper.to_dict()
Esempio n. 2
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "what a wonderful day!",
                    "emotion": "happy"
                },
                ...
            ],
            "emotion": [  // class_key
                "angry",
                "happy",
                "sad",
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)
        class_idx2text, class_text2idx = self._get_class_dicts(data=data)

        helper = Helper(**{
            "file_path": file_path,
            "class_idx2text": class_idx2text,
            "class_text2idx": class_text2idx,
        })
        helper.set_model_parameter({
            "num_classes": len(class_idx2text),
        })
        helper.set_predict_helper({
            "class_idx2text": class_idx2text,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence = example["sequence"].strip().replace("\n", "")
            sequence_words = self.word_tokenizer.tokenize(sequence)

            if (
                    self.sequence_max_length is not None
                    and data_type == "train"
                    and len(sequence_words) > self.sequence_max_length
            ):
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            feature_row = {
                "id": data_uid,
                "sequence": sequence,
            }
            features.append(feature_row)

            class_text = example[self.class_key]
            label_row = {
                "id": data_uid,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            }
            labels.append(label_row)

            helper.set_example(data_uid, {
                "sequence": sequence,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            })

        return utils.make_batch(features, labels), helper.to_dict()
Esempio n. 3
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "what a wonderful day!",
                    "emotion": "happy"
                },
                ...
            ],
            "emotion": [  // class_key
                "angry",
                "happy",
                "sad",
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)
        class_idx2text, class_text2idx = self._get_class_dicts(data=data)

        helper = Helper(**{
            "file_path": file_path,
            "class_idx2text": class_idx2text,
            "class_text2idx": class_text2idx,
            "cls_token": self.cls_token,
            "sep_token": self.sep_token,
            "dataset": SeqClsBertDataset,
            "metric_key": self.METRIC_KEY,
        })
        helper.set_model_parameter({
            "num_classes": len(class_idx2text),
        })
        helper.set_predict_helper({
            "class_idx2text": class_idx2text,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_a = utils.get_sequence_a(example)
            sequence_b = example.get("sequence_b", None)

            sequence_a_tokens = self.tokenizer.tokenize(sequence_a)
            sequence_b_tokens = None
            if sequence_b:
                sequence_b_tokens = self.tokenizer.tokenize(sequence_b)

            bert_input = utils.make_bert_input(
                sequence_a,
                sequence_b,
                self.tokenizer,
                max_seq_length=self.sequence_max_length,
                data_type=data_type,
                cls_token=self.cls_token,
                sep_token=self.sep_token,
                input_type=self.input_type,
            )

            if bert_input is None:
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            # token_type(segment_ids) will be added in dataset
            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
            }
            features.append(feature_row)

            class_text = example[self.class_key]
            label_row = {
                "id": data_uid,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            }
            labels.append(label_row)

            helper.set_example(data_uid, {
                "sequence_a": sequence_a,
                "sequence_a_tokens": sequence_a_tokens,
                "sequence_b": sequence_b,
                "sequence_b_tokens": sequence_b_tokens,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            })

            if self.is_test and len(features) >= 10:
                break

        return utils.make_batch(features, labels), helper.to_dict()