Exemple #1
0
def test_make_batch():
    features = {
        "f1": 0,
        "f2": 1,
        "f3": 3,
    }

    labels = {
        "l1": 0,
        "l2": 1,
        "l3": 2,
    }

    batch = make_batch(features, labels)

    assert batch.features == features
    assert batch.labels == labels
Exemple #2
0
def test_batch_sort_by_key():

    features = [{"f1": "long long long"}, {"f1": "short"}, {"f1": "mid mid"}]

    labels = [
        {
            "l1": 3
        },
        {
            "l1": 1
        },
        {
            "l1": 2
        },
    ]

    batch = make_batch(features, labels)
    batch.sort_by_key("f1")

    assert batch.features == sorted(features, key=lambda x: len(x["f1"]))
Exemple #3
0
    def _read(self, file_path, data_type=None):
        word_tokenized_error_count, sub_level_tokenized_error_count = 0, 0

        data = self.data_handler.read(file_path)
        squad = json.loads(data)
        if "data" in squad:
            squad = squad["data"]

        helper = Helper(
            **{
                "file_path": file_path,
                "raw_dataset": squad,
                "cls_token": self.cls_token,
                "sep_token": self.sep_token,
                "dataset": SQuADBertDataset,
            })
        helper.set_model_parameter({
            "lang_code": self.lang_code,
        })

        features, labels = [], []
        is_training = data_type == "train"

        for article in tqdm(squad, desc=data_type):
            for paragraph in article["paragraphs"]:
                context_text = paragraph["context"].replace("``",
                                                            '" ').replace(
                                                                "''", '" ')
                context_tokens = self.word_tokenizer.tokenize(context_text)

                context_spans, char_to_word_offset = self._convert_to_spans(
                    context_text, context_tokens)
                context_tokens = [
                    Token(text, span)
                    for (text, span) in zip(context_tokens, context_spans)
                ]

                context_sub_tokens = []
                for token in context_tokens:
                    for sub_token in self.sub_level_tokenizer.tokenize(
                            token.text):
                        context_sub_tokens.append(
                            Token(sub_token, token.text_span))

                for qa in paragraph["qas"]:
                    question_text = qa["question"]
                    question_text = " ".join(
                        self.word_tokenizer.tokenize(question_text))
                    question_sub_tokens = [
                        Token(sub_token) for sub_token in
                        self.sub_level_tokenizer.tokenize(question_text)
                    ]

                    id_ = qa["id"]
                    answers = qa["answers"]

                    answer_texts, answer_indices = [], []

                    if qa.get("is_impossible", None):
                        answers = qa["plausible_answers"]
                        answerable = 0
                    else:
                        answers = qa["answers"]
                        answerable = 1

                    for answer in answers:
                        answer_start = answer["answer_start"]
                        answer_end = answer_start + len(answer["text"]) - 1

                        answer_texts.append(answer["text"])
                        answer_indices.append((answer_start, answer_end))

                    if len(answer_indices) > 0:
                        answer_char_start, answer_char_end = self._find_one_most_common(
                            answer_indices)
                        answer_word_start = char_to_word_offset[
                            answer_char_start]
                        answer_word_end = char_to_word_offset[answer_char_end]

                        char_answer_text = context_text[
                            answer_char_start:answer_char_end + 1]
                        word_answer_text = context_text[
                            context_spans[answer_word_start][0]:
                            context_spans[answer_word_end][1]]

                        if not self._is_rebuild(char_answer_text,
                                                word_answer_text):
                            logger.warning(
                                f"word_tokenized_error: {char_answer_text}  ###  {word_answer_text}"
                            )
                            word_tokenized_error_count += 1
                    else:
                        # Unanswerable
                        answers = ["<noanswer>"]
                        answer_char_start, answer_char_end = -1, -1
                        answer_word_start, answer_word_end = -1, -1

                    bert_features, bert_labels = self._make_features_and_labels(
                        context_sub_tokens,
                        question_sub_tokens,
                        answer_char_start,
                        answer_char_end + 1,
                    )

                    for (index, (feature, label)) in enumerate(
                            zip(bert_features, bert_labels)):
                        bert_tokens = feature
                        answer_start, answer_end = label

                        if is_training and (
                                answer_start < 0
                                or answer_start >= len(bert_tokens)
                                or answer_end >= len(bert_tokens)
                                or bert_tokens[answer_start].text_span is None
                                or bert_tokens[answer_end].text_span is None):
                            continue

                        if is_training:
                            char_start = bert_tokens[answer_start].text_span[0]
                            char_end = bert_tokens[answer_end].text_span[1]
                            bert_answer = context_text[char_start:char_end]

                            if char_answer_text != bert_answer:
                                logger.warning(
                                    f"sub_level_tokenized_error: {char_answer_text} ### {word_answer_text})"
                                )
                                sub_level_tokenized_error_count += 1

                        feature_row = {
                            "bert_input":
                            [token.text for token in bert_tokens],
                            "bert_token": bert_tokens,
                        }
                        features.append(feature_row)

                        bert_id = id_ + f"#{index}"
                        label_row = {
                            "id": bert_id,  # question_id + bert_index
                            "answer_texts": "\t".join(answer_texts),
                            "answer_start": answer_start,
                            "answer_end": answer_end,
                            "answerable": answerable,
                        }
                        labels.append(label_row)

                        if id_ not in helper.examples:
                            helper.set_example(
                                id_, {
                                    "context": context_text,
                                    "question": question_text,
                                    "answers": answer_texts,
                                })
                        helper.set_example(id_, {
                            f"bert_tokens_{index}": bert_tokens,
                        },
                                           update=True)

        logger.info(
            f"tokenized_error_count - word: {word_tokenized_error_count} | sub_level: {sub_level_tokenized_error_count}"
        )
        return utils.make_batch(features, labels), helper.to_dict()
Exemple #4
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "what a wonderful day!",
                    "emotion": "happy"
                },
                ...
            ],
            "emotion": [  // class_key
                "angry",
                "happy",
                "sad",
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)
        class_idx2text, class_text2idx = self._get_class_dicts(data=data)

        helper = Helper(**{
            "file_path": file_path,
            "class_idx2text": class_idx2text,
            "class_text2idx": class_text2idx,
        })
        helper.set_model_parameter({
            "num_classes": len(class_idx2text),
        })
        helper.set_predict_helper({
            "class_idx2text": class_idx2text,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence = example["sequence"].strip().replace("\n", "")
            sequence_words = self.word_tokenizer.tokenize(sequence)

            if (
                    self.sequence_max_length is not None
                    and data_type == "train"
                    and len(sequence_words) > self.sequence_max_length
            ):
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            feature_row = {
                "id": data_uid,
                "sequence": sequence,
            }
            features.append(feature_row)

            class_text = example[self.class_key]
            label_row = {
                "id": data_uid,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            }
            labels.append(label_row)

            helper.set_example(data_uid, {
                "sequence": sequence,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            })

        return utils.make_batch(features, labels), helper.to_dict()
Exemple #5
0
    def __call__(self, features, labels, apply_pad_labels=(), apply_pad_values=()):
        self.collate(features)
        self.collate(labels, apply_pad=False,
                     apply_pad_labels=apply_pad_labels, apply_pad_values=apply_pad_values)

        return utils.make_batch(features, labels)
Exemple #6
0
    def __call__(self, features, labels):
        self.collate(features, pad_value=self.pad_value)
        self.collate(labels, apply_pad=False, pad_value=self.pad_value)

        return utils.make_batch(features, labels)
Exemple #7
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "i'm looking for a flight from New York to London.",
                    "slots": ["O", "O", "O", "O", "O", "O", "B-city.dept", "I-city.dept" "O", "B-city.dest"]
                    // the number of tokens in sequence.split() and tags must match
                },
                ...
            ],
            "slots": [  // tag_key
                "O",    // tags should be in IOB format
                "B-city.dept",
                "I-city.dept",
                "B-city.dest",
                "I-city.dest",
                ...
            ]
        }
        """

        data = self._get_data(file_path)
        tag_idx2text, tag_text2idx = self._get_tag_dicts(data=data)

        helper = Helper(
            **{
                "file_path": file_path,
                "tag_idx2text": tag_idx2text,
                "ignore_tag_idx": self.ignore_tag_idx,
                "cls_token": self.cls_token,
                "sep_token": self.sep_token,
            })
        helper.set_model_parameter({
            "num_tags": len(tag_idx2text),
            "ignore_tag_idx": self.ignore_tag_idx,
        })
        helper.set_predict_helper({
            "tag_idx2text": tag_idx2text,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_text = example["sequence"].strip().replace("\n", "")

            sequence_tokens = self.word_tokenizer.tokenize(sequence_text)
            naive_tokens = sequence_text.split()
            is_head_word = utils.get_is_head_of_word(naive_tokens,
                                                     sequence_tokens)

            sequence_sub_tokens = []
            tagged_sub_token_idxs = []
            curr_sub_token_idx = 1  # skip CLS_TOKEN
            for token_idx, token in enumerate(sequence_tokens):
                for sub_token_pos, sub_token in enumerate(
                        self.subword_tokenizer.tokenize(token, unit="word")):
                    sequence_sub_tokens.append(sub_token)
                    if is_head_word[token_idx] and sub_token_pos == 0:
                        tagged_sub_token_idxs.append(curr_sub_token_idx)
                    curr_sub_token_idx += 1

            bert_input = [self.cls_token
                          ] + sequence_sub_tokens + [self.sep_token]

            if (self.sequence_max_length is not None and data_type == "train"
                    and len(bert_input) > self.sequence_max_length):
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            tag_texts = example[self.tag_key]
            tag_idxs = [tag_text2idx[tag_text] for tag_text in tag_texts]

            utils.sanity_check_iob(naive_tokens, tag_texts)
            assert len(naive_tokens) == len(tagged_sub_token_idxs), \
                f"""Wrong tagged_sub_token_idxs: followings mismatch.
                naive_tokens: {naive_tokens}
                sequence_sub_tokens: {sequence_sub_tokens}
                tagged_sub_token_idxs: {tagged_sub_token_idxs}"""

            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
                "tagged_sub_token_idxs": tagged_sub_token_idxs,
                "num_tokens": len(naive_tokens),
            }
            features.append(feature_row)

            label_row = {
                "id": data_uid,
                "tag_idxs": tag_idxs,
                "tag_texts": tag_texts,
            }
            labels.append(label_row)

            helper.set_example(
                data_uid, {
                    "sequence": sequence_text,
                    "sequence_sub_tokens": sequence_sub_tokens,
                    "tag_idxs": tag_idxs,
                    "tag_texts": tag_texts,
                })

        return utils.make_batch(features, labels), helper.to_dict()
Exemple #8
0
    def _read(self, file_path, data_type=None):
        tokenized_error_count = 0

        data = self.data_handler.read(file_path)
        squad = json.loads(data)
        if "data" in squad:
            squad = squad["data"]

        helper = Helper(**{
            "file_path": file_path,
            "raw_dataset": squad,
        })
        helper.set_model_parameter({
            "lang_code": self.lang_code,
        })

        features, labels = [], []

        for article in tqdm(squad, desc=data_type):
            for paragraph in article["paragraphs"]:
                context = paragraph["context"].replace("``", '" ').replace(
                    "''", '" ')
                context_words = self.word_tokenizer.tokenize(context)

                if (self.context_max_length is not None
                        and data_type == "train"
                        and len(context_words) > self.context_max_length):
                    continue

                for qa in paragraph["qas"]:
                    question = qa["question"].strip().replace("\n", "")
                    id_ = qa["id"]

                    answer_texts, answer_indices = [], []

                    if qa.get("is_impossible", None):
                        answers = qa["plausible_answers"]
                        answerable = 0
                    else:
                        answers = qa["answers"]
                        answerable = 1

                    for answer in answers:
                        answer_start = answer["answer_start"]
                        answer_end = answer_start + len(answer["text"])

                        answer_texts.append(answer["text"])
                        answer_indices.append((answer_start, answer_end))

                    feature_row = {
                        "context": self._clean_text(context),
                        "question": question,
                    }
                    features.append(feature_row)

                    if len(answer_indices) > 0:
                        answer_start, answer_end = self._find_one_most_common(
                            answer_indices)
                        text_spans = self._convert_to_spans(
                            context, context_words)
                        word_idxs = self._get_word_span_idxs(
                            text_spans, answer_start, answer_end)

                        word_answer_start = word_idxs[0]
                        word_answer_end = word_idxs[-1]

                        # To check rebuild answer: char_answer_text - word_answer_text
                        char_answer_text = context[answer_start:answer_end]
                        word_answer_text = context[
                            text_spans[word_answer_start][0]:
                            text_spans[word_answer_end][1]]

                        if not self._is_rebuild(char_answer_text,
                                                word_answer_text):
                            logger.warning(
                                f"word_tokenized_error: {char_answer_text}  ###  {word_answer_text}"
                            )
                            tokenized_error_count += 1

                    else:
                        # Unanswerable
                        answers = ["<noanswer>"]
                        text_spans = []
                        answer_start, answer_end = 0, 0
                        word_answer_start, word_answer_end = 0, 0

                    label_row = {
                        "id": id_,
                        "answer_start": word_answer_start,
                        "answer_end": word_answer_end,
                        "answerable": answerable,
                    }
                    labels.append(label_row)

                    helper.set_example(
                        id_, {
                            "context": context,
                            "text_span": text_spans,
                            "question": question,
                            "answers": answer_texts,
                        })

        logger.info(f"tokenized_error_count: {tokenized_error_count} ")
        return utils.make_batch(features, labels), helper.to_dict()
Exemple #9
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence_a": "what a wonderful day!",
                    "sequence_b": "what a great day!",
                    "score": 0.9
                },
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)

        helper = Helper(**{
            "file_path": file_path,
            "cls_token": self.cls_token,
            "sep_token": self.sep_token,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_a = utils.get_sequence_a(example)
            sequence_b = example.get("sequence_b", None)

            sequence_a_tokens = self.tokenizer.tokenize(sequence_a)
            sequence_b_tokens = None
            if sequence_b:
                sequence_b_tokens = self.tokenizer.tokenize(sequence_b)

            bert_input = utils.make_bert_input(
                sequence_a,
                sequence_b,
                self.tokenizer,
                max_seq_length=self.sequence_max_length,
                data_type=data_type,
                cls_token=self.cls_token,
                sep_token=self.sep_token,
                input_type=self.input_type,
            )

            if bert_input is None:
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
            }
            features.append(feature_row)

            score = example[self.label_key]
            label_row = {
                "id": data_uid,
                "score": score,
            }
            labels.append(label_row)

            helper.set_example(data_uid, {
                "sequence_a": sequence_a,
                "sequence_a_tokens": sequence_a_tokens,
                "sequence_b": sequence_b,
                "sequence_b_tokens": sequence_b_tokens,
                "score": score,
            })

            if self.is_test and len(features) >= 10:
                break

        return utils.make_batch(features, labels), helper.to_dict()
Exemple #10
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "what a wonderful day!",
                    "emotion": "happy"
                },
                ...
            ],
            "emotion": [  // class_key
                "angry",
                "happy",
                "sad",
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)
        class_idx2text, class_text2idx = self._get_class_dicts(data=data)

        helper = Helper(**{
            "file_path": file_path,
            "class_idx2text": class_idx2text,
            "class_text2idx": class_text2idx,
            "cls_token": self.cls_token,
            "sep_token": self.sep_token,
            "dataset": SeqClsBertDataset,
            "metric_key": self.METRIC_KEY,
        })
        helper.set_model_parameter({
            "num_classes": len(class_idx2text),
        })
        helper.set_predict_helper({
            "class_idx2text": class_idx2text,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_a = utils.get_sequence_a(example)
            sequence_b = example.get("sequence_b", None)

            sequence_a_tokens = self.tokenizer.tokenize(sequence_a)
            sequence_b_tokens = None
            if sequence_b:
                sequence_b_tokens = self.tokenizer.tokenize(sequence_b)

            bert_input = utils.make_bert_input(
                sequence_a,
                sequence_b,
                self.tokenizer,
                max_seq_length=self.sequence_max_length,
                data_type=data_type,
                cls_token=self.cls_token,
                sep_token=self.sep_token,
                input_type=self.input_type,
            )

            if bert_input is None:
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            # token_type(segment_ids) will be added in dataset
            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
            }
            features.append(feature_row)

            class_text = example[self.class_key]
            label_row = {
                "id": data_uid,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            }
            labels.append(label_row)

            helper.set_example(data_uid, {
                "sequence_a": sequence_a,
                "sequence_a_tokens": sequence_a_tokens,
                "sequence_b": sequence_b,
                "sequence_b_tokens": sequence_b_tokens,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            })

            if self.is_test and len(features) >= 10:
                break

        return utils.make_batch(features, labels), helper.to_dict()
Exemple #11
0
    def _read(self, file_path, data_type=None):
        file_path = self.data_handler.read(file_path, return_path=True)
        file_path = Path(file_path)

        data_dir = file_path.parent
        file_name = file_path.stem

        db_path = data_dir / f"{file_name}.db"
        table_path = data_dir / f"{file_name}.tables.jsonl"

        self.dbengine = DBEngine(db_path)

        helper = Helper(**{
            "file_path": file_path,
            "db_path": db_path,
        })

        features, labels = [], []

        sql_datas, table_data = self.load_data(file_path,
                                               table_path,
                                               data_type=data_type)
        for sql_data in tqdm(sql_datas, desc=data_type):
            question = sql_data["question"]
            table_id = sql_data["table_id"]
            column_headers = table_data[table_id]["header"]

            feature_row = {"column": column_headers, "question": question}

            data_uid = str(uuid.uuid1())
            conditions_value_position = self.get_coditions_value_position(
                sql_data["question"], [x[2] for x in sql_data["sql"]["conds"]])

            sql_query = Query.from_dict(sql_data["sql"], ordered=True)
            execution_result = self.dbengine.execute_query(table_id,
                                                           sql_query,
                                                           lower=True)

            label_row = {
                "id":
                data_uid,
                "table_id":
                table_id,
                "tokenized_question":
                self.word_tokenizer.tokenize(question),
                "aggregator_idx":
                sql_data["sql"]["agg"],
                "select_column_idx":
                sql_data["sql"]["sel"],
                "conditions_num":
                len(sql_data["sql"]["conds"]),
                "conditions_column_idx":
                [x[0] for x in sql_data["sql"]["conds"]],
                "conditions_operator_idx":
                [x[1] for x in sql_data["sql"]["conds"]],
                "conditions_value_string":
                [str(x[2]) for x in sql_data["sql"]["conds"]],
                "conditions_value_position":
                conditions_value_position,
                "sql_query":
                sql_query,
                "execution_result":
                execution_result,
            }

            features.append(feature_row)
            labels.append(label_row)

            helper.set_example(
                data_uid, {
                    "question": question,
                    "sql_query": sql_query,
                    "execution_result": execution_result,
                })

            if self.is_test and len(labels) == 10:
                break

        return utils.make_batch(features, labels), helper.to_dict()