Esempio n. 1
0
    def _read(self, file_path, data_type=None):
        file_path = self.data_handler.read(file_path, return_path=True)
        file_path = Path(file_path)

        data_dir = file_path.parent
        file_name = file_path.stem

        db_path = data_dir / f"{file_name}.db"
        table_path = data_dir / f"{file_name}.tables.jsonl"

        self.dbengine = DBEngine(db_path)

        helper = {"file_path": file_path, "db_path": db_path, "examples": {}}
        features, labels = [], []

        sql_datas, table_data = self.load_data(file_path, table_path, data_type=data_type)
        for sql_data in tqdm(sql_datas, desc=data_type):
            question = sql_data["question"]
            table_id = sql_data["table_id"]
            column_headers = table_data[table_id]["header"]

            feature_row = {"column": column_headers, "question": question}

            data_uid = str(uuid.uuid1())
            conditions_value_position = self.get_coditions_value_position(
                sql_data["question"], [x[2] for x in sql_data["sql"]["conds"]]
            )

            sql_query = Query.from_dict(sql_data["sql"], ordered=True)
            execution_result = self.dbengine.execute_query(table_id, sql_query, lower=True)

            label_row = {
                "id": data_uid,
                "table_id": table_id,
                "tokenized_question": self.word_tokenizer.tokenize(question),
                "aggregator_idx": sql_data["sql"]["agg"],
                "select_column_idx": sql_data["sql"]["sel"],
                "conditions_num": len(sql_data["sql"]["conds"]),
                "conditions_column_idx": [x[0] for x in sql_data["sql"]["conds"]],
                "conditions_operator_idx": [x[1] for x in sql_data["sql"]["conds"]],
                "conditions_value_string": [str(x[2]) for x in sql_data["sql"]["conds"]],
                "conditions_value_position": conditions_value_position,
                "sql_query": sql_query,
                "execution_result": execution_result,
            }

            features.append(feature_row)
            labels.append(label_row)

            helper["examples"][data_uid] = {
                "question": question,
                "sql_query": sql_query,
                "execution_result": execution_result,
            }

            if self.is_test and len(labels) == 10:
                break

        return make_batch(features, labels), helper
Esempio n. 2
0
    def __call__(self,
                 features,
                 labels,
                 apply_pad_labels=(),
                 apply_pad_values=()):
        self.collate(features)
        self.collate(labels,
                     apply_pad=False,
                     apply_pad_labels=apply_pad_labels,
                     apply_pad_values=apply_pad_values)

        return make_batch(features, labels)
Esempio n. 3
0
def test_make_batch():
    features = {
        "f1": 0,
        "f2": 1,
        "f3": 3,
    }

    labels = {
        "l1": 0,
        "l2": 1,
        "l3": 2,
    }

    batch = make_batch(features, labels)

    assert batch.features == features
    assert batch.labels == labels
Esempio n. 4
0
def test_batch_sort_by_key():

    features = [
        {"f1": "long long long"},
        {"f1": "short"},
        {"f1": "mid mid"}
    ]

    labels = [
        {"l1": 3},
        {"l1": 1},
        {"l1": 2},
    ]

    batch = make_batch(features, labels)
    batch.sort_by_key("f1")

    assert batch.features == sorted(features, key=lambda x: len(x["f1"]))
Esempio n. 5
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "i'm looking for a flight from New York to London.",
                    "slots": ["O", "O", "O", "O", "O", "O", "B-city.dept", "I-city.dept" "O", "B-city.dest"]
                    // the number of tokens in sequence.split() and tags must match
                },
                ...
            ],
            "slots": [  // tag_key
                "O",    // tags should be in IOB format
                "B-city.dept",
                "I-city.dept",
                "B-city.dest",
                "I-city.dest",
                ...
            ]
        }
        """

        data, raw_dataset = self._get_data(file_path)
        tag_idx2text, tag_text2idx = self._get_tag_dicts(data=data)

        helper = {
            "file_path": file_path,
            "examples": {},
            "raw_dataset": raw_dataset,
            "tag_idx2text": tag_idx2text,
            "ignore_tag_idx": self.ignore_tag_idx,
            "cls_token": self.CLS_TOKEN,
            "sep_token": self.SEP_TOKEN,
            "unk_token": self.UNK_TOKEN,
            "model": {
                "num_tags": len(tag_idx2text),
                "ignore_tag_idx": self.ignore_tag_idx,
            },
            "predict_helper": {
                "tag_idx2text": tag_idx2text,
            }
        }
        features, labels = [], []

        for example in tqdm(raw_dataset, desc=data_type):
            sequence_text = example["sequence"].strip().replace("\n", "")

            sequence_tokens = self.word_tokenizer.tokenize(sequence_text)
            naive_tokens = sequence_text.split()
            is_head_word = utils.get_is_head_of_word(naive_tokens, sequence_tokens)

            sequence_sub_tokens = []
            tagged_sub_token_idxs = []
            curr_sub_token_idx = 1  # skip CLS_TOKEN
            for token_idx, token in enumerate(sequence_tokens):
                for sub_token_pos, sub_token in enumerate(
                        self.subword_tokenizer.tokenize(token, unit="word")
                ):
                    sequence_sub_tokens.append(sub_token)
                    if is_head_word[token_idx] and sub_token_pos == 0:
                        tagged_sub_token_idxs.append(curr_sub_token_idx)
                    curr_sub_token_idx += 1

            bert_input = [self.CLS_TOKEN] + sequence_sub_tokens + [self.SEP_TOKEN]

            if (
                    self.sequence_max_length is not None
                    and data_type == "train"
                    and len(bert_input) > self.sequence_max_length
            ):
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            tag_texts = example[self.tag_key]
            tag_idxs = [tag_text2idx[tag_text] for tag_text in tag_texts]

            utils.sanity_check_iob(naive_tokens, tag_texts)
            assert len(naive_tokens) == len(tagged_sub_token_idxs), \
                f"""Wrong tagged_sub_token_idxs: followings mismatch.
                naive_tokens: {naive_tokens}
                sequence_sub_tokens: {sequence_sub_tokens}
                tagged_sub_token_idxs: {tagged_sub_token_idxs}"""

            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
                "tagged_sub_token_idxs": tagged_sub_token_idxs,
                "num_tokens": len(naive_tokens),
            }
            features.append(feature_row)

            label_row = {
                "id": data_uid,
                "tag_idxs": tag_idxs,
                "tag_texts": tag_texts,
            }
            labels.append(label_row)

            helper["examples"][data_uid] = {
                "sequence": sequence_text,
                "sequence_sub_tokens": sequence_sub_tokens,
                "tag_idxs": tag_idxs,
                "tag_texts": tag_texts,
            }

        return make_batch(features, labels), helper
Esempio n. 6
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence_a": "what a wonderful day!",
                    "sequence_b": "what a great day!",
                    "score": 0.9
                },
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)

        helper = {
            "file_path": file_path,
            "examples": {},
            "cls_token": self.CLS_TOKEN,
            "sep_token": self.SEP_TOKEN,
            "unk_token": self.UNK_TOKEN,
            "model": {},
            "predict_helper": {}
        }
        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_a = utils.get_sequence_a(example)
            sequence_b = example.get("sequence_b", None)

            sequence_a_sub_tokens = self.subword_tokenizer.tokenize(sequence_a)
            sequence_b_sub_tokens = None
            bert_input = [self.CLS_TOKEN
                          ] + sequence_a_sub_tokens + [self.SEP_TOKEN]

            if sequence_b is not None:
                sequence_b_sub_tokens = self.subword_tokenizer.tokenize(
                    sequence_b)
                bert_input += sequence_b_sub_tokens + [self.SEP_TOKEN]

            if (self.sequence_max_length is not None and data_type == "train"
                    and len(bert_input) > self.sequence_max_length):
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
            }
            features.append(feature_row)

            score = example[self.label_key]
            label_row = {
                "id": data_uid,
                "score": score,
            }
            labels.append(label_row)

            helper["examples"][data_uid] = {
                "sequence_a": sequence_a,
                "sequence_a_sub_tokens": sequence_a_sub_tokens,
                "sequence_b": sequence_b,
                "sequence_b_sub_tokens": sequence_b_sub_tokens,
                "score": score,
            }

            if self.is_test and len(features) >= 10:
                break

        return make_batch(features, labels), helper
Esempio n. 7
0
    def __call__(self, features, labels):
        self.collate(features)
        self.collate(labels, apply_pad=False)

        return make_batch(features, labels)
Esempio n. 8
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "what a wonderful day!",
                    "emotion": "happy"
                },
                ...
            ],
            "emotion": [  // class_key
                "angry",
                "happy",
                "sad",
                ...
            ]
        }
        """

        data, raw_dataset = self._get_data(file_path, data_type=data_type)
        class_idx2text, class_text2idx = self._get_class_dicts(data=data)

        helper = {
            "file_path": file_path,
            "examples": {},
            "raw_dataset": raw_dataset,
            "class_idx2text": class_idx2text,
            "class_text2idx": class_text2idx,
            "cls_token": self.CLS_TOKEN,
            "sep_token": self.SEP_TOKEN,
            "unk_token": self.UNK_TOKEN,
            "continue_symbol": self.CONTINUE_SYMBOL,
            "model": {
                "num_classes": len(class_idx2text),
            },
            "predict_helper": {
                "class_idx2text": class_idx2text,
            }
        }
        features, labels = [], []

        for example in tqdm(raw_dataset, desc=data_type):
            sequence_text = example["sequence"].strip().replace("\n", "")
            sequence_sub_tokens = self.subword_tokenizer.tokenize(
                sequence_text)

            bert_input = [self.CLS_TOKEN
                          ] + sequence_sub_tokens + [self.SEP_TOKEN]

            if (self.sequence_max_length is not None and data_type == "train"
                    and len(bert_input) > self.sequence_max_length):
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
            }
            features.append(feature_row)

            class_text = example[self.class_key]
            label_row = {
                "id": data_uid,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            }
            labels.append(label_row)

            helper["examples"][data_uid] = {
                "sequence": sequence_text,
                "sequence_sub_tokens": sequence_sub_tokens,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            }

        return make_batch(features, labels), helper
Esempio n. 9
0
    def _read(self, file_path, data_type=None):
        word_tokenized_error_count, subword_tokenized_error_count = 0, 0

        if data_type != "train":
            self.context_stride = 64  # NOTE: hard-code

        data = self.data_handler.read(file_path)
        squad = json.loads(data)
        if "data" in squad:
            squad = squad["data"]

        helper = {
            "file_path": file_path,
            "examples": {},
            "raw_dataset": squad,
            "cls_token": self.CLS_TOKEN,
            "sep_token": self.SEP_TOKEN,
            "model": {
                "lang_code": self.lang_code,
            },
        }
        features, labels = [], []

        for article in tqdm(squad, desc=data_type):
            for paragraph in article["paragraphs"]:
                context_text = paragraph["context"].replace("``",
                                                            '" ').replace(
                                                                "''", '" ')
                context_tokens = self.word_tokenizer.tokenize(context_text)

                context_spans, char_to_word_offset = self._convert_to_spans(
                    context_text, context_tokens)
                context_tokens = [
                    Token(text, span)
                    for (text, span) in zip(context_tokens, context_spans)
                ]

                context_sub_tokens = []
                for token in context_tokens:
                    for sub_token in self.subword_tokenizer.tokenize(
                            token.text):
                        context_sub_tokens.append(
                            Token(sub_token, token.text_span))

                for qa in paragraph["qas"]:
                    question_text = qa["question"]
                    question_text = " ".join(
                        self.word_tokenizer.tokenize(question_text))
                    question_sub_tokens = [
                        Token(subword) for subword in
                        self.subword_tokenizer.tokenize(question_text)
                    ]

                    id_ = qa["id"]
                    answers = qa["answers"]

                    answer_texts, answer_indices = [], []

                    if qa.get("is_impossible", None):
                        answers = qa["plausible_answers"]
                        answerable = 0
                    else:
                        answers = qa["answers"]
                        answerable = 1

                    for answer in answers:
                        answer_start = answer["answer_start"]
                        answer_end = answer_start + len(answer["text"]) - 1

                        answer_texts.append(answer["text"])
                        answer_indices.append((answer_start, answer_end))

                    if len(answer_indices) > 0:
                        answer_char_start, answer_char_end = self._find_one_most_common(
                            answer_indices)
                        answer_word_start = char_to_word_offset[
                            answer_char_start]
                        answer_word_end = char_to_word_offset[answer_char_end]

                        char_answer_text = context_text[
                            answer_char_start:answer_char_end + 1]
                        word_answer_text = context_text[
                            context_spans[answer_word_start][0]:
                            context_spans[answer_word_end][1]]

                        if not self._is_rebuild(char_answer_text,
                                                word_answer_text):
                            logger.warning(
                                f"word_tokenized_error: {char_answer_text}  ###  {word_answer_text}"
                            )
                            word_tokenized_error_count += 1
                    else:
                        # Unanswerable
                        answers = ["<noanswer>"]
                        answer_char_start, answer_char_end = -1, -1
                        answer_word_start, answer_word_end = -1, -1

                    bert_features, bert_labels = self._make_features_and_labels(
                        context_sub_tokens,
                        question_sub_tokens,
                        answer_char_start,
                        answer_char_end + 1,
                    )

                    for (index, (feature, label)) in enumerate(
                            zip(bert_features, bert_labels)):
                        bert_tokens = feature
                        answer_start, answer_end = label

                        if (answer_start < 0
                                or answer_start >= len(bert_tokens)
                                or answer_end >= len(bert_tokens)
                                or bert_tokens[answer_start].text_span is None
                                or bert_tokens[answer_end].text_span is None):
                            continue

                        char_start = bert_tokens[answer_start].text_span[0]
                        char_end = bert_tokens[answer_end].text_span[1]
                        bert_answer = context_text[char_start:char_end]

                        if char_answer_text != bert_answer:
                            logger.warning(
                                f"subword_tokenized_error: {char_answer_text} ### {word_answer_text})"
                            )
                            subword_tokenized_error_count += 1

                        feature_row = {
                            "bert_input":
                            [token.text for token in bert_tokens],
                            "bert_token": bert_tokens,
                        }
                        features.append(feature_row)

                        bert_id = id_ + f"#{index}"
                        label_row = {
                            "id": bert_id,  # question_id + bert_index
                            "answer_texts": "\t".join(answer_texts),
                            "answer_start": answer_start,
                            "answer_end": answer_end,
                            "answerable": answerable,
                        }
                        labels.append(label_row)

                        if id_ not in helper["examples"]:
                            helper["examples"][id_] = {
                                "context": context_text,
                                "question": question_text,
                                "answers": answer_texts,
                            }
                        helper["examples"][id_][
                            f"bert_tokens_{index}"] = bert_tokens

        logger.info(
            f"tokenized_error_count - word: {word_tokenized_error_count} | subword: {subword_tokenized_error_count}"
        )
        return make_batch(features, labels), helper
Esempio n. 10
0
    def _read(self, file_path, data_type=None):
        tokenized_error_count = 0

        data = self.data_handler.read(file_path)
        squad = json.loads(data)
        if "data" in squad:
            squad = squad["data"]

        helper = {
            "file_path": file_path,
            "examples": {},  # qid: {context: ..., text_span: ..., question: ..., answer_texts}
            "raw_dataset": squad,

            "model": {
                "lang_code": self.lang_code,
            },
        }

        features, labels = [], []

        for article in tqdm(squad, desc=data_type):
            for paragraph in article["paragraphs"]:
                context = paragraph["context"].replace("``", '" ').replace("''", '" ')
                context_words = self.word_tokenizer.tokenize(context)

                if (
                    self.context_max_length is not None
                    and data_type == "train"
                    and len(context_words) > self.context_max_length
                ):
                    continue

                for qa in paragraph["qas"]:
                    question = qa["question"].strip().replace("\n", "")
                    id_ = qa["id"]

                    answer_texts, answer_indices = [], []

                    if qa.get("is_impossible", None):
                        answers = qa["plausible_answers"]
                        answerable = 0
                    else:
                        answers = qa["answers"]
                        answerable = 1

                    for answer in answers:
                        answer_start = answer["answer_start"]
                        answer_end = answer_start + len(answer["text"])

                        answer_texts.append(answer["text"])
                        answer_indices.append((answer_start, answer_end))

                    feature_row = {
                        "context": self._clean_text(context),
                        "question": question,
                    }
                    features.append(feature_row)

                    if len(answer_indices) > 0:
                        answer_start, answer_end = self._find_one_most_common(answer_indices)
                        text_spans = self._convert_to_spans(context, context_words)
                        word_idxs = self._get_word_span_idxs(text_spans, answer_start, answer_end)

                        word_answer_start = word_idxs[0]
                        word_answer_end = word_idxs[-1]

                        # To check rebuild answer: char_answer_text - word_answer_text
                        char_answer_text = context[answer_start:answer_end]
                        word_answer_text = context[
                            text_spans[word_answer_start][0] : text_spans[word_answer_end][1]
                        ]

                        if not self._is_rebuild(char_answer_text, word_answer_text):
                            logger.warning(f"word_tokenized_error: {char_answer_text}  ###  {word_answer_text}")
                            tokenized_error_count += 1

                    else:
                        # Unanswerable
                        answers = ["<noanswer>"]
                        text_spans = []
                        answer_start, answer_end = 0, 0
                        word_answer_start, word_answer_end = 0, 0

                    label_row = {
                        "id": id_,
                        "answer_start": word_answer_start,
                        "answer_end": word_answer_end,
                        "answerable": answerable,
                    }
                    labels.append(label_row)

                    helper["examples"][id_] = {
                        "context": context,
                        "text_span": text_spans,
                        "question": question,
                        "answers": answer_texts,
                    }

        logger.info(f"tokenized_error_count: {tokenized_error_count} ")
        return make_batch(features, labels), helper