Beispiel #1
0
    def _add_examples(self, examples, example_failures, paragraph, split):
        paragraph_text = paragraph["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = self._tokenizer.tokenize(token)
            for j, sub_token in enumerate(sub_tokens):
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        for qa in paragraph["qas"]:
            qas_id = qa["id"] if "id" in qa else None
            qid = qa["qid"] if "qid" in qa else None
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            plau_answer_text = plau_answer_start_w = plau_answer_end_w = None
            if split == "train":
                if self.v2:
                    is_impossible = qa["is_impossible"]
                if not is_impossible:
                    if "detected_answers" in qa:  # MRQA format
                        answer = qa["detected_answers"][0]
                        answer_offset = answer["char_spans"][0][0]
                    else:  # SQuAD format
                        answer = qa["answers"][0]
                        answer_offset = answer["answer_start"]
                    orig_answer_text = answer["text"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    if answer_offset + answer_length - 1 >= len(
                            char_to_word_offset):
                        utils.log("End position is out of document!")
                        example_failures[0] += 1
                        continue
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]

                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        tokenization.whitespace_tokenize(orig_answer_text))
                    actual_text = actual_text.lower()
                    cleaned_answer_text = cleaned_answer_text.lower()
                    if actual_text.find(cleaned_answer_text) == -1:
                        utils.log("Could not find answer: '{:}' in doc vs. "
                                  "'{:}' in provided answer".format(
                                      tokenization.printable_text(actual_text),
                                      tokenization.printable_text(
                                          cleaned_answer_text)))
                        example_failures[0] += 1
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""
                    plausible_answers = qa.get("plausible_answers", None)
                    if plausible_answers:
                        plau_answer_text = plausible_answers[0]["text"]
                        plau_answer_start = plausible_answers[0][
                            "answer_start"]
                        plau_answer_length = len(plau_answer_text)
                        if plau_answer_start + plau_answer_length - 1 >= len(
                                char_to_word_offset):
                            tf.logging.warning("plausible answer error, pass.")
                            plau_answer_text = plau_answer_start_w = plau_answer_end_w = None
                        else:
                            plau_answer_start_w = char_to_word_offset[
                                plau_answer_start]
                            plau_answer_end_w = char_to_word_offset[
                                plau_answer_start + plau_answer_length - 1]

                            actual_text = " ".join(
                                doc_tokens[plau_answer_start_w:(
                                    plau_answer_end_w + 1)])
                            cleaned_answer_text = " ".join(
                                tokenization.whitespace_tokenize(
                                    plau_answer_text))
                            actual_text = actual_text.lower()
                            cleaned_answer_text = cleaned_answer_text.lower()
                            if actual_text.find(cleaned_answer_text) == -1:
                                tf.logging.warning(
                                    "plausible answer error, pass.")
                                plau_answer_text = plau_answer_start_w = plau_answer_end_w = None

            example = QAExample(
                task_name=self.name,
                eid=len(examples),
                qas_id=qas_id,
                qid=qid,
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                is_impossible=is_impossible,
                all_doc_tokens=all_doc_tokens,
                orig_to_tok_index=orig_to_tok_index,
                tok_to_orig_index=tok_to_orig_index,
                plau_answer_start=plau_answer_start_w,
                plau_answer_text=plau_answer_text,
                plau_answer_end=plau_answer_end_w,
            )
            examples.append(example)
Beispiel #2
0
    def _add_examples(self, examples, example_failures, paragraph, split):
        paragraph_text = paragraph["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        if self.name in [
                "sacqa", "cmrc2018", "ccks42ee", "ccks42single", "ccks42multi"
        ]:  # for chinese
            prev_is_chinese = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace or prev_is_chinese or is_chinese_char(
                            c):
                        doc_tokens.append(c)
                        prev_is_chinese = True if is_chinese_char(c) else False
                    else:
                        doc_tokens[-1] += c
                        prev_is_chinese = False
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
        else:
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

        for qa in paragraph["qas"]:
            qas_id = qa["id"] if "id" in qa else None
            qid = qa["qid"] if "qid" in qa else None
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            if split == "train":
                if self.v2:
                    is_impossible = qa["is_impossible"]
                if not is_impossible:
                    if "detected_answers" in qa:  # MRQA format
                        answer = qa["detected_answers"][0]
                        answer_offset = answer["char_spans"][0][0]
                    else:  # SQuAD format
                        answer = qa["answers"][0]
                        answer_offset = answer["answer_start"]
                    orig_answer_text = answer["text"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    if answer_offset + answer_length - 1 >= len(
                            char_to_word_offset):
                        utils.log("End position is out of document!")
                        example_failures[0] += 1
                        continue
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]

                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    if self.name in [
                            "sacqa", "cmrc2018", "ccks42ee", "ccks42single",
                            "ccks42multi"
                    ]:  # for chinese, no whitespace needed
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                    else:
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                    actual_text = actual_text.lower()
                    cleaned_answer_text = cleaned_answer_text.lower()
                    if actual_text.find(cleaned_answer_text) == -1:
                        utils.log(
                            "Could not find answer: '{:}': '{:}' in doc vs. "
                            "'{:}' in provided answer".format(
                                qas_id,
                                tokenization.printable_text(actual_text),
                                tokenization.printable_text(
                                    cleaned_answer_text)))
                        example_failures[0] += 1
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""

            example = QAExample(task_name=self.name,
                                eid=len(examples),
                                qas_id=qas_id,
                                qid=qid,
                                question_text=question_text,
                                doc_tokens=doc_tokens,
                                orig_answer_text=orig_answer_text,
                                start_position=start_position,
                                end_position=end_position,
                                is_impossible=is_impossible)
            examples.append(example)
Beispiel #3
0
    def _add_examples(self, examples, example_failures, paragraph, split):
        paragraph_text = paragraph["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        # def parse(sentence):
        #     """ 解析一个句子,返回dependence heads etc """
        #     doc = nlp(sentence)
        #     heads = []
        #     words = []
        #     for sent in doc.sentences:
        #         heads_tmp = []
        #         num_tmp = sum([len(x) if x else 0 for x in heads])
        #         for word in sent.words:
        #             words.append(word.text)
        #             if word.head == 0:
        #                 heads_tmp.append(0)
        #             else:
        #                 heads_tmp.append(word.head + num_tmp)
        #         heads.append(heads_tmp)
        #     heads = reduce(lambda x, y: x + y, heads)
        #     return heads, words
        #
        # def parse_and_trim(tokens):
        #     """ 输入空格分词后的tokens list, parse后按照输入调整heads """
        #     heads, words = parse(" ".join(tokens))
        #     t2w = {}
        #     w2t = {}
        #     ti = 0
        #     wi = 0
        #     last_move = None  # 交替移动指针的控制
        #     while (ti < len(tokens)) and (wi < len(words)):
        #         if tokens[ti] == words[wi]:
        #             t2w[ti] = wi
        #             w2t[wi] = ti
        #             ti += 1
        #             wi += 1
        #             last_move = None
        #         elif tokens[ti] in words[wi]:
        #             t2w[ti] = wi
        #             if wi not in w2t:
        #                 w2t[wi] = ti
        #             ti += 1
        #             last_move = 't'
        #         elif words[wi] in tokens[ti]:
        #             w2t[wi] = ti
        #             if ti not in t2w:
        #                 t2w[ti] = wi
        #             wi += 1
        #             last_move = 'w'
        #         else:
        #             if last_move == 'w':
        #                 ti += 1
        #                 last_move = 't'
        #             elif last_move == 't':
        #                 wi += 1
        #                 last_move = 'w'
        #             else:
        #                 wi += 1
        #                 ti += 1
        #                 last_move = None
        #     heads_ = []
        #     for ti in range(len(tokens)):
        #         wi = t2w.get(ti, None)
        #         if wi is not None:
        #             h = heads[wi]
        #             if h == 0:
        #                 heads_.append(0)
        #             else:
        #                 h_ = w2t.get(h - 1, None)
        #                 if h_ is not None:
        #                     heads_.append(h_ + 1)
        #                 else:
        #                     heads_.append(ti + 1)
        #         else:
        #             heads_.append(ti + 1)
        #     return heads_
        #
        # def heads_2_dep_matrix(heads):
        #     """ 将dependence heads转换为dependence matrix """
        #     arr = np.diag((1,) * len(heads))
        #     for i, j in enumerate(heads):
        #         if j != 0:
        #             arr[i, j - 1] = 1
        #     while True:  # 传递依赖
        #         arr1 = np.matmul(arr, arr)
        #         arr1[arr1 > 1] = 1
        #         if (arr1 == arr).all():
        #             break
        #         else:
        #             arr = arr1
        #     return arr

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        # heads = parse_and_trim(doc_tokens)  # dependence heads
        for (i, token) in enumerate(doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = self._tokenizer.tokenize(token)
            for j, sub_token in enumerate(sub_tokens):
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        # heads_piece = []
        # last_orig_index = None
        # for ind in range(len(all_doc_tokens)):
        #     orig_index = tok_to_orig_index[ind]
        #     if orig_index == last_orig_index:
        #         heads_piece.append(ind)
        #     else:
        #         h = heads[orig_index]
        #         if h == 0:
        #             heads_piece.append(0)
        #         else:
        #             heads_piece.append(orig_to_tok_index[h - 1] + 1)
        #         last_orig_index = orig_index

        # all_doc_tokens_dep_mask = heads_2_dep_matrix(heads_piece)

        for qa in paragraph["qas"]:
            qas_id = qa["id"] if "id" in qa else None
            qid = qa["qid"] if "qid" in qa else None
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            plau_answer_text = plau_answer_start_w = plau_answer_end_w = None
            if split == "train":
                if self.v2:
                    is_impossible = qa["is_impossible"]
                if not is_impossible:
                    if "detected_answers" in qa:  # MRQA format
                        answer = qa["detected_answers"][0]
                        answer_offset = answer["char_spans"][0][0]
                    else:  # SQuAD format
                        answer = qa["answers"][0]
                        answer_offset = answer["answer_start"]
                    orig_answer_text = answer["text"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    if answer_offset + answer_length - 1 >= len(
                            char_to_word_offset):
                        utils.log("End position is out of document!")
                        example_failures[0] += 1
                        continue
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]

                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        tokenization.whitespace_tokenize(orig_answer_text))
                    actual_text = actual_text.lower()
                    cleaned_answer_text = cleaned_answer_text.lower()
                    if actual_text.find(cleaned_answer_text) == -1:
                        utils.log("Could not find answer: '{:}' in doc vs. "
                                  "'{:}' in provided answer".format(
                                      tokenization.printable_text(actual_text),
                                      tokenization.printable_text(
                                          cleaned_answer_text)))
                        example_failures[0] += 1
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""
                    plausible_answers = qa.get("plausible_answers", None)
                    if plausible_answers:
                        plau_answer_text = plausible_answers[0]["text"]
                        plau_answer_start = plausible_answers[0][
                            "answer_start"]
                        plau_answer_length = len(plau_answer_text)
                        if plau_answer_start + plau_answer_length - 1 >= len(
                                char_to_word_offset):
                            tf.logging.waring("plausible answer error, pass.")
                            plau_answer_text = plau_answer_start_w = plau_answer_end_w = None
                        else:
                            plau_answer_start_w = char_to_word_offset[
                                plau_answer_start]
                            plau_answer_end_w = char_to_word_offset[
                                plau_answer_start + plau_answer_length - 1]

                            actual_text = " ".join(
                                doc_tokens[plau_answer_start_w:(
                                    plau_answer_end_w + 1)])
                            cleaned_answer_text = " ".join(
                                tokenization.whitespace_tokenize(
                                    plau_answer_text))
                            actual_text = actual_text.lower()
                            cleaned_answer_text = cleaned_answer_text.lower()
                            if actual_text.find(cleaned_answer_text) == -1:
                                tf.logging.waring(
                                    "plausible answer error, pass.")
                                plau_answer_text = plau_answer_start_w = plau_answer_end_w = None

            example = QAExample(
                task_name=self.name,
                eid=len(examples),
                qas_id=qas_id,
                qid=qid,
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                is_impossible=is_impossible,
                all_doc_tokens=all_doc_tokens,
                orig_to_tok_index=orig_to_tok_index,
                tok_to_orig_index=tok_to_orig_index,
                # all_doc_tokens_dep_mask=all_doc_tokens_dep_mask,
                plau_answer_start=plau_answer_start_w,
                plau_answer_text=plau_answer_text,
                plau_answer_end=plau_answer_end_w,
            )
            examples.append(example)