def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.

        Returns:
          A list of wordpiece tokens.
        """
        seq_cws_dict = {}
        output_tokens = []
        if self.count % 10000 == 0:
            logger.info(f"count ={self.count}, processing text: {text}")
        self.count += 1
        for ind, token in enumerate(whitespace_tokenize(text)):
            seq_cws = jieba.lcut(token)
            seq_cws_dict.update({x: 1 for x in seq_cws})

        for token in whitespace_tokenize(text):

            chars = list(token)
            i = 0

            while i < len(chars):
                if len(CH_RE.findall(chars[i])) == 0:  # 不是中文的,原文加进去。
                    output_tokens.append(token)
                    break

                has_add = False
                for length in range(5, 0, -1):
                    if i + length > len(chars):
                        continue
                    if ''.join(chars[i:i + length]) in seq_cws_dict:
                        output_tokens.append(chars[i])
                        for l in range(1, length):
                            output_tokens.append('##' + chars[i + l])
                        i += length
                        has_add = True
                        break
                if not has_add:
                    output_tokens.append(chars[i])
                    i += 1

        return output_tokens
    def tokenize(self, text, never_split=None):
        """ Basic Tokenization of a piece of text.
            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.

        Args:
            **never_split**: (`optional`) list of str
                Kept for backward compatibility purposes.
                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
                List of token not to split.
        """
        never_split = self.never_split + (never_split
                                          if never_split is not None else [])
        text = self._clean_text(text)
        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            # pass MASK forward
            if MASK in token:
                split_tokens.append(MASK)
                if token != MASK:
                    remaining_chars = token.replace(MASK, "").strip()
                    if remaining_chars:
                        split_tokens.append(remaining_chars)
                continue

            if self.do_lower_case and token not in never_split:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens
def read_trivia_examples(input_file, is_training=True):
    total_cnt = 0
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)['data']

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    no_answer_cnt = 0
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["qid"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                if qa["answers"] == []:
                    no_answer_cnt += 1
                    continue
                if is_training:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    # word position
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text))
                    cleaned_start = actual_text.lower().find(
                        cleaned_answer_text)
                    #if actual_text.find(cleaned_answer_text) == -1:
                    if cleaned_start == -1:
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                        continue
                    else:
                        # cleaned_answer_text might be lower cased, needs to be reconstructued from actual_text
                        orig_answer_text = actual_text[
                            cleaned_start:cleaned_start +
                            len(cleaned_answer_text)]
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""
                example = TriviaExample(qas_id=qas_id,
                                        question_text=question_text,
                                        doc_tokens=doc_tokens,
                                        orig_answer_text=orig_answer_text,
                                        start_position=start_position,
                                        end_position=end_position)
                examples.append(example)
    print("# of questions without an answer".format(no_answer_cnt))
    return examples
Beispiel #4
0
    def _create_examples(self, input_data, set_type, language):
        is_training = set_type == "train"
        paragraph_id = 0
        examples = []
        for entry in tqdm(input_data):
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                sentence_breaks = list(
                    infer_sentence_breaks(paragraph_text)
                )  # TODO can also get sentence_breaks from json directly.
                paragraph_id += 1
                doc_tokens = []
                char_to_word_offset = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if _is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                    char_to_word_offset.append(len(doc_tokens) - 1)

                for qas in paragraph["qas"]:
                    qas_id = qas["id"]
                    question_text = qas["question"]
                    start_position = None
                    end_position = None
                    orig_answer_text = None
                    # If a question has multiple answers, we only use the first.
                    answer = qas["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]
                    answer_length = len(orig_answer_text)
                    sentence_text = None
                    for start, end in sentence_breaks:
                        if start <= answer_offset < end:
                            sentence_text = paragraph_text[start:end]
                            break
                    # A potential problem here is that the sentence might break
                    # around the answer fragment. In that case, we skip the example.
                    if not sentence_text:
                        continue
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                        continue

                    example = RetrievalSquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        answer_text=actual_text,
                        sentence_text=sentence_text,
                        paragraph_text=paragraph_text,
                        paragraph_id=paragraph_id)
                    examples.append(example)
        return examples
Beispiel #5
0
def read_nq_examples(input_file_or_data, is_training):
    """Read a NQ json file into a list of NQExample. Refer to `nq_to_squad.py`
       to convert the `simplified-nq-t*.jsonl` files to NQ json."""
    if isinstance(input_file_or_data, str):
        with open(input_file_or_data, "r", encoding='utf-8') as f:
            input_data = json.load(f)["data"]

    else:
        input_data = input_file_or_data

    for entry_index, entry in enumerate(tqdm(input_data, total=len(input_data))):
        # if entry_index >= 2:
        #     break
        assert len(entry["paragraphs"]) == 1
        paragraph = entry["paragraphs"][0]
        paragraph_text = paragraph["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        assert len(paragraph["qas"]) == 1
        qa = paragraph["qas"][0]
        start_position = None
        end_position = None
        long_position = None
        orig_answer_text = None
        short_is_impossible = False
        long_is_impossible = False
        if is_training:
            short_is_impossible = qa["short_is_impossible"]
            short_answers = qa["short_answers"]
            if len(short_answers) >= 2:
                # logger.info(f"Choosing leftmost of "
                #     f"{len(short_answers)} short answer")
                short_answers = sorted(short_answers, key=lambda sa: sa["answer_start"])
                short_answers = short_answers[0: 1]

            if not short_is_impossible:
                answer = short_answers[0]
                orig_answer_text = answer["text"]
                answer_offset = answer["answer_start"]
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[
                    answer_offset + answer_length - 1]
                # Only add answers where the text can be exactly
                # recovered from the document. If this CAN'T
                # happen it's likely due to weird Unicode stuff
                # so we will just skip the example.
                #
                # Note that this means for training mode, every
                # example is NOT guaranteed to be preserved.
                actual_text = " ".join(doc_tokens[start_position:
                    end_position + 1])
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    logger.warning(
                        "Could not find answer: '%s' vs. '%s'",
                        actual_text, cleaned_answer_text)
                    continue
            else:
                start_position = -1
                end_position = -1
                orig_answer_text = ""

            long_is_impossible = qa["long_is_impossible"]
            long_answers = qa["long_answers"]
            if (len(long_answers) != 1) and not long_is_impossible:
                raise ValueError(f"For training, each question"
                    f" should have exactly 1 long answer.")

            if not long_is_impossible:
                long_answer = long_answers[0]
                long_answer_offset = long_answer["answer_start"]
                long_position = char_to_word_offset[long_answer_offset]
            else:
                long_position = -1

            # print(f'Q:{question_text}')
            # print(f'A:{start_position}, {end_position},
            # {orig_answer_text}')
            # print(f'R:{doc_tokens[start_position: end_position]}')

            if not short_is_impossible and not long_is_impossible:
                assert long_position <= start_position

            if not short_is_impossible and long_is_impossible:
                assert False, f'Invalid pair short, long pair'

        example = NQExample(
            qas_id=qa["id"],
            question_text=qa["question"],
            doc_tokens=doc_tokens,
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position,
            long_position=long_position,
            short_is_impossible=short_is_impossible,
            long_is_impossible=long_is_impossible,
            crop_start=qa["crop_start"])

        yield example
Beispiel #6
0
def read_nq_entry(entry, is_training):
    """
    Converts a NQ entry into a list of NqExamples.
    :param entry: dict
    :param is_training: bool
    :return: list[NqExample]
    """
    def is_whitespace(c):
        return c in " \t\r\n" or ord(c) == 0x202F

    examples = []
    contexts_id = entry["id"]
    contexts = entry["contexts"]
    doc_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True
    for c in contexts:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

    questions = []
    for i, question in enumerate(entry["questions"]):
        qas_id = "{}".format(contexts_id)
        question_text = question["input_text"]
        start_position = None
        end_position = None
        answer = None
        if is_training:
            answer_dict = entry["answers"][i]
            answer = make_nq_answer(contexts, answer_dict)

            # For now, only handle extractive, yes, and no.
            if answer is None or answer.offset is None:
                continue
            start_position = char_to_word_offset[answer.offset]
            end_position = char_to_word_offset[answer.offset +
                                               len(answer.text) - 1]

            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(doc_tokens[start_position:(end_position +
                                                              1)])
            cleaned_answer_text = " ".join(
                tokenization.whitespace_tokenize(answer.text))
            if actual_text.find(cleaned_answer_text) == -1:
                logger.warning("Could not find answer: '%s' vs. '%s'",
                               actual_text, cleaned_answer_text)
                continue

        questions.append(question_text)
        example = NqExample(example_id=int(contexts_id),
                            qas_id=qas_id,
                            questions=questions[:],
                            doc_tokens=doc_tokens,
                            doc_tokens_map=entry.get("contexts_map", None),
                            answer=answer,
                            start_position=start_position,
                            end_position=end_position)
        examples.append(example)
    return examples
Beispiel #7
0
    def to_feature_list(
        self,
        tokenizer,
        max_seq_length,
        doc_stride,
        max_query_length,
        set_type,
    ):
        is_training = set_type == PHASE.TRAIN
        features = []
        if is_training and not self.is_impossible:
            # Get start and end position
            start_position = self.start_position
            end_position = self.end_position

            # If the answer cannot be found in the text, then skip this example.
            actual_text = " ".join(
                self.doc_tokens[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(
                whitespace_tokenize(self.answer_text))
            if actual_text.find(cleaned_answer_text) == -1:
                logger.warning("Could not find answer: '%s' vs. '%s'",
                               actual_text, cleaned_answer_text)
                return []

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(self.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        if is_training and not self.is_impossible:
            tok_start_position = orig_to_tok_index[self.start_position]
            if self.end_position < len(self.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[self.end_position + 1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1

            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position,
                tokenizer, self.answer_text)

        spans = []

        truncated_query = tokenizer.encode(
            self.question_text,
            add_special_tokens=False,
            truncation=True,
            max_length=max_query_length,
        )
        sequence_added_tokens = (
            tokenizer.max_len - tokenizer.max_len_single_sentence +
            1 if "roberta" in str(type(tokenizer))
            or "camembert" in str(type(tokenizer)) else tokenizer.max_len -
            tokenizer.max_len_single_sentence)
        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair

        span_doc_tokens = all_doc_tokens
        while len(spans) * doc_stride < len(all_doc_tokens):

            encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
                truncated_query
                if tokenizer.padding_side == "right" else span_doc_tokens,
                span_doc_tokens
                if tokenizer.padding_side == "right" else truncated_query,
                truncation="only_second"
                if tokenizer.padding_side == "right" else "only_first",
                pad_to_max_length=True,
                max_length=max_seq_length,
                return_overflowing_tokens=True,
                stride=max_seq_length - doc_stride - len(truncated_query) -
                sequence_pair_added_tokens,
                return_token_type_ids=True,
            )

            paragraph_len = min(
                len(all_doc_tokens) - len(spans) * doc_stride,
                max_seq_length - len(truncated_query) -
                sequence_pair_added_tokens,
            )

            if tokenizer.pad_token_id in encoded_dict["input_ids"]:
                if tokenizer.padding_side == "right":
                    non_padded_ids = encoded_dict[
                        "input_ids"][:encoded_dict["input_ids"].
                                     index(tokenizer.pad_token_id)]
                else:
                    last_padding_id_position = (
                        len(encoded_dict["input_ids"]) - 1 -
                        encoded_dict["input_ids"][::-1].index(
                            tokenizer.pad_token_id))
                    non_padded_ids = encoded_dict["input_ids"][
                        last_padding_id_position + 1:]

            else:
                non_padded_ids = encoded_dict["input_ids"]

            tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

            token_to_orig_map = {}
            for i in range(paragraph_len):
                index = (len(truncated_query) + sequence_added_tokens +
                         i if tokenizer.padding_side == "right" else i)
                token_to_orig_map[index] = tok_to_orig_index[len(spans) *
                                                             doc_stride + i]

            encoded_dict["paragraph_len"] = paragraph_len
            encoded_dict["tokens"] = tokens
            encoded_dict["token_to_orig_map"] = token_to_orig_map
            encoded_dict["truncated_query_with_special_tokens_length"] = (
                len(truncated_query) + sequence_added_tokens)
            encoded_dict["token_is_max_context"] = {}
            encoded_dict["start"] = len(spans) * doc_stride
            encoded_dict["length"] = paragraph_len

            spans.append(encoded_dict)

            if "overflowing_tokens" not in encoded_dict or (
                    "overflowing_tokens" in encoded_dict
                    and len(encoded_dict["overflowing_tokens"]) == 0):
                break
            span_doc_tokens = encoded_dict["overflowing_tokens"]

        for doc_span_index in range(len(spans)):
            for j in range(spans[doc_span_index]["paragraph_len"]):
                is_max_context = _new_check_is_max_context(
                    spans, doc_span_index, doc_span_index * doc_stride + j)
                index = (j if tokenizer.padding_side == "left" else
                         spans[doc_span_index]
                         ["truncated_query_with_special_tokens_length"] + j)
                spans[doc_span_index]["token_is_max_context"][
                    index] = is_max_context

        for span in spans:
            # Identify the position of the CLS token
            cls_index = span["input_ids"].index(tokenizer.cls_token_id)

            # p_mask: mask with 1 for token than cannot be in the answer
            #         (0 for token which can be in an answer)
            # Original TF implem also keep the classification token (set to 0) (not sure why...)
            p_mask = np.ones_like(span["token_type_ids"])
            if tokenizer.padding_side == "right":
                p_mask[len(truncated_query) + sequence_added_tokens:] = 0
            else:
                p_mask[-len(span["tokens"]):-(len(truncated_query) +
                                              sequence_added_tokens)] = 0

            pad_token_indices = np.where(
                span["input_ids"] == tokenizer.pad_token_id)
            special_token_indices = np.asarray(
                tokenizer.get_special_tokens_mask(
                    span["input_ids"],
                    already_has_special_tokens=True)).nonzero()

            p_mask[pad_token_indices] = 1
            p_mask[special_token_indices] = 1

            # Set the cls index to 0: the CLS index can be used for impossible answers
            p_mask[cls_index] = 0

            span_is_impossible = self.is_impossible
            start_position = 0
            end_position = 0
            if is_training and not span_is_impossible:
                # For training, if our document chunk does not contain an annotation
                # we throw it out, since there is nothing to predict.
                doc_start = span["start"]
                doc_end = span["start"] + span["length"] - 1
                out_of_span = False

                # noinspection PyUnboundLocalVariable
                if not (tok_start_position >= doc_start
                        and tok_end_position <= doc_end):
                    out_of_span = True

                if out_of_span:
                    start_position = cls_index
                    end_position = cls_index

                    # We store "is_impossible" at an example level instead
                    # noinspection PyUnusedLocal
                    span_is_impossible = True
                else:
                    if tokenizer.padding_side == "left":
                        doc_offset = 0
                    else:
                        doc_offset = len(
                            truncated_query) + sequence_added_tokens

                    start_position = tok_start_position - doc_start + doc_offset
                    end_position = tok_end_position - doc_start + doc_offset

            features.append(
                DataRow(
                    unique_id="",
                    qas_id=self.qas_id,
                    tokens=span["tokens"],
                    token_to_orig_map=span["token_to_orig_map"],
                    token_is_max_context=span["token_is_max_context"],
                    input_ids=np.array(span["input_ids"]),
                    input_mask=np.array(span["attention_mask"]),
                    segment_ids=np.array(span["token_type_ids"]),
                    cls_index=np.array(cls_index),
                    p_mask=np.array(p_mask.tolist()),
                    paragraph_len=span["paragraph_len"],
                    start_position=start_position,
                    end_position=end_position,
                    answers=self.answers,
                    doc_tokens=self.doc_tokens,
                ))
        return features
Beispiel #8
0
def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
    reader = open(input_file, "r", encoding='utf-8')
    reader.readline()
    input_data = []

    for line in reader:
        input_data.append(json.loads(line))

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for paragraph in input_data:
        paragraph_text = paragraph["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        for qa in paragraph["qas"]:
            try:
                qas_id = qa["id"]
            except:
                qas_id = qa["qid"]

            question_text = qa["question"]
            start_positions = []
            end_positions = []
            orig_answer_texts = []
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            if is_training:
                if version_2_with_negative:
                    is_impossible = qa["is_impossible"]
                # if (len(qa["answers"]) != 1) and (not is_impossible):
                #     raise ValueError(
                #         "For training, each question should have exactly 1 answer.")
                if not is_impossible:
                    flag = True
                    for answer in qa["detected_answers"]:
                        orig_answer_text = paragraph_text[
                            answer["char_spans"][0][0]:
                            answer["char_spans"][0][1] + 1]  #answer["text"]
                        answer_offset = answer["char_spans"][0][0]
                        # answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer["char_spans"]
                                                           [0][1]]

                        # end_position = char_to_word_offset[answer_offset + answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            flag = False
                            break
                        start_positions.append(start_position)
                        end_positions.append(end_position)
                        orig_answer_texts.append(orig_answer_text)

                    if not flag and is_training:
                        continue
                # else:
                #     start_position = -1
                #     end_position = -1
                #     orig_answer_text = ""

            example = SquadExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                # orig_answer_text=orig_answer_text,
                # start_position=start_position,
                # end_position=end_position,
                orig_answer_text=orig_answer_texts,
                start_position=start_positions,
                end_position=end_positions,
                is_impossible=is_impossible)
            examples.append(example)
    return examples
Beispiel #9
0
def squad_convert_example_to_features(example, max_seq_length, doc_stride,
                                      max_query_length, padding_strategy,
                                      is_training):
    features = []
    if is_training and not example.is_impossible:
        # Get start and end position
        start_position = example.start_position
        end_position = example.end_position

        # If the answer cannot be found in the text, then skip this example.
        actual_text = " ".join(
            example.doc_tokens[start_position:(end_position + 1)])
        cleaned_answer_text = " ".join(whitespace_tokenize(
            example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
            logger.warning(
                f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'"
            )
            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        if tokenizer.__class__.__name__ in [
                "RobertaTokenizer",
                "LongformerTokenizer",
                "BartTokenizer",
                "RobertaTokenizerFast",
                "LongformerTokenizerFast",
                "BartTokenizerFast",
        ]:
            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
        else:
            sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    if is_training and not example.is_impossible:
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1

        (tok_start_position, tok_end_position) = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
            example.answer_text)

    spans = []
    truncated_query = tokenizer.encode(example.question_text,
                                       add_special_tokens=False,
                                       truncation=True,
                                       max_length=max_query_length)

    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
    # in the way they compute mask of added tokens.
    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
    sequence_added_tokens = (
        tokenizer.model_max_length - tokenizer.max_len_single_sentence +
        1 if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET else
        tokenizer.model_max_length - tokenizer.max_len_single_sentence)
    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair

    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):

        # Define the side we want to truncate / pad and the text/pair sorting
        if tokenizer.padding_side == "right":
            texts = truncated_query
            pairs = span_doc_tokens
            truncation = TruncationStrategy.ONLY_SECOND
        else:
            texts = span_doc_tokens
            pairs = truncated_query
            truncation = TruncationStrategy.ONLY_FIRST

        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
            texts,
            pairs,
            truncation=truncation,
            padding=padding_strategy,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            stride=max_seq_length - doc_stride - len(truncated_query) -
            sequence_pair_added_tokens,
            return_token_type_ids=True,
        )

        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )

        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            if tokenizer.padding_side == "right":
                non_padded_ids = encoded_dict[
                    "input_ids"][:encoded_dict["input_ids"].
                                 index(tokenizer.pad_token_id)]
            else:
                last_padding_id_position = (
                    len(encoded_dict["input_ids"]) - 1 -
                    encoded_dict["input_ids"][::-1].index(
                        tokenizer.pad_token_id))
                non_padded_ids = encoded_dict["input_ids"][
                    last_padding_id_position + 1:]

        else:
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        for i in range(paragraph_len):
            index = len(
                truncated_query
            ) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) *
                                                         doc_stride + i]

        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(
            truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len

        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict or (
                "overflowing_tokens" in encoded_dict
                and len(encoded_dict["overflowing_tokens"]) == 0):
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(
                spans, doc_span_index, doc_span_index * doc_stride + j)
            index = (j if tokenizer.padding_side == "left" else
                     spans[doc_span_index]
                     ["truncated_query_with_special_tokens_length"] + j)
            spans[doc_span_index]["token_is_max_context"][
                index] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # Original TF implem also keep the classification token (set to 0)
        p_mask = np.ones_like(span["token_type_ids"])
        if tokenizer.padding_side == "right":
            p_mask[len(truncated_query) + sequence_added_tokens:] = 0
        else:
            p_mask[-len(span["tokens"]):-(len(truncated_query) +
                                          sequence_added_tokens)] = 0

        pad_token_indices = np.where(
            span["input_ids"] == tokenizer.pad_token_id)
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(
                span["input_ids"], already_has_special_tokens=True)).nonzero()

        p_mask[pad_token_indices] = 1
        p_mask[special_token_indices] = 1

        # Set the cls index to 0: the CLS index can be used for impossible answers
        p_mask[cls_index] = 0

        span_is_impossible = example.is_impossible
        start_position = 0
        end_position = 0
        if is_training and not span_is_impossible:
            # For training, if our document chunk does not contain an annotation
            # we throw it out, since there is nothing to predict.
            doc_start = span["start"]
            doc_end = span["start"] + span["length"] - 1
            out_of_span = False

            if not (tok_start_position >= doc_start
                    and tok_end_position <= doc_end):
                out_of_span = True

            if out_of_span:
                start_position = cls_index
                end_position = cls_index
                span_is_impossible = True
            else:
                if tokenizer.padding_side == "left":
                    doc_offset = 0
                else:
                    doc_offset = len(truncated_query) + sequence_added_tokens

                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

        features.append(
            SquadFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_index=
                0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_position=start_position,
                end_position=end_position,
                is_impossible=span_is_impossible,
                guid=example.guid,
            ))
    return features
def read_squad_example(example: QASample):
    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    paragraph_text = example.context
    doc_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True
    for c in paragraph_text:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

    qas_id = example.sample_id
    question_text = example.question
    sup_ids = example.sup_ids
    sup_token_pos_ids = []

    answer = example.answer_dict
    orig_answer_text = answer["text"]
    answer_offset = answer["answer_start"]
    answer_length = len(orig_answer_text)
    start_position = char_to_word_offset[answer_offset]
    end_position = char_to_word_offset[answer_offset + answer_length - 1]

    if sup_ids:
        for sup in sup_ids:
            sup_start_position = char_to_word_offset[sup[0]]
            sup_end_position = char_to_word_offset[sup[1] - 1]

            sup_token_pos_ids.append((sup_start_position, sup_end_position))

    # Only add answers where the text can be exactly recovered from the
    # document. If this CAN'T happen it's likely due to weird Unicode
    # stuff so we will just skip the example.
    #
    # Note that this means for training mode, every example is NOT
    # guaranteed to be preserved.
    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
    cleaned_answer_text = " ".join(
        whitespace_tokenize(orig_answer_text))
    if actual_text.find(cleaned_answer_text) == -1:
        logger.warning("Could not find answer: '%s' vs. '%s'",
                       actual_text, cleaned_answer_text)

    return SquadExample(
        qas_id=qas_id,
        question_text=question_text,
        doc_tokens=doc_tokens,
        orig_answer_text=orig_answer_text,
        start_position=start_position,
        end_position=end_position,
        sup_ids=sup_token_pos_ids)
Beispiel #11
0
def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    input_data = input_data

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]

            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]

                question_text = qa["question"]
                start_positions = []
                end_positions = []
                orig_answer_texts = []
                is_impossible = False

                if is_training:  # for debug
                    if version_2_with_negative:
                        is_impossible = qa.get("is_impossible", False)

                    if not is_impossible:
                        flag = True

                        for answer in qa["answers"]:
                            orig_answer_text = answer["text"]
                            answer_offset = answer["answer_start"]
                            answer_length = len(orig_answer_text)
                            start_position = char_to_word_offset[answer_offset]
                            end_position = char_to_word_offset[answer_offset +
                                                               answer_length -
                                                               1]
                            # Only add answers where the text can be exactly recovered from the
                            # document. If this CAN'T happen it's likely due to weird Unicode
                            # stuff so we will just skip the example.
                            #
                            # Note that this means for training mode, every example is NOT
                            # guaranteed to be preserved.
                            actual_text = " ".join(
                                doc_tokens[start_position:(end_position + 1)])
                            cleaned_answer_text = " ".join(
                                whitespace_tokenize(orig_answer_text))
                            if actual_text.find(cleaned_answer_text) == -1:
                                logger.warning(
                                    "Could not find answer: '%s' vs. '%s'",
                                    actual_text, cleaned_answer_text)

                                flag = False
                                break
                            start_positions.append(start_position)
                            end_positions.append(end_position)
                            orig_answer_texts.append(orig_answer_text)

                        if not flag and is_training:
                            continue

                    # else:
                    #     start_position = -1
                    #     end_position = -1
                    #     orig_answer_text = ""

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_texts,
                                       start_position=start_positions,
                                       end_position=end_positions,
                                       is_impossible=is_impossible)
                examples.append(example)

    return examples
def read_coqa_examples(input_file,
                       is_training=True,
                       use_history=False,
                       n_history=-1):
    """
    read a CoQA json file into a list of QA examples
    """
    total_cnt = 0
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)['data']

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        # process story text
        paragraph_text = entry["story"]
        paragraph_id = entry["id"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            # each char is mapped to word position
            char_to_word_offset.append(len(doc_tokens) - 1)

        # process questions
        question_history_texts = []
        for (question, ans) in zip(entry['questions'], entry['answers']):
            total_cnt += 1
            cur_question_text = question["input_text"]
            question_history_texts.append(cur_question_text)
            question_id = question["turn_id"]
            ans_id = ans["turn_id"]
            start_position = None
            end_position = None
            yes_no_flag = None
            yes_no_ans = None
            orig_answer_text = None
            if (question_id != ans_id):
                print("question turns are not ordered!")
                print("mismatched question {}".format(cur_question_text))
            if is_training:
                orig_answer_text = ans["text"]
                answer_offset = ans["span_start"]
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                if (answer_offset + answer_length >= len(char_to_word_offset)):
                    end_position = char_to_word_offset[-1]
                else:
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length]
                actual_text = " ".join(
                    doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))
                yes_no_flag = int(ans["yes_no_flag"])
                yes_no_ans = int(ans["yes_no_ans"])
                if actual_text.find(cleaned_answer_text) == -1:
                    logger.warning("Could not find answer: '%s' vs. '%s'",
                                   actual_text, cleaned_answer_text)
                    continue

            if (use_history):
                if (n_history == -1
                        or n_history > len(question_history_texts)):
                    question_texts = question_history_texts[:]
                else:
                    question_texts = question_history_texts[-1 * n_history:]
            else:
                question_texts = question_history_texts[-1]

            example = CoQAExample(paragraph_id=paragraph_id,
                                  turn_id=question_id,
                                  question_texts=question_texts,
                                  doc_tokens=doc_tokens,
                                  orig_answer_text=orig_answer_text,
                                  start_position=start_position,
                                  end_position=end_position,
                                  yes_no_flag=yes_no_flag,
                                  yes_no_ans=yes_no_ans)
            examples.append(example)
    logger.info("Total raw examples: {}".format(total_cnt))
    return examples
Beispiel #13
0
def read_newsqa_examples(input_file, is_training, version_2_with_negative, group):
    """Read a SQuAD json file into a list of NewsqaExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for paragraph in input_data:
        if paragraph['type']==group:
            paragraph_text = paragraph["text"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
            for qid, qa in enumerate(paragraph["questions"]):
                qas_id = paragraph["storyId"]+str(qid)
                question_text = qa["q"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible=False
                if 'consensus' not in qa:
                    is_impossible=True
                elif 's' not in qa["consensus"] or 'e' not in qa["consensus"]:
                    is_impossible=True
                if is_training:
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if not is_impossible:
                        answer = qa["consensus"]
                        answer_offset = answer["s"]
                        answer_end_offset = answer["e"]
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_end_offset-1]
                        answer_length = answer_end_offset-answer_offset
                        orig_answer_text = paragraph_text[answer_offset:answer_end_offset-1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning("Could not find answer: '%s' vs. '%s'",
                                           actual_text, cleaned_answer_text)
                            pdb.set_trace()
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""
                example = NewsqaExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=is_impossible)
                examples.append(example)
    return examples
Beispiel #14
0
def read_newsqa_examples(input_file, is_training, version_2_with_negative=True):
    """Read a NewsQA json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for story in input_data:
        story_text = story["text"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in story_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        for i, qa in enumerate(story["questions"]):
            qas_id = story["storyId"] + '-' + str(i)
            question_text = qa["q"]
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            # if is_training:
            if version_2_with_negative:
                if ("noAnswer" in qa["consensus"] or "badQuestion" in qa["consensus"]):
                    is_impossible = True
                else:
                    is_impossible = False
            if (len(qa["consensus"]) != 2) and (not is_impossible):
                raise ValueError(
                    "For training, each question should have exactly 1 answer.", qa)
            if not is_impossible:
                answer = qa["consensus"]
                answer_offset = answer["s"]
                answer_length = answer["e"] - answer["s"]
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset + answer_length - 1]
                orig_answer_text = story_text[answer["s"]:answer["e"]]

                # Only add answers where the text can be exactly recovered from the
                # document. If this CAN'T happen it's likely due to weird Unicode
                # stuff so we will just skip the example.
                #
                # Note that this means for training mode, every example is NOT
                # guaranteed to be preserved.
                actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    logger.warning("Could not find answer: '%s' vs. '%s'",
                                   actual_text, cleaned_answer_text)
                    continue
            else:
                start_position = -1
                end_position = -1
                orig_answer_text = ""

            example = SquadExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                is_impossible=is_impossible)
            examples.append(example)

            #if len(examples) > 200:
            #    break
        #if len(examples) > 200:
        #    break

    return examples
def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) > 1) and (not is_impossible):
                        # For comparability with this model implementation, if more than one answer exists
                        # we will choose the first one as the correct gold answer.
                        qa["answers"] = [qa["answers"][0]]
                        # raise ValueError(
                        #   "For training, each question should have exactly 1 answer.")
                    elif (len(qa["answers"]) == 0) and (not is_impossible):
                        # In the none SQuAD datasets, it may very well be possible that no gold answer has
                        # been found for an example. In these cases we just discard the example in training.
                        continue

                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)

                        if answer_offset + answer_length - 1 >= len(char_to_word_offset):
                            # In some datasets we get this edge case... s
                            continue

                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]).lower()
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text)).lower()
                        if actual_text.find(cleaned_answer_text) == -1:
                            actual_text_1 = " ".join(doc_tokens[(start_position-1):end_position]).lower()
                            if actual_text_1.find(cleaned_answer_text) == -1:
                                logger.warning("Could not find answer: '%s' vs. '%s'",
                                               actual_text, cleaned_answer_text)
                                continue
                            else:
                                start_position = start_position - 1
                                end_position = end_position - 1
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=is_impossible)
                examples.append(example)
    return examples
Beispiel #16
0
def create_examples(
    examples: list,
    source: str,
    is_training: bool = True,
    multi_qa_type_class: bool = False,
):
    """
    Args:
        examples(list): list of examples 
        is_training (bool): whether we want to create examples for training or eval mode
    Return:
        list of examples (each example is an instance)
    """
    sources = ['SQuAD', 'SubjQA']
    if source not in sources:
        raise ValueError('Data source must be one of {}'.format(sources))

    if not isinstance(examples, list):
        raise TypeError("Input should be a list of examples.")

    def is_whitespace(char: str):
        if char == " " or char == "\t" or char == "\r" or char == "\n" or ord(
                char) == 0x202F:
            return True
        return False

    def preproc_context(context: str):
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in context:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)
        return doc_tokens, char_to_word_offset

    example_instances = []

    for example in examples:

        # TODO: figure out, whether we should strip off "ANSWERNOTFOUND" from reviews in SubjQA;
        #       if not, then start and end positions should be second to the last index (i.e., sequence[-2]) instead of 0 (i.e., [CLS]),
        #       since "ANSWERNOTFOUND" is last token in each review text

        context = example["context"] if source == 'SQuAD' else example[
            "review"].rstrip('ANSWERNOTFOUND')
        doc_tokens, char_to_word_offset = preproc_context(context)

        if source == 'SQuAD':

            for qa in example["qas"]:

                qas_id = qa["id"]
                q_text = qa["question"]
                dataset = 'SQuAD'
                start_position = None
                end_position = None
                orig_answer_text = qa['answers'][0]['text'] if len(
                    qa['answers']) == 1 else ''
                is_impossible = qa['is_impossible']
                q_sbj = 2 if multi_qa_type_class else 0
                a_sbj = 2 if multi_qa_type_class else 0
                domain = 'wikipedia'

                # we don't need start and end positions in eval mode
                if is_training:
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )

                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]

                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.

                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))

                        if actual_text.find(cleaned_answer_text) == -1:
                            # skip example, if answer cannot be recovered from document
                            continue

                    # elif question is NOT answerable, then answer is the empty string and start and end positions are 0
                    else:
                        #uncomment line below to skip unanswerable questions
                        #continue
                        start_position = 0
                        end_position = 0
                        orig_answer_text = ""

        elif source == 'SubjQA':

            qas_id = example['qa_id']
            q_text = example['question']
            dataset = 'SubjQA'
            start_position = None
            end_position = None
            is_impossible = example['is_impossible']
            q_sbj = example['question_subj']
            a_sbj = example['ans_subj']
            domain = example['domain']

            assert len(
                example['answer']
            ) == 3, "Each answer must consist of an answer text, a start and an end index of answer span"

            if not is_impossible:
                orig_answer_text = example['answer']['answer_text']
                answer_offset = example['answer']['answer_start']
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                try:
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]
                # sometimes orig. answer text has more white spaces between tokens than the same char. sequence in review text,
                # thus we will get an IndexError (i.e., answer_length is too long)
                except IndexError:
                    orig_answer_text = context[
                        answer_offset:example['answer']['answer_end']]
                    answer_length = len(orig_answer_text)
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]

                actual_text = " ".join(
                    doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))

                if actual_text.find(cleaned_answer_text) == -1:
                    # skip example, if answer cannot be recovered from document
                    continue

            # elif question is NOT answerable, then answer is the empty string and start and end positions are 0
            else:
                #uncomment line below to skip unanswerable questions (for now)
                #continue
                start_position = 0
                end_position = 0
                orig_answer_text = ""

        example_instance = InputExample(
            qas_id=qas_id,
            q_text=q_text,
            doc_tokens=doc_tokens,
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position,
            is_impossible=is_impossible,
            q_sbj=q_sbj,
            a_sbj=a_sbj,
            domain=domain,
            dataset=dataset,
        )

        example_instances.append(example_instance)

    return example_instances
Beispiel #17
0
def read_quac_examples(input_file, is_training=True, use_history=False, n_history=-1):
    """
    read QuAC data into a list of QA examples
    """
    with open(input_file, "r", encoding="utf-8") as reader:
        input_data = json.load(reader)['data']

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    #yesno_symbols = set()
    #followup_symbols = set()
    for entry in input_data:
        para_obj = entry['paragraphs'][0]
        paragraph_id = para_obj['id']
        # process context paragraph
        paragraph_text = para_obj['context']
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            # each char is mapped to word position
            char_to_word_offset.append(len(doc_tokens) - 1)

        # process questions
        question_history_texts = []
        for qa in para_obj['qas']:
            cur_question_text = qa['question']
            question_history_texts.append(cur_question_text)
            example_id = qa['id']
            # word position
            start_position = None
            end_position = None
            yes_no_flag = None
            yes_no_ans = None
            followup = None
            orig_answer_text = None
            if is_training:
                answer = qa['answers'][0]
                orig_answer_text = answer["text"]
                answer_offset = answer["answer_start"]
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                if answer_offset + answer_length >= len(char_to_word_offset):
                    end_position = char_to_word_offset[-1]
                else:
                    end_position = char_to_word_offset[answer_offset + answer_length]
                actual_text = " ".join(doc_tokens[start_position:(end_position+1)])
                cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    logger.warning("Could not find answer: '%s' vs. '%s'",
                                           actual_text, cleaned_answer_text)
                    continue
                #logger.info("yesno symbol: {}, followup symbol: {}".format(qa['yesno'], qa['followup']))
                yes_no_flag = int(qa['yesno'] in ['y','n'])
                yes_no_ans = int(qa['yesno'] == 'y')
                #yes_no_flag = yesno_vocab.index(qa['yesno'])
                #yesno_symbols.add(qa['yesno'])
                followup = followup_vocab.index(qa['followup'])
                #followup_symbols.add(qa['followup'])
            questions =  []
            if use_history:
                # !!! CONTINUE
                if n_history == -1 or len(question_history_texts) <= n_history:
                    questions = question_history_texts[:]
                else:
                    questions = question_history_texts[-1*n_history:]
            else:
                questions = [question_history_texts[-1]]
            example = QuACExample(
                example_id=example_id,
                questions=questions,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                yes_no_flag=yes_no_flag,
                yes_no_ans=yes_no_ans,
                followup=followup)
            examples.append(example)
        
    #logger.info("yesno symbols: {}, followup symbols: {}".format(yesno_symbols, followup_symbols))
    return examples
Beispiel #18
0
def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                if is_training and len(examples) >= 50:
                    break
                if is_training and random.randint(1, 10) != 5:
                    continue
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) < 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have more than(including) 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position,
                                       is_impossible=is_impossible)
                examples.append(example)
    return examples
Beispiel #19
0
def read_quac_examples(input_file, is_training):
    """Read a QuAC json file into a list of CQAExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    # if FLAGS.load_small_portion:
    #     input_data = input_data[:10]
    #     print('input_data:', input_data)
    #     tf.logging.warning('<<<<<<<<<< load_small_portion is on! >>>>>>>>>>')
    for entry in input_data:
        # An additional "CANNOTANSWER" has been added in QuAC data, so no need to append one.
        entry = entry['paragraphs'][0]
        paragraph_text = entry["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        ############################################################
        # convert the convasational QAs to squad format, with history
        ############################################################

        questions = [(item['question'], item['id'])
                     for item in entry['qas']]  # [(question, question_id), ()]
        answers = [(item['orig_answer']['text'],
                    item['orig_answer']['answer_start'])
                   for item in entry['qas']]
        followups = [item['followup'] for item in entry['qas']]
        yesnos = [item['yesno'] for item in entry['qas']]

        qas = []
        for i, (question, answer, followup,
                yesno) in enumerate(zip(questions, answers, followups,
                                        yesnos)):
            metadata = {
                'turn': i + 1,
                'history_turns': [],
                'tok_history_answer_markers': [],
                'followup': followup,
                'yesno': yesno,
                'history_turns_text': []
            }
            # if FLAGS.use_RL:
            #     start_index = 0
            # else:
            #     start_index = 0 if i - int(FLAGS.history) < 0 else i - int(FLAGS.history)

            end_index = i
            question_with_histories = ''

            history_answer_marker = None
            # if FLAGS.use_history_answer_marker:
            a = 0
            if a < 1:
                start_index = 0  # we read all the histories no matter we use RL or not. we will make approporiate selections afterwards
                history_answer_marker = []
                for history_turn, (each_answer, each_question) in enumerate(
                        zip(answers[start_index:end_index],
                            questions[start_index:end_index])):

                    # [history_answer_start, history_answer_end, history_answer_text]
                    each_marker = [
                        each_answer[1], each_answer[1] + len(each_answer[0]),
                        each_answer[0]
                    ]
                    history_answer_marker.append(each_marker)
                    metadata['history_turns'].append(history_turn +
                                                     start_index + 1)
                    metadata['history_turns_text'].append(
                        (each_question[0],
                         each_answer[0]))  #[(q1, a1), (q2, a2), ...]
            else:
                # prepend historical questions and answers
                start_index = max(end_index - 6, 0)

                for each_answer in answers[start_index:end_index]:
                    question_with_histories += each_answer[0] + ' '

            # add the current question
            question_with_histories += question[0]
            qas.append({
                'id':
                question[1],
                'question':
                question_with_histories,
                'answers': [{
                    'answer_start': answer[1],
                    'text': answer[0]
                }],
                'history_answer_marker':
                history_answer_marker,
                'metadata':
                metadata
            })

        for qa in qas:
            qas_id = qa["id"]
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None

            # if is_training:
            # we read in the groundtruth answer bothing druing training and predicting, because we need to compute acc and f1 at predicting time.
            if len(qa["answers"]) != 1:
                raise ValueError(
                    "For training, each question should have exactly 1 answer."
                )
            answer = qa["answers"][0]
            orig_answer_text = answer["text"]
            answer_offset = answer["answer_start"]
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length -
                                               1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(doc_tokens[start_position:(end_position +
                                                              1)])

            cleaned_answer_text = " ".join(
                tokenization_bert.whitespace_tokenize(orig_answer_text))

            if is_training and actual_text.find(cleaned_answer_text) == -1:
                logger.warning("Could not find answer: '%s' vs. '%s'",
                               actual_text, cleaned_answer_text)
                continue

            # we construct a tok_history_answer_marker to store the aggregated history answer markers for a question.
            # we also construct each_tok_history_answer_marker to store a single history answer marker.
            tok_history_answer_marker = [0] * len(doc_tokens)

            for marker_index, marker in enumerate(qa['history_answer_marker']):
                each_tok_history_answer_marker = [0] * len(doc_tokens)
                history_orig_answer_text = marker[2]
                history_answer_offset = marker[0]
                history_answer_length = len(history_orig_answer_text)
                history_start_position = char_to_word_offset[
                    history_answer_offset]
                history_end_position = char_to_word_offset[
                    history_answer_offset + history_answer_length - 1]
                history_actual_text = " ".join(
                    doc_tokens[history_start_position:(history_end_position +
                                                       1)])
                history_cleaned_answer_text = " ".join(
                    tokenization_bert.whitespace_tokenize(
                        history_orig_answer_text))
                if history_actual_text.find(history_cleaned_answer_text) != -1:
                    tok_history_answer_marker = tok_history_answer_marker[: history_start_position] + \
                                        [1] * (history_end_position - history_start_position + 1) + \
                                        tok_history_answer_marker[history_end_position + 1 :]
                    each_tok_history_answer_marker = each_tok_history_answer_marker[: history_start_position] + \
                                        [1] * (history_end_position - history_start_position + 1) + \
                                        each_tok_history_answer_marker[history_end_position + 1 :]
                    assert len(tok_history_answer_marker) == len(doc_tokens)
                    assert len(each_tok_history_answer_marker) == len(
                        doc_tokens)
                    qa['metadata']['tok_history_answer_markers'].append(
                        each_tok_history_answer_marker)

            example = CQAExample(
                qas_id=qas_id,
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                history_answer_marker=tok_history_answer_marker,
                metadata=qa['metadata'])
            examples.append(example)

    return examples
Beispiel #20
0
def read_mlqa_examples(input_file, is_training, version_2_with_negative, input_lang):
    """Read a MLQA json file into a list of MlqaExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True

            if input_lang == "zh":
                try:
                    if "jieba" not in sys.modules:
                        import jieba
                    else:
                        jieba = sys.modules["jieba"]
                except (AttributeError, ImportError):
                    logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
                    logger.error("1. pip install jieba")
                    raise
                paragraph_text = " ".join(jieba.cut(paragraph_text))

            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:
                    if version_2_with_negative:
                        # is_impossible = qa["is_impossible"]
                        raise ValueError('MLQA dataset doesn\'t contain impossible question...')
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer.")
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning("Could not find answer: '%s' vs. '%s'",
                                           actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = MlqaExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=is_impossible)
                examples.append(example)
    return examples
Beispiel #21
0
def read_record_examples(input_file,
                         is_training,
                         version_2_with_negative=False):
    """Read a ReCoRD json file into a list of ReCoRDExample."""
    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        # white space tokenization
        paragraph_text = entry["passage"]["text"].replace('\xa0', ' ')
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        # load entities in passage
        passage_entities = []
        for entity in entry['passage']['entities']:
            entity_start_offset = entity['start']
            entity_end_offset = entity['end']
            # some error labeled entities in record dataset
            if entity_end_offset < entity_start_offset:
                continue
            entity_text = paragraph_text[
                entity_start_offset:entity_end_offset + 1]
            passage_entities.append({
                'orig_text':
                entity_text,
                'start_position':
                char_to_word_offset[entity_start_offset],
                'end_position':
                char_to_word_offset[entity_end_offset]
            })

        for qa in entry["qas"]:
            qas_id = qa["id"]
            question_text = qa["query"].replace('\xa0', ' ')
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            if is_training:
                if version_2_with_negative:
                    is_impossible = qa["is_impossible"]
                # if (len(qa["answers"]) != 1) and (not is_impossible):
                #     raise ValueError(
                #         "For training, each question should have exactly 1 answer."
                #     )
                if not is_impossible:
                    # just chose the first one?
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["start"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]
                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position + 1)])
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text))
                    if actual_text.find(cleaned_answer_text) == -1:
                        logger.info("Could not find answer: '%s' vs. '%s'",
                                    actual_text, cleaned_answer_text)
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""

            example = ReCoRDExample(qas_id=qas_id,
                                    question_text=question_text,
                                    doc_tokens=doc_tokens,
                                    passage_entities=passage_entities,
                                    orig_answer_text=orig_answer_text,
                                    start_position=start_position,
                                    end_position=end_position,
                                    is_impossible=is_impossible)
            examples.append(example)

    return examples
Beispiel #22
0
def bionumqa_convert_example_to_features(example, max_seq_length, doc_stride,
                                         max_query_length, is_training):
    features = []
    if is_training and not example.is_impossible:
        # Get start and end position
        start_position = example.start_position
        end_position = example.end_position

        # If the answer cannot be found in the text, then skip this example.
        actual_text = " ".join(
            example.doc_tokens[start_position:(end_position + 1)])
        cleaned_answer_text = " ".join(whitespace_tokenize(
            example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text,
                           cleaned_answer_text)
            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    if is_training and not example.is_impossible:
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1

        (tok_start_position, tok_end_position) = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
            example.answer_text)

    spans = []

    truncated_query = tokenizer.encode(example.question_text,
                                       add_special_tokens=False,
                                       max_length=max_query_length)
    sequence_added_tokens = (
        tokenizer.max_len - tokenizer.max_len_single_sentence +
        1 if "roberta" in str(type(tokenizer))
        or "camembert" in str(type(tokenizer)) else tokenizer.max_len -
        tokenizer.max_len_single_sentence)
    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair

    all_doc_nums = example.context_nums
    question_nums = example.question_nums
    span_doc_tokens = all_doc_tokens
    doc_num_indices = [
        i for i in range(len(all_doc_tokens)) if all_doc_tokens[i] == "[NUM]"
    ]

    while len(spans) * doc_stride < len(all_doc_tokens):

        encoded_dict = tokenizer.encode_plus(
            truncated_query
            if tokenizer.padding_side == "right" else span_doc_tokens,
            span_doc_tokens
            if tokenizer.padding_side == "right" else truncated_query,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            pad_to_max_length=True,
            stride=max_seq_length - doc_stride - len(truncated_query) -
            sequence_pair_added_tokens,
            truncation_strategy="only_second"
            if tokenizer.padding_side == "right" else "only_first",
            return_token_type_ids=True,
        )

        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )

        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            if tokenizer.padding_side == "right":
                non_padded_ids = encoded_dict[
                    "input_ids"][:encoded_dict["input_ids"].
                                 index(tokenizer.pad_token_id)]
            else:
                last_padding_id_position = (
                    len(encoded_dict["input_ids"]) - 1 -
                    encoded_dict["input_ids"][::-1].index(
                        tokenizer.pad_token_id))
                non_padded_ids = encoded_dict["input_ids"][
                    last_padding_id_position + 1:]

        else:
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        for i in range(paragraph_len):
            index = len(
                truncated_query
            ) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) *
                                                         doc_stride + i]

        doc_num_start = np.digitize(len(spans) * doc_stride, doc_num_indices)
        doc_num_end = np.digitize(
            len(spans) * doc_stride + paragraph_len, doc_num_indices)
        doc_nums = all_doc_nums[doc_num_start:doc_num_end]
        nums = question_nums + doc_nums

        number_mask = [0.0] * max_seq_length
        number_indice = [i for i in range(len(tokens)) if tokens[i] == '[NUM]']
        for index, num in zip(number_indice, nums):
            number_mask[index] = num

        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(
            truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len
        encoded_dict["number"] = number_mask

        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict:
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(
                spans, doc_span_index, doc_span_index * doc_stride + j)
            index = (j if tokenizer.padding_side == "left" else
                     spans[doc_span_index]
                     ["truncated_query_with_special_tokens_length"] + j)
            spans[doc_span_index]["token_is_max_context"][
                index] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # Original TF implem also keep the classification token (set to 0)
        p_mask = np.ones_like(span["token_type_ids"])
        if tokenizer.padding_side == "right":
            p_mask[len(truncated_query) + sequence_added_tokens:] = 0
        else:
            p_mask[-len(span["tokens"]):-(len(truncated_query) +
                                          sequence_added_tokens)] = 0

        pad_token_indices = np.where(
            span["input_ids"] == tokenizer.pad_token_id)
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(
                span["input_ids"], already_has_special_tokens=True)).nonzero()

        p_mask[pad_token_indices] = 1
        p_mask[special_token_indices] = 1

        # Set the cls index to 0: the CLS index can be used for impossible answers
        p_mask[cls_index] = 0

        span_is_impossible = example.is_impossible
        start_position = 0
        end_position = 0
        if is_training and not span_is_impossible:
            # For training, if our document chunk does not contain an annotation
            # we throw it out, since there is nothing to predict.
            doc_start = span["start"]
            doc_end = span["start"] + span["length"] - 1
            out_of_span = False

            if not (tok_start_position >= doc_start
                    and tok_end_position <= doc_end):
                out_of_span = True

            if out_of_span:
                start_position = cls_index
                end_position = cls_index
                span_is_impossible = True
            else:
                if tokenizer.padding_side == "left":
                    doc_offset = 0
                else:
                    doc_offset = len(truncated_query) + sequence_added_tokens

                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

        features.append(
            BioNumQAFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_index=
                0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_position=start_position,
                end_position=end_position,
                is_impossible=span_is_impossible,
                qas_id=example.qas_id,
                number=span['number']))
    return features
Beispiel #23
0
def squad_convert_example_to_features(example, max_seq_length, doc_stride,
                                      max_query_length, is_training):
    features = []
    if is_training and not example.is_impossible:
        # Get start and end position
        if example.question_type == 'factoid':
            start_position = example.start_position
            end_position = example.end_position

            # If the answer cannot be found in the text, then skip this example.
            actual_text = " ".join(
                example.doc_tokens[start_position:(end_position + 1)])
        else:
            actual_text = "".join([
                example.doc_sent[e] for e in example.pointing_answer
            ]).strip()
        cleaned_answer_text = " ".join(whitespace_tokenize(
            example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text,
                           cleaned_answer_text)
            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []

    all_sent_positions = []
    all_tok_to_sep_idx = {}
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        if token == '|':
            all_sent_positions.append(len(all_doc_tokens))
            sub_tokens = ['[SEP]']

        for sub_token in sub_tokens:
            all_tok_to_sep_idx[len(
                all_doc_tokens)] = len(all_sent_positions) - 1
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    if is_training and not example.is_impossible:
        if example.question_type == "narrative":
            sent_start_position = example.pointing_answer[0]
            sent_end_position = example.pointing_answer[-1]
            tok_start_position = 0
            tok_end_position = 0
        else:
            # center = int((example.char_start_position*2 + len(example.answer_text))/2)
            # start_for_sent = center-50 if center >= 50 else 0
            # end_for_sent = center + 50 if center +50 < len(example.char_to_sent_offset) else len(example.char_to_sent_offset) -1
            start_for_sent = example.char_start_position
            end_for_sent = example.char_start_position + len(
                example.answer_text) - 1
            sent_start_position = example.char_to_sent_offset[start_for_sent]
            sent_end_position = example.char_to_sent_offset[end_for_sent]
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[example.end_position +
                                                     1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1

            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position,
                tokenizer, example.answer_text)

    spans = []

    truncated_query = tokenizer.encode(example.question_text,
                                       add_special_tokens=False,
                                       max_length=max_query_length)
    sequence_added_tokens = (
        tokenizer.max_len - tokenizer.max_len_single_sentence +
        1 if "roberta" in str(type(tokenizer))
        or "camembert" in str(type(tokenizer)) else tokenizer.max_len -
        tokenizer.max_len_single_sentence)
    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair

    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):

        encoded_dict = tokenizer.encode_plus(
            truncated_query
            if tokenizer.padding_side == "right" else span_doc_tokens,
            span_doc_tokens
            if tokenizer.padding_side == "right" else truncated_query,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            pad_to_max_length=True,
            stride=max_seq_length - doc_stride - len(truncated_query) -
            sequence_pair_added_tokens,
            truncation_strategy="only_second"
            if tokenizer.padding_side == "right" else "only_first",
            return_token_type_ids=True,
        )

        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )

        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            if tokenizer.padding_side == "right":
                non_padded_ids = encoded_dict[
                    "input_ids"][:encoded_dict["input_ids"].
                                 index(tokenizer.pad_token_id)]
            else:
                last_padding_id_position = (
                    len(encoded_dict["input_ids"]) - 1 -
                    encoded_dict["input_ids"][::-1].index(
                        tokenizer.pad_token_id))
                non_padded_ids = encoded_dict["input_ids"][
                    last_padding_id_position + 1:]

        else:
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        token_to_orig_sent_map = {}
        token_to_cur_sent_map = {}
        cur_sent_to_orig_sent_map = {}
        for i in range(paragraph_len):
            index = len(
                truncated_query
            ) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) *
                                                         doc_stride + i]
            token_to_orig_sent_map[index] = all_tok_to_sep_idx[len(spans) *
                                                               doc_stride + i]

            token_to_cur_sent_map[index] = all_tok_to_sep_idx[
                len(spans) * doc_stride +
                i] - all_tok_to_sep_idx[len(spans) * doc_stride] + 1
        cur_sent_to_orig_sent_map = {
            token_to_cur_sent_map[e]: token_to_orig_sent_map[e]
            for e in token_to_cur_sent_map.keys() if token_to_cur_sent_map[e]
            not in cur_sent_to_orig_sent_map.keys()
        }
        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["question_mask"] = [
            1 - e for e in encoded_dict["token_type_ids"]
        ]
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["token_to_orig_sent_map"] = token_to_orig_sent_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(
            truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len
        encoded_dict["question_type"] = question_type2idx[
            example.question_type] if example.question_type else None
        encoded_dict["answer_type"] = answer_type2idx[
            example.answer_type] if example.answer_type else None
        encoded_dict["sentence_mask"] = [0] * (len(truncated_query) + 2) + [
            token_to_cur_sent_map[k] for k in token_to_cur_sent_map.keys()
        ]
        encoded_dict["sentence_mask"] += [0] * (
            512 - len(encoded_dict["sentence_mask"]))
        encoded_dict["cur_sent_to_orig_sent_map"] = cur_sent_to_orig_sent_map
        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict:
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(
                spans, doc_span_index, doc_span_index * doc_stride + j)
            index = (j if tokenizer.padding_side == "left" else
                     spans[doc_span_index]
                     ["truncated_query_with_special_tokens_length"] + j)
            spans[doc_span_index]["token_is_max_context"][
                index] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # Original TF implem also keep the classification token (set to 0)
        p_mask = np.ones_like(span["token_type_ids"])
        if tokenizer.padding_side == "right":
            p_mask[len(truncated_query) + sequence_added_tokens:] = 0
        else:
            p_mask[-len(span["tokens"]):-(len(truncated_query) +
                                          sequence_added_tokens)] = 0

        pad_token_indices = np.where(
            span["input_ids"] == tokenizer.pad_token_id)
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(
                span["input_ids"], already_has_special_tokens=True)).nonzero()

        p_mask[pad_token_indices] = 1
        p_mask[special_token_indices] = 1

        # Set the cls index to 0: the CLS index can be used for impossible answers
        p_mask[cls_index] = 0

        span_is_impossible = example.is_impossible
        t_start_position = 0
        t_end_position = 0
        s_start_position = 0
        s_end_position = 0
        if is_training and not span_is_impossible:
            # For training, if our document chunk does not contain an annotation
            # we throw it out, since there is nothing to predict.
            if span["question_type"] == 1:
                doc_start = span["start"]
                doc_end = span["start"] + span["length"] - 1
                out_of_span = False

                if not (tok_start_position >= doc_start
                        and tok_end_position <= doc_end):
                    out_of_span = True

                if out_of_span:
                    t_start_position = cls_index
                    t_end_position = cls_index
                    span_is_impossible = True
                else:
                    if tokenizer.padding_side == "left":
                        doc_offset = 0
                    else:
                        doc_offset = len(
                            truncated_query) + sequence_added_tokens

                    t_start_position = tok_start_position - doc_start + doc_offset
                    t_end_position = tok_end_position - doc_start + doc_offset

                    sent_doc_start = all_tok_to_sep_idx[span["start"]]
                    sent_doc_end = all_tok_to_sep_idx[span["start"] +
                                                      span["length"] - 1]

                    sent_start_position = sent_start_position if sent_start_position >= sent_doc_start else sent_doc_start
                    sent_end_position = sent_end_position if sent_end_position <= sent_doc_end else sent_doc_end

                    if tokenizer.padding_side == "left":
                        doc_offset = 0
                    else:
                        doc_offset = 1
                    s_start_position = sent_start_position - sent_doc_start + doc_offset
                    s_end_position = sent_end_position - sent_doc_start + doc_offset
            else:
                sent_doc_start = all_tok_to_sep_idx[span["start"]]
                sent_doc_end = all_tok_to_sep_idx[span["start"] +
                                                  span["length"] - 1]
                out_of_span = False

                if not (sent_start_position >= sent_doc_start
                        and sent_end_position <= sent_doc_end):
                    out_of_span = True

                if out_of_span:
                    span_is_impossible = True
                else:
                    if tokenizer.padding_side == "left":
                        doc_offset = 0
                    else:
                        doc_offset = 1
                    s_start_position = sent_start_position - sent_doc_start + doc_offset
                    s_end_position = sent_end_position - sent_doc_start + doc_offset
        if span_is_impossible and random.random() > 0.5 and is_training:
            continue
        features.append(
            SquadFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                span["question_mask"],
                span["sentence_mask"],
                span["cur_sent_to_orig_sent_map"],
                cls_index,
                p_mask.tolist(),
                example_index=
                0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                token_to_orig_sent_map=span["token_to_orig_sent_map"],
                sent_start_position=s_start_position,
                sent_end_position=s_end_position,
                tok_start_position=t_start_position,
                tok_end_position=t_end_position,
                question_type=span["question_type"],
                answer_type=span["answer_type"],
                is_impossible=span_is_impossible,
                qas_id=example.qas_id,
            ))
    return features
Beispiel #24
0
def Get_Date_From_DataSet(input_file):
    examples = []
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    for entry in input_data:  #entry 是由title和paragraph构成的dict
        for paragraph in entry["paragraphs"]:  #遍历paragraph这个dict
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:  #处理多个空格的情况
                if is_whitespace(c):  #如果c是空格 将前面空格的flag设为true
                    prev_is_whitespace = True
                else:  #如果不是空格,前面是空格的话另起一个
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:  #如果不是空格,加到前面的token后面
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) -
                                           1)  #记录每一个char对应的词汇index

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                is_impossible = qa["is_impossible"]
                if not is_impossible:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]
                    answer_offset = answer["answer_start"]  # 答案开始的char的位置
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[
                        answer_offset]  # 获得answer开始位置词的索引
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length -
                                                       1]  # 获得answer结束位置词的索引
                    actual_text = " ".join(
                        doc_tokens[start_position:(end_position +
                                                   1)])  # 按照start位置找到的answer
                    cleaned_answer_text = " ".join(
                        whitespace_tokenize(orig_answer_text))  # 数据集中给的answer
                    if actual_text.find(
                            cleaned_answer_text
                    ) == -1:  # 将tokenize函数的提取和手动提取作比较,如果不能包含数据集中的数据,则warn
                        logger.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position,
                                       is_impossible=is_impossible)
                examples.append(example)
    return examples
Beispiel #25
0
def read_squad_examples_jb(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""

    if isinstance(input_file, str):
        with open(input_file, "r", encoding="utf-8") as reader:
            input_data = json.load(reader)["data"]
    else:
        input_data = input_file

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if _is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                try:
                    retriever_score = qa["retriever_score"]
                except KeyError:
                    retriever_score = 0
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[
                            answer_offset + answer_length - 1
                        ]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position : (end_position + 1)]
                        )
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text)
                        )
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text,
                                cleaned_answer_text,
                            )
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                examples.append(
                    SquadExampleJB(
                        qas_id=qas_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        orig_answer_text=orig_answer_text,
                        start_position=start_position,
                        end_position=end_position,
                        is_impossible=is_impossible,
                        paragraph=paragraph_text,
                        title=entry["title"],
                        retriever_score=retriever_score,
                    )
                )
    return examples
Beispiel #26
0
    def convert_to_example(self,
                           question_text,
                           qas_id=None,
                           paragraph_text=None,
                           char_to_word_offset=None,
                           doc_tokens=None,
                           is_impossible=False,
                           answer=None,
                           answer_offset=None):
        """
            - qas_id: int
            - question_text: string
            - paragraph_text: string. If char_to_word_offset and doc_tokens exists, then you cal remain it to None
            - char_to_word_offset: list, it is the intermediate result after predealing the paragraph text
            - doc_tokens: list, it is the intermediate result after predealing the paragraph text
            - is_impossible: bool
            - answer: string
            - answer_offset: int, indicate the answer location in paragraph
        """
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                    c) == 0x202F:
                return True
            return False

        if char_to_word_offset is None or len(char_to_word_offset) < 1:
            if char_to_word_offset is None:
                doc_tokens = []
                char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

        start_position = None

        end_position = None
        orig_answer_text = None

        if self.is_training:
            if not is_impossible:
                orig_answer_text = answer
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset +
                                                   answer_length - 1]
                # Only add answers where the text can be exactly recovered from the
                # document. If this CAN'T happen it's likely due to weird Unicode
                # stuff so we will just skip the example.
                #
                # Note that this means for training mode, every example is NOT
                # guaranteed to be preserved.
                actual_text = " ".join(
                    doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    print("Could not find answer: '%s' vs. '%s'", actual_text,
                          cleaned_answer_text)
                    return None
            else:
                start_position = -1
                end_position = -1
                orig_answer_text = ""
        else:
            is_impossible = False

        example = SquadExample(qas_id=qas_id,
                               question_text=question_text,
                               doc_tokens=doc_tokens,
                               orig_answer_text=orig_answer_text,
                               start_position=start_position,
                               end_position=end_position,
                               is_impossible=is_impossible)
        return example
def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training):
    features = []
    if is_training and not example.is_impossible:
        # Get start and end position
        start_position = example.start_position
        end_position = example.end_position

        # If the answer cannot be found in the text, then skip this example.
        actual_text = " ".join(example.doc_tokens[start_position: (end_position + 1)])
        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
            # logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    if is_training and not example.is_impossible:
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1

        (tok_start_position, tok_end_position) = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
        )

    spans = []

    truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
    # print("truncated query: {}".format(truncated_query))
    sequence_added_tokens = (
        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
        if "roberta" in str(type(tokenizer))
        else tokenizer.max_len - tokenizer.max_len_single_sentence
    )
    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair

    span_doc_tokens = all_doc_tokens
    # print("span doc tokens: {}".format(span_doc_tokens))
    while len(spans) * doc_stride < len(all_doc_tokens):

        encoded_dict = tokenizer.encode_plus(
            truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
            span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            pad_to_max_length=True,
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
            truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
        )

        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )

        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
        else:
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        for i in range(paragraph_len):
            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]

        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len

        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict:
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
            index = (
                j
                if tokenizer.padding_side == "left"
                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
            )
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # Original TF implem also keep the classification token (set to 0) (not sure why...)
        p_mask = np.array(span["token_type_ids"])

        p_mask = np.minimum(p_mask, 1)

        if tokenizer.padding_side == "right":
            # Limit positive values to one
            p_mask = 1 - p_mask

        p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1

        # Set the CLS index to '0'
        p_mask[cls_index] = 0

        span_is_impossible = example.is_impossible
        start_position = 0
        end_position = 0
        if is_training and not span_is_impossible:
            # For training, if our document chunk does not contain an annotation
            # we throw it out, since there is nothing to predict.
            doc_start = span["start"]
            doc_end = span["start"] + span["length"] - 1
            out_of_span = False

            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
                out_of_span = True

            if out_of_span:
                start_position = cls_index
                end_position = cls_index
                span_is_impossible = True
            else:
                if tokenizer.padding_side == "left":
                    doc_offset = 0
                else:
                    doc_offset = len(truncated_query) + sequence_added_tokens

                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

        features.append(
            SquadFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_index=0,
                # Can not set unique_id and example_index here. They will be set after multiple processing.
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_position=start_position,
                end_position=end_position,
            )
        )
    return features
Beispiel #28
0
def read_squad_examples(input_file, is_training, do_lower_case=False):
    """Read a SQuAD json file into a list of SquadExample."""

    with open(input_file, "r", encoding="utf-8") as reader:
        input_data = json.load(reader)["data"]

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case)
            doc_tokens = []  # [word0, word1,...]的形式
            char_to_word_offset = []  # 记录每个char对应word的index
            temp_word = ""
            for c in paragraph_text:  # 类似split(),以空格为分隔
                if _is_whitespace(c):
                    char_to_word_offset.append(len(doc_tokens) - 1)
                    continue
                else:
                    temp_word += c
                    char_to_word_offset.append(len(doc_tokens))
                if do_lower_case:
                    temp_word = temp_word.lower()
                if temp_word == raw_doc_tokens[len(doc_tokens)]:
                    doc_tokens.append(temp_word)
                    temp_word = ""

            assert len(doc_tokens) == len(raw_doc_tokens)
            assert doc_tokens == raw_doc_tokens

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                if is_training:
                    answer = qa["answers"][0]
                    orig_answer_text = answer["text"]

                    if len(qa["answers"]) != 1:
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if orig_answer_text not in paragraph_text:
                        logging.warning("Could not find answer")
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""
                    else:
                        answer_offset = paragraph_text.index(orig_answer_text)
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[
                            answer_offset]  # start word index
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            whitespace_tokenize(orig_answer_text))
                        if do_lower_case:
                            cleaned_answer_text = cleaned_answer_text.lower()
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text,
                                cleaned_answer_text,
                            )
                            continue

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=False,
                )
                examples.append(example)

    return examples
 def tokenize(self, text):
     return whitespace_tokenize(text)
def fincausal_convert_example_to_features(example: FinCausalExample,
                                          max_seq_length: int,
                                          doc_stride: int,
                                          is_training: bool) -> List[FinCausalFeatures]:
    features = []
    if is_training:
        # Get start and end position
        start_cause_position = example.start_cause_position
        end_cause_position = example.end_cause_position
        start_effect_position = example.start_effect_position
        end_effect_position = example.end_effect_position

        # If the cause cannot be found in the text, then skip this example.
        actual_cause_text = " ".join(example.doc_tokens[start_cause_position: (end_cause_position + 1)])
        cleaned_cause_text = " ".join(whitespace_tokenize(_run_split_on_punc(example.cause_text)))
        if actual_cause_text.find(cleaned_cause_text) == -1:
            logger.warning("Could not find cause: '%s' vs. '%s'", actual_cause_text, cleaned_cause_text)
            return []

        # If the effect cannot be found in the text, then skip this example.
        actual_effect_text = " ".join(example.doc_tokens[start_effect_position: (end_effect_position + 1)])
        cleaned_effect_text = " ".join(whitespace_tokenize(_run_split_on_punc(example.effect_text)))
        if actual_effect_text.find(cleaned_effect_text) == -1:
            logger.warning("Could not find effect: '%s' vs. '%s'", actual_effect_text, cleaned_effect_text)
            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    if is_training:
        tok_cause_start_position = orig_to_tok_index[example.start_cause_position]
        if example.end_cause_position < len(example.doc_tokens) - 1:
            tok_cause_end_position = orig_to_tok_index[example.end_cause_position + 1] - 1
        else:
            tok_cause_end_position = len(all_doc_tokens) - 1

        (tok_cause_start_position, tok_cause_end_position) = _improve_answer_span(
            all_doc_tokens, tok_cause_start_position, tok_cause_end_position, tokenizer, example.cause_text
        )

        tok_effect_start_position = orig_to_tok_index[example.start_effect_position]
        if example.end_effect_position < len(example.doc_tokens) - 1:
            tok_effect_end_position = orig_to_tok_index[example.end_effect_position + 1] - 1
        else:
            tok_effect_end_position = len(all_doc_tokens) - 1

        (tok_effect_start_position, tok_effect_end_position) = _improve_answer_span(
            all_doc_tokens, tok_effect_start_position, tok_effect_end_position, tokenizer, example.effect_text
        )
    if example.offset_sentence_2 > 0:
        tok_sentence_2_offset = orig_to_tok_index[example.offset_sentence_2 + 1] - 1
    else:
        tok_sentence_2_offset = None
    if example.offset_sentence_3 > 0:
        tok_sentence_3_offset = orig_to_tok_index[example.offset_sentence_3 + 1] - 1
    else:
        tok_sentence_3_offset = None

    spans: List[BatchEncoding] = []

    sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence

    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):

        encoded_dict: BatchEncoding = tokenizer.encode_plus(span_doc_tokens,
                                                            max_length=max_seq_length,
                                                            return_overflowing_tokens=True,
                                                            pad_to_max_length=True,
                                                            stride=max_seq_length - doc_stride - sequence_added_tokens - 1,
                                                            truncation_strategy="only_first",
                                                            truncation=True,
                                                            return_token_type_ids=True,
                                                            )

        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - sequence_added_tokens,
        )
        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            if tokenizer.padding_side == "right":
                non_padded_ids = encoded_dict.data["input_ids"][
                                 : encoded_dict.data["input_ids"].index(tokenizer.pad_token_id)]
            else:
                last_padding_id_position = (
                        len(encoded_dict.data["input_ids"])
                        - 1
                        - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
                )
                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1:]
        else:
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        for i in range(paragraph_len):
            index = sequence_added_tokens + i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]

        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len

        spans.append(encoded_dict)

        if len(encoded_dict.get("overflowing_tokens", [])) == 0:
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index].data["paragraph_len"]):
            is_max_context = _check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
            spans[doc_span_index].data["token_is_max_context"][j] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
        cls_index = span.data["input_ids"].index(tokenizer.cls_token_id)

        p_mask = np.ones(len(span.data["token_type_ids"]))
        p_mask[np.where(np.array(span.data["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
        # Set the CLS index to '0'
        p_mask[cls_index] = 0

        span_is_impossible = False
        cause_start_position = 0
        cause_end_position = 0
        effect_start_position = 0
        effect_end_position = 0
        doc_start = span.data["start"]
        doc_end = span.data["start"] + span.data["length"] - 1
        out_of_span = False
        if tokenizer.padding_side == "left":
            doc_offset = 0
        else:
            doc_offset = sequence_added_tokens
        if tok_sentence_2_offset is not None:
            sentence_2_offset = tok_sentence_2_offset - doc_start + doc_offset
        else:
            sentence_2_offset = None
        if tok_sentence_3_offset is not None:
            sentence_3_offset = tok_sentence_3_offset - doc_start + doc_offset
        else:
            sentence_3_offset = None
        if is_training:
            # For training, if our document chunk does not contain an annotation
            # we throw it out, since there is nothing to predict.
            if not (tok_cause_start_position >= doc_start
                    and tok_cause_end_position <= doc_end
                    and tok_effect_start_position >= doc_start
                    and tok_effect_end_position <= doc_end):
                out_of_span = True

            if out_of_span:
                cause_start_position = cls_index
                cause_end_position = cls_index
                effect_start_position = cls_index
                effect_end_position = cls_index
                span_is_impossible = True
            else:
                cause_start_position = tok_cause_start_position - doc_start + doc_offset
                cause_end_position = tok_cause_end_position - doc_start + doc_offset
                effect_start_position = tok_effect_start_position - doc_start + doc_offset
                effect_end_position = tok_effect_end_position - doc_start + doc_offset

        features.append(
            FinCausalFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_orig_index=example.example_id,
                example_index=0,
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                cause_start_position=cause_start_position,
                cause_end_position=cause_end_position,
                effect_start_position=effect_start_position,
                effect_end_position=effect_end_position,
                sentence_2_offset=sentence_2_offset,
                sentence_3_offset=sentence_3_offset,
                is_impossible=span_is_impossible,
            )
        )
    return features