Ejemplo n.º 1
0
    def test_squad_with_unwordpieceable_passage(self):

        tokenizer = SpacyTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = (
            "There were four major HDTV systems tested by SMPTE in the late 1970s, "
            "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:"
        )
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = (
            "Broca, being what today would be called a neurosurgeon, "
            "had taken an interest in the pathology of speech. He wanted "
            "to localize the difference between man and the other animals, "
            "which appeared to reside in speech. He discovered the speech "
            "center of the human brain, today called Broca's area after him. "
            "His interest was mainly in Biological anthropology, but a German "
            "philosopher specializing in psychology, Theodor Waitz, took up the "
            "theme of general and social anthropology in his six-volume work, "
            "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
            """soon translated as "The Anthropology of Primitive Peoples". """
            "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import (
            make_reading_comprehension_instance, )

        instance1 = make_reading_comprehension_instance(
            tokenizer.tokenize(question1),
            tokenizer.tokenize(passage1),
            {"bert": token_indexer},
            passage1,
        )

        instance2 = make_reading_comprehension_instance(
            tokenizer.tokenize(question2),
            tokenizer.tokenize(passage2),
            {"bert": token_indexer},
            passage2,
        )

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
Ejemplo n.º 2
0
    def test_squad_with_unwordpieceable_passage(self):
        # pylint: disable=line-too-long
        tokenizer = WordTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, "
                    "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:")
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = ("Broca, being what today would be called a neurosurgeon, "
                    "had taken an interest in the pathology of speech. He wanted "
                    "to localize the difference between man and the other animals, "
                    "which appeared to reside in speech. He discovered the speech "
                    "center of the human brain, today called Broca's area after him. "
                    "His interest was mainly in Biological anthropology, but a German "
                    "philosopher specializing in psychology, Theodor Waitz, took up the "
                    "theme of general and social anthropology in his six-volume work, "
                    "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
                    """soon translated as "The Anthropology of Primitive Peoples". """
                    "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance

        instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1),
                                                        tokenizer.tokenize(passage1),
                                                        {"bert": token_indexer},
                                                        passage1)

        instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2),
                                                        tokenizer.tokenize(passage2),
                                                        {"bert": token_indexer},
                                                        passage2)

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
Ejemplo n.º 3
0
    def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         char_spans: List[Tuple[int, int]] = None,
                         answer_texts: List[str] = None,
                         passage_tokens: List[Token] = None) -> Instance:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                         (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))

        return util.make_reading_comprehension_instance(self._claim_tokenizer.tokenize(question_text),
                                                        passage_tokens,
                                                        self._token_indexers,
                                                        passage_text,
                                                        token_spans,
                                                        answer_texts)
Ejemplo n.º 4
0
    def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         char_spans: List[Tuple[int, int]] = None,
                         answer_texts: List[str] = None,
                         passage_tokens: List[Token] = None) -> Instance:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                         (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))

        return util.make_reading_comprehension_instance(self._tokenizer.tokenize(question_text),
                                                        passage_tokens,
                                                        self._token_indexers,
                                                        passage_text,
                                                        token_spans,
                                                        answer_texts)
Ejemplo n.º 5
0
    def text_to_instance(
        self,  # type: ignore
        question_text: str,
        passage_text: str,
        answer: bool = None,
        passage_tokens: List[Token] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Optional[Instance]:

        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        question_tokens = self._tokenizer.tokenize(question_text)
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.

        # The original answer is filtered out
        return util.make_reading_comprehension_instance(
            question_tokens,
            passage_tokens,
            self._token_indexers,
            answer,
            passage_text,
            additional_metadata,
        )
Ejemplo n.º 6
0
    def text_to_instance(
        self,  # type: ignore
        question_text: str,
        passage_text: str,
        char_spans: List[Tuple[int, int]] = None,
        answer_texts: List[str] = None,
        passage_tokens: List[Token] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Optional[Instance]:

        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        question_tokens = self._tokenizer.tokenize(question_text)
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]
        char_spans = char_spans or []
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            if char_span_end > passage_offsets[-1][1]:
                continue
            (span_start, span_end), error = util.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))
        # The original answer is filtered out
        if char_spans and not token_spans:
            if self.skip_invalid_examples:
                return None
            else:
                token_spans.append(
                    (len(passage_tokens) - 1, len(passage_tokens) - 1))
        return util.make_reading_comprehension_instance(
            question_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            token_spans,
            answer_texts,
            additional_metadata,
        )
Ejemplo n.º 7
0
 def text_to_instance(
         self,  # type: ignore
         question_text: str,
         passage_text: str,
         char_spans: List[Tuple[int, int]] = None,
         answer_texts: List[str] = None,
         passage_tokens: List[Token] = None,
         max_passage_len: int = None,
         max_question_len: int = None,
         drop_invalid: bool = False) -> Optional[Instance]:
     """
     We cut the passage and question according to `max_passage_len` and `max_question_len` here.
     We will drop the invalid examples if `drop_invalid` equals to true.
     """
     # pylint: disable=arguments-differ
     if not passage_tokens:
         passage_tokens = self._tokenizer.tokenize(passage_text)
     question_tokens = self._tokenizer.tokenize(question_text)
     if max_passage_len is not None:
         passage_tokens = passage_tokens[:max_passage_len]
     if max_question_len is not None:
         question_tokens = question_tokens[:max_question_len]
     char_spans = char_spans or []
     # We need to convert character indices in `passage_text` to token indices in
     # `passage_tokens`, as the latter is what we'll actually use for supervision.
     token_spans: List[Tuple[int, int]] = []
     passage_offsets = [(token.idx, token.idx + len(token.text))
                        for token in passage_tokens]
     for char_span_start, char_span_end in char_spans:
         if char_span_end > passage_offsets[-1][1]:
             continue
         (span_start, span_end), error = util.char_span_to_token_span(
             passage_offsets, (char_span_start, char_span_end))
         if error:
             logger.debug("Passage: %s", passage_text)
             logger.debug("Passage tokens: %s", passage_tokens)
             logger.debug("Question text: %s", question_text)
             logger.debug("Answer span: (%d, %d)", char_span_start,
                          char_span_end)
             logger.debug("Token span: (%d, %d)", span_start, span_end)
             logger.debug("Tokens in answer: %s",
                          passage_tokens[span_start:span_end + 1])
             logger.debug("Answer: %s",
                          passage_text[char_span_start:char_span_end])
         token_spans.append((span_start, span_end))
     if not token_spans:
         if drop_invalid:
             return None
         else:
             token_spans.append((0, 0))
     return util.make_reading_comprehension_instance(
         question_tokens, passage_tokens, self._token_indexers,
         passage_text, token_spans, answer_texts)
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            char_spans: List[Tuple[int, int]] = None,
            answer_texts: List[str] = None,
            passage_tokens: List[Token] = None) -> Instance:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []
        fields = {}
        # if not has_answer:
        #     question_tokens = self._tokenizer.tokenize(question_text)
        #     passage_field = TextField(passage_tokens, self._token_indexers)
        #     fields['passage'] = passage_field
        #     fields['question'] = TextField(question_tokens, self._token_indexers)
        #     metadata = {
        #         'original_passage': passage_text,
        #         'token_offsets': None,
        #         'question_tokens': [token.text for token in question_tokens],
        #         'passage_tokens': [token.text for token in passage_tokens]
        #         }
        #     fields['span_start'] = IndexField(-1, passage_field.empty_field())
        #     fields['span_end'] = IndexField(-1, passage_field.empty_field())
        #     return Instance(fields)

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            (span_start, span_end), error = util.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))

        return util.make_reading_comprehension_instance(
            self._tokenizer.tokenize(question_text), passage_tokens,
            self._token_indexers, passage_text, token_spans, answer_texts)
Ejemplo n.º 9
0
 def text_to_instance(
         self,  # type: ignore
         question_text: str,
         passage_text: str,
         token_spans: List[Tuple[int, int]] = None,
         answer_texts: List[str] = None,
         question_tokens: List[Token] = None,
         passage_tokens: List[Token] = None) -> Instance:
     # pylint: disable=arguments-differ
     if not question_tokens:
         question_tokens = self._tokenizer.tokenize(question_text)
     if not passage_tokens:
         passage_tokens = self._tokenizer.tokenize(passage_text)
     return util.make_reading_comprehension_instance(
         question_tokens, passage_tokens, self._token_indexers,
         passage_text, token_spans, answer_texts)
Ejemplo n.º 10
0
 def text_to_instance(self,  # type: ignore
                      question_text: str,
                      passage_text: str,
                      token_spans: List[Tuple[int, int]] = None,
                      answer_texts: List[str] = None,
                      question_tokens: List[Token] = None,
                      passage_tokens: List[Token] = None) -> Instance:
     # pylint: disable=arguments-differ
     if not question_tokens:
         question_tokens = self._tokenizer.tokenize(question_text)
     if not passage_tokens:
         passage_tokens = self._tokenizer.tokenize(passage_text)
     return util.make_reading_comprehension_instance(question_tokens,
                                                     passage_tokens,
                                                     self._token_indexers,
                                                     passage_text,
                                                     token_spans,
                                                     answer_texts)
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            question_id: str = None,
            passage_id: str = None,
            answer_annotations: List[Dict] = None,
            passage_tokens: List[Token] = None) -> Union[Instance, None]:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
            passage_tokens = split_tokens_by_hyphen(passage_tokens)
        question_tokens = self._tokenizer.tokenize(question_text)
        question_tokens = split_tokens_by_hyphen(question_tokens)
        # passage_text = question_text
        # passage_tokens = question_tokens
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]

        answer_type, answer_texts = None, []
        if answer_annotations:
            # Currently we only use the first annotated answer here, but actually this doesn't affect
            # the training, because we only have one annotation for the train set.
            answer_type, answer_texts = self.extract_answer_info_from_annotation(
                answer_annotations[0])

        # Tokenize the answer text in order to find the matched span based on token
        tokenized_answer_texts = []
        for answer_text in answer_texts:
            answer_tokens = self._tokenizer.tokenize(answer_text)
            answer_tokens = split_tokens_by_hyphen(answer_tokens)
            tokenized_answer_texts.append(' '.join(token.text
                                                   for token in answer_tokens))

        if self.instance_format == "squad":
            valid_passage_spans = \
                self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else []
            if not valid_passage_spans:
                if "passage_span" in self.skip_when_all_empty:
                    return None
                else:
                    valid_passage_spans.append(
                        (len(passage_tokens) - 1, len(passage_tokens) - 1))
            return make_reading_comprehension_instance(
                question_tokens,
                passage_tokens,
                self._token_indexers,
                passage_text,
                valid_passage_spans,
                # this `answer_texts` will not be used for evaluation
                answer_texts,
                additional_metadata={
                    "original_passage": passage_text,
                    "original_question": question_text,
                    "passage_id": passage_id,
                    "question_id": question_id,
                    "valid_passage_spans": valid_passage_spans,
                    "answer_annotations": answer_annotations
                })
        elif self.instance_format == "bert":
            question_concat_passage_tokens = question_tokens + [
                Token("[SEP]")
            ] + passage_tokens
            valid_passage_spans = []
            for span in self.find_valid_spans(passage_tokens,
                                              tokenized_answer_texts):
                # This span is for `question + [SEP] + passage`.
                valid_passage_spans.append(
                    (span[0] + len(question_tokens) + 1,
                     span[1] + len(question_tokens) + 1))
            if not valid_passage_spans:
                if "passage_span" in self.skip_when_all_empty:
                    return None
                else:
                    valid_passage_spans.append(
                        (len(question_concat_passage_tokens) - 1,
                         len(question_concat_passage_tokens) - 1))
            answer_info = {
                "answer_texts":
                answer_texts,  # this `answer_texts` will not be used for evaluation
                "answer_passage_spans": valid_passage_spans
            }
            return self.make_bert_drop_instance(question_tokens,
                                                passage_tokens,
                                                question_concat_passage_tokens,
                                                self._token_indexers,
                                                passage_text,
                                                answer_info,
                                                additional_metadata={
                                                    "original_passage":
                                                    passage_text,
                                                    "original_question":
                                                    question_text,
                                                    "passage_id":
                                                    passage_id,
                                                    "question_id":
                                                    question_id,
                                                    "answer_annotations":
                                                    answer_annotations
                                                })
        elif self.instance_format == "drop":
            numbers_in_passage = []
            number_indices = []
            for token_index, token in enumerate(passage_tokens):
                number = self.convert_word_to_number(token.text)
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(token_index)
            # hack to guarantee minimal length of padded number
            numbers_in_passage.append(0)
            number_indices.append(-1)
            numbers_as_tokens = [
                Token(str(number)) for number in numbers_in_passage
            ]

            valid_passage_spans = \
                self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else []
            valid_question_spans = \
                self.find_valid_spans(question_tokens, tokenized_answer_texts) if tokenized_answer_texts else []

            target_numbers = []
            # `answer_texts` is a list of valid answers.
            for answer_text in answer_texts:
                number = self.convert_word_to_number(answer_text)
                if number is not None:
                    target_numbers.append(number)
            valid_signs_for_add_sub_expressions = []
            valid_counts = []
            if answer_type in ["number", "date"]:
                valid_signs_for_add_sub_expressions = \
                    self.find_valid_add_sub_expressions(numbers_in_passage, target_numbers)
            if answer_type in ["number"]:
                # Currently we only support count number 0 ~ 9
                numbers_for_count = list(range(10))
                valid_counts = self.find_valid_counts(numbers_for_count,
                                                      target_numbers)

            type_to_answer_map = {
                "passage_span": valid_passage_spans,
                "question_span": valid_question_spans,
                "addition_subtraction": valid_signs_for_add_sub_expressions,
                "counting": valid_counts
            }

            if self.skip_when_all_empty \
                    and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty):
                return None

            answer_info = {
                "answer_texts":
                answer_texts,  # this `answer_texts` will not be used for evaluation
                "answer_passage_spans": valid_passage_spans,
                "answer_question_spans": valid_question_spans,
                "signs_for_add_sub_expressions":
                valid_signs_for_add_sub_expressions,
                "counts": valid_counts
            }

            return self.make_marginal_drop_instance(question_tokens,
                                                    passage_tokens,
                                                    numbers_as_tokens,
                                                    number_indices,
                                                    self._token_indexers,
                                                    passage_text,
                                                    answer_info,
                                                    additional_metadata={
                                                        "original_passage":
                                                        passage_text,
                                                        "original_question":
                                                        question_text,
                                                        "original_numbers":
                                                        numbers_in_passage,
                                                        "passage_id":
                                                        passage_id,
                                                        "question_id":
                                                        question_id,
                                                        "answer_info":
                                                        answer_info,
                                                        "answer_annotations":
                                                        answer_annotations
                                                    })
        else:
            raise ValueError(
                f"Expect the instance format to be \"drop\", \"squad\" or \"bert\", "
                f"but got {self.instance_format}")
Ejemplo n.º 12
0
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            char_spans: List[Tuple[int, int]] = None,
            answer_texts: List[str] = None,
            passage_tokens: List[Token] = None) -> Instance:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx,
                            token.idx + len(token.text.replace("_", "")))
                           for token in passage_tokens]
        """
        with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'w', encoding='utf-8') as f:
            f.write(question_text)
            f.write('\n')
            f.write(passage_text)
            f.write("\n")
            for x in passage_tokens:
                f.write(x.text)
                f.write(" ")
            f.write('\n')
            for x in answer_texts:
                f.write(x)
                f.write("\n")
            f.write("\n")
            for i, (start, end) in enumerate(passage_offsets):
                f.write(str(i)+": ")
                f.write(passage_text[start:end])
                f.write(" ")
                f.write(str(start)+" "+str(end))
                f.write(" "+passage_tokens[i].text)
                f.write("\n") 
            f.write("\n")
            f.write("\nanswers\n")
        """

        for char_span_start, char_span_end in char_spans:
            #try:
            (span_start, span_end), error = util.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            """
            with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'a', encoding='utf-8') as f:
                f.write(str([x.text for x in passage_tokens[span_start:span_end+1]]))
                f.write("\n") 
            except:
                with open("/home/kz918/bpe/eval/bidaf/error.txt", 'w', encoding='utf-8') as f:
                    f.write(question_text)
                    f.write('\n')
                    f.write(passage_text)
                    f.write("\n")
                    for x in passage_tokens:
                        f.write(x.text)
                        f.write(" ")
                    f.write('\n')
                    for x in answer_texts:
                        f.write(x)
                        f.write("\n")
                    f.write("\n")
                import pdb; pdb.set_trace()
            """
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))
        """
        with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'a', encoding='utf-8') as f:
            f.write("\n")
            f.write("\nspans\n")
            for start, end in token_spans:
                f.write(str(start)+" "+str(end)+"\n")
            f.write("\n")
        """
        #import pdb; pdb.set_trace()
        return util.make_reading_comprehension_instance(
            self._tokenizer.tokenize(question_text), passage_tokens,
            self._token_indexers, passage_text, token_spans, answer_texts)