def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         char_spans_sent: List[Tuple[int, int]] = None,
                         sent_labels: List[int] = None,
                         answer_texts: List[str] = None,
                         passage_tokens: List[Token] = None,
                         passage_offsets: List[Tuple] = None) -> Instance:

        token_spans_sent: List[Tuple[int, int]] = []

        for char_span_sent_start, char_span_sent_end in char_spans_sent:
            (span_start_sent, span_end_sent), error = util.char_span_to_token_span(passage_offsets,
                                                                                   (char_span_sent_start,
                                                                                    char_span_sent_end))
            token_spans_sent.append((span_start_sent, span_end_sent))

        tokenized_ques = self._tokenizer.tokenize(question_text)
        tokenized_ques = [Token(text=tk.text, idx=tk.idx) for tk in tokenized_ques]

        return make_reading_comprehension_instance(tokenized_ques,
                                                   passage_tokens,
                                                   self._token_indexers,
                                                   passage_text,
                                                   token_spans_sent,
                                                   sent_labels,
                                                   answer_texts,
                                                   passage_offsets)
Exemple #2
0
    def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         char_spans: List[Tuple[int, int]] = None,
                         answer_texts: List[str] = None,
                         passage_tokens: List[Token] = None) -> Instance:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                         (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))

        return util.make_reading_comprehension_instance(self._tokenizer.tokenize(question_text),
                                                        passage_tokens,
                                                        self._token_indexers,
                                                        passage_text,
                                                        token_spans,
                                                        answer_texts)
Exemple #3
0
    def text_to_instance(
            self,  # type: ignore
            tokenized_stand_alone_ques: List[Token],
            question_text: str,
            passage_text: str,
            sent_labels: List[int] = None,
            answer_texts: List[str] = None,
            passage_sent_tokens: List[List[Token]] = None,
            evd_possible_chains: List[List[int]] = None,
            ans_sent_idxs: List[int] = None,
            sents_span: List[Tuple[int, int]] = None,
            sents_offset: List[Tuple] = None,
            article_id: str = None) -> Instance:

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans_sent: List[Tuple[int, int]] = []
        for i, char_span_sent in enumerate(sents_span):
            char_sent_start, char_sent_end = char_span_sent[0], char_span_sent[
                1]
            sent_offset = sents_offset[i]
            total_wordpiece = 0

            try:
                (span_start, span_end), error = util.char_span_to_token_span(
                    sent_offset, (char_sent_start, char_sent_end))
                for j, token in enumerate(passage_sent_tokens[i]):
                    total_wordpiece += len(
                        wordpiece_tokenizer(token.text.lower()))
                    if total_wordpiece >= self._word_piece_limit:
                        break
                if span_end >= j or span_start >= j:
                    span_start = j - 1 if span_start >= j else span_start
                    span_end = j - 1 if span_end >= j else span_end
                    # print(passage_sent_tokens[i])
                    # print("span start and end:", span_start, span_end+1)
                    # print("tokens:", passage_sent_tokens[i][span_start: span_end])
                    # print('overflow:', j)
                    # print('total wordpiece:', total_wordpiece)
                    # input()
            except IndexError:
                print(sent_offset)
                print(char_sent_start, char_sent_end)
                input()
            token_spans_sent.append((span_start, span_end))
            # print(span_start, span_end+1)
            # print(passage_sent_tokens[i][span_start: span_end])
            # input()

        return make_reading_comprehension_instance(tokenized_stand_alone_ques,
                                                   passage_sent_tokens,
                                                   self._token_indexers,
                                                   passage_text,
                                                   sent_labels,
                                                   answer_texts,
                                                   evd_possible_chains,
                                                   ans_sent_idxs,
                                                   token_spans_sent,
                                                   article_id,
                                                   para_limit=self._para_limit)
Exemple #4
0
 def test_char_span_to_token_span_handles_hard_cases(self):
     # An earlier version of the code had a hard time when the answer was the last token in the
     # passage.  This tests that case, on the instance that used to fail.
     tokenizer = SpacyTokenizer()
     passage = (
         "Beyonc\u00e9 is believed to have first started a relationship with Jay Z "
         +
         'after a collaboration on "\'03 Bonnie & Clyde", which appeared on his seventh '
         +
         "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay "
         +
         "Z's girlfriend in the music video for the song, which would further fuel "
         +
         "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were "
         +
         "married without publicity. As of April 2014, the couple have sold a combined 300 "
         +
         "million records together. The couple are known for their private relationship, "
         +
         "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 "
         +
         'suffered a miscarriage in 2010 or 2011, describing it as "the saddest thing" '
         +
         "she had ever endured. She returned to the studio and wrote music in order to cope "
         +
         "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order "
         +
         "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris."
     )
     start = 912
     end = 912 + len("Paris.")
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     token_span = util.char_span_to_token_span(offsets, (start, end))[0]
     assert token_span == (184, 185)
Exemple #5
0
    def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         char_spans: List[Tuple[int, int]] = None,
                         answer_texts: List[str] = None,
                         passage_tokens: List[Token] = None) -> Instance:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                         (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))

        return util.make_reading_comprehension_instance(self._claim_tokenizer.tokenize(question_text),
                                                        passage_tokens,
                                                        self._token_indexers,
                                                        passage_text,
                                                        token_spans,
                                                        answer_texts)
Exemple #6
0
 def test_char_span_to_token_span_handles_easy_cases(self):
     # These are _inclusive_ spans, on both sides.
     tokenizer = WordTokenizer()
     passage = "On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " +\
         "Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " +\
         "nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " +\
         "first performances since giving birth to Blue Ivy."
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     # "January 7, 2012"
     token_span = util.char_span_to_token_span(offsets, (3, 18))[0]
     assert token_span == (1, 4)
     # "Lenox Hill Hospital"
     token_span = util.char_span_to_token_span(offsets, (91, 110))[0]
     assert token_span == (22, 24)
     # "Lenox Hill Hospital in New York."
     token_span = util.char_span_to_token_span(offsets, (91, 123))[0]
     assert token_span == (22, 28)
 def test_char_span_to_token_span_handles_easy_cases(self):
     # These are _inclusive_ spans, on both sides.
     tokenizer = WordTokenizer()
     passage = "On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " +\
         "Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " +\
         "nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " +\
         "first performances since giving birth to Blue Ivy."
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     # "January 7, 2012"
     token_span = util.char_span_to_token_span(offsets, (3, 18))[0]
     assert token_span == (1, 4)
     # "Lenox Hill Hospital"
     token_span = util.char_span_to_token_span(offsets, (91, 110))[0]
     assert token_span == (22, 24)
     # "Lenox Hill Hospital in New York."
     token_span = util.char_span_to_token_span(offsets, (91, 123))[0]
     assert token_span == (22, 28)
Exemple #8
0
 def text_to_instance(self,  # type: ignore
                      question_text: str,
                      passages_texts: List[str],
                      qid: int,
                      answer_texts: List[str] = None,
                      char_spans: List[List[Tuple[int, int]]] = None,
                      max_passage_len: int = None,
                      max_question_len: int = None,
                      drop_invalid: bool = False) -> Optional[Instance]:
     """
     We cut the passage and question according to `max_passage_len` and `max_question_len` here.
     We will drop the invalid examples if `drop_invalid` equals to true.
     """
     passages_tokens = [self._tokenizer.tokenize(passage_text) for passage_text in passages_texts]
     question_tokens = self._tokenizer.tokenize(question_text)
     if max_passage_len is not None:
         passages_tokens = [passage_tokens[:max_passage_len] for passage_tokens in passages_tokens]
     if max_question_len is not None:
         question_tokens = question_tokens[: max_question_len]
     char_spans = char_spans or []
     # We need to convert character indices in `passage_text` to token indices in
     # `passage_tokens`, as the latter is what we'll actually use for supervision.
     passages_offsets = [[(token.idx, token.idx + len(token.text)) for token in passage_tokens]
                         for passage_tokens in passages_tokens]
     token_spans = []
     for passage_id, span_in_passage in enumerate(char_spans):
         passage_offsets = passages_offsets[passage_id]
         passage_token_spans: List[Tuple[int, int]] = []
         for char_span_start, char_span_end in span_in_passage:
             if char_span_end > passage_offsets[-1][1]:
                 continue
             (span_start, span_end), error = util.char_span_to_token_span(
                 passage_offsets,
                 (char_span_start, char_span_end))
             if error:
                 logger.debug("Passage: %s", passages_texts[passage_id])
                 logger.debug("Passage tokens: %s", passages_tokens[passage_id])
                 logger.debug("Question text: %s", question_text)
                 logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                 logger.debug("Token span: (%d, %d)", span_start, span_end)
                 logger.debug("Tokens in answer: %s",
                              passages_tokens[passage_id][span_start:span_end + 1])
                 logger.debug("Answer: %s", passages_texts[passage_id][char_span_start:char_span_end])
             passage_token_spans.append((span_start, span_end))
         if not passage_token_spans:
             if drop_invalid:
                 return None
             else:
                 passage_token_spans.append((-1, -1))
         token_spans.append(passage_token_spans)
     return self.make_MSMARCO_MultiPassage_instance(question_tokens,
                                                    passages_tokens,
                                                    self._token_indexers,
                                                    passages_texts,
                                                    qid,
                                                    token_spans,
                                                    answer_texts)
Exemple #9
0
    def text_to_instance(
        self,  # type: ignore
        question_text: str,
        passage_text: str,
        char_spans: List[Tuple[int, int]] = None,
        answer_texts: List[str] = None,
        passage_tokens: List[Token] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Optional[Instance]:

        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        question_tokens = self._tokenizer.tokenize(question_text)
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]
        char_spans = char_spans or []
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            if char_span_end > passage_offsets[-1][1]:
                continue
            (span_start, span_end), error = util.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))
        # The original answer is filtered out
        if char_spans and not token_spans:
            if self.skip_invalid_examples:
                return None
            else:
                token_spans.append(
                    (len(passage_tokens) - 1, len(passage_tokens) - 1))
        return util.make_reading_comprehension_instance(
            question_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            token_spans,
            answer_texts,
            additional_metadata,
        )
Exemple #10
0
    def text_to_instance(
        self,  # type: ignore
        question_text_list: List[str],
        passage_text: str,
        start_span_list: List[List[int]] = None,
        end_span_list: List[List[int]] = None,
        passage_tokens: List[Token] = None,
        yesno_list: List[int] = None,
        followup_list: List[int] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Instance:

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        answer_token_span_list = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for start_list, end_list in zip(start_span_list, end_span_list):
            token_spans: List[Tuple[int, int]] = []
            for char_span_start, char_span_end in zip(start_list, end_list):
                (span_start, span_end), error = util.char_span_to_token_span(
                    passage_offsets, (char_span_start, char_span_end))
                if error:
                    logger.debug("Passage: %s", passage_text)
                    logger.debug("Passage tokens: %s", passage_tokens)
                    logger.debug("Answer span: (%d, %d)", char_span_start,
                                 char_span_end)
                    logger.debug("Token span: (%d, %d)", span_start, span_end)
                    logger.debug("Tokens in answer: %s",
                                 passage_tokens[span_start:span_end + 1])
                    logger.debug("Answer: %s",
                                 passage_text[char_span_start:char_span_end])
                token_spans.append((span_start, span_end))
            answer_token_span_list.append(token_spans)
        question_list_tokens = [
            self._tokenizer.tokenize(q) for q in question_text_list
        ]
        # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
        additional_metadata["answer_texts_list"] = [
            util.handle_cannot(ans_list)
            for ans_list in additional_metadata["answer_texts_list"]
        ]
        return util.make_reading_comprehension_instance_quac(
            question_list_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            answer_token_span_list,
            yesno_list,
            followup_list,
            additional_metadata,
            self._num_context_answers,
        )
Exemple #11
0
    def get_gold_token_spans(tokenizer, gold_char_spans, context):
        # Adapted from AllenNLP
        passage_tokens = tokenizer.tokenize(context)
        token_spans = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]

        for char_span_start, char_span_end in gold_char_spans:
            if char_span_end > passage_offsets[-1][1]:
                continue
            (span_start, span_end), error = char_span_to_token_span(passage_offsets, (char_span_start, char_span_end))
            token_spans.append((span_start, span_end))
        return token_spans
Exemple #12
0
 def text_to_instance(
         self,  # type: ignore
         question_text: str,
         passage_text: str,
         char_spans: List[Tuple[int, int]] = None,
         answer_texts: List[str] = None,
         passage_tokens: List[Token] = None,
         max_passage_len: int = None,
         max_question_len: int = None,
         drop_invalid: bool = False) -> Optional[Instance]:
     """
     We cut the passage and question according to `max_passage_len` and `max_question_len` here.
     We will drop the invalid examples if `drop_invalid` equals to true.
     """
     # pylint: disable=arguments-differ
     if not passage_tokens:
         passage_tokens = self._tokenizer.tokenize(passage_text)
     question_tokens = self._tokenizer.tokenize(question_text)
     if max_passage_len is not None:
         passage_tokens = passage_tokens[:max_passage_len]
     if max_question_len is not None:
         question_tokens = question_tokens[:max_question_len]
     char_spans = char_spans or []
     # We need to convert character indices in `passage_text` to token indices in
     # `passage_tokens`, as the latter is what we'll actually use for supervision.
     token_spans: List[Tuple[int, int]] = []
     passage_offsets = [(token.idx, token.idx + len(token.text))
                        for token in passage_tokens]
     for char_span_start, char_span_end in char_spans:
         if char_span_end > passage_offsets[-1][1]:
             continue
         (span_start, span_end), error = util.char_span_to_token_span(
             passage_offsets, (char_span_start, char_span_end))
         if error:
             logger.debug("Passage: %s", passage_text)
             logger.debug("Passage tokens: %s", passage_tokens)
             logger.debug("Question text: %s", question_text)
             logger.debug("Answer span: (%d, %d)", char_span_start,
                          char_span_end)
             logger.debug("Token span: (%d, %d)", span_start, span_end)
             logger.debug("Tokens in answer: %s",
                          passage_tokens[span_start:span_end + 1])
             logger.debug("Answer: %s",
                          passage_text[char_span_start:char_span_end])
         token_spans.append((span_start, span_end))
     if not token_spans:
         if drop_invalid:
             return None
         else:
             token_spans.append((0, 0))
     return util.make_reading_comprehension_instance(
         question_tokens, passage_tokens, self._token_indexers,
         passage_text, token_spans, answer_texts)
Exemple #13
0
    def test_char_span_to_token_span_handles_out_of_bounds_start_end(self):
        tokenizer = SpacyTokenizer()
        passage = "This sentence is just for testing purposes"
        tokens = tokenizer.tokenize(passage)
        offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]

        # scenario 1: negative start character span (this should really never happen)
        start = -1
        end = start + len("This")
        expected_span = (0, 0)
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

        # scenario 2: end character span exceeds sentence length, for whichever reason
        start = 34
        end = start + len("purposes") + 1
        expected_span = (6, 6)
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            char_spans: List[Tuple[int, int]] = None,
            answer_texts: List[str] = None,
            passage_tokens: List[Token] = None) -> Instance:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []
        fields = {}
        # if not has_answer:
        #     question_tokens = self._tokenizer.tokenize(question_text)
        #     passage_field = TextField(passage_tokens, self._token_indexers)
        #     fields['passage'] = passage_field
        #     fields['question'] = TextField(question_tokens, self._token_indexers)
        #     metadata = {
        #         'original_passage': passage_text,
        #         'token_offsets': None,
        #         'question_tokens': [token.text for token in question_tokens],
        #         'passage_tokens': [token.text for token in passage_tokens]
        #         }
        #     fields['span_start'] = IndexField(-1, passage_field.empty_field())
        #     fields['span_end'] = IndexField(-1, passage_field.empty_field())
        #     return Instance(fields)

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            (span_start, span_end), error = util.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))

        return util.make_reading_comprehension_instance(
            self._tokenizer.tokenize(question_text), passage_tokens,
            self._token_indexers, passage_text, token_spans, answer_texts)
Exemple #15
0
    def test_char_span_to_token_span_handles_undertokenization(self):
        tokenizer = SpacyTokenizer()
        passage = "This sentence will have two under tokenized tokens, one#here and one at the#end"
        tokens = tokenizer.tokenize(passage)
        offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]

        # scenario 1: under tokenized in the middle of the sentence, look for the first part of the token
        start = 52
        end = start + len("one")
        expected_span = (9, 9)  # the indices of the whole "one&here" token should be returned
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

        # scenario 2: under tokenized in the middle of the sentence, look for the second part of the token
        start = 56
        end = start + len("here")
        expected_span = (9, 9)  # the indices of the whole "one&here" token should be returned
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

        # scenario 3: under tokenized at the end of the sentence, look for the first part of the token
        start = 72
        end = start + len("the")
        expected_span = (13, 13)  # the indices of the whole "the&end" token should be returned
        token_span, error = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error

        # scenario 4: under tokenized at the end of the sentence, look for the second part of the token
        # this used to cause an IndexError
        start = 76
        end = start + len("end")
        expected_span = (13, 13)  # the indices of the whole "the&end" token should be returned
        token_span, errory = util.char_span_to_token_span(offsets, (start, end))
        assert token_span == expected_span
        assert error
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            para_sent_char_spans: List[List[Tuple[int, int]]] = None,
            sent_labels: List[int] = None,
            answer_texts: List[str] = None,
            passage_para_tokens: List[List[Token]] = None,
            passage_para_offsets: List[List[Tuple]] = None,
            evd_possible_chains: List[List[int]] = None,
            ans_sent_idxs: List[int] = None,
            article_id: str = None) -> Instance:

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.

        token_spans_passage_para: List[List[Tuple[int, int]]] = []

        for para_offset, para_char_spans, para_tokens in zip(
                passage_para_offsets, para_sent_char_spans,
                passage_para_tokens):
            token_spans_para_sent: List[Tuple[int, int]] = []
            # print(para_char_spans)
            for char_span_sent_start, char_span_sent_end in zip(
                    para_char_spans[0], para_char_spans[1]):
                (span_start_sent,
                 span_end_sent), error = util.char_span_to_token_span(
                     para_offset, (char_span_sent_start, char_span_sent_end))
                # print(para_tokens[span_start_sent: span_end_sent+1])
                token_spans_para_sent.append((span_start_sent, span_end_sent))
            token_spans_passage_para.append(token_spans_para_sent)
        # print(token_spans_passage_para)
        tokenized_ques = self._tokenizer.tokenize(question_text)
        tokenized_ques = [
            Token(text=tk.text, idx=tk.idx) for tk in tokenized_ques
        ]
        return make_reading_comprehension_instance(tokenized_ques,
                                                   passage_para_tokens,
                                                   self._token_indexers,
                                                   passage_text,
                                                   token_spans_passage_para,
                                                   sent_labels,
                                                   answer_texts,
                                                   passage_para_offsets,
                                                   evd_possible_chains,
                                                   ans_sent_idxs,
                                                   article_id,
                                                   para_limit=self._para_limit)
Exemple #17
0
    def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         question_id: str = None,
                         answer_text: str = None,
                         char_span_start: int = None,
                         passage_tokens: List[Token] = None,
                         answer_texts: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        question_tokens = self._tokenizer.tokenize(question_text)
        # Separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, self._token_indexers)
        fields['passage'] = passage_field
        fields['question'] = TextField(question_tokens, self._token_indexers)

        if answer_text:
            # SQuAD gives answer annotations as a character index into the paragraph, but we need a
            # token index for our models.  We convert them here.
            char_span_end = char_span_start + len(answer_text)
            (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                         (char_span_start,
                                                                          char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s", answer_text)

            fields['span_start'] = IndexField(span_start, passage_field)
            fields['span_end'] = IndexField(span_end, passage_field)
        metadata = {
                'original_passage': passage_text,
                'token_offsets': passage_offsets
                }
        if question_id:
            metadata['question_id'] = question_id
        if answer_texts:
            metadata['answer_texts'] = answer_texts
        fields['metadata'] = MetadataField(metadata)
        return Instance(fields)
Exemple #18
0
 def text_to_instance(self,  # type: ignore
                      question_text_list: List[str],
                      passage_text: str,
                      start_span_list: List[List[int]] = None,
                      end_span_list: List[List[int]] = None,
                      passage_tokens: List[Token] = None,
                      yesno_list: List[int] = None,
                      followup_list: List[int] = None,
                      additional_metadata: Dict[str, Any] = None) -> Instance:
     # pylint: disable=arguments-differ
     # We need to convert character indices in `passage_text` to token indices in
     # `passage_tokens`, as the latter is what we'll actually use for supervision.
     answer_token_span_list = []
     passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
     for start_list, end_list in zip(start_span_list, end_span_list):
         token_spans: List[Tuple[int, int]] = []
         for char_span_start, char_span_end in zip(start_list, end_list):
             (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                          (char_span_start, char_span_end))
             if error:
                 logger.debug("Passage: %s", passage_text)
                 logger.debug("Passage tokens: %s", passage_tokens)
                 logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                 logger.debug("Token span: (%d, %d)", span_start, span_end)
                 logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                 logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
             token_spans.append((span_start, span_end))
         answer_token_span_list.append(token_spans)
     question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list]
     # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
     additional_metadata['answer_texts_list'] = [util.handle_cannot(ans_list) for ans_list \
                                                 in additional_metadata['answer_texts_list']]
     return util.make_reading_comprehension_instance_quac(question_list_tokens,
                                                          passage_tokens,
                                                          self._token_indexers,
                                                          passage_text,
                                                          answer_token_span_list,
                                                          yesno_list,
                                                          followup_list,
                                                          additional_metadata,
                                                          self._num_context_answers)
Exemple #19
0
 def test_char_span_to_token_span_handles_hard_cases(self):
     # An earlier version of the code had a hard time when the answer was the last token in the
     # passage.  This tests that case, on the instance that used to fail.
     tokenizer = WordTokenizer()
     passage = "Beyonc\u00e9 is believed to have first started a relationship with Jay Z " +\
         "after a collaboration on \"'03 Bonnie & Clyde\", which appeared on his seventh " +\
         "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay " +\
         "Z's girlfriend in the music video for the song, which would further fuel " +\
         "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were " +\
         "married without publicity. As of April 2014, the couple have sold a combined 300 " +\
         "million records together. The couple are known for their private relationship, " +\
         "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 " +\
         "suffered a miscarriage in 2010 or 2011, describing it as \"the saddest thing\" " +\
         "she had ever endured. She returned to the studio and wrote music in order to cope " +\
         "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order " +\
         "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris."
     start = 912
     end = 912 + len("Paris.")
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for  t in tokens]
     token_span = util.char_span_to_token_span(offsets, (start, end))[0]
     assert token_span == (184, 185)
Exemple #20
0
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            char_spans: List[Tuple[int, int]] = None,
            answer_texts: List[str] = None,
            passage_tokens: List[Token] = None) -> Instance:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        char_spans = char_spans or []

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx,
                            token.idx + len(token.text.replace("_", "")))
                           for token in passage_tokens]
        """
        with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'w', encoding='utf-8') as f:
            f.write(question_text)
            f.write('\n')
            f.write(passage_text)
            f.write("\n")
            for x in passage_tokens:
                f.write(x.text)
                f.write(" ")
            f.write('\n')
            for x in answer_texts:
                f.write(x)
                f.write("\n")
            f.write("\n")
            for i, (start, end) in enumerate(passage_offsets):
                f.write(str(i)+": ")
                f.write(passage_text[start:end])
                f.write(" ")
                f.write(str(start)+" "+str(end))
                f.write(" "+passage_tokens[i].text)
                f.write("\n") 
            f.write("\n")
            f.write("\nanswers\n")
        """

        for char_span_start, char_span_end in char_spans:
            #try:
            (span_start, span_end), error = util.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            """
            with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'a', encoding='utf-8') as f:
                f.write(str([x.text for x in passage_tokens[span_start:span_end+1]]))
                f.write("\n") 
            except:
                with open("/home/kz918/bpe/eval/bidaf/error.txt", 'w', encoding='utf-8') as f:
                    f.write(question_text)
                    f.write('\n')
                    f.write(passage_text)
                    f.write("\n")
                    for x in passage_tokens:
                        f.write(x.text)
                        f.write(" ")
                    f.write('\n')
                    for x in answer_texts:
                        f.write(x)
                        f.write("\n")
                    f.write("\n")
                import pdb; pdb.set_trace()
            """
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))
        """
        with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'a', encoding='utf-8') as f:
            f.write("\n")
            f.write("\nspans\n")
            for start, end in token_spans:
                f.write(str(start)+" "+str(end)+"\n")
            f.write("\n")
        """
        #import pdb; pdb.set_trace()
        return util.make_reading_comprehension_instance(
            self._tokenizer.tokenize(question_text), passage_tokens,
            self._token_indexers, passage_text, token_spans, answer_texts)
Exemple #21
0
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            char_spans: List[Tuple[int, int]] = None,
            char_spans_sp: List[Tuple[int, int]] = None,
            char_spans_sent: List[Tuple[int, int]] = None,
            sent_labels: List[int] = None,
            answer_texts: List[str] = None,
            question_passage_tokens: List[Token] = None,
            question_passage_offsets: List[Tuple[int, int]] = None,
            article_id: str = None) -> Instance:
        # pylint: disable=arguments-differ
        # if not passage_tokens:
        #     passage_tokens = self._tokenizer.tokenize(passage_text)

        char_spans = char_spans or []
        char_spans_sp = char_spans_sp or []

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        token_spans_sp: List[Tuple[int, int]] = []
        token_spans_sent: List[Tuple[int, int]] = []

        for char_span_start, char_span_end in char_spans:
            (span_start, span_end), error = util.char_span_to_token_span(
                question_passage_offsets, (char_span_start, char_span_end))
            # print(span_start, span_end)

            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", question_passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             question_passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))

        for char_span_sp_start, char_span_sp_end in char_spans_sp:
            (span_start_sp, span_end_sp), error = util.char_span_to_token_span(
                question_passage_offsets,
                (char_span_sp_start, char_span_sp_end))
            token_spans_sp.append((span_start_sp, span_end_sp))

        for char_span_sent_start, char_span_sent_end in char_spans_sent:
            (span_start_sent,
             span_end_sent), error = util.char_span_to_token_span(
                 question_passage_offsets,
                 (char_span_sent_start, char_span_sent_end))
            token_spans_sent.append((span_start_sent, span_end_sent))

        tokenized_ques = self._tokenizer.tokenize(question_text)
        tokenized_ques = [
            Token(text=tk.text, idx=tk.idx) for tk in tokenized_ques
        ]

        return make_reading_comprehension_instance(
            tokenized_ques,
            question_passage_tokens,
            question_passage_offsets,
            self._token_indexers,
            passage_text,
            token_spans,
            token_spans_sp,
            token_spans_sent,
            sent_labels,
            answer_texts,
            additional_metadata={'_id': article_id},
            para_limit=self._para_limit)
    def text_to_instance(
            self,  # type: ignore
            item_id: Any,
            question_text: str,
            choice_text_list: List[str],
            fact_text: str,
            answer_span: List[str],
            answer_relations: List[str],
            answer_starts: List[int] = None,
            answer_id: int = None,
            prefetched_sentences: Dict[str, List[str]] = None,
            prefetched_indices: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        fact_tokens = self._tokenizer.tokenize(fact_text)
        choices_tokens_list = [
            self._tokenizer.tokenize(x) for x in choice_text_list
        ]
        choice_kb_fields = []
        selected_tuples = []
        for choice in choice_text_list:
            kb_fields = []

            if self._use_cskg and self._use_elastic_search:
                max_sents_per_source = int(self._max_tuples / 2)
            else:
                max_sents_per_source = self._max_tuples
            selected_hits = []
            if self._use_elastic_search:
                elastic_search_hits = self.get_elasticsearch_sentences(
                    prefetched_sentences, prefetched_indices, answer_span,
                    choice, question_text, fact_text, max_sents_per_source)
                selected_hits.extend(elastic_search_hits)

            if self._use_cskg:
                cskg_sentences = self.get_cskg_sentences(
                    fact_text, answer_span, choice, max_sents_per_source)
                selected_hits.extend(cskg_sentences)
            # add a dummy entry to capture the embedding link
            if self._ignore_spans:
                fact_choice_sentence = fact_text + " || " + choice
                selected_hits.append(fact_choice_sentence)
            else:
                for answer in set(answer_span):
                    answer_choice_sentence = answer + " || " + choice
                    selected_hits.append(answer_choice_sentence)

            selected_tuples.append(selected_hits)
            for hit_text in selected_hits:
                kb_fields.append(
                    TextField(self._tokenizer.tokenize(hit_text),
                              self._token_indexers))

            choice_kb_fields.append(ListField(kb_fields))

        fields["choice_kb"] = ListField(choice_kb_fields)
        fields['fact'] = TextField(fact_tokens, self._token_indexers)

        if self._add_relation_labels:
            if answer_relations and len(answer_relations):
                relation_fields = []
                for relation in set(answer_relations):
                    relation_fields.append(
                        LabelField(relation,
                                   label_namespace="relation_labels"))
                fields["relations"] = ListField(relation_fields)
                selected_relations = self.collate_relations(answer_relations)
                fields["relation_label"] = MultiLabelField(
                    selected_relations, "relation_labels")
            else:
                fields["relations"] = ListField([
                    LabelField(-1,
                               label_namespace="relation_labels",
                               skip_indexing=True)
                ])
                fields["relation_label"] = MultiLabelField([],
                                                           "relation_labels")

        answer_fields = []
        answer_span_fields = []
        fact_offsets = [(token.idx, token.idx + len(token.text))
                        for token in fact_tokens]

        for idx, answer in enumerate(answer_span):
            answer_fields.append(
                TextField(self._tokenizer.tokenize(answer),
                          self._token_indexers))
            if answer_starts:
                if len(answer_starts) <= idx:
                    raise ValueError("Only {} answer_starts in json. "
                                     "Expected {} in {}".format(
                                         len(answer_starts), len(answer_span),
                                         item_id))
                offset = answer_starts[idx]
            else:
                offset = fact_text.index(answer)
                if offset == -1:
                    raise ValueError("Span: {} not found in fact: {}".format(
                        answer, fact_text))

            tok_span, err = char_span_to_token_span(
                fact_offsets, (offset, offset + len(answer)))
            if err:
                logger.info("Could not find token spans for '{}' in '{}'."
                            "Best guess: {} in {} at {}".format(
                                answer, fact_text,
                                [offset, offset + len(answer)], fact_offsets,
                                tok_span))
            answer_span_fields.append(
                SpanField(tok_span[0], tok_span[1], fields['fact']))

        fields["answer_text"] = ListField(answer_fields)
        fields["answer_spans"] = ListField(answer_span_fields)
        fields['question'] = TextField(question_tokens, self._token_indexers)

        fields['choices_list'] = ListField(
            [TextField(x, self._token_indexers) for x in choices_tokens_list])
        if answer_id is not None:
            fields['answer_id'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id":
            item_id,
            "question_text":
            question_text,
            "fact_text":
            fact_text,
            "choice_text_list":
            choice_text_list,
            "question_tokens": [x.text for x in question_tokens],
            "fact_tokens": [x.text for x in fact_tokens],
            "choice_tokens_list":
            [[x.text for x in ct] for ct in choices_tokens_list],
            "answer_text":
            answer_span,
            "answer_start":
            answer_starts,
            "answer_span_fields":
            [(x.span_start, x.span_end) for x in answer_span_fields],
            "relations":
            answer_relations,
            "selected_tuples":
            selected_tuples
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)