Esempio n. 1
0
 def test_char_span_to_token_span_handles_hard_cases(self):
     # An earlier version of the code had a hard time when the answer was the last token in the
     # passage.  This tests that case, on the instance that used to fail.
     tokenizer = SpacyTokenizer()
     passage = (
         "Beyonc\u00e9 is believed to have first started a relationship with Jay Z "
         +
         'after a collaboration on "\'03 Bonnie & Clyde", which appeared on his seventh '
         +
         "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay "
         +
         "Z's girlfriend in the music video for the song, which would further fuel "
         +
         "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were "
         +
         "married without publicity. As of April 2014, the couple have sold a combined 300 "
         +
         "million records together. The couple are known for their private relationship, "
         +
         "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 "
         +
         'suffered a miscarriage in 2010 or 2011, describing it as "the saddest thing" '
         +
         "she had ever endured. She returned to the studio and wrote music in order to cope "
         +
         "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order "
         +
         "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris."
     )
     start = 912
     end = 912 + len("Paris.")
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     token_span = util.char_span_to_token_span(offsets, (start, end))[0]
     assert token_span == (184, 185)
Esempio n. 2
0
 def test_char_span_to_token_span_handles_out_of_range(self):
     offsets = [(10, 18), (20, 28)]
     with pytest.raises(ValueError):
         util.char_span_to_token_span(offsets, (1, 3))
     with pytest.raises(ValueError):
         util.char_span_to_token_span(offsets, (1, 15))
     with pytest.raises(ValueError):
         util.char_span_to_token_span(offsets, (30, 38))
     with pytest.raises(ValueError):
         util.char_span_to_token_span(offsets, (25, 38))
Esempio n. 3
0
    def test_char_span_to_token_span_handles_none_cases(self):
        # base case
        offsets = [(0, 8), (10, 18), (20, 28), (30, 38), (40, 48)]
        token_span, error = util.char_span_to_token_span(offsets, (10, 38))
        assert token_span == (1, 3) and not error

        # None in the middle
        offsets = [(0, 8), (10, 18), None, (30, 38), (40, 48)]
        token_span, error = util.char_span_to_token_span(offsets, (10, 38))
        assert token_span == (1, 3) and not error

        # None before
        offsets = [None, (10, 18), (20, 28), (30, 38), (40, 48)]
        token_span, error = util.char_span_to_token_span(offsets, (10, 38))
        assert token_span == (1, 3) and not error

        # None after
        offsets = [(0, 8), (10, 18), (20, 28), (30, 38), None]
        token_span, error = util.char_span_to_token_span(offsets, (10, 38))
        assert token_span == (1, 3) and not error

        # None after and we're looking for more characters
        offsets = [(0, 8), (10, 18), (20, 28), (30, 38), None]
        with pytest.raises(ValueError):
            util.char_span_to_token_span(offsets, (10, 48))

        # Starting at None
        offsets = [None, (10, 18), (20, 28), (30, 38), (40, 48)]
        token_span, error = util.char_span_to_token_span(offsets, (8, 38))
        assert token_span == (0, 3) and error
    def text_to_instance(
        self,  # type: ignore
        question_text: str,
        passage_text: str,
        char_spans: List[Tuple[int, int]] = None,
        answer_texts: List[str] = None,
        passage_tokens: List[Token] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Optional[Instance]:

        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        question_tokens = self._tokenizer.tokenize(question_text)
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]
        char_spans = char_spans or []
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            if char_span_end > passage_offsets[-1][1]:
                continue
            (span_start, span_end), error = util.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))
        # The original answer is filtered out
        if char_spans and not token_spans:
            if self.skip_invalid_examples:
                return None
            else:
                token_spans.append(
                    (len(passage_tokens) - 1, len(passage_tokens) - 1))
        return util.make_reading_comprehension_instance(
            question_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            token_spans,
            answer_texts,
            additional_metadata,
        )
Esempio n. 5
0
 def test_char_span_to_token_span_handles_easy_cases(self):
     # These are _inclusive_ spans, on both sides.
     tokenizer = SpacyTokenizer()
     passage = (
         "On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy "
         +
         "Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four "
         +
         "nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her "
         + "first performances since giving birth to Blue Ivy.")
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     # "January 7, 2012"
     token_span = util.char_span_to_token_span(offsets, (3, 18))[0]
     assert token_span == (1, 4)
     # "Lenox Hill Hospital"
     token_span = util.char_span_to_token_span(offsets, (91, 110))[0]
     assert token_span == (22, 24)
     # "Lenox Hill Hospital in New York."
     token_span = util.char_span_to_token_span(offsets, (91, 123))[0]
     assert token_span == (22, 28)
Esempio n. 6
0
    def text_to_instance(
        self,  # type: ignore
        question_text_list: List[str],
        passage_text: str,
        start_span_list: List[List[int]] = None,
        end_span_list: List[List[int]] = None,
        passage_tokens: List[Token] = None,
        yesno_list: List[int] = None,
        followup_list: List[int] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Instance:

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        answer_token_span_list = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for start_list, end_list in zip(start_span_list, end_span_list):
            token_spans: List[Tuple[int, int]] = []
            for char_span_start, char_span_end in zip(start_list, end_list):
                (span_start, span_end), error = util.char_span_to_token_span(
                    passage_offsets, (char_span_start, char_span_end))
                if error:
                    logger.debug("Passage: %s", passage_text)
                    logger.debug("Passage tokens: %s", passage_tokens)
                    logger.debug("Answer span: (%d, %d)", char_span_start,
                                 char_span_end)
                    logger.debug("Token span: (%d, %d)", span_start, span_end)
                    logger.debug("Tokens in answer: %s",
                                 passage_tokens[span_start:span_end + 1])
                    logger.debug("Answer: %s",
                                 passage_text[char_span_start:char_span_end])
                token_spans.append((span_start, span_end))
            answer_token_span_list.append(token_spans)
        question_list_tokens = [
            self._tokenizer.tokenize(q) for q in question_text_list
        ]
        # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
        additional_metadata["answer_texts_list"] = [
            util.handle_cannot(ans_list)
            for ans_list in additional_metadata["answer_texts_list"]
        ]
        return util.make_reading_comprehension_instance_quac(
            question_list_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            answer_token_span_list,
            yesno_list,
            followup_list,
            additional_metadata,
            self._num_context_answers,
        )
Esempio n. 7
0
    def make_instances(
        self,
        qid: str,
        question: str,
        answers: List[str],
        context: str,
        first_answer_offset: Optional[int],
    ) -> Iterable[Instance]:
        # tokenize context by spaces first, and then with the wordpiece tokenizer
        # For RoBERTa, this produces a bug where every token is marked as beginning-of-sentence. To fix it, we
        # detect whether a space comes before a word, and if so, add "a " in front of the word.
        def tokenize_slice(start: int, end: int) -> Iterable[Token]:
            text_to_tokenize = context[start:end]
            if start - 1 >= 0 and context[start - 1].isspace():
                prefix = "a "  # must end in a space, and be short so we can be sure it becomes only one token
                wordpieces = self._tokenizer.tokenize(prefix +
                                                      text_to_tokenize)
                for wordpiece in wordpieces:
                    if wordpiece.idx is not None:
                        wordpiece.idx -= len(prefix)
                return wordpieces[1:]
            else:
                return self._tokenizer.tokenize(text_to_tokenize)

        tokenized_context = []
        token_start = 0
        for i, c in enumerate(context):
            if c.isspace():
                for wordpiece in tokenize_slice(token_start, i):
                    if wordpiece.idx is not None:
                        wordpiece.idx += token_start
                    tokenized_context.append(wordpiece)
                token_start = i + 1
        for wordpiece in tokenize_slice(token_start, len(context)):
            if wordpiece.idx is not None:
                wordpiece.idx += token_start
            tokenized_context.append(wordpiece)

        if first_answer_offset is None:
            (token_answer_span_start, token_answer_span_end) = (-1, -1)
        else:
            (token_answer_span_start,
             token_answer_span_end), _ = char_span_to_token_span(
                 [(t.idx, t.idx + len(sanitize_wordpiece(t.text)))
                  if t.idx is not None else None for t in tokenized_context],
                 (first_answer_offset, first_answer_offset + len(answers[0])),
             )

        # Tokenize the question
        tokenized_question = self._tokenizer.tokenize(question)
        tokenized_question = tokenized_question[:self.max_query_length]
        for token in tokenized_question:
            token.type_id = self.non_content_type_id
            token.idx = None

        # Stride over the context, making instances
        # Sequences are [CLS] question [SEP] [SEP] context [SEP], hence the - 4 for four special tokens.
        # This is technically not correct for anything but RoBERTa, but it does not affect the scores.
        space_for_context = self.length_limit - len(tokenized_question) - 4
        stride_start = 0
        while True:
            tokenized_context_window = tokenized_context[stride_start:]
            tokenized_context_window = tokenized_context_window[:
                                                                space_for_context]

            window_token_answer_span = (
                token_answer_span_start - stride_start,
                token_answer_span_end - stride_start,
            )
            if any(i < 0 or i >= len(tokenized_context_window)
                   for i in window_token_answer_span):
                # The answer is not contained in the window.
                window_token_answer_span = None

            if not self.skip_invalid_examples or window_token_answer_span is not None:
                additional_metadata = {"id": qid}
                instance = self.text_to_instance(
                    question,
                    tokenized_question,
                    context,
                    tokenized_context_window,
                    answers,
                    window_token_answer_span,
                    additional_metadata,
                )
                yield instance

            stride_start += space_for_context
            if stride_start >= len(tokenized_context):
                break
            stride_start -= self.stride