Example #1
0
    def text_to_instance(
        self,  # type: ignore
        question_text: str,
        passage_text: str,
        char_spans: List[Tuple[int, int]] = None,
        answer_texts: List[str] = None,
        passage_tokens: List[Token] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Optional[Instance]:

        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        question_tokens = self._tokenizer.tokenize(question_text)
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]
        char_spans = char_spans or []
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        token_spans: List[Tuple[int, int]] = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for char_span_start, char_span_end in char_spans:
            if char_span_end > passage_offsets[-1][1]:
                continue
            (span_start, span_end), error = utils.char_span_to_token_span(
                passage_offsets, (char_span_start, char_span_end))
            if error:
                logger.debug("Passage: %s", passage_text)
                logger.debug("Passage tokens: %s", passage_tokens)
                logger.debug("Question text: %s", question_text)
                logger.debug("Answer span: (%d, %d)", char_span_start,
                             char_span_end)
                logger.debug("Token span: (%d, %d)", span_start, span_end)
                logger.debug("Tokens in answer: %s",
                             passage_tokens[span_start:span_end + 1])
                logger.debug("Answer: %s",
                             passage_text[char_span_start:char_span_end])
            token_spans.append((span_start, span_end))
        # The original answer is filtered out
        if char_spans and not token_spans:
            if self.skip_invalid_examples:
                return None
            else:
                token_spans.append(
                    (len(passage_tokens) - 1, len(passage_tokens) - 1))
        return utils.make_reading_comprehension_instance(
            question_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            token_spans,
            answer_texts,
            additional_metadata,
        )
Example #2
0
    def text_to_instance(
        self,  # type: ignore
        question_text: str,
        passage_text: str,
        token_spans: List[Tuple[int, int]] = None,
        answer_texts: List[str] = None,
        question_tokens: List[Token] = None,
        passage_tokens: List[Token] = None,
    ) -> Instance:

        if not question_tokens:
            question_tokens = self._tokenizer.tokenize(question_text)
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
        return utils.make_reading_comprehension_instance(
            question_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            token_spans,
            answer_texts,
        )
Example #3
0
    def text_to_instance(
        self,  # type: ignore
        question_text: str,
        passage_text: str,
        question_id: str = None,
        passage_id: str = None,
        answer_annotations: List[Dict] = None,
        passage_tokens: List[Token] = None,
    ) -> Union[Instance, None]:

        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
            passage_tokens = split_tokens_by_hyphen(passage_tokens)
        question_tokens = self._tokenizer.tokenize(question_text)
        question_tokens = split_tokens_by_hyphen(question_tokens)
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]

        answer_type: str = None
        answer_texts: List[str] = []
        if answer_annotations:
            # Currently we only use the first annotated answer here, but actually this doesn't affect
            # the training, because we only have one annotation for the train set.
            answer_type, answer_texts = self.extract_answer_info_from_annotation(
                answer_annotations[0])

        # Tokenize the answer text in order to find the matched span based on token
        tokenized_answer_texts = []
        for answer_text in answer_texts:
            answer_tokens = self._tokenizer.tokenize(answer_text)
            answer_tokens = split_tokens_by_hyphen(answer_tokens)
            tokenized_answer_texts.append(" ".join(token.text
                                                   for token in answer_tokens))

        if self.instance_format == "squad":
            valid_passage_spans = (self.find_valid_spans(
                passage_tokens, tokenized_answer_texts)
                                   if tokenized_answer_texts else [])
            if not valid_passage_spans:
                if "passage_span" in self.skip_when_all_empty:
                    return None
                else:
                    valid_passage_spans.append(
                        (len(passage_tokens) - 1, len(passage_tokens) - 1))
            return make_reading_comprehension_instance(
                question_tokens,
                passage_tokens,
                self._token_indexers,
                passage_text,
                valid_passage_spans,
                # this `answer_texts` will not be used for evaluation
                answer_texts,
                additional_metadata={
                    "original_passage": passage_text,
                    "original_question": question_text,
                    "passage_id": passage_id,
                    "question_id": question_id,
                    "valid_passage_spans": valid_passage_spans,
                    "answer_annotations": answer_annotations,
                },
            )
        elif self.instance_format == "bert":
            question_concat_passage_tokens = question_tokens + [
                Token("[SEP]")
            ] + passage_tokens
            valid_passage_spans = []
            for span in self.find_valid_spans(passage_tokens,
                                              tokenized_answer_texts):
                # This span is for `question + [SEP] + passage`.
                valid_passage_spans.append(
                    (span[0] + len(question_tokens) + 1,
                     span[1] + len(question_tokens) + 1))
            if not valid_passage_spans:
                if "passage_span" in self.skip_when_all_empty:
                    return None
                else:
                    valid_passage_spans.append((
                        len(question_concat_passage_tokens) - 1,
                        len(question_concat_passage_tokens) - 1,
                    ))
            answer_info = {
                "answer_texts":
                answer_texts,  # this `answer_texts` will not be used for evaluation
                "answer_passage_spans": valid_passage_spans,
            }
            return self.make_bert_drop_instance(
                question_tokens,
                passage_tokens,
                question_concat_passage_tokens,
                self._token_indexers,
                passage_text,
                answer_info,
                additional_metadata={
                    "original_passage": passage_text,
                    "original_question": question_text,
                    "passage_id": passage_id,
                    "question_id": question_id,
                    "answer_annotations": answer_annotations,
                },
            )
        elif self.instance_format == "drop":
            numbers_in_passage = []
            number_indices = []
            for token_index, token in enumerate(passage_tokens):
                number = self.convert_word_to_number(token.text)
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(token_index)
            # hack to guarantee minimal length of padded number
            numbers_in_passage.append(0)
            number_indices.append(-1)
            numbers_as_tokens = [
                Token(str(number)) for number in numbers_in_passage
            ]

            valid_passage_spans = (self.find_valid_spans(
                passage_tokens, tokenized_answer_texts)
                                   if tokenized_answer_texts else [])
            valid_question_spans = (self.find_valid_spans(
                question_tokens, tokenized_answer_texts)
                                    if tokenized_answer_texts else [])

            target_numbers = []
            # `answer_texts` is a list of valid answers.
            for answer_text in answer_texts:
                number = self.convert_word_to_number(answer_text)
                if number is not None:
                    target_numbers.append(number)
            valid_signs_for_add_sub_expressions: List[List[int]] = []
            valid_counts: List[int] = []
            if answer_type in ["number", "date"]:
                valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions(
                    numbers_in_passage, target_numbers)
            if answer_type in ["number"]:
                # Currently we only support count number 0 ~ 9
                numbers_for_count = list(range(10))
                valid_counts = self.find_valid_counts(numbers_for_count,
                                                      target_numbers)

            type_to_answer_map = {
                "passage_span": valid_passage_spans,
                "question_span": valid_question_spans,
                "addition_subtraction": valid_signs_for_add_sub_expressions,
                "counting": valid_counts,
            }

            if self.skip_when_all_empty and not any(
                    type_to_answer_map[skip_type]
                    for skip_type in self.skip_when_all_empty):
                return None

            answer_info = {
                "answer_texts":
                answer_texts,  # this `answer_texts` will not be used for evaluation
                "answer_passage_spans": valid_passage_spans,
                "answer_question_spans": valid_question_spans,
                "signs_for_add_sub_expressions":
                valid_signs_for_add_sub_expressions,
                "counts": valid_counts,
            }

            return self.make_marginal_drop_instance(
                question_tokens,
                passage_tokens,
                numbers_as_tokens,
                number_indices,
                self._token_indexers,
                passage_text,
                answer_info,
                additional_metadata={
                    "original_passage": passage_text,
                    "original_question": question_text,
                    "original_numbers": numbers_in_passage,
                    "passage_id": passage_id,
                    "question_id": question_id,
                    "answer_info": answer_info,
                    "answer_annotations": answer_annotations,
                },
            )
        else:
            raise ValueError(
                f'Expect the instance format to be "drop", "squad" or "bert", '
                f"but got {self.instance_format}")
    def text_to_instance(
        self,  # type: ignore
        question_text: str,
        passage_text: str,
        is_impossible: bool = None,
        char_spans: List[Tuple[int, int]] = None,
        answer_texts: List[str] = None,
        passage_tokens: List[Token] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Optional[Instance]:
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)

        if self.no_answer_token is not None:
            if self.passage_length_limit is not None:
                passage_tokens = passage_tokens[:self.passage_length_limit - 1]
            passage_tokens = passage_tokens + [
                Token(
                    text=self.no_answer_token,
                    idx=passage_tokens[-1].idx + len(passage_tokens[-1].text) +
                    1,  # type: ignore
                    lemma_=self.no_answer_token,
                )
            ]
        elif self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]

        question_tokens = self._tokenizer.tokenize(question_text)
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]

        if is_impossible:
            if self.no_answer_token is None:
                raise ValueError(
                    "This is a SQuAD 2.0 dataset, yet your using a SQuAD reader has 'no_answer_token' "
                    "set to `None`. "
                    "Consider specifying the 'no_answer_token' or using the 'squad2' reader instead, which "
                    f"by default uses '{SQUAD2_NO_ANSWER_TOKEN}' as the 'no_answer_token'."
                )
            answer_texts = [self.no_answer_token]
            token_spans: List[Tuple[int, int]] = [(len(passage_tokens) - 1,
                                                   len(passage_tokens) - 1)]
        else:
            char_spans = char_spans or []
            # We need to convert character indices in `passage_text` to token indices in
            # `passage_tokens`, as the latter is what we'll actually use for supervision.
            token_spans = []
            passage_offsets = [(token.idx, token.idx + len(token.text))
                               for token in passage_tokens]
            for char_span_start, char_span_end in char_spans:
                if char_span_end > passage_offsets[-1][1]:
                    continue
                (span_start, span_end), error = utils.char_span_to_token_span(
                    passage_offsets, (char_span_start, char_span_end))
                if error:
                    logger.debug("Passage: %s", passage_text)
                    logger.debug("Passage tokens (with no-answer): %s",
                                 passage_tokens)
                    logger.debug("Question text: %s", question_text)
                    logger.debug("Answer span: (%d, %d)", char_span_start,
                                 char_span_end)
                    logger.debug("Token span: (%d, %d)", span_start, span_end)
                    logger.debug(
                        "Tokens in answer: %s",
                        passage_tokens[span_start:span_end + 1],
                    )
                    logger.debug("Answer: %s",
                                 passage_text[char_span_start:char_span_end])
                token_spans.append((span_start, span_end))
            # The original answer is filtered out
            if char_spans and not token_spans:
                if self.skip_impossible_questions:
                    return None
                else:
                    if self.no_answer_token is not None:
                        answer_texts = [self.no_answer_token]
                    token_spans.append((
                        len(passage_tokens) - 1,
                        len(passage_tokens) - 1,
                    ))
        return utils.make_reading_comprehension_instance(
            question_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            token_spans,
            answer_texts,
            additional_metadata,
        )