Ejemplo n.º 1
0
    def text_to_instance(
        self,  # type: ignore
        question_text_list: List[str],
        passage_text: str,
        start_span_list: List[List[int]] = None,
        end_span_list: List[List[int]] = None,
        passage_tokens: List[Token] = None,
        yesno_list: List[int] = None,
        followup_list: List[int] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Instance:

        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        answer_token_span_list = []
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        for start_list, end_list in zip(start_span_list, end_span_list):
            token_spans: List[Tuple[int, int]] = []
            for char_span_start, char_span_end in zip(start_list, end_list):
                (span_start, span_end), error = util.char_span_to_token_span(
                    passage_offsets, (char_span_start, char_span_end))
                if error:
                    logger.debug("Passage: %s", passage_text)
                    logger.debug("Passage tokens: %s", passage_tokens)
                    logger.debug("Answer span: (%d, %d)", char_span_start,
                                 char_span_end)
                    logger.debug("Token span: (%d, %d)", span_start, span_end)
                    logger.debug("Tokens in answer: %s",
                                 passage_tokens[span_start:span_end + 1])
                    logger.debug("Answer: %s",
                                 passage_text[char_span_start:char_span_end])
                token_spans.append((span_start, span_end))
            answer_token_span_list.append(token_spans)
        question_list_tokens = [
            self._tokenizer.tokenize(q) for q in question_text_list
        ]
        # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
        additional_metadata["answer_texts_list"] = [
            util.handle_cannot(ans_list)
            for ans_list in additional_metadata["answer_texts_list"]
        ]
        return util.make_reading_comprehension_instance_quac(
            question_list_tokens,
            passage_tokens,
            self._token_indexers,
            passage_text,
            answer_token_span_list,
            yesno_list,
            followup_list,
            additional_metadata,
            self._num_context_answers,
        )
Ejemplo n.º 2
0
 def text_to_instance(self,  # type: ignore
                      question_text_list: List[str],
                      passage_text: str,
                      start_span_list: List[List[int]] = None,
                      end_span_list: List[List[int]] = None,
                      passage_tokens: List[Token] = None,
                      yesno_list: List[int] = None,
                      followup_list: List[int] = None,
                      additional_metadata: Dict[str, Any] = None) -> Instance:
     # pylint: disable=arguments-differ
     # We need to convert character indices in `passage_text` to token indices in
     # `passage_tokens`, as the latter is what we'll actually use for supervision.
     answer_token_span_list = []
     passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
     for start_list, end_list in zip(start_span_list, end_span_list):
         token_spans: List[Tuple[int, int]] = []
         for char_span_start, char_span_end in zip(start_list, end_list):
             (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                          (char_span_start, char_span_end))
             if error:
                 logger.debug("Passage: %s", passage_text)
                 logger.debug("Passage tokens: %s", passage_tokens)
                 logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                 logger.debug("Token span: (%d, %d)", span_start, span_end)
                 logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                 logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
             token_spans.append((span_start, span_end))
         answer_token_span_list.append(token_spans)
     question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list]
     # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
     additional_metadata['answer_texts_list'] = [util.handle_cannot(ans_list) for ans_list \
                                                 in additional_metadata['answer_texts_list']]
     return util.make_reading_comprehension_instance_quac(question_list_tokens,
                                                          passage_tokens,
                                                          self._token_indexers,
                                                          passage_text,
                                                          answer_token_span_list,
                                                          yesno_list,
                                                          followup_list,
                                                          additional_metadata,
                                                          self._num_context_answers)