def text_to_instance( self, # type: ignore question_text_list: List[str], passage_text: str, start_span_list: List[List[int]] = None, end_span_list: List[List[int]] = None, passage_tokens: List[Token] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None, ) -> Instance: # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. answer_token_span_list = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for start_list, end_list in zip(start_span_list, end_span_list): token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in zip(start_list, end_list): (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) answer_token_span_list.append(token_spans) question_list_tokens = [ self._tokenizer.tokenize(q) for q in question_text_list ] # Map answer texts to "CANNOTANSWER" if more than half of them marked as so. additional_metadata["answer_texts_list"] = [ util.handle_cannot(ans_list) for ans_list in additional_metadata["answer_texts_list"] ] return util.make_reading_comprehension_instance_quac( question_list_tokens, passage_tokens, self._token_indexers, passage_text, answer_token_span_list, yesno_list, followup_list, additional_metadata, self._num_context_answers, )
def text_to_instance(self, # type: ignore question_text_list: List[str], passage_text: str, start_span_list: List[List[int]] = None, end_span_list: List[List[int]] = None, passage_tokens: List[Token] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None) -> Instance: # pylint: disable=arguments-differ # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. answer_token_span_list = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for start_list, end_list in zip(start_span_list, end_span_list): token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in zip(start_list, end_list): (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) answer_token_span_list.append(token_spans) question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list] # Map answer texts to "CANNOTANSWER" if more than half of them marked as so. additional_metadata['answer_texts_list'] = [util.handle_cannot(ans_list) for ans_list \ in additional_metadata['answer_texts_list']] return util.make_reading_comprehension_instance_quac(question_list_tokens, passage_tokens, self._token_indexers, passage_text, answer_token_span_list, yesno_list, followup_list, additional_metadata, self._num_context_answers)