def text_to_instance(self, # type: ignore question_text: str, passage_text: str, char_spans_sent: List[Tuple[int, int]] = None, sent_labels: List[int] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None, passage_offsets: List[Tuple] = None) -> Instance: token_spans_sent: List[Tuple[int, int]] = [] for char_span_sent_start, char_span_sent_end in char_spans_sent: (span_start_sent, span_end_sent), error = util.char_span_to_token_span(passage_offsets, (char_span_sent_start, char_span_sent_end)) token_spans_sent.append((span_start_sent, span_end_sent)) tokenized_ques = self._tokenizer.tokenize(question_text) tokenized_ques = [Token(text=tk.text, idx=tk.idx) for tk in tokenized_ques] return make_reading_comprehension_instance(tokenized_ques, passage_tokens, self._token_indexers, passage_text, token_spans_sent, sent_labels, answer_texts, passage_offsets)
def text_to_instance(self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) return util.make_reading_comprehension_instance(self._tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def text_to_instance( self, # type: ignore tokenized_stand_alone_ques: List[Token], question_text: str, passage_text: str, sent_labels: List[int] = None, answer_texts: List[str] = None, passage_sent_tokens: List[List[Token]] = None, evd_possible_chains: List[List[int]] = None, ans_sent_idxs: List[int] = None, sents_span: List[Tuple[int, int]] = None, sents_offset: List[Tuple] = None, article_id: str = None) -> Instance: # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans_sent: List[Tuple[int, int]] = [] for i, char_span_sent in enumerate(sents_span): char_sent_start, char_sent_end = char_span_sent[0], char_span_sent[ 1] sent_offset = sents_offset[i] total_wordpiece = 0 try: (span_start, span_end), error = util.char_span_to_token_span( sent_offset, (char_sent_start, char_sent_end)) for j, token in enumerate(passage_sent_tokens[i]): total_wordpiece += len( wordpiece_tokenizer(token.text.lower())) if total_wordpiece >= self._word_piece_limit: break if span_end >= j or span_start >= j: span_start = j - 1 if span_start >= j else span_start span_end = j - 1 if span_end >= j else span_end # print(passage_sent_tokens[i]) # print("span start and end:", span_start, span_end+1) # print("tokens:", passage_sent_tokens[i][span_start: span_end]) # print('overflow:', j) # print('total wordpiece:', total_wordpiece) # input() except IndexError: print(sent_offset) print(char_sent_start, char_sent_end) input() token_spans_sent.append((span_start, span_end)) # print(span_start, span_end+1) # print(passage_sent_tokens[i][span_start: span_end]) # input() return make_reading_comprehension_instance(tokenized_stand_alone_ques, passage_sent_tokens, self._token_indexers, passage_text, sent_labels, answer_texts, evd_possible_chains, ans_sent_idxs, token_spans_sent, article_id, para_limit=self._para_limit)
def test_char_span_to_token_span_handles_hard_cases(self): # An earlier version of the code had a hard time when the answer was the last token in the # passage. This tests that case, on the instance that used to fail. tokenizer = SpacyTokenizer() passage = ( "Beyonc\u00e9 is believed to have first started a relationship with Jay Z " + 'after a collaboration on "\'03 Bonnie & Clyde", which appeared on his seventh ' + "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay " + "Z's girlfriend in the music video for the song, which would further fuel " + "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were " + "married without publicity. As of April 2014, the couple have sold a combined 300 " + "million records together. The couple are known for their private relationship, " + "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 " + 'suffered a miscarriage in 2010 or 2011, describing it as "the saddest thing" ' + "she had ever endured. She returned to the studio and wrote music in order to cope " + "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order " + "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris." ) start = 912 end = 912 + len("Paris.") tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] token_span = util.char_span_to_token_span(offsets, (start, end))[0] assert token_span == (184, 185)
def text_to_instance(self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) return util.make_reading_comprehension_instance(self._claim_tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def test_char_span_to_token_span_handles_easy_cases(self): # These are _inclusive_ spans, on both sides. tokenizer = WordTokenizer() passage = "On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " +\ "Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " +\ "nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " +\ "first performances since giving birth to Blue Ivy." tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] # "January 7, 2012" token_span = util.char_span_to_token_span(offsets, (3, 18))[0] assert token_span == (1, 4) # "Lenox Hill Hospital" token_span = util.char_span_to_token_span(offsets, (91, 110))[0] assert token_span == (22, 24) # "Lenox Hill Hospital in New York." token_span = util.char_span_to_token_span(offsets, (91, 123))[0] assert token_span == (22, 28)
def test_char_span_to_token_span_handles_easy_cases(self): # These are _inclusive_ spans, on both sides. tokenizer = WordTokenizer() passage = "On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " +\ "Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " +\ "nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " +\ "first performances since giving birth to Blue Ivy." tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] # "January 7, 2012" token_span = util.char_span_to_token_span(offsets, (3, 18))[0] assert token_span == (1, 4) # "Lenox Hill Hospital" token_span = util.char_span_to_token_span(offsets, (91, 110))[0] assert token_span == (22, 24) # "Lenox Hill Hospital in New York." token_span = util.char_span_to_token_span(offsets, (91, 123))[0] assert token_span == (22, 28)
def text_to_instance(self, # type: ignore question_text: str, passages_texts: List[str], qid: int, answer_texts: List[str] = None, char_spans: List[List[Tuple[int, int]]] = None, max_passage_len: int = None, max_question_len: int = None, drop_invalid: bool = False) -> Optional[Instance]: """ We cut the passage and question according to `max_passage_len` and `max_question_len` here. We will drop the invalid examples if `drop_invalid` equals to true. """ passages_tokens = [self._tokenizer.tokenize(passage_text) for passage_text in passages_texts] question_tokens = self._tokenizer.tokenize(question_text) if max_passage_len is not None: passages_tokens = [passage_tokens[:max_passage_len] for passage_tokens in passages_tokens] if max_question_len is not None: question_tokens = question_tokens[: max_question_len] char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. passages_offsets = [[(token.idx, token.idx + len(token.text)) for token in passage_tokens] for passage_tokens in passages_tokens] token_spans = [] for passage_id, span_in_passage in enumerate(char_spans): passage_offsets = passages_offsets[passage_id] passage_token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in span_in_passage: if char_span_end > passage_offsets[-1][1]: continue (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passages_texts[passage_id]) logger.debug("Passage tokens: %s", passages_tokens[passage_id]) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passages_tokens[passage_id][span_start:span_end + 1]) logger.debug("Answer: %s", passages_texts[passage_id][char_span_start:char_span_end]) passage_token_spans.append((span_start, span_end)) if not passage_token_spans: if drop_invalid: return None else: passage_token_spans.append((-1, -1)) token_spans.append(passage_token_spans) return self.make_MSMARCO_MultiPassage_instance(question_tokens, passages_tokens, self._token_indexers, passages_texts, qid, token_spans, answer_texts)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None, additional_metadata: Dict[str, Any] = None, ) -> Optional[Instance]: if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) question_tokens = self._tokenizer.tokenize(question_text) if self.passage_length_limit is not None: passage_tokens = passage_tokens[:self.passage_length_limit] if self.question_length_limit is not None: question_tokens = question_tokens[:self.question_length_limit] char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: if char_span_end > passage_offsets[-1][1]: continue (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) # The original answer is filtered out if char_spans and not token_spans: if self.skip_invalid_examples: return None else: token_spans.append( (len(passage_tokens) - 1, len(passage_tokens) - 1)) return util.make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts, additional_metadata, )
def text_to_instance( self, # type: ignore question_text_list: List[str], passage_text: str, start_span_list: List[List[int]] = None, end_span_list: List[List[int]] = None, passage_tokens: List[Token] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None, ) -> Instance: # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. answer_token_span_list = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for start_list, end_list in zip(start_span_list, end_span_list): token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in zip(start_list, end_list): (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) answer_token_span_list.append(token_spans) question_list_tokens = [ self._tokenizer.tokenize(q) for q in question_text_list ] # Map answer texts to "CANNOTANSWER" if more than half of them marked as so. additional_metadata["answer_texts_list"] = [ util.handle_cannot(ans_list) for ans_list in additional_metadata["answer_texts_list"] ] return util.make_reading_comprehension_instance_quac( question_list_tokens, passage_tokens, self._token_indexers, passage_text, answer_token_span_list, yesno_list, followup_list, additional_metadata, self._num_context_answers, )
def get_gold_token_spans(tokenizer, gold_char_spans, context): # Adapted from AllenNLP passage_tokens = tokenizer.tokenize(context) token_spans = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in gold_char_spans: if char_span_end > passage_offsets[-1][1]: continue (span_start, span_end), error = char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) token_spans.append((span_start, span_end)) return token_spans
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None, max_passage_len: int = None, max_question_len: int = None, drop_invalid: bool = False) -> Optional[Instance]: """ We cut the passage and question according to `max_passage_len` and `max_question_len` here. We will drop the invalid examples if `drop_invalid` equals to true. """ # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) question_tokens = self._tokenizer.tokenize(question_text) if max_passage_len is not None: passage_tokens = passage_tokens[:max_passage_len] if max_question_len is not None: question_tokens = question_tokens[:max_question_len] char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: if char_span_end > passage_offsets[-1][1]: continue (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) if not token_spans: if drop_invalid: return None else: token_spans.append((0, 0)) return util.make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def test_char_span_to_token_span_handles_out_of_bounds_start_end(self): tokenizer = SpacyTokenizer() passage = "This sentence is just for testing purposes" tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] # scenario 1: negative start character span (this should really never happen) start = -1 end = start + len("This") expected_span = (0, 0) token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error # scenario 2: end character span exceeds sentence length, for whichever reason start = 34 end = start + len("purposes") + 1 expected_span = (6, 6) token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] fields = {} # if not has_answer: # question_tokens = self._tokenizer.tokenize(question_text) # passage_field = TextField(passage_tokens, self._token_indexers) # fields['passage'] = passage_field # fields['question'] = TextField(question_tokens, self._token_indexers) # metadata = { # 'original_passage': passage_text, # 'token_offsets': None, # 'question_tokens': [token.text for token in question_tokens], # 'passage_tokens': [token.text for token in passage_tokens] # } # fields['span_start'] = IndexField(-1, passage_field.empty_field()) # fields['span_end'] = IndexField(-1, passage_field.empty_field()) # return Instance(fields) # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) return util.make_reading_comprehension_instance( self._tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def test_char_span_to_token_span_handles_undertokenization(self): tokenizer = SpacyTokenizer() passage = "This sentence will have two under tokenized tokens, one#here and one at the#end" tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] # scenario 1: under tokenized in the middle of the sentence, look for the first part of the token start = 52 end = start + len("one") expected_span = (9, 9) # the indices of the whole "one&here" token should be returned token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error # scenario 2: under tokenized in the middle of the sentence, look for the second part of the token start = 56 end = start + len("here") expected_span = (9, 9) # the indices of the whole "one&here" token should be returned token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error # scenario 3: under tokenized at the end of the sentence, look for the first part of the token start = 72 end = start + len("the") expected_span = (13, 13) # the indices of the whole "the&end" token should be returned token_span, error = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error # scenario 4: under tokenized at the end of the sentence, look for the second part of the token # this used to cause an IndexError start = 76 end = start + len("end") expected_span = (13, 13) # the indices of the whole "the&end" token should be returned token_span, errory = util.char_span_to_token_span(offsets, (start, end)) assert token_span == expected_span assert error
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, para_sent_char_spans: List[List[Tuple[int, int]]] = None, sent_labels: List[int] = None, answer_texts: List[str] = None, passage_para_tokens: List[List[Token]] = None, passage_para_offsets: List[List[Tuple]] = None, evd_possible_chains: List[List[int]] = None, ans_sent_idxs: List[int] = None, article_id: str = None) -> Instance: # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans_passage_para: List[List[Tuple[int, int]]] = [] for para_offset, para_char_spans, para_tokens in zip( passage_para_offsets, para_sent_char_spans, passage_para_tokens): token_spans_para_sent: List[Tuple[int, int]] = [] # print(para_char_spans) for char_span_sent_start, char_span_sent_end in zip( para_char_spans[0], para_char_spans[1]): (span_start_sent, span_end_sent), error = util.char_span_to_token_span( para_offset, (char_span_sent_start, char_span_sent_end)) # print(para_tokens[span_start_sent: span_end_sent+1]) token_spans_para_sent.append((span_start_sent, span_end_sent)) token_spans_passage_para.append(token_spans_para_sent) # print(token_spans_passage_para) tokenized_ques = self._tokenizer.tokenize(question_text) tokenized_ques = [ Token(text=tk.text, idx=tk.idx) for tk in tokenized_ques ] return make_reading_comprehension_instance(tokenized_ques, passage_para_tokens, self._token_indexers, passage_text, token_spans_passage_para, sent_labels, answer_texts, passage_para_offsets, evd_possible_chains, ans_sent_idxs, article_id, para_limit=self._para_limit)
def text_to_instance(self, # type: ignore question_text: str, passage_text: str, question_id: str = None, answer_text: str = None, char_span_start: int = None, passage_tokens: List[Token] = None, answer_texts: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] question_tokens = self._tokenizer.tokenize(question_text) # Separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, self._token_indexers) fields['passage'] = passage_field fields['question'] = TextField(question_tokens, self._token_indexers) if answer_text: # SQuAD gives answer annotations as a character index into the paragraph, but we need a # token index for our models. We convert them here. char_span_end = char_span_start + len(answer_text) (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", answer_text) fields['span_start'] = IndexField(span_start, passage_field) fields['span_end'] = IndexField(span_end, passage_field) metadata = { 'original_passage': passage_text, 'token_offsets': passage_offsets } if question_id: metadata['question_id'] = question_id if answer_texts: metadata['answer_texts'] = answer_texts fields['metadata'] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, # type: ignore question_text_list: List[str], passage_text: str, start_span_list: List[List[int]] = None, end_span_list: List[List[int]] = None, passage_tokens: List[Token] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None) -> Instance: # pylint: disable=arguments-differ # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. answer_token_span_list = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for start_list, end_list in zip(start_span_list, end_span_list): token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in zip(start_list, end_list): (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) answer_token_span_list.append(token_spans) question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list] # Map answer texts to "CANNOTANSWER" if more than half of them marked as so. additional_metadata['answer_texts_list'] = [util.handle_cannot(ans_list) for ans_list \ in additional_metadata['answer_texts_list']] return util.make_reading_comprehension_instance_quac(question_list_tokens, passage_tokens, self._token_indexers, passage_text, answer_token_span_list, yesno_list, followup_list, additional_metadata, self._num_context_answers)
def test_char_span_to_token_span_handles_hard_cases(self): # An earlier version of the code had a hard time when the answer was the last token in the # passage. This tests that case, on the instance that used to fail. tokenizer = WordTokenizer() passage = "Beyonc\u00e9 is believed to have first started a relationship with Jay Z " +\ "after a collaboration on \"'03 Bonnie & Clyde\", which appeared on his seventh " +\ "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay " +\ "Z's girlfriend in the music video for the song, which would further fuel " +\ "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were " +\ "married without publicity. As of April 2014, the couple have sold a combined 300 " +\ "million records together. The couple are known for their private relationship, " +\ "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 " +\ "suffered a miscarriage in 2010 or 2011, describing it as \"the saddest thing\" " +\ "she had ever endured. She returned to the studio and wrote music in order to cope " +\ "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order " +\ "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris." start = 912 end = 912 + len("Paris.") tokens = tokenizer.tokenize(passage) offsets = [(t.idx, t.idx + len(t.text)) for t in tokens] token_span = util.char_span_to_token_span(offsets, (start, end))[0] assert token_span == (184, 185)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text.replace("_", ""))) for token in passage_tokens] """ with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'w', encoding='utf-8') as f: f.write(question_text) f.write('\n') f.write(passage_text) f.write("\n") for x in passage_tokens: f.write(x.text) f.write(" ") f.write('\n') for x in answer_texts: f.write(x) f.write("\n") f.write("\n") for i, (start, end) in enumerate(passage_offsets): f.write(str(i)+": ") f.write(passage_text[start:end]) f.write(" ") f.write(str(start)+" "+str(end)) f.write(" "+passage_tokens[i].text) f.write("\n") f.write("\n") f.write("\nanswers\n") """ for char_span_start, char_span_end in char_spans: #try: (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) """ with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'a', encoding='utf-8') as f: f.write(str([x.text for x in passage_tokens[span_start:span_end+1]])) f.write("\n") except: with open("/home/kz918/bpe/eval/bidaf/error.txt", 'w', encoding='utf-8') as f: f.write(question_text) f.write('\n') f.write(passage_text) f.write("\n") for x in passage_tokens: f.write(x.text) f.write(" ") f.write('\n') for x in answer_texts: f.write(x) f.write("\n") f.write("\n") import pdb; pdb.set_trace() """ if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) """ with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'a', encoding='utf-8') as f: f.write("\n") f.write("\nspans\n") for start, end in token_spans: f.write(str(start)+" "+str(end)+"\n") f.write("\n") """ #import pdb; pdb.set_trace() return util.make_reading_comprehension_instance( self._tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, char_spans_sp: List[Tuple[int, int]] = None, char_spans_sent: List[Tuple[int, int]] = None, sent_labels: List[int] = None, answer_texts: List[str] = None, question_passage_tokens: List[Token] = None, question_passage_offsets: List[Tuple[int, int]] = None, article_id: str = None) -> Instance: # pylint: disable=arguments-differ # if not passage_tokens: # passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] char_spans_sp = char_spans_sp or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] token_spans_sp: List[Tuple[int, int]] = [] token_spans_sent: List[Tuple[int, int]] = [] for char_span_start, char_span_end in char_spans: (span_start, span_end), error = util.char_span_to_token_span( question_passage_offsets, (char_span_start, char_span_end)) # print(span_start, span_end) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", question_passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", question_passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) for char_span_sp_start, char_span_sp_end in char_spans_sp: (span_start_sp, span_end_sp), error = util.char_span_to_token_span( question_passage_offsets, (char_span_sp_start, char_span_sp_end)) token_spans_sp.append((span_start_sp, span_end_sp)) for char_span_sent_start, char_span_sent_end in char_spans_sent: (span_start_sent, span_end_sent), error = util.char_span_to_token_span( question_passage_offsets, (char_span_sent_start, char_span_sent_end)) token_spans_sent.append((span_start_sent, span_end_sent)) tokenized_ques = self._tokenizer.tokenize(question_text) tokenized_ques = [ Token(text=tk.text, idx=tk.idx) for tk in tokenized_ques ] return make_reading_comprehension_instance( tokenized_ques, question_passage_tokens, question_passage_offsets, self._token_indexers, passage_text, token_spans, token_spans_sp, token_spans_sent, sent_labels, answer_texts, additional_metadata={'_id': article_id}, para_limit=self._para_limit)
def text_to_instance( self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], fact_text: str, answer_span: List[str], answer_relations: List[str], answer_starts: List[int] = None, answer_id: int = None, prefetched_sentences: Dict[str, List[str]] = None, prefetched_indices: str = None) -> Instance: fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question_text) fact_tokens = self._tokenizer.tokenize(fact_text) choices_tokens_list = [ self._tokenizer.tokenize(x) for x in choice_text_list ] choice_kb_fields = [] selected_tuples = [] for choice in choice_text_list: kb_fields = [] if self._use_cskg and self._use_elastic_search: max_sents_per_source = int(self._max_tuples / 2) else: max_sents_per_source = self._max_tuples selected_hits = [] if self._use_elastic_search: elastic_search_hits = self.get_elasticsearch_sentences( prefetched_sentences, prefetched_indices, answer_span, choice, question_text, fact_text, max_sents_per_source) selected_hits.extend(elastic_search_hits) if self._use_cskg: cskg_sentences = self.get_cskg_sentences( fact_text, answer_span, choice, max_sents_per_source) selected_hits.extend(cskg_sentences) # add a dummy entry to capture the embedding link if self._ignore_spans: fact_choice_sentence = fact_text + " || " + choice selected_hits.append(fact_choice_sentence) else: for answer in set(answer_span): answer_choice_sentence = answer + " || " + choice selected_hits.append(answer_choice_sentence) selected_tuples.append(selected_hits) for hit_text in selected_hits: kb_fields.append( TextField(self._tokenizer.tokenize(hit_text), self._token_indexers)) choice_kb_fields.append(ListField(kb_fields)) fields["choice_kb"] = ListField(choice_kb_fields) fields['fact'] = TextField(fact_tokens, self._token_indexers) if self._add_relation_labels: if answer_relations and len(answer_relations): relation_fields = [] for relation in set(answer_relations): relation_fields.append( LabelField(relation, label_namespace="relation_labels")) fields["relations"] = ListField(relation_fields) selected_relations = self.collate_relations(answer_relations) fields["relation_label"] = MultiLabelField( selected_relations, "relation_labels") else: fields["relations"] = ListField([ LabelField(-1, label_namespace="relation_labels", skip_indexing=True) ]) fields["relation_label"] = MultiLabelField([], "relation_labels") answer_fields = [] answer_span_fields = [] fact_offsets = [(token.idx, token.idx + len(token.text)) for token in fact_tokens] for idx, answer in enumerate(answer_span): answer_fields.append( TextField(self._tokenizer.tokenize(answer), self._token_indexers)) if answer_starts: if len(answer_starts) <= idx: raise ValueError("Only {} answer_starts in json. " "Expected {} in {}".format( len(answer_starts), len(answer_span), item_id)) offset = answer_starts[idx] else: offset = fact_text.index(answer) if offset == -1: raise ValueError("Span: {} not found in fact: {}".format( answer, fact_text)) tok_span, err = char_span_to_token_span( fact_offsets, (offset, offset + len(answer))) if err: logger.info("Could not find token spans for '{}' in '{}'." "Best guess: {} in {} at {}".format( answer, fact_text, [offset, offset + len(answer)], fact_offsets, tok_span)) answer_span_fields.append( SpanField(tok_span[0], tok_span[1], fields['fact'])) fields["answer_text"] = ListField(answer_fields) fields["answer_spans"] = ListField(answer_span_fields) fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField( [TextField(x, self._token_indexers) for x in choices_tokens_list]) if answer_id is not None: fields['answer_id'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "fact_text": fact_text, "choice_text_list": choice_text_list, "question_tokens": [x.text for x in question_tokens], "fact_tokens": [x.text for x in fact_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], "answer_text": answer_span, "answer_start": answer_starts, "answer_span_fields": [(x.span_start, x.span_end) for x in answer_span_fields], "relations": answer_relations, "selected_tuples": selected_tuples } fields["metadata"] = MetadataField(metadata) return Instance(fields)