def test_squad_with_unwordpieceable_passage(self): tokenizer = SpacyTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ( "There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:" ) question1 = "Who released A Study of High Definition Television Systems?" passage2 = ( "Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" from allennlp.data.dataset_readers.reading_comprehension.util import ( make_reading_comprehension_instance, ) instance1 = make_reading_comprehension_instance( tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1, ) instance2 = make_reading_comprehension_instance( tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2, ) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"] ptokens = tensor_dict["passage"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"]) _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def test_squad_with_unwordpieceable_passage(self): # pylint: disable=line-too-long tokenizer = WordTokenizer() token_indexer = PretrainedBertIndexer("bert-base-uncased") passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, " "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:") question1 = "Who released A Study of High Definition Television Systems?" passage2 = ("Broca, being what today would be called a neurosurgeon, " "had taken an interest in the pathology of speech. He wanted " "to localize the difference between man and the other animals, " "which appeared to reside in speech. He discovered the speech " "center of the human brain, today called Broca's area after him. " "His interest was mainly in Biological anthropology, but a German " "philosopher specializing in psychology, Theodor Waitz, took up the " "theme of general and social anthropology in his six-volume work, " "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was " """soon translated as "The Anthropology of Primitive Peoples". """ "The last two volumes were published posthumously.") question2 = "What did Broca discover in the human brain?" from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1), tokenizer.tokenize(passage1), {"bert": token_indexer}, passage1) instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2), tokenizer.tokenize(passage2), {"bert": token_indexer}, passage2) vocab = Vocabulary() batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) qtokens = tensor_dict["question"] ptokens = tensor_dict["passage"] config = BertConfig(len(token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"]) _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def text_to_instance(self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) return util.make_reading_comprehension_instance(self._claim_tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def text_to_instance(self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) return util.make_reading_comprehension_instance(self._tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, answer: bool = None, passage_tokens: List[Token] = None, additional_metadata: Dict[str, Any] = None, ) -> Optional[Instance]: if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) question_tokens = self._tokenizer.tokenize(question_text) if self.passage_length_limit is not None: passage_tokens = passage_tokens[:self.passage_length_limit] if self.question_length_limit is not None: question_tokens = question_tokens[:self.question_length_limit] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. # The original answer is filtered out return util.make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, answer, passage_text, additional_metadata, )
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None, additional_metadata: Dict[str, Any] = None, ) -> Optional[Instance]: if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) question_tokens = self._tokenizer.tokenize(question_text) if self.passage_length_limit is not None: passage_tokens = passage_tokens[:self.passage_length_limit] if self.question_length_limit is not None: question_tokens = question_tokens[:self.question_length_limit] char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: if char_span_end > passage_offsets[-1][1]: continue (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) # The original answer is filtered out if char_spans and not token_spans: if self.skip_invalid_examples: return None else: token_spans.append( (len(passage_tokens) - 1, len(passage_tokens) - 1)) return util.make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts, additional_metadata, )
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None, max_passage_len: int = None, max_question_len: int = None, drop_invalid: bool = False) -> Optional[Instance]: """ We cut the passage and question according to `max_passage_len` and `max_question_len` here. We will drop the invalid examples if `drop_invalid` equals to true. """ # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) question_tokens = self._tokenizer.tokenize(question_text) if max_passage_len is not None: passage_tokens = passage_tokens[:max_passage_len] if max_question_len is not None: question_tokens = question_tokens[:max_question_len] char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: if char_span_end > passage_offsets[-1][1]: continue (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) if not token_spans: if drop_invalid: return None else: token_spans.append((0, 0)) return util.make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] fields = {} # if not has_answer: # question_tokens = self._tokenizer.tokenize(question_text) # passage_field = TextField(passage_tokens, self._token_indexers) # fields['passage'] = passage_field # fields['question'] = TextField(question_tokens, self._token_indexers) # metadata = { # 'original_passage': passage_text, # 'token_offsets': None, # 'question_tokens': [token.text for token in question_tokens], # 'passage_tokens': [token.text for token in passage_tokens] # } # fields['span_start'] = IndexField(-1, passage_field.empty_field()) # fields['span_end'] = IndexField(-1, passage_field.empty_field()) # return Instance(fields) # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for char_span_start, char_span_end in char_spans: (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) return util.make_reading_comprehension_instance( self._tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, token_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, question_tokens: List[Token] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not question_tokens: question_tokens = self._tokenizer.tokenize(question_text) if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) return util.make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def text_to_instance(self, # type: ignore question_text: str, passage_text: str, token_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, question_tokens: List[Token] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not question_tokens: question_tokens = self._tokenizer.tokenize(question_text) if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) return util.make_reading_comprehension_instance(question_tokens, passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None, passage_tokens: List[Token] = None) -> Union[Instance, None]: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) passage_tokens = split_tokens_by_hyphen(passage_tokens) question_tokens = self._tokenizer.tokenize(question_text) question_tokens = split_tokens_by_hyphen(question_tokens) # passage_text = question_text # passage_tokens = question_tokens if self.passage_length_limit is not None: passage_tokens = passage_tokens[:self.passage_length_limit] if self.question_length_limit is not None: question_tokens = question_tokens[:self.question_length_limit] answer_type, answer_texts = None, [] if answer_annotations: # Currently we only use the first annotated answer here, but actually this doesn't affect # the training, because we only have one annotation for the train set. answer_type, answer_texts = self.extract_answer_info_from_annotation( answer_annotations[0]) # Tokenize the answer text in order to find the matched span based on token tokenized_answer_texts = [] for answer_text in answer_texts: answer_tokens = self._tokenizer.tokenize(answer_text) answer_tokens = split_tokens_by_hyphen(answer_tokens) tokenized_answer_texts.append(' '.join(token.text for token in answer_tokens)) if self.instance_format == "squad": valid_passage_spans = \ self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] if not valid_passage_spans: if "passage_span" in self.skip_when_all_empty: return None else: valid_passage_spans.append( (len(passage_tokens) - 1, len(passage_tokens) - 1)) return make_reading_comprehension_instance( question_tokens, passage_tokens, self._token_indexers, passage_text, valid_passage_spans, # this `answer_texts` will not be used for evaluation answer_texts, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "passage_id": passage_id, "question_id": question_id, "valid_passage_spans": valid_passage_spans, "answer_annotations": answer_annotations }) elif self.instance_format == "bert": question_concat_passage_tokens = question_tokens + [ Token("[SEP]") ] + passage_tokens valid_passage_spans = [] for span in self.find_valid_spans(passage_tokens, tokenized_answer_texts): # This span is for `question + [SEP] + passage`. valid_passage_spans.append( (span[0] + len(question_tokens) + 1, span[1] + len(question_tokens) + 1)) if not valid_passage_spans: if "passage_span" in self.skip_when_all_empty: return None else: valid_passage_spans.append( (len(question_concat_passage_tokens) - 1, len(question_concat_passage_tokens) - 1)) answer_info = { "answer_texts": answer_texts, # this `answer_texts` will not be used for evaluation "answer_passage_spans": valid_passage_spans } return self.make_bert_drop_instance(question_tokens, passage_tokens, question_concat_passage_tokens, self._token_indexers, passage_text, answer_info, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "passage_id": passage_id, "question_id": question_id, "answer_annotations": answer_annotations }) elif self.instance_format == "drop": numbers_in_passage = [] number_indices = [] for token_index, token in enumerate(passage_tokens): number = self.convert_word_to_number(token.text) if number is not None: numbers_in_passage.append(number) number_indices.append(token_index) # hack to guarantee minimal length of padded number numbers_in_passage.append(0) number_indices.append(-1) numbers_as_tokens = [ Token(str(number)) for number in numbers_in_passage ] valid_passage_spans = \ self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] valid_question_spans = \ self.find_valid_spans(question_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] target_numbers = [] # `answer_texts` is a list of valid answers. for answer_text in answer_texts: number = self.convert_word_to_number(answer_text) if number is not None: target_numbers.append(number) valid_signs_for_add_sub_expressions = [] valid_counts = [] if answer_type in ["number", "date"]: valid_signs_for_add_sub_expressions = \ self.find_valid_add_sub_expressions(numbers_in_passage, target_numbers) if answer_type in ["number"]: # Currently we only support count number 0 ~ 9 numbers_for_count = list(range(10)) valid_counts = self.find_valid_counts(numbers_for_count, target_numbers) type_to_answer_map = { "passage_span": valid_passage_spans, "question_span": valid_question_spans, "addition_subtraction": valid_signs_for_add_sub_expressions, "counting": valid_counts } if self.skip_when_all_empty \ and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty): return None answer_info = { "answer_texts": answer_texts, # this `answer_texts` will not be used for evaluation "answer_passage_spans": valid_passage_spans, "answer_question_spans": valid_question_spans, "signs_for_add_sub_expressions": valid_signs_for_add_sub_expressions, "counts": valid_counts } return self.make_marginal_drop_instance(question_tokens, passage_tokens, numbers_as_tokens, number_indices, self._token_indexers, passage_text, answer_info, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage, "passage_id": passage_id, "question_id": question_id, "answer_info": answer_info, "answer_annotations": answer_annotations }) else: raise ValueError( f"Expect the instance format to be \"drop\", \"squad\" or \"bert\", " f"but got {self.instance_format}")
def text_to_instance( self, # type: ignore question_text: str, passage_text: str, char_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None) -> Instance: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) char_spans = char_spans or [] # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. token_spans: List[Tuple[int, int]] = [] passage_offsets = [(token.idx, token.idx + len(token.text.replace("_", ""))) for token in passage_tokens] """ with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'w', encoding='utf-8') as f: f.write(question_text) f.write('\n') f.write(passage_text) f.write("\n") for x in passage_tokens: f.write(x.text) f.write(" ") f.write('\n') for x in answer_texts: f.write(x) f.write("\n") f.write("\n") for i, (start, end) in enumerate(passage_offsets): f.write(str(i)+": ") f.write(passage_text[start:end]) f.write(" ") f.write(str(start)+" "+str(end)) f.write(" "+passage_tokens[i].text) f.write("\n") f.write("\n") f.write("\nanswers\n") """ for char_span_start, char_span_end in char_spans: #try: (span_start, span_end), error = util.char_span_to_token_span( passage_offsets, (char_span_start, char_span_end)) """ with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'a', encoding='utf-8') as f: f.write(str([x.text for x in passage_tokens[span_start:span_end+1]])) f.write("\n") except: with open("/home/kz918/bpe/eval/bidaf/error.txt", 'w', encoding='utf-8') as f: f.write(question_text) f.write('\n') f.write(passage_text) f.write("\n") for x in passage_tokens: f.write(x.text) f.write(" ") f.write('\n') for x in answer_texts: f.write(x) f.write("\n") f.write("\n") import pdb; pdb.set_trace() """ if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Question text: %s", question_text) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) """ with open("/home/kz918/bpe/eval/bidaf/debug.txt", 'a', encoding='utf-8') as f: f.write("\n") f.write("\nspans\n") for start, end in token_spans: f.write(str(start)+" "+str(end)+"\n") f.write("\n") """ #import pdb; pdb.set_trace() return util.make_reading_comprehension_instance( self._tokenizer.tokenize(question_text), passage_tokens, self._token_indexers, passage_text, token_spans, answer_texts)