def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset = json.load(dataset_file) logger.info("Reading the dataset") kept_count, skip_count = 0, 0 for passage_id, passage_info in dataset.items(): passage_text = passage_info["passage"] passage_tokens = self._tokenizer.tokenize(passage_text) passage_tokens = split_tokens_by_hyphen(passage_tokens) for question_answer in passage_info["qa_pairs"]: question_id = question_answer["query_id"] question_text = question_answer["question"].strip() answer_annotations = [] if "answer" in question_answer: answer_annotations.append(question_answer["answer"]) if "validated_answers" in question_answer: answer_annotations += question_answer["validated_answers"] instance = self.text_to_instance(question_text, passage_text, question_id, passage_id, answer_annotations, passage_tokens) if instance is not None: kept_count += 1 yield instance else: skip_count += 1 logger.info(f"Skipped {skip_count} questions, kept {kept_count} questions.")
def _read(self, file_path: str): file_path = cached_path(file_path) with open(file_path) as dataset_file: dataset = json.load(dataset_file) for passage_id, passage_info in tqdm(dataset.items()): passage_text = passage_info["passage"].strip() if self.wordpiece_numbers: word_tokens = split_tokens_by_hyphen( self.number_tokenizer.tokenize(passage_text)) else: word_tokens = self.tokenizer.tokenize(passage_text) numbers_in_passage = [] number_indices = [] number_words = [] number_len = [] passage_tokens = [] curr_index = 0 # Get all passage numbers for token in word_tokens: number = self.word_to_num(token.text) wordpieces = self.tokenizer.tokenize(token.text) num_wordpieces = len(wordpieces) if number is not None: numbers_in_passage.append(number) number_indices.append(curr_index) number_words.append(token.text) number_len.append(num_wordpieces) passage_tokens += wordpieces curr_index += num_wordpieces # Process questions from this passage for question_answer in passage_info["qa_pairs"]: question_id = question_answer["query_id"] question_text = question_answer["question"].strip() answer_annotations = [] if "answer" in question_answer: answer_type = get_answer_type(question_answer['answer']) if self.answer_type is not None and get_answer_type( answer_type) not in self.answer_type: continue answer_annotations.append(question_answer["answer"]) if self.question_type is not None and get_question_type( question_text, answer_type, question_answer['answer'], self.max_count) not in self.question_type: continue if self.use_validated and "validated_answers" in question_answer: answer_annotations += question_answer["validated_answers"] instance = self.text_to_instance( question_text, passage_text, passage_tokens, numbers_in_passage, number_words, number_indices, number_len, question_id, passage_id, answer_annotations) if instance is not None: yield instance
def get_instance(self, passage_info): passage_text = passage_info["context"].strip() if self.wordpiece_numbers: word_tokens = split_tokens_by_hyphen( self.number_tokenizer.tokenize(passage_text)) else: word_tokens = self.tokenizer.tokenize(passage_text) numbers_in_passage = [] number_indices = [] number_words = [] number_len = [] passage_tokens = [] curr_index = 0 # Get all passage numbers for token in word_tokens: number = self.word_to_num(token.text, True) wordpieces = self.tokenizer.tokenize(token.text) num_wordpieces = len(wordpieces) if number is not None: numbers_in_passage.append(number) number_indices.append(curr_index) number_words.append(token.text) number_len.append(num_wordpieces) passage_tokens += wordpieces curr_index += num_wordpieces # Process questions from this passage for question_answer in passage_info["qa_pairs"]: question_id = question_answer["qid"] question_text = question_answer["question"].strip() dataset = question_answer["dataset"] if len(question_answer["answers"] ) != 0 and question_answer["answers"][0][0] != "": answer_annotations = question_answer["answers"] else: answer_annotations = None instance = self.text_to_instance(question_text, passage_text, passage_tokens, numbers_in_passage, number_words, number_indices, number_len, question_id, answer_annotations, dataset) yield instance
def extract_passage_numbers(passage): word_tokens = split_tokens_by_hyphen(number_tokenizer.tokenize(passage)) numbers_in_passage = [] number_indices = [] number_words = [] number_len = [] passage_tokens = [] curr_index = 0 # Get all passage numbers for token in word_tokens: number = _get_number_from_word(token.text) wordpieces = tokenizer.tokenize(token.text) num_wordpieces = len(wordpieces) if number is not None: numbers_in_passage.append(number) number_indices.append(token.idx) number_words.append(token.text) number_len.append(num_wordpieces) passage_tokens += wordpieces curr_index += num_wordpieces return numbers_in_passage, number_indices
def _read(self, file_path: str): file_path = cached_path(file_path) with open(file_path) as dataset_file: dataset = json.load(dataset_file) instances = [] failed_passages = [] for passage_id, passage_info in tqdm(dataset.items()): passage_text = passage_info["passage"].strip() if self.wordpiece_numbers: word_tokens = split_tokens_by_hyphen( self.number_tokenizer.tokenize(passage_text)) else: word_tokens = self.tokenizer.tokenize(passage_text) try: passage_spans = [] if self.extract_spans is False \ else self.span_extractor.extract_passage_spans(word_tokens, self.spans_labels, self.span_max_length) except: failed_passages.append(passage_id) continue numbers_in_passage = [] number_indices = [] number_words = [] number_len = [] passage_tokens = [] curr_index = 0 # Get all passage numbers for token in word_tokens: number = self.word_to_num(token.text) wordpieces = self.tokenizer.tokenize(token.text) num_wordpieces = len(wordpieces) if self.extract_spans and num_wordpieces > 1: passage_spans = list( map( lambda span: adapt_span_by_wordpieces( span, curr_index, num_wordpieces), passage_spans)) if number is not None: numbers_in_passage.append(number) number_indices.append(curr_index) number_words.append(token.text) number_len.append(num_wordpieces) passage_tokens += wordpieces curr_index += num_wordpieces # Process questions from this passage for question_answer in passage_info["qa_pairs"]: question_id = question_answer["query_id"] question_text = question_answer["question"].strip() answer_annotations = [] if "answer" in question_answer: answer_type = get_answer_type(question_answer['answer']) if self.answer_type is not None and answer_type not in self.answer_type: continue if self.question_type is not None and get_question_type( question_text, answer_type, question_answer['answer'], self.max_count) not in self.question_type: continue answer_annotations.append(question_answer["answer"]) if self.use_validated and "validated_answers" in question_answer: answer_annotations += question_answer["validated_answers"] instance = self.text_to_instance( question_text, passage_text, passage_tokens, passage_spans, numbers_in_passage, number_words, number_indices, number_len, question_id, passage_id, answer_annotations) if instance is not None: instances.append(instance) if self.extract_spans: with open( file_path.split('/')[-1].split('.')[0] + "_failed_passages.json", 'w+') as outfile: json.dump(failed_passages, outfile) return instances
def _read(self, file_path: str): file_path = cached_path(file_path) with open(file_path, encoding = "utf8") as dataset_file: dataset = json.load(dataset_file) if self.standardize_texts and self._is_training: dataset = standardize_dataset(dataset) instances_count = 0 for passage_id, passage_info in tqdm(dataset.items()): passage_text = passage_info["passage"].strip() if self.wordpiece_numbers: # In this case we actually first use a basic `WordTokenizer`, where each token is # additionally split on any hyphen it contains. word_tokens = split_tokens_by_hyphen(self.number_tokenizer.tokenize(passage_text)) else: word_tokens = self.tokenizer.tokenize(passage_text) # Auxiliary variables for handling numbers from the passage numbers_in_passage = [] number_indices = [] number_words = [] number_len = [] passage_tokens = [] curr_index = 0 # Get all passage numbers for token in word_tokens: # Wordpiece tokenization is done here. # In addition, every token recognized as a number is stored for arithmetic processing. number = self.word_to_num(token.text, self.improve_number_extraction) wordpieces = self.tokenizer.tokenize(token.text) num_wordpieces = len(wordpieces) if number is not None: numbers_in_passage.append(number) number_indices.append(curr_index) number_words.append(token.text) number_len.append(num_wordpieces) passage_tokens += wordpieces curr_index += num_wordpieces # if len(number_indices) == 0: # import pdb; pdb.set_trace() passage_tokens = fill_token_indices(passage_tokens, passage_text, self._uncased, self.basic_tokenizer, word_tokens) # Process questions from this passage for qa_pair in passage_info["qa_pairs"]: if 0 < self.max_instances <= instances_count: return question_id = qa_pair["query_id"] question_text = qa_pair["question"].strip() answer_annotations: List[Dict] = list() specific_answer_type = None if 'answer' in qa_pair and qa_pair['answer']: answer = qa_pair['answer'] specific_answer_type = get_answer_type(answer) if specific_answer_type not in self.answer_types: continue answer_annotations.append(answer) if self.use_validated and "validated_answers" in qa_pair and qa_pair["validated_answers"]: answer_annotations += qa_pair["validated_answers"] instance = self.text_to_instance(question_text, passage_text, passage_tokens, numbers_in_passage, number_words, number_indices, number_len, question_id, passage_id, answer_annotations, specific_answer_type) if instance is not None: instances_count += 1 yield instance
def text_to_instance(self, # type: ignore question_text: str, passage_text: str, question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None, passage_tokens: List[Token] = None) -> Union[Instance, None]: # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) passage_tokens = split_tokens_by_hyphen(passage_tokens) question_tokens = self._tokenizer.tokenize(question_text) question_tokens = split_tokens_by_hyphen(question_tokens) if self.passage_length_limit is not None: passage_tokens = passage_tokens[: self.passage_length_limit] if self.question_length_limit is not None: question_tokens = question_tokens[: self.question_length_limit] answer_type: str = None answer_texts: List[str] = [] if answer_annotations: # Currently we only use the first annotated answer here, but actually this doesn't affect # the training, because we only have one annotation for the train set. answer_type, answer_texts = self.extract_answer_info_from_annotation(answer_annotations[0]) # Tokenize the answer text in order to find the matched span based on token tokenized_answer_texts = [] for answer_text in answer_texts: answer_tokens = self._tokenizer.tokenize(answer_text) answer_tokens = split_tokens_by_hyphen(answer_tokens) tokenized_answer_texts.append(' '.join(token.text for token in answer_tokens)) if self.instance_format == "squad": valid_passage_spans = \ self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] if not valid_passage_spans: if "passage_span" in self.skip_when_all_empty: return None else: valid_passage_spans.append((len(passage_tokens) - 1, len(passage_tokens) - 1)) return make_reading_comprehension_instance(question_tokens, passage_tokens, self._token_indexers, passage_text, valid_passage_spans, # this `answer_texts` will not be used for evaluation answer_texts, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "passage_id": passage_id, "question_id": question_id, "valid_passage_spans": valid_passage_spans, "answer_annotations": answer_annotations}) elif self.instance_format == "bert": question_concat_passage_tokens = question_tokens + [Token("[SEP]")] + passage_tokens valid_passage_spans = [] for span in self.find_valid_spans(passage_tokens, tokenized_answer_texts): # This span is for `question + [SEP] + passage`. valid_passage_spans.append((span[0] + len(question_tokens) + 1, span[1] + len(question_tokens) + 1)) if not valid_passage_spans: if "passage_span" in self.skip_when_all_empty: return None else: valid_passage_spans.append((len(question_concat_passage_tokens) - 1, len(question_concat_passage_tokens) - 1)) answer_info = {"answer_texts": answer_texts, # this `answer_texts` will not be used for evaluation "answer_passage_spans": valid_passage_spans} return self.make_bert_drop_instance(question_tokens, passage_tokens, question_concat_passage_tokens, self._token_indexers, passage_text, answer_info, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "passage_id": passage_id, "question_id": question_id, "answer_annotations": answer_annotations}) elif self.instance_format == "drop": numbers_in_passage = [] number_indices = [] for token_index, token in enumerate(passage_tokens): number = self.convert_word_to_number(token.text) if number is not None: numbers_in_passage.append(number) number_indices.append(token_index) # hack to guarantee minimal length of padded number numbers_in_passage.append(0) number_indices.append(-1) numbers_as_tokens = [Token(str(number)) for number in numbers_in_passage] valid_passage_spans = \ self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] valid_question_spans = \ self.find_valid_spans(question_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] target_numbers = [] # `answer_texts` is a list of valid answers. for answer_text in answer_texts: number = self.convert_word_to_number(answer_text) if number is not None: target_numbers.append(number) valid_signs_for_add_sub_expressions: List[List[int]] = [] valid_counts: List[int] = [] if answer_type in ["number", "date"]: valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions(numbers_in_passage, target_numbers) if answer_type in ["number"]: # Currently we only support count number 0 ~ 9 numbers_for_count = list(range(10)) valid_counts = self.find_valid_counts(numbers_for_count, target_numbers) type_to_answer_map = {"passage_span": valid_passage_spans, "question_span": valid_question_spans, "addition_subtraction": valid_signs_for_add_sub_expressions, "counting": valid_counts} if self.skip_when_all_empty \ and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty): return None answer_info = {"answer_texts": answer_texts, # this `answer_texts` will not be used for evaluation "answer_passage_spans": valid_passage_spans, "answer_question_spans": valid_question_spans, "signs_for_add_sub_expressions": valid_signs_for_add_sub_expressions, "counts": valid_counts} return self.make_marginal_drop_instance(question_tokens, passage_tokens, numbers_as_tokens, number_indices, self._token_indexers, passage_text, answer_info, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage, "passage_id": passage_id, "question_id": question_id, "answer_info": answer_info, "answer_annotations": answer_annotations}) else: raise ValueError(f"Expect the instance format to be \"drop\", \"squad\" or \"bert\", " f"but got {self.instance_format}")
def text_to_instance(self, # type: ignore question_text: str, passage_text: str, question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None, passage_tokens: List[Token] = None) -> Union[Instance, None]: # pylint: disable=arguments-differ if not passage_tokens: # [w1,w2,w3,...,wn] passage_tokens = self._tokenizer.tokenize(passage_text) passage_tokens = split_tokens_by_hyphen(passage_tokens) passage_tokens = self.word_tokenizer(passage_tokens) question_tokens = self._tokenizer.tokenize(question_text) question_tokens = split_tokens_by_hyphen(question_tokens) question_tokens = self.word_tokenizer(question_tokens) if self.passage_length_limit is not None: passage_tokens = passage_tokens[: self.passage_length_limit] if self.question_length_limit is not None: question_tokens = question_tokens[: self.question_length_limit] passage_question_tokens = [Token("[CLS]")] + passage_tokens + [Token("[SEP]")] + question_tokens + [Token("[SEP]")] + self.implicit_tokens #passage_question_tokens = [Token(token) for token in passage_question_tokens] answer_type: str = None answer_texts: List[str] = [] if answer_annotations: # Currently we only use the first annotated answer here, but actually this doesn't affect # the training, because we only have one annotation for the train set. answer_type, answer_texts = self.extract_answer_info_from_annotation(answer_annotations[0]) # Tokenize the answer text in order to find the matched span based on token tokenized_answer_texts = [] for answer_text in answer_texts: answer_tokens = self._tokenizer.tokenize(answer_text) #answer_tokens = [Token(token) for token in answer_tokens] answer_tokens = split_tokens_by_hyphen(answer_tokens) answer_tokens = self.word_tokenizer(answer_tokens) tokenized_answer_texts.append(' '.join([token.text for token in answer_tokens])) if self.instance_format == "force_add": numbers_in_passage_question = [] number_indices = [] for token_index, token in enumerate(passage_question_tokens): number = self.convert_word_to_number(token.text) if number is not None: numbers_in_passage_question.append(number) number_indices.append(token_index) # hack to guarantee minimal length of padded number numbers_in_passage_question.append(0) number_indices.append(-1) numbers_as_tokens = [Token(str(number)) for number in numbers_in_passage_question] valid_passage_question_spans = \ self.find_valid_spans(passage_question_tokens,tokenized_answer_texts) if tokenized_answer_texts else [] valid_signs_for_add_sub_expressions: List[List[int]] = [] valid_counts: List[int] = [] #arithmetic answer if answer_type in ["number", "date"]: target_numbers = [] for answer_text in answer_texts: number = self.convert_word_to_number(answer_text) if number is not None: target_numbers.append(number) valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions(numbers_in_passage_question, target_numbers) #if not valid_signs_for_add_sub_expressions: # valid_signs_for_add_sub_expressions = self.find_valid_self_add_sub_expressions(numbers_in_passage_question,target_numbers,self.implicit_number) #count answer if answer_type in ["number"] : # Currently we only support count number 0 ~ 9 numbers_for_count = list(range(10)) valid_counts = self.find_valid_counts(numbers_for_count, target_numbers) type_to_answer_map = {"spans": valid_passage_question_spans, "addition_subtraction": valid_signs_for_add_sub_expressions, "counting": valid_counts} if self.skip_when_all_empty \ and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty): return None answer_info = {"answer_texts": answer_texts, # this `answer_texts` will not be used for evaluation "answer_spans": valid_passage_question_spans, "signs_for_add_sub_expressions": valid_signs_for_add_sub_expressions, "counts": valid_counts} return self.make_marginal_bert_drop_instance(passage_question_tokens, #passage_tokens, self.implicit_tokens, numbers_as_tokens, number_indices, self._token_indexers, passage_text, answer_info, additional_metadata={ "original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage_question, "passage_id": passage_id, "question_id": question_id, "answer_info": answer_info, "answer_annotations": answer_annotations}) else: raise ValueError(f"Expect the instance format to be \"bert_drop\", " f"but got {self.instance_format}")
def dataset_iterator(self, single_file_path_cached, dataset): jsonl = open(single_file_path_cached, 'r') for passage_info in jsonl: passage_info = json.loads(passage_info) passage_text = passage_info["passage"].strip() if self.wordpiece_numbers: word_tokens = split_tokens_by_hyphen( self.number_tokenizer.tokenize(passage_text)) else: word_tokens = self.tokenizer.tokenize(passage_text) numbers_in_passage = [] number_indices = [] number_words = [] number_len = [] passage_tokens = [] curr_index = 0 # Get all passage numbers for token in word_tokens: number = self.word_to_num(token.text, True) wordpieces = self.tokenizer.tokenize(token.text) num_wordpieces = len(wordpieces) if number is not None: numbers_in_passage.append(number) number_indices.append(curr_index) number_words.append(token.text) number_len.append(num_wordpieces) passage_tokens += wordpieces curr_index += num_wordpieces # Process questions from this passage for question_answer in passage_info["qa_pairs"]: question_id = question_answer["query_id"] question_text = question_answer["question"].strip() if question_answer["answer"] != "impossible": spans = question_answer["answer"]["spans"] spans = [span.strip() for span in spans if span.strip()] question_answer["answer"]["spans"] = spans #dataset = question_answer.get("dataset", currentDataset) answer_annotations = [] if "answer" in question_answer: answer_annotations.append(question_answer["answer"]) if self.use_validated and "validated_answers" in question_answer: answer_annotations += question_answer[ "validated_answers"] else: answer_annotations = None #print(passage_text) #print('Question: ', question_text) #print('Answer: ', answer_annotations) instance = self.text_to_instance(question_text, passage_text, passage_tokens, numbers_in_passage, number_words, number_indices, number_len, question_id, answer_annotations, dataset) if instance is not None: self.numInstances += 1 self.dataset_numbers[dataset] = self.dataset_numbers.get( dataset, 0) + 1 if self.numInstances % 5000 == 0: pass #print('i: ', self.numInstances) #print('Dataset Numbers: ', self.dataset_numbers) yield instance print('Dataset Numbers after finishing %s: ' % dataset, self.dataset_numbers)
def process_numbers(self, # type: ignore question_text: str, passage_text: str, question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None, passage_tokens: List[Token] = None) -> Union[Instance, None]: '''extract all numbers from passage and append to self.all_numbers; note this is done multiple times for each paragraph at the moment''' # pylint: disable=arguments-differ if not passage_tokens: passage_tokens = self._tokenizer.tokenize(passage_text) passage_tokens = split_tokens_by_hyphen(passage_tokens) question_tokens = self._tokenizer.tokenize(question_text) question_tokens = split_tokens_by_hyphen(question_tokens) if self.passage_length_limit is not None: #logger.info("passage_length = %s", len(passage_tokens)) passage_tokens = passage_tokens[: self.passage_length_limit] if self.question_length_limit is not None: question_tokens = question_tokens[: self.question_length_limit] #logger.info("question_length = %s", len(question_tokens)) answer_type: str = None answer_texts: List[str] = [] if answer_annotations: # Currently we only use the first annotated answer here, but actually this doesn't affect # the training, because we only have one annotation for the train set. answer_type, answer_texts = self.extract_answer_info_from_annotation(answer_annotations[0]) # Tokenize the answer text in order to find the matched span based on token tokenized_answer_texts = [] for answer_text in answer_texts: answer_tokens = self._tokenizer.tokenize(answer_text) answer_tokens = split_tokens_by_hyphen(answer_tokens) tokenized_answer_texts.append(' '.join(token.text for token in answer_tokens)) if self.instance_format == "drop": numbers_in_passage = [] number_indices = [] for token_index, token in enumerate(passage_tokens): number = self.convert_word_to_number(token.text) if number is not None: numbers_in_passage.append(number) number_indices.append(token_index) # hack to guarantee minimal length of padded number number_indices.append(-1) numbers_in_passage.append(0) valid_passage_spans = \ self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] valid_question_spans = \ self.find_valid_spans(question_tokens, tokenized_answer_texts) if tokenized_answer_texts else [] target_numbers = [] # `answer_texts` is a list of valid answers. for answer_text in answer_texts: number = self.convert_word_to_number(answer_text) if number is not None: target_numbers.append(number) valid_signs_for_add_sub_expressions: List[List[int]] = [] valid_counts: List[int] = [] # if answer_type in ["number", "date"]: if answer_type in ["number"]: valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions(numbers_in_passage, target_numbers) if answer_type in ["number"]: # Currently we only support count number 0 ~ 9 numbers_for_count = list(range(10)) valid_counts = self.find_valid_counts(numbers_for_count, target_numbers) type_to_answer_map = {"passage_span": valid_passage_spans, "question_span": valid_question_spans, "addition_subtraction": valid_signs_for_add_sub_expressions, "counting": valid_counts} if self.skip_when_all_empty \ and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty): return None else: self.all_numbers = np.concatenate((self.all_numbers, np.array(numbers_in_passage))) l = len(numbers_in_passage) if l in self.num_numbers: self.num_numbers[l] += 1 else: self.num_numbers[l] = 1
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) if "train" in file_path: with open(file_path) as dataset_file: dataset = json.load(dataset_file) logger.info("Finding mean and sd") for passage_id, passage_info in dataset.items(): passage_text = passage_info["passage"] passage_tokens = self._tokenizer.tokenize(passage_text) passage_tokens = split_tokens_by_hyphen(passage_tokens) for question_answer in passage_info["qa_pairs"]: question_id = question_answer["query_id"] question_text = question_answer["question"].strip() answer_annotations = [] if "answer" in question_answer: answer_annotations.append(question_answer["answer"]) if "validated_answers" in question_answer: answer_annotations += question_answer["validated_answers"] # UNCOMMENT TO FIND NUMBER DISTRIBUTION # self.process_numbers(question_text, # passage_text, # question_id, # passage_id, # answer_annotations, # passage_tokens) # if self.all_numbers: # self.mean = self.all_numbers.mean() # else: # self.mean = 0 # if len(self.all_numbers) > 1: # self.sd = self.all_numbers.std() # else: # self.sd = 1 # logger.info("Mean of all numbers = %f \n", self.mean) # logger.info("Sd of all numbers = %f \n", self.sd) # logger.info("Number distribution %s", self.num_numbers) with open(file_path) as dataset_file: dataset = json.load(dataset_file) logger.info("Reading the dataset") kept_count, skip_count = 0, 0 for passage_id, passage_info in dataset.items(): passage_text = passage_info["passage"] passage_tokens = self._tokenizer.tokenize(passage_text) passage_tokens = split_tokens_by_hyphen(passage_tokens) for question_answer in passage_info["qa_pairs"]: question_id = question_answer["query_id"] question_text = question_answer["question"].strip() answer_annotations = [] if "answer" in question_answer: answer_annotations.append(question_answer["answer"]) if "validated_answers" in question_answer: answer_annotations += question_answer["validated_answers"] instance = self.text_to_instance(question_text, passage_text, question_id, passage_id, answer_annotations, passage_tokens) if instance is not None: kept_count += 1 yield instance else: skip_count += 1 logger.info(f"Skipped {skip_count} questions, kept {kept_count} questions.")
def _read(self, file_path: str): file_path = cached_path(file_path) with open(file_path) as dataset_file: dataset = json.load(dataset_file) passage_count = 0 count_questions_prefixes = ["how many field goal", "how many touchdown", "how many pass", "how many times", "how many interception", "how many win", "how many of the"] for passage_id, passage_info in tqdm(dataset.items()): # passage_count += 1 # if passage_count > 30: # break passage_text = passage_info["passage"].strip() passages_sentences = self.sentence_tokenizer.split_sentences(passage_text) numbers_in_passage = [] number_indices = [] number_words = [] number_len = [] passage_tokens = [] passage_sentence_tokens= [] sentence_indices = [] curr_index = 0 for sentence_idx, sentence in enumerate(passages_sentences): if self.wordpiece_numbers: word_tokens = split_tokens_by_hyphen(self.number_tokenizer.tokenize(sentence)) else: word_tokens = self.tokenizer.tokenize(sentence) sentence_tokens = [] # Get all passage numbers for token in word_tokens: number = self.word_to_num(token.text) wordpieces = self.tokenizer.tokenize(token.text) num_wordpieces = len(wordpieces) if number is not None: numbers_in_passage.append(number) number_indices.append(curr_index) number_words.append(token.text) number_len.append(num_wordpieces) passage_tokens += wordpieces sentence_tokens += wordpieces sentence_indices += [sentence_idx] * num_wordpieces curr_index += num_wordpieces passage_sentence_tokens.append(sentence_tokens) # Process questions from this passage for question_answer in passage_info["qa_pairs"]: question_id = question_answer["query_id"] question_text = question_answer["question"].strip() if not any(question_text.lower().startswith(prefix) for prefix in count_questions_prefixes): continue answer_annotations = [] if "answer" in question_answer: if self.answer_type is not None and get_answer_type(question_answer['answer']) not in self.answer_type: continue answer_annotations.append(question_answer["answer"]) if self.use_validated and "validated_answers" in question_answer: answer_annotations += question_answer["validated_answers"] instance = self.text_to_instance(question_text, passage_text, passage_tokens, passage_sentence_tokens, numbers_in_passage, number_words, number_indices, number_len, sentence_indices, question_id, passage_id, answer_annotations) if instance is not None: yield instance