def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding="utf-8") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text, ) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, ) examples.append(example) return examples
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path, cache_dir="data/squad") logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json["data"] logger.info("Reading the dataset") for article in dataset[:1]: for paragraph_json in article["paragraphs"][:1]: paragraph_text = paragraph_json["context"] # white_space tokenization paragraph_words = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F: prev_is_whitespace = True else: if prev_is_whitespace: paragraph_words.append(c) else: paragraph_words[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(paragraph_words) - 1) # calc answer span for question_answer in paragraph_json["qas"]: question_id = question_answer["id"] question_text = question_answer["question"].strip( ).replace("\n", "") # Here, since the givened answer is same, we just choose the first answer just like most does answer = question_answer["answers"][0] answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( paragraph_words[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue # convert to Instance additional_metadata = {"id": question_id} instance = self.text_to_instance( question_text, paragraph_text, paragraph_words, answer_text, start_position, end_position, additional_metadata) if instance is not None: yield instance
def tokenize(self, text): return whitespace_tokenize(text.lower())
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a ReCoRD json file into a list of RecordExample.""" reader = open(input_file, "r", encoding='utf-8') def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for line in reader: raw = json.loads(line) source = raw['source'] paragraph_text = raw['passage']['text'] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in raw['qas']: qas_id = qa['id'] question_text = qa['query'] start_position = None end_position = None orig_answer_text = None if is_training: answers = qa["answers"] orig_answer_texts = [answer["text"] for answer in answers] answer_offsets = [answer["answer_start"] for answer in answers] answer_lengths = [ len(orig_answer_text) for orig_answer_text in orig_answer_texts ] start_positions = [ char_to_word_offset[answer_offset] for answer_offset in answer_offsets ] end_positions = [ char_to_word_offset[answer_offset + answer_length - 1] for answer_offset, answer_length in zip( answer_offsets, answer_lengths) ] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_texts = [ " ".join(doc_tokens[start_position:(end_position + 1)]) ] cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = RecordExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) reader.close() return examples
def create_samples_squad(entry): """Read a SQuAD json file into a list of SquadExample.""" def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False try: _ = entry["paragraphs"][0]["qas"][0]["is_impossible"] is_training = True except KeyError: is_training = False examples = [] num_examples = 1 for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] char_to_word_offset = [] doc_tokens = paragraph_text.split(" ") for i, t in enumerate(doc_tokens): char_to_word_offset.extend([i] * (len(t) + 1)) char_to_word_offset = char_to_word_offset[:-1] # cut off last added whitespace for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: is_impossible = qa["is_impossible"] # TODO check how to transform dev set with multiple possible answers, for now take only 1 answer # if (len(qa["answers"]) != 1) and (not is_impossible): # raise ValueError( # "For training, each question should have exactly 1 answer." # ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[ answer_offset + answer_length - 1 ] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position : (end_position + 1)] ) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text) ) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text, ) continue else: start_position = -1 end_position = -1 orig_answer_text = "" clear_text = {} clear_text["qas_id"] = qas_id clear_text["question_text"] = question_text clear_text["doc_tokens"] = doc_tokens clear_text["orig_answer_text"] = orig_answer_text clear_text["start_position"] = start_position clear_text["end_position"] = end_position clear_text["is_impossible"] = is_impossible clear_text["is_training"] = is_training example = Sample( id=None, clear_text=clear_text, features=None, tokenized=None ) num_examples += 1 examples.append(example) return examples