def _convert_example_to_record(self, example, max_seq_length, tokenizer): """_convert_example_to_record""" tokens = tokenization.whitespace_tokenize(example.text_a) labels = tokenization.whitespace_tokenize(example.label) tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) if len(tokens) > max_seq_length - 2: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] tokens = ["[CLS]"] + tokens + ["[SEP]"] token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) text_type_ids = [0] * len(token_ids) no_entity_id = len(self.label_map) - 1 label_ids = [no_entity_id ] + [self.label_map[label] for label in labels] + [no_entity_id] Record = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) record = Record(token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids, label_ids=label_ids) return record
def read_squad_example(entry, is_training): examples = [] for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer." ) answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def tokenize(self, text): text = [convert_to_unicode(a) for a in text] text2 = [] for token in text: text2 += self._clean_text(token) split_tokens = [] for token in text2: if self.do_lower_case: token = token.lower() token = self._run_strip_accents(token) split_tokens.append(token) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens
def read_squad_examples(id, paragraph, question, tokenizer): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False def is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def clean_text(text): """Performs invalid character removal and whitespace cleanup on text.""" output = [] for char in text: cp = ord(char) if cp == 0 or cp == 0xfffd or is_control(char): continue if is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output) examples = [] paragraph_text = " ".join(tokenization.whitespace_tokenize(clean_text(paragraph))) doc_tokens = tokenizer.basic_tokenizer.tokenize(paragraph_text) qas_id = id question_text = question start_position = None end_position = None orig_answer_text = None example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def can_find(text, offset, length, tokens, char_to_word_offset): print(char_to_word_offset) start_position = char_to_word_offset[offset] end_position = char_to_word_offset[offset + length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'",actual_text, cleaned_answer_text) return None,None return start_position,end_position
def detect_span(_answers, context, doc_tokens, char_to_word_offset): orig_answer_texts = [] start_positions = [] end_positions = [] switches = [] answers = [] for answer in _answers: answers += find_span_from_text(context, doc_tokens, answer['text']) for answer in answers: orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) switch = 0 if 'word_start' in answer and 'word_end' in answer: start_position = answer['word_start'] end_position = answer['word_end'] else: start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]).replace( ' ##', '').replace('##', '') cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.replace(' ', '').find(cleaned_answer_text.replace(' ', '')) == -1: print("Could not find answer: '%s' vs. '%s'" % (actual_text, cleaned_answer_text)) orig_answer_texts.append(orig_answer_text) start_positions.append(start_position) end_positions.append(end_position) switches.append(switch) return orig_answer_texts, switches, start_positions, end_positions
def read_record_examples(input_file, is_training): """Read a ReCoRD json file into a list of ReCoRDExample.""" with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: paragraph_text = entry["passage"]["text"].replace('\xa0', ' ') doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # load entities in passage passage_entities = [] for entity in entry['passage']['entities']: entity_start_offset = entity['start'] entity_end_offset = entity['end'] if entity_end_offset < entity_start_offset: # some error labeled entities in record dataset continue entity_text = paragraph_text[entity_start_offset: entity_end_offset + 1] passage_entities.append({'orig_text': entity_text, 'start_position': char_to_word_offset[entity_start_offset], 'end_position': char_to_word_offset[entity_end_offset]}) for qa in entry["qas"]: qas_id = qa["id"] question_text = qa["query"].replace('\xa0', ' ') start_position = None end_position = None orig_answer_text = None if is_training: # if len(qa["answers"]) != 1: # raise ValueError( # "For training, each question should have exactly 1 answer.") answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = ReCoRDExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, passage_entities=passage_entities, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_squad_examples(input_file, return_answers, context_only=False, question_only=False, draft=False, draft_num_examples=12, append_title=False): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r") as reader: input_data = json.load(reader)["data"] examples = [] ans_cnt = 0 no_ans_cnt = 0 # Only word-based tokenization is peformed (whitespace based) for doc_idx, entry in enumerate(input_data): title = entry['title'][0] if type(entry['title']) == list else entry['title'] assert type(title) == str for par_idx, paragraph in enumerate(entry["paragraphs"]): # Do not load context for question only if not question_only: paragraph_text = paragraph["context"] title_offset = 0 if append_title: title_str = '[ ' + ' '.join(title.split('_')) + ' ] ' title_offset += len(title_str) paragraph_text = title_str + paragraph_text # Note that we use the term 'word' for whitespace based words, and 'token' for subtokens (for BERT input) doc_words, char_to_word_offset = context_to_words_and_offset(paragraph_text) # 1) Context only ends here if context_only: metadata = {} if "pubmed_id" in entry: entry_keys = [ "pubmed_id", "sha", "title_original", "title_entities", "journal", "authors", "article_idx" ] para_keys = ["context_entities"] for entry_key in entry_keys: if entry_key in entry: metadata[entry_key] = entry[entry_key] for para_key in para_keys: if para_key in paragraph: metadata[para_key] = paragraph[para_key] # metadata["pubmed_id"] = (metadata["pubmed_id"] if not pd.isnull(metadata["pubmed_id"]) # else 'NaN') example = SquadExample( doc_words=doc_words, title=title, doc_idx=doc_idx, par_idx=par_idx, metadata=metadata) examples.append(example) if draft and len(examples) == draft_num_examples: return examples continue # 2) Question only or 3) context/question pair else: for qa in paragraph["qas"]: qas_id = str(qa["id"]) question_text = qa["question"] # Noisy question skipping if len(question_text.split(' ')) == 1: logger.info('Skipping a single word question: {}'.format(question_text)) continue if "I couldn't could up with another question." in question_text: logger.info('Skipping a strange question: {}'.format(question_text)) continue start_position = None end_position = None orig_answer_text = None # For pre-processing that should return answers together if return_answers: assert type(qa["answers"]) == dict or type(qa["answers"]) == list, type(qa["answers"]) if type(qa["answers"]) == dict: qa["answers"] = [qa["answers"]] # No answers if len(qa["answers"]) == 0: orig_answer_text = "" start_position = -1 # Word-level no-answer => -1 end_position = -1 no_ans_cnt += 1 # Answer exists else: answer = qa["answers"][0] ans_cnt += 1 orig_answer_text = answer["text"] answer_offset = answer["answer_start"] + title_offset answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the context actual_text = " ".join(doc_words[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) # word based tokenization if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue # Question only ends here if question_only: example = SquadExample( qas_id=qas_id, question_text=question_text) # Context/question pair ends here else: example = SquadExample( qas_id=qas_id, question_text=question_text, paragraph_text=paragraph_text, doc_words=doc_words, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, title=title, doc_idx=doc_idx, par_idx=par_idx) examples.append(example) if draft and len(examples) == draft_num_examples: return examples # Testing for shuffled draft (should comment out above 'draft' if-else statements) if draft: random.shuffle(examples) logger.info(str(len(examples)) + ' were collected before draft for shuffling') return examples[:draft_num_examples] logger.info('Answer/no-answer stat: %d vs %d'%(ans_cnt, no_ans_cnt)) return examples
def read_nq_entry(entry, is_training): """Converts a NQ entry into a list of NqExamples.""" def is_whitespace(c): return c in " \t\r\n" or ord(c) == 0x202F examples = [] contexts_id = entry["id"] contexts = entry["contexts"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in contexts: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) questions = [] for i, question in enumerate(entry["questions"]): qas_id = "{}".format(contexts_id) question_text = question["input_text"] start_position = None end_position = None answer = None if is_training: answer_dict = entry["answers"][i] answer = make_nq_answer(contexts, answer_dict) # For now, only handle extractive, yes, and no. if answer is None or answer.offset is None: continue start_position = char_to_word_offset[answer.offset] end_position = char_to_word_offset[answer.offset + len(answer.text) - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(answer.text)) if actual_text.find(cleaned_answer_text) == -1: tf.compat.v1.logging.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue questions.append(question_text) example = NqExample(example_id=int(contexts_id), qas_id=qas_id, questions=questions[:], doc_tokens=doc_tokens, doc_tokens_map=entry.get("contexts_map", None), answer=answer, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] # examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case=squad_params.do_lower_case) doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True k = 0 temp_word = "" for c in paragraph_text: if tokenization._is_whitespace(c): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if squad_params.do_lower_case: temp_word = temp_word.lower() if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 assert k == len(raw_doc_tokens) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: answer = qa["answers"][0] orig_answer_text = answer["text"] if orig_answer_text not in paragraph_text: tf.logging.warning("Could not find answer") else: answer_offset = paragraph_text.index(orig_answer_text) answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if squad_params.do_lower_case: cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: pdb.set_trace() tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) tf.logging.info("**********read_squad_examples complete!**********") return examples
def _create_examples(self, data_split, dataset_type='v1.1'): input_file = os.path.join( self.data_dir, '{}-{}.json'.format(data_split, dataset_type)) with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F or ord(c) == 160: return True return False doc_count = 0 examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) doc_count += 1 doc_id = '{}-{}'.format(data_split, doc_count) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False # note: use the first answer for all dataset splits. # if len(qa["answers"]) != 1: # raise ValueError( # "For training, each question should have exactly 1 answer.") if dataset_type == 'v2.0': is_impossible = qa['is_impossible'] if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] orig_answer_texts = [a['text'] for a in qa["answers"]] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = '' orig_answer_texts = [] example = InputAnswerExtractionExample( did=doc_id, qid=qas_id, query_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, orig_answer_texts=orig_answer_texts, is_impossible=is_impossible) examples.append(example) if self.candidate_filepath[data_split]: return mc_converter(self.candidate_filepath[data_split], examples) else: return examples
def read_squad_examples(input_file, is_training, version_2_with_negative=False): """Read a SQuAD json file into a list of SquadExample.""" with io.open(input_file, "r", encoding="utf8") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] # doc_tokens = [] # char_to_word_offset = [] # prev_is_whitespace = True # for c in paragraph_text: # if is_whitespace(c): # prev_is_whitespace = True # else: # if prev_is_whitespace: # doc_tokens.append(c) # else: # doc_tokens[-1] += c # prev_is_whitespace = False # char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_pos = None end_pos = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) doc_tokens = [ paragraph_text[:answer_offset], paragraph_text[answer_offset:answer_offset + answer_length], paragraph_text[answer_offset + answer_length:] ] start_pos = 1 end_pos = 1 # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. #actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) actual_text = " ".join(doc_tokens[start_pos:(end_pos + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_pos = -1 end_pos = -1 orig_answer_text = "" else: doc_tokens = tokenization.tokenize_chinese_chars( paragraph_text) example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_pos, end_position=end_pos, is_impossible=is_impossible) examples.append(example) return examples
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a generator of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text, ) continue else: start_position = -1 end_position = -1 orig_answer_text = "" yield SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, )
def estimate_runtime_examples(data_path, sample_rate, tokenizer, \ max_seq_length, doc_stride, max_query_length, \ remove_impossible_questions=True, filter_invalid_spans=True): """Count runtime examples which may differ from number of raw samples due to sliding window operation and etc.. This is useful to get correct warmup steps for training.""" assert sample_rate > 0.0 and sample_rate <= 1.0, "sample_rate must be set between 0.0~1.0" print("loading data with json parser...") with open(data_path, "r") as reader: data = json.load(reader)["data"] num_raw_examples = 0 for entry in data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] for qa in paragraph["qas"]: num_raw_examples += 1 print("num raw examples:{}".format(num_raw_examples)) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False sampled_examples = [] for entry in data: for paragraph in entry["paragraphs"]: doc_tokens = None for qa in paragraph["qas"]: if random.random() > sample_rate and sample_rate < 1.0: continue if doc_tokens is None: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) assert len( qa["answers"] ) == 1, "For training, each question should have exactly 1 answer." qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if ('is_impossible' in qa) and (qa["is_impossible"]): if remove_impossible_questions or filter_invalid_spans: continue else: start_position = -1 end_position = -1 orig_answer_text = "" is_impossible = True else: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # remove corrupt samples actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = MRQAExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) sampled_examples.append(example) runtime_sample_rate = len(sampled_examples) / float(num_raw_examples) # print("DEBUG-> runtime sampled examples: {}, sample rate: {}.".format(len(sampled_examples), runtime_sample_rate)) runtime_samp_cnt = 0 for example in sampled_examples: query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 if filter_invalid_spans and not (tok_start_position >= doc_start and tok_end_position <= doc_end): continue runtime_samp_cnt += 1 return int(runtime_samp_cnt / runtime_sample_rate)
def read_quac_examples(input_file, is_training): """Read a QuAC json file into a list of CQAExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] if FLAGS.load_small_portion: input_data = input_data[:10] # print('input_data:', input_data) tf.logging.warning('<<<<<<<<<< load_small_portion is on! >>>>>>>>>>') for entry in input_data: # An additional "CANNOTANSWER" has been added in QuAC data, so no need to append one. entry = entry['paragraphs'][0] paragraph_text = entry["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) ############################################################ # convert the convasational QAs to squad format, with history ############################################################ questions = [(item['question'], item['id']) for item in entry['qas']] # [(question, question_id), ()] answers = [(item['orig_answer']['text'], item['orig_answer']['answer_start']) for item in entry['qas']] followups = [item['followup'] for item in entry['qas']] yesnos = [item['yesno'] for item in entry['qas']] qas = [] for i, (question, answer, followup, yesno) in enumerate(zip(questions, answers, followups, yesnos)): metadata = {'turn': i + 1, 'history_turns': [], 'tok_history_answer_markers':[], 'followup': followup, 'yesno': yesno, 'history_turns_text': []} # if FLAGS.use_RL: # start_index = 0 # else: # start_index = 0 if i - int(FLAGS.history) < 0 else i - int(FLAGS.history) end_index = i question_with_histories = '' history_answer_marker = None if FLAGS.use_history_answer_marker: start_index = 0 # we read all the histories no matter we use RL or not. we will make approporiate selections afterwards history_answer_marker = [] for history_turn, (each_answer, each_question) in enumerate( zip(answers[start_index: end_index], questions[start_index: end_index])): # [history_answer_start, history_answer_end, history_answer_text] each_marker = [each_answer[1], each_answer[1] + len(each_answer[0]), each_answer[0]] history_answer_marker.append(each_marker) metadata['history_turns'].append(history_turn + start_index + 1) metadata['history_turns_text'].append((each_question[0], each_answer[0])) #[(q1, a1), (q2, a2), ...] else: # prepend historical questions and answers start_index = max(end_index - FLAGS.history, 0) if FLAGS.only_history_answer: for each_answer in answers[start_index: end_index]: question_with_histories += each_answer[0] + ' ' else: for each_question, each_answer in zip(questions[start_index: end_index], answers[start_index: end_index]): question_with_histories += each_question[0] + ' ' + each_answer[0] + ' ' # add the current question question_with_histories += question[0] qas.append({'id': question[1], 'question': question_with_histories, 'answers': [{'answer_start': answer[1], 'text': answer[0]}], 'history_answer_marker': history_answer_marker, 'metadata': metadata}) for qa in qas: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None # if is_training: # we read in the groundtruth answer bothing druing training and predicting, because we need to compute acc and f1 at predicting time. if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer.") answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if is_training and actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue # we construct a tok_history_answer_marker to store the aggregated history answer markers for a question. # we also construct each_tok_history_answer_marker to store a single history answer marker. tok_history_answer_marker = [0] * len(doc_tokens) if FLAGS.use_history_answer_marker: for marker_index, marker in enumerate(qa['history_answer_marker']): each_tok_history_answer_marker = [0] * len(doc_tokens) history_orig_answer_text = marker[2] history_answer_offset = marker[0] history_answer_length = len(history_orig_answer_text) history_start_position = char_to_word_offset[history_answer_offset] history_end_position = char_to_word_offset[history_answer_offset + history_answer_length - 1] history_actual_text = " ".join(doc_tokens[history_start_position:(history_end_position + 1)]) history_cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(history_orig_answer_text)) if history_actual_text.find(history_cleaned_answer_text) != -1: tok_history_answer_marker = tok_history_answer_marker[: history_start_position] + \ [1] * (history_end_position - history_start_position + 1) + \ tok_history_answer_marker[history_end_position + 1 :] each_tok_history_answer_marker = each_tok_history_answer_marker[: history_start_position] + \ [1] * (history_end_position - history_start_position + 1) + \ each_tok_history_answer_marker[history_end_position + 1 :] assert len(tok_history_answer_marker) == len(doc_tokens) assert len(each_tok_history_answer_marker) == len(doc_tokens) qa['metadata']['tok_history_answer_markers'].append(each_tok_history_answer_marker) else: tf.logging.warning("Could not find history answer: '%s' vs. '%s'", history_actual_text, history_cleaned_answer_text) example = CQAExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, history_answer_marker=tok_history_answer_marker, metadata=qa['metadata']) examples.append(example) # print(example) return examples
def read_squad_examples(input_file, is_training, do_lower_case): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case=do_lower_case) doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True k = 0 temp_word = "" for c in paragraph_text: if tokenization._is_whitespace(c): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if do_lower_case is True: temp_word = temp_word.lower() if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 try: assert k == len(raw_doc_tokens) except AssertionError: print(len(raw_doc_tokens), len(doc_tokens)) for i in range(min(len(doc_tokens), len(raw_doc_tokens))): if raw_doc_tokens[i] != doc_tokens[i]: print(raw_doc_tokens[i - 3:i + 3], doc_tokens[i - 3:i + 3]) break print(''.join(doc_tokens[500:])) print("----") print(''.join(raw_doc_tokens[500:])) raise AssertionError for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] is_impossible = False start_position = None end_position = None orig_answer_text = None if is_training: is_impossible = len(qa['answers']) == 0 if len(qa["answers"]) > 1: pass #raise ValueError( # "For training, each question should have less than 1 answer.") if len(qa['answers']) == 0: orig_answer_text = "" start_position = end_position = 0 # use_cls else: answer = qa["answers"][0] orig_answer_text = answer["text"] if orig_answer_text not in paragraph_text: logger.warning("Could not find answer") continue answer_offset = paragraph_text.index(orig_answer_text) #answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if do_lower_case: cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if FLAGS.version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer.") if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def read_classifier_examples(input_file, labels, is_training): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False #if (len(qa["answers"]) != 1): if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: #tf.logging.warning("Could not find answer: '%s' vs. '%s'", #actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" label = labels[qas_id] example = run_classifier.InputExample(guid=qas_id, text_a=question_text, text_b=orig_answer_text, label=label) examples.append(example) return examples
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" # with tf.gfile.Open(input_file, "r") as reader: # input_data = json.load(reader)["data"] file_names = os.listdir(input_file) file_names = [a for a in file_names if a.endswith('.json')] dataset = [] for file_name in file_names: data_file = os.path.join(input_file, file_name) with tf.io.gfile.GFile(data_file,'r') as f: dataset_json = json.load(f) dataset.extend(dataset_json['data']) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in dataset: paragraph_text = entry["context"] doc_tokens = [] # c # context에서 white space 기준으로 토큰화 char_to_word_offset = [] # 한 토큰 당 character 수 저장 prev_is_whitespace = True for c in paragraph_text: # context 한 글자씩 for loop if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: # 이전 글자가 white space 이면 doc_tokens.append(c) # doc_tokens에 append else: doc_tokens[-1] += c # 마지막에 append prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in entry["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: answer = qa["answer"] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training): features = [] if is_training and not example.is_impossible: # Get start and end position start_position = example.start_position end_position = example.end_position # If the answer cannot be found in the text, then skip this example. actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) return [] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text ) spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence ) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", return_token_type_ids=True, ) paragraph_len = min( len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens, ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: if tokenizer.padding_side == "right": non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] else: last_padding_id_position = ( len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id) ) non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :] else: non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens encoded_dict["token_to_orig_map"] = token_to_orig_map encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens encoded_dict["token_is_max_context"] = {} encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len spans.append(encoded_dict) if "overflowing_tokens" not in encoded_dict: break span_doc_tokens = encoded_dict["overflowing_tokens"] for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) index = ( j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j ) spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: # Identify the position of the CLS token cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) p_mask = np.ones_like(span["token_type_ids"]) if tokenizer.padding_side == "right": p_mask[len(truncated_query) + sequence_added_tokens :] = 0 else: p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0 pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id) special_token_indices = np.asarray( tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True) ).nonzero() p_mask[pad_token_indices] = 1 p_mask[special_token_indices] = 1 # Set the cls index to 0: the CLS index can be used for impossible answers p_mask[cls_index] = 0 span_is_impossible = example.is_impossible start_position = 0 end_position = 0 if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = span["start"] doc_end = span["start"] + span["length"] - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = cls_index end_position = cls_index span_is_impossible = True else: if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append( SquadFeatures( span["input_ids"], span["attention_mask"], span["token_type_ids"], cls_index, p_mask.tolist(), example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span["paragraph_len"], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], token_to_orig_map=span["token_to_orig_map"], start_position=start_position, end_position=end_position, is_impossible=span_is_impossible, qas_id=example.qas_id, ) ) return features
def read_squad_examples(input_data, tokenizer): """ https://github.com/eva-n27/BERT-for-Chinese-Question-Answering/blob/master/run_squad.py Read a SQuAD json file into a list of SquadExample. 这个函数将input_data[i]["paragraphs"]["context"]变成一个list,词的list 然后遍历"qas",对于每一个qa,提取 { qas_id: qa['id'], question_text: qa["question"], orig_answer_text: answer["text"], start_position: start_position, end_position: end_position } """ import unicodedata def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False def is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def clean_text(text): """Performs invalid character removal and whitespace cleanup on text.""" output = [] for char in text: cp = ord(char) if cp == 0 or cp == 0xfffd or is_control(char): continue if is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output) examples = [] tf.logging.info("*** reading squad examples ***") for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = " ".join( tokenization.whitespace_tokenize( clean_text(paragraph["context"]))) for qa in paragraph["qas"]: doc_tokens = tokenizer.basic_tokenizer.tokenize(paragraph_text) qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_squad_examples(input_file, tokenizer, is_training): """ Read a SQuAD json file into a list of SquadExample. 这个函数将input_data[i]["paragraphs"]["context"]变成一个list,词的list 然后遍历"qas",对于每一个qa,提取 { qas_id: qa['id'], question_text: qa["question"], orig_answer_text: answer["text"], start_position: start_position, end_position: end_position } """ import unicodedata with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False def is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def clean_text(text): """Performs invalid character removal and whitespace cleanup on text.""" output = [] for char in text: cp = ord(char) if cp == 0 or cp == 0xfffd or is_control(char): continue if is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output) examples = [] for entry in tqdm(input_data): for paragraph in entry["paragraphs"]: paragraph_text = " ".join( tokenization.whitespace_tokenize( clean_text(paragraph["context"]))) for qa in paragraph["qas"]: doc_tokens = tokenizer.basic_tokenizer.tokenize(paragraph_text) qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer." ) answer = qa["answers"][0] orig_answer_text = answer["text"] if len(orig_answer_text) == 0: continue cleaned_answer_text = "".join( tokenizer.basic_tokenizer.tokenize(orig_answer_text)) ori_start_position = "".join(doc_tokens).find( cleaned_answer_text) if ori_start_position == -1: print("Could not find answer: '%s' vs. '%s'", ''.join(doc_tokens), cleaned_answer_text) continue ori_end_position = ori_start_position + len( cleaned_answer_text) - 1 char_to_word_offset = {} start = 0 for idx, token in enumerate(doc_tokens): for _ in token: char_to_word_offset[start] = idx start += 1 start_position = char_to_word_offset[ori_start_position] try: end_position = char_to_word_offset[ori_end_position] except KeyError: continue # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) if actual_text != cleaned_answer_text: # print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue orig_answer_text = cleaned_answer_text example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_squad_examples(input_file, is_training, context_only=False, question_only=False, draft=False, draft_num_examples=12, tokenizer=None): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r") as reader: input_data = json.load(reader)["data"] examples = [] for doc_idx, entry in enumerate(input_data): title = entry['title'] for pid, paragraph in enumerate(entry["paragraphs"]): if not question_only: paragraph_text = paragraph["context"] doc_tokens, char_to_word_offset = context_to_tokens_and_offset( paragraph_text, tokenizer=tokenizer) if context_only: example = SquadExample(doc_tokens=doc_tokens, title=title, doc_idx=doc_idx, pid=pid) examples.append(example) if draft and len(examples) == draft_num_examples: return examples continue else: for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: if False: # len(qa["answers"]) > 1: raise ValueError( "For training, each question should have exactly 1 answer." ) elif len(qa["answers"]) == 0: orig_answer_text = "" start_position = -1 end_position = -1 else: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize( orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue if question_only: example = SquadExample(qas_id=qas_id, question_text=question_text) else: example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, title=title, pid=pid) examples.append(example) if draft and len(examples) == draft_num_examples: return examples return examples
def read_doqa_examples(input_file, is_training): """Read a DoQA json file into a list of DOQAExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] if FLAGS.load_small_portion: input_data = input_data[:10] # print('input_data:', input_data) tf.logging.warning('<<<<<<<<<< load_small_portion is on! >>>>>>>>>>') for entry in input_data: # An additional "CANNOTANSWER" has been added in DoQA data, so no need to append one. entry = entry['paragraphs'][0] paragraph_text = entry["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) ############################################################ # convert the convasational QAs to squad format, with history ############################################################ questions = [(item['question'], item['id']) for item in entry['qas']] # [(question, question_id), ()] answers = [(item['orig_answer']['text'], item['orig_answer']['answer_start']) for item in entry['qas']] followups = [item['followup'] for item in entry['qas']] yesnos = [item['yesno'] for item in entry['qas']] qas = [] for i, (question, answer, followup, yesno) in enumerate(zip(questions, answers, followups, yesnos)): metadata = { 'turn': i + 1, 'history_turns': [], 'tok_history_answer_markers': [], 'followup': followup, 'yesno': yesno, 'history_turns_text': [] } # if FLAGS.use_RL: # start_index = 0 # else: # start_index = 0 if i - int(FLAGS.history) < 0 else i - int(FLAGS.history) end_index = i question_with_histories = '' history_answer_marker = None start_index = 0 # we read all the histories no matter we use RL or not. we will make approporiate selections afterwards history_answer_marker = [] for history_turn, (each_answer, each_question) in enumerate( zip(answers[start_index:end_index], questions[start_index:end_index])): # [history_answer_start, history_answer_end, history_answer_text] each_marker = [ each_answer[1], each_answer[1] + len(each_answer[0]), each_answer[0] ] history_answer_marker.append(each_marker) metadata['history_turns'].append(history_turn + start_index + 1) metadata['history_turns_text'].append( (each_question[0], each_answer[0])) #[(q1, a1), (q2, a2), ...] # add the current question question_with_histories += question[0] qas.append({ 'id': question[1], 'question': question_with_histories, 'answers': [{ 'answer_start': answer[1], 'text': answer[0] }], 'history_answer_marker': history_answer_marker, 'metadata': metadata }) for qa in qas: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None # if is_training: # we read in the groundtruth answer bothing druing training and predicting, because we need to compute acc and f1 at predicting time. if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer." ) answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if is_training and actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue # we construct a tok_history_answer_marker to store the aggregated history answer markers for a question. # we also construct each_tok_history_answer_marker to store a single history answer marker. tok_history_answer_marker = [0] * len(doc_tokens) for marker_index, marker in enumerate(qa['history_answer_marker']): each_tok_history_answer_marker = [0] * len(doc_tokens) history_orig_answer_text = marker[2] history_answer_offset = marker[0] history_answer_length = len(history_orig_answer_text) history_start_position = char_to_word_offset[ history_answer_offset] history_end_position = char_to_word_offset[ history_answer_offset + history_answer_length - 1] history_actual_text = " ".join( doc_tokens[history_start_position:(history_end_position + 1)]) history_cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(history_orig_answer_text)) if history_actual_text.find(history_cleaned_answer_text) != -1: tok_history_answer_marker = tok_history_answer_marker[: history_start_position] + \ [1] * (history_end_position - history_start_position + 1) + \ tok_history_answer_marker[history_end_position + 1 :] each_tok_history_answer_marker = each_tok_history_answer_marker[: history_start_position] + \ [1] * (history_end_position - history_start_position + 1) + \ each_tok_history_answer_marker[history_end_position + 1 :] assert len(tok_history_answer_marker) == len(doc_tokens) assert len(each_tok_history_answer_marker) == len( doc_tokens) qa['metadata']['tok_history_answer_markers'].append( each_tok_history_answer_marker) else: tf.logging.warning( "Could not find history answer: '%s' vs. '%s'", history_actual_text, history_cleaned_answer_text) example = DOQAExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, history_answer_marker=tok_history_answer_marker, metadata=qa['metadata']) examples.append(example) # print(example) return examples
def read_docvqa_examples(input_file, is_training, skip_match_answers=True): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r") as reader: input_data = json.load(reader) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False count_match = 0 count_nomatch = 0 examples = [] for paragraph in input_data: image_id = paragraph["image_id"] paragraph_text = paragraph["context"] boxes = paragraph["boxes"] doc_tokens = paragraph["context"] for qa in paragraph["qas"]: if not qa["answer"]: continue qas_id = qa["qid"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False answer = qa["answer"][0] orig_answer_text = answer["text"] if is_training: if not is_impossible: answer = qa["answer"][0] orig_answer_text = answer["text"] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. start_position = qa["answer"][0]["answer_start"] end_position = qa["answer"][0]["answer_end"] actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if not skip_match_answers: if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) count_nomatch += 1 continue count_match += 1 else: start_position = -1 end_position = -1 orig_answer_text = "" example = DocvqaExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, boxes=boxes) examples.append(example) return examples
def read_kg_examples(input_file, is_training): """Read a knowledge graph json file into a list of KGCExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"]#[Memo]ちょっとわからないけどデータを抽出してる def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for data in entry["data"]: #文字列を単語の列に変換する nlr_text = data["nlr"] kgr = [] char_to_word_offset = [] prev_is_whitespace = True for c in nlr_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: kgr.append(c) else: kgr[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(kgr) - 1) #入力データをKGCExampleクラスのインスタンスに変換する for qa in paragraph["qas"]: kg_id = qa["id"] nlr = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if FLAGS.version_2_with_negative:#SQuAD 2.0でないかどうか is_impossible = qa["is_impossible"]#SQuAD 1.1のときはimpossibleなときがある。たぶんそれはQに対するAがないときとかかな if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer.") if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( kgr[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" #ここまでで得たデータをKGCExampleクラスのインスタンスに変換する example = KGCExample( kg_id=kg_id, nlr=nlr, kgr=kgr, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples