def _read_json(self, input_file, is_training): examples = [] with open(input_file, "r") as f: input_data = json.load(f)["data"] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_pos = None end_pos = None orig_answer_text = None if is_training: if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer." ) answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) doc_tokens = [ paragraph_text[:answer_offset], paragraph_text[answer_offset:answer_offset + answer_length], paragraph_text[answer_offset + answer_length:] ] start_pos = 1 end_pos = 1 actual_text = " ".join( doc_tokens[start_pos:(end_pos + 1)]) if actual_text.find(orig_answer_text) == -1: print("Could not find answer: '%s' vs. '%s'", actual_text, orig_answer_text) continue else: doc_tokens = tokenization.tokenize_chinese_chars( paragraph_text) Example = namedtuple('Example', [ 'qas_id', 'question_text', 'doc_tokens', 'orig_answer_text', 'start_position', 'end_position' ]) example = Example(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_pos, end_position=end_pos) examples.append(example) return examples
def read_squad_examples(input_file, is_training, version_2_with_negative=False): """Read a SQuAD json file into a list of SquadExample.""" with io.open(input_file, "r", encoding="utf8") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] # doc_tokens = [] # char_to_word_offset = [] # prev_is_whitespace = True # for c in paragraph_text: # if is_whitespace(c): # prev_is_whitespace = True # else: # if prev_is_whitespace: # doc_tokens.append(c) # else: # doc_tokens[-1] += c # prev_is_whitespace = False # char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_pos = None end_pos = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) doc_tokens = [ paragraph_text[:answer_offset], paragraph_text[answer_offset:answer_offset + answer_length], paragraph_text[answer_offset + answer_length:] ] start_pos = 1 end_pos = 1 # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. #actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) actual_text = " ".join(doc_tokens[start_pos:(end_pos + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_pos = -1 end_pos = -1 orig_answer_text = "" else: doc_tokens = tokenization.tokenize_chinese_chars( paragraph_text) example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_pos, end_position=end_pos, is_impossible=is_impossible) examples.append(example) return examples
def abc(test_examples = [],init_check=''): s_t = str (time.localtime()) #取名用 if init_check != '': args.init_checkpoint = init_check args.ernie_config_path = 'config/ernie_config1.json' if test_examples == []: #想直接输入examples曾出现异常无法使用,废弃保留代码 while True: break Example = namedtuple('Example', ['qas_id', 'question_text', 'doc_tokens','orig_answer_text', 'start_position', 'end_position']) example = Example( qas_id = s_t + str(ii) , question_text=que, doc_tokens= tokenization.tokenize_chinese_chars(para), orig_answer_text=None, start_position=None, end_position=None) test_examples.append(example) break reader = task_reader.MRCReader( vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, tokenizer=args.tokenizer, is_classify=args.is_classify, is_regression=args.is_regression, for_cn=args.for_cn, task_id=args.task_id, doc_stride=args.doc_stride, max_query_length=args.max_query_length) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) test_pyreader.decorate_tensor_provider( reader.data_generator( args.test_set, batch_size=args.batch_size, epoch=1, dev_count=1, shuffle=False, phase="test")) print(reader.get_examples("test")) mrc_result = evaluate( exe, test_prog, test_pyreader, test_graph_vars, "test", examples=reader.get_examples("test"), features=reader.get_features("test"), # examples = test_examples, # features=reader._convert_example_to_feature(examples=test_examples,max_seq_length=512,tokenizer=tokenization.FullTokenizer( # vocab_file='config/vocab.txt', do_lower_case=True),is_training =False), args=args) print('abc:mrc:return',len(mrc_result[1])) return mrc_result