def origin_events_process4test(): """ origin_events_process deal test json data like:{"text": "国际组织IEEE限制华为后,北大教授宣布退出IEEE编委会", "id": "17388c65ef9d77aff12d8cc1f645e41c"} """ origin_events_path = sys.argv[2] save_dir = sys.argv[3] if not origin_events_path or not save_dir: raise Exception("set origin_events_path and save_dir first") output = [] lines = utils.read_by_lines(origin_events_path) for line in lines: d_json = json.loads(line) event = {} event["trigger"] = "" event["trigger_start_index"] = 0 event["class"] = "" event["event_type"] = "" event["arguments"] = [] argument = {} argument["argument_start_index"] = 0 argument["role"] = "" argument["argument"] = "" argument["alias"] = [] event["arguments"].append(argument) event["event_id"] = u"{}_{}".format(d_json["id"], "no_event") event["text"] = d_json["text"] event["id"] = d_json["id"] output.append(json.dumps(event, ensure_ascii=False)) random.shuffle(output) # 随机一下 print(u"include sentences {}, events {}, test datas {}".format( len(lines), len(output), len(output))) utils.write_by_lines(u"{}/test.json".format(save_dir), output)
def test_data_2_eval(test_file_path, save_path): """test_2_eval_data""" # test_file_path = sys.argv[2] # save_path = sys.argv[3] if not test_file_path or not save_path: raise Exception("must set test_data_path and save_path") datas = utils.read_by_lines(test_file_path) all_events = {} for data in datas: d_json = json.loads(data) text = d_json["text"] _id = utils.cal_md5(text.encode("utf-8")) event = { "trigger": d_json["trigger"], "trigger_start_index": d_json["trigger_start_index"], "event_type": d_json["event_type"], "class": d_json["class"], "arguments": d_json["arguments"], } if _id not in all_events: all_events[_id] = {u"id": _id, u"text": text, u"event_list": []} all_events[_id][u"event_list"].append(event) outputs = [json.dumps(x, ensure_ascii=False) for x in all_events.values()] utils.write_by_lines(save_path, outputs) print(u"test data 2 eval data, inputs {} outputs {}".format( len(datas), len(outputs)))
def predict_trigger(trigger_dict): """ trigger predict """ log = trigger_dict['log'] args = trigger_dict['args'] labels_map = trigger_dict['labels_map'] ernie_config = trigger_dict['ernie_config'] place = trigger_dict['place'] dev_count = trigger_dict['dev_count'] reader = trigger_dict['reader'] startup_prog = trigger_dict['startup_prog'] test_prog = trigger_dict['test_prog'] test_pyreader = trigger_dict['test_pyreader'] graph_vars = trigger_dict['graph_vars'] nccl2_num_trainers = trigger_dict['nccl2_num_trainers'] nccl2_trainer_id = trigger_dict['nccl2_trainer_id'] if 'exe' in trigger_dict: exe = trigger_dict['exe'] else: exe = None trigger_pred_save_path = args.trigger_pred_save_path if os.path.exists(trigger_pred_save_path): print('delete old file : %s' % trigger_pred_save_path) os.remove(trigger_pred_save_path) if args.do_test: # TODO 2020-11-24 # TODO 2021-01-20 取消if not exe -- use_fp16=args.use_fp16) del exe if not exe: exe = fluid.Executor(place) exe.run(startup_prog) if args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) log.error( "********************** trigger predict begin **********************" ) test_ret = predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') utils.write_by_lines(args.trigger_pred_save_path, test_ret) del exe
def schema_role_process(): """schema_role_process""" schema_path = sys.argv[2] save_path = sys.argv[3] if not schema_path or not save_path: raise Exception("set schema_path and save_path first") index = 0 roles = set() for line in utils.read_by_lines(schema_path): d_json = json.loads(line) for role in d_json["role_list"]: roles.add(role["role"]) outputs = [] for r in list(roles): outputs.append(u"B-{}\t{}".format(r, index)) index += 1 outputs.append(u"I-{}\t{}".format(r, index)) index += 1 outputs.append(u"O\t{}".format(index)) print(u"include roles {},create label {}".format(len(roles), len(outputs))) utils.write_by_lines(save_path, outputs)
def origin_events_process(): """origin_events_process""" origin_events_path = sys.argv[2] save_dir = sys.argv[3] if not origin_events_path or not save_dir: raise Exception("set origin_events_path and save_dir first") output = [] lines = utils.read_by_lines(origin_events_path) for line in lines: d_json = json.loads(line) for event in d_json["event_list"]: event["event_id"] = u"{}_{}".format(d_json["id"], event["trigger"]) event["text"] = d_json["text"] event["id"] = d_json["id"] output.append(json.dumps(event, ensure_ascii=False)) random.shuffle(output) # 随机一下 # 按照 8 / 2 分 train_data_len = int(len(output) * 0.8) train_data = output[:train_data_len] test_data = output[train_data_len:] print( u"include sentences {}, events {}, train datas {}, dev datas {}, test datas {}" .format(len(lines), len(output), len(train_data), len(test_data), len(test_data))) utils.write_by_lines(u"{}/train.json".format(save_dir), train_data) utils.write_by_lines(u"{}/dev.json".format(save_dir), test_data) utils.write_by_lines(u"{}/test.json".format(save_dir), test_data)
def schema_event_type_process(): """schema_process""" schema_path = sys.argv[2] save_path = sys.argv[3] if not schema_path or not save_path: raise Exception("set schema_path and save_path first") index = 0 event_types = set() for line in utils.read_by_lines(schema_path): d_json = json.loads(line) event_types.add(d_json["event_type"]) outputs = [] for et in list(event_types): outputs.append(u"B-{}\t{}".format(et, index)) index += 1 outputs.append(u"I-{}\t{}".format(et, index)) index += 1 outputs.append(u"O\t{}".format(index)) print(u"include event type {}, create label {}".format( len(event_types), len(outputs))) utils.write_by_lines(save_path, outputs)
def predict_data_2_eval(pred_trigger_path, pred_role_path, schema_path, save_path): """pred_process_with_golden_type""" # pred_trigger_path = sys.argv[2] # pred_role_path = sys.argv[3] # schema_path = sys.argv[4] # save_path = sys.argv[5] if not pred_trigger_path or not pred_role_path or not schema_path or not save_path: raise Exception( "must set pred_trigger_path and pred_role_path and schema_path and save_path" ) print(u"predict data 2 eval data start") trigger_data_list = utils.read_by_lines(pred_trigger_path) trigger_datas = {} for d in trigger_data_list: d_json = json.loads(d) trigger_datas[d_json["event_id"]] = d_json print(u"load trigger predict datas {} from {}".format( len(trigger_datas), pred_trigger_path)) role_data_list = utils.read_by_lines(pred_role_path) role_datas = {} for d in role_data_list: d_json = json.loads(d) role_datas[d_json["event_id"]] = d_json print(u"load role predict datas {} from {}".format(len(role_datas), pred_role_path)) schema_data_list = utils.read_by_lines(schema_path) schema_datas = {} for d in schema_data_list: d_json = json.loads(d) schema_datas[d_json["event_type"]] = [ r["role"] for r in d_json["role_list"] ] print(u"load schema datas {} from {}".format(len(schema_data_list), schema_path)) all_events = {} for t_json in trigger_datas.values(): text = t_json["sentence"] _id = utils.cal_md5(text.encode("utf-8")) exist_event_type = set() for tri_info in t_json["trigger_ret"]: event_type = tri_info["event_type"] if event_type in exist_event_type: continue trigger = tri_info["text"] role_type_set = set(schema_datas[event_type]) r_json = role_datas[t_json["event_id"]] arguments = [] for p_r in r_json["roles_ret"]: role_type = p_r["role_type"] if role_type in role_type_set: arguments.append({ u"role": role_type, u"argument": p_r["text"] }) if len(arguments) > 0: event = { u"trigger": trigger, u"event_type": event_type, u"arguments": arguments } if _id not in all_events: all_events[_id] = { u"id": _id, u"text": text, u"event_list": [] } all_events[_id][u"event_list"].append(event) exist_event_type.add(event_type) outputs = [json.dumps(x, ensure_ascii=False) for x in all_events.values()] utils.write_by_lines(save_path, outputs) print(u"predict data 2 eval data is finished, outputs {}".format( len(outputs)))
def main(args): """main""" reader = task_reader.RoleSequenceLabelReader( vocab_path=args.vocab_path, labels_map=labels_map, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed, task_id=args.task_id) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) if args.in_tokens: if args.batch_size < args.max_seq_len: raise ValueError( 'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len)) max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, graph_vars = create_model( args, pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr, loss_scaling = optimization( loss=graph_vars["loss"], warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, graph_vars = create_model( args, pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) nccl2_num_trainers = 1 nccl2_trainer_id = 0 exe = fluid.Executor(place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=graph_vars["loss"].name, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 graph_vars["learning_rate"] = scheduled_lr time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps != 0: train_exe.run(fetch_list=[]) else: fetch_list = [ graph_vars["num_infer"].name, graph_vars["num_label"].name, graph_vars["num_correct"].name, graph_vars["loss"].name, graph_vars['learning_rate'].name, ] out = train_exe.run(fetch_list=fetch_list) num_infer, num_label, num_correct, np_loss, np_lr = out lr = float(np_lr[0]) loss = np_loss.mean() precision, recall, f1 = calculate_f1( num_label, num_infer, num_correct) if args.verbose: print( "train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(), lr if warmup_steps > 0 else args.learning_rate)) current_example, current_epoch = reader.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin print( u"【train】epoch: {}, step: {}, loss: {:.6f}, " "f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, speed: {:.3f} steps/s" .format(current_epoch, steps, float(loss), float(f1), float(precision), float(recall), args.skip_steps / used_time)) time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: precision, recall, f1 = evaluate_wrapper( reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}" .format(float(precision), float(recall), float(f1))) # evaluate test set if args.do_test: precision, recall, f1 = evaluate_wrapper( reader, exe, test_prog, test_pyreader, graph_vars, current_epoch, steps) print( u"【test】precision {:.4f} , recall {:.4f}, f1-score {:.4f}" .format(float(precision), float(recall), float(f1))) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "final_model") fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break # final eval on dev set if args.do_val: precision, recall, f1 = evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') print(u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}".format( float(precision), float(recall), float(f1))) if args.do_test: test_ret = predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars, 1, 'final') utils.write_by_lines(args.trigger_pred_save_path, test_ret)