def init_data_loader(self, config, query_map_file): vocab_file_path = os.path.join( config.get("bert_pretrained_model_path"), config.get("vocab_file")) slot_file = os.path.join( event_config.get("slot_list_root_path"), event_config.get("bert_slot_complete_file_name_role")) schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema")) # query_map_file = os.path.join(event_config.get( # "slot_list_root_path"), event_config.get("query_map_file")) data_loader = EventRolePrepareMRC(vocab_file_path, 512, slot_file, schema_file, query_map_file) return data_loader
def gen_type_classification_data(): """ generate event type classification data of index_type_fold_data_{} """ # bert vocab file path vocab_file_path = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file")) # bert config file path bert_config_file = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("bert_config_path")) # event type list file path event_type_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("event_type_file")) data_loader = EventTypeClassificationPrepare(vocab_file_path, 512, event_type_file) # train file train_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_train")) # eval file eval_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_eval")) data_loader.k_fold_split_data(train_file, eval_file, True)
def gen_type_classification_data(): """ generate event type classification data of index_type_fold_data_{} """ # bert vocab file path # chinese_roberta_wwm_ext_L-12_H-1024_A-12_large/vocab.txt vocab_file_path = os.path.join(event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file")) # event type list file path # slot_pattern/vocab_all_event_type_label_map.txt event_type_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("event_type_file")) data_loader =EventTypeClassificationPrepare(vocab_file_path,512,event_type_file) # train file # data/train.json train_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_train")) # eval file # data/dev.json eval_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_eval")) data_loader.k_fold_split_data(train_file,eval_file,True)
import os import numpy as np from data_processing.event_prepare_data import EventTypeClassificationPrepare, EventRolePrepareMRC from configs.event_config import event_config if __name__ == "__main__": vocab_file_path = os.path.join(event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file")) # bert_config_file = os.path.join(event_config.get("bert_pretrained_model_path"), event_config.get("bert_config_path")) event_type_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("event_type_file")) # data_loader =EventTypeClassificationPrepare(vocab_file_path,512,event_type_file) # train_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_train")) # eval_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_eval")) # train_data_list,train_label_list,train_token_type_id_list,dev_data_list,dev_label_list,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,is_train=True) slot_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("bert_slot_complete_file_name_role")) schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema")) query_map_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("query_map_file")) data_loader = EventRolePrepareMRC(vocab_file_path, 512, slot_file, schema_file, query_map_file) train_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_train")) eval_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_eval")) # data_list,label_start_list,label_end_list,query_len_list,token_type_id_list # train_datas, train_labels_start,train_labels_end,train_query_lens,train_token_type_id_list,dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,True) # dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._read_json_file(eval_file,None,False) # train_datas, train_labels_start,train_labels_end,train_query_lens,train_token_type_id_list,dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._merge_ee_and_re_datas(train_file,eval_file,"relation_extraction/data/train_data.json","relation_extraction/data/dev_data.json") data_loader.k_fold_split_data(train_file, eval_file, True) # import numpy as np # train_query_lens = np.load("data/fold_data_{}/query_lens_train.npy".format(0),allow_pickle=True) # print(train_query_lens[0]) # re_train_file = "relation_extraction/data/train_data.json"
def run_event_verify_role_mrc(args): """ retro reader 第二阶段的精度模块,同时训练两个任务,role抽取和问题是否可以回答 :param args: :return: """ model_base_dir = event_config.get(args.model_checkpoint_dir).format( args.fold_index) pb_model_dir = event_config.get(args.model_pb_dir).format(args.fold_index) vocab_file_path = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file")) bert_config_file = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("bert_config_path")) slot_file = os.path.join( event_config.get("slot_list_root_path"), event_config.get("bert_slot_complete_file_name_role")) schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema")) query_map_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("query_map_file")) data_loader = EventRolePrepareMRC(vocab_file_path, 512, slot_file, schema_file, query_map_file) # train_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_train")) # eval_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_eval")) # data_list,label_start_list,label_end_list,query_len_list,token_type_id_list # train_datas, train_labels_start,train_labels_end,train_query_lens,train_token_type_id_list,dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,True) # dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._read_json_file(eval_file,None,False) # train_datas, train_labels_start,train_labels_end,train_query_lens,train_token_type_id_list,dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._merge_ee_and_re_datas(train_file,eval_file,"relation_extraction/data/train_data.json","relation_extraction/data/dev_data.json") train_has_answer_label_list = [] dev_has_answer_label_list = [] train_datas = np.load( "data/verify_neg_fold_data_{}/token_ids_train.npy".format( args.fold_index), allow_pickle=True) # train_has_answer_label_list = np.load("data/verify_neg_fold_data_{}/has_answer_train.npy".format(args.fold_index),allow_pickle=True) train_token_type_id_list = np.load( "data/verify_neg_fold_data_{}/token_type_ids_train.npy".format( args.fold_index), allow_pickle=True) dev_datas = np.load( "data/verify_neg_fold_data_{}/token_ids_dev.npy".format( args.fold_index), allow_pickle=True) # dev_has_answer_label_list = np.load("data/verify_neg_fold_data_{}/has_answer_dev.npy".format(args.fold_index),allow_pickle=True) dev_token_type_id_list = np.load( "data/verify_neg_fold_data_{}/token_type_ids_dev.npy".format( args.fold_index), allow_pickle=True) train_query_lens = np.load( "data/verify_neg_fold_data_{}/query_lens_train.npy".format( args.fold_index), allow_pickle=True) dev_query_lens = np.load( "data/verify_neg_fold_data_{}/query_lens_dev.npy".format( args.fold_index), allow_pickle=True) train_start_labels = np.load( "data/verify_neg_fold_data_{}/labels_start_train.npy".format( args.fold_index), allow_pickle=True) dev_start_labels = np.load( "data/verify_neg_fold_data_{}/labels_start_dev.npy".format( args.fold_index), allow_pickle=True) train_end_labels = np.load( "data/verify_neg_fold_data_{}/labels_end_train.npy".format( args.fold_index), allow_pickle=True) dev_end_labels = np.load( "data/verify_neg_fold_data_{}/labels_end_dev.npy".format( args.fold_index), allow_pickle=True) train_samples_nums = len(train_datas) for i in range(train_samples_nums): if sum(train_start_labels[i]) == 0: train_has_answer_label_list.append(0) else: train_has_answer_label_list.append(1) train_has_answer_label_list = np.array( train_has_answer_label_list).reshape((train_samples_nums, 1)) dev_samples_nums = len(dev_datas) for i in range(dev_samples_nums): if sum(dev_start_labels[i]) == 0: dev_has_answer_label_list.append(0) else: dev_has_answer_label_list.append(1) dev_has_answer_label_list = np.array(dev_has_answer_label_list).reshape( (dev_samples_nums, 1)) if train_samples_nums % args.train_batch_size != 0: each_epoch_steps = int(train_samples_nums / args.train_batch_size) + 1 else: each_epoch_steps = int(train_samples_nums / args.train_batch_size) # each_epoch_steps = int(data_loader.train_samples_nums/args.train_batch_size)+1 logger.info('*****train_set sample nums:{}'.format(train_samples_nums)) logger.info('*****dev_set sample nums:{}'.format(dev_samples_nums)) logger.info('*****train each epoch steps:{}'.format(each_epoch_steps)) train_steps_nums = each_epoch_steps * args.epochs # train_steps_nums = each_epoch_steps * args.epochs // hvd.size() logger.info('*****train_total_steps:{}'.format(train_steps_nums)) decay_steps = args.decay_epoch * each_epoch_steps logger.info('*****train decay steps:{}'.format(decay_steps)) # dropout_prob是丢弃概率 params = { "dropout_prob": args.dropout_prob, "num_labels": 2, "rnn_size": args.rnn_units, "num_layers": args.num_layers, "hidden_units": args.hidden_units, "decay_steps": decay_steps, "train_steps": train_steps_nums, "num_warmup_steps": int(train_steps_nums * 0.1) } # dist_strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=args.gpu_nums) config_tf = tf.ConfigProto() config_tf.gpu_options.allow_growth = True run_config = tf.estimator.RunConfig( model_dir=model_base_dir, save_summary_steps=each_epoch_steps, save_checkpoints_steps=each_epoch_steps, session_config=config_tf, keep_checkpoint_max=3, # train_distribute=dist_strategy ) bert_init_checkpoints = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("bert_init_checkpoints")) # init_checkpoints = "output/model/merge_usingtype_roberta_traindev_event_role_bert_mrc_model_desmodified_lowercase/checkpoint/model.ckpt-1218868" model_fn = event_verify_mrc_model_fn_builder(bert_config_file, bert_init_checkpoints, args) estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) if args.do_train: train_input_fn = lambda: event_input_verfify_mrc_fn( train_datas, train_start_labels, train_end_labels, train_token_type_id_list, train_query_lens, train_has_answer_label_list, is_training=True, is_testing=False, args=args) eval_input_fn = lambda: event_input_verfify_mrc_fn( dev_datas, dev_start_labels, dev_end_labels, dev_token_type_id_list, dev_query_lens, dev_has_answer_label_list, is_training=False, is_testing=False, args=args) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps_nums) exporter = tf.estimator.BestExporter( exports_to_keep=1, serving_input_receiver_fn=bert_mrc_serving_input_receiver_fn) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, exporters=[exporter], throttle_secs=0) # for _ in range(args.epochs): tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # "bert_ce_model_pb" estimator.export_saved_model(pb_model_dir, bert_mrc_serving_input_receiver_fn)
def run_event_binclassification(args): """ retroreader中的eav模块,即第一遍阅读模块,预测该问题是否有回答 :param args: :return: """ model_base_dir = event_config.get(args.model_checkpoint_dir).format( args.fold_index) pb_model_dir = event_config.get(args.model_pb_dir).format(args.fold_index) print(model_base_dir) print(pb_model_dir) vocab_file_path = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file")) bert_config_file = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("bert_config_path")) event_type_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("event_type_file")) # data_loader =EventTypeClassificationPrepare(vocab_file_path,512,event_type_file) # train_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_train")) # eval_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_eval")) # train_data_list,train_label_list,train_token_type_id_list,dev_data_list,dev_label_list,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,is_train=True) train_data_list = np.load( "data/verify_neg_fold_data_{}/token_ids_train.npy".format( args.fold_index), allow_pickle=True) # train_label_list = np.load("data/verify_neg_fold_data_{}/has_answer_train.npy".format(args.fold_index),allow_pickle=True) train_label_list = [] train_start_labels = np.load( "data/verify_neg_fold_data_{}/labels_start_train.npy".format( args.fold_index), allow_pickle=True) dev_start_labels = np.load( "data/verify_neg_fold_data_{}/labels_start_dev.npy".format( args.fold_index), allow_pickle=True) train_token_type_id_list = np.load( "data/verify_neg_fold_data_{}/token_type_ids_train.npy".format( args.fold_index), allow_pickle=True) dev_data_list = np.load( "data/verify_neg_fold_data_{}/token_ids_dev.npy".format( args.fold_index), allow_pickle=True) # dev_label_list = np.load("data/verify_neg_fold_data_{}/has_answer_dev.npy".format(args.fold_index),allow_pickle=True) dev_label_list = [] dev_token_type_id_list = np.load( "data/verify_neg_fold_data_{}/token_type_ids_dev.npy".format( args.fold_index), allow_pickle=True) # dev_datas,dev_token_type_ids,dev_labels = data_loader._read_json_file(eval_file) train_samples_nums = len(train_data_list) for i in range(train_samples_nums): if sum(train_start_labels[i]) == 0: train_label_list.append(0) else: train_label_list.append(1) train_label_list = np.array(train_label_list).reshape( (train_samples_nums, 1)) dev_samples_nums = len(dev_data_list) for i in range(dev_samples_nums): if sum(dev_start_labels[i]) == 0: dev_label_list.append(0) else: dev_label_list.append(1) dev_label_list = np.array(dev_label_list).reshape((dev_samples_nums, 1)) if train_samples_nums % args.train_batch_size != 0: each_epoch_steps = int(train_samples_nums / args.train_batch_size) + 1 else: each_epoch_steps = int(train_samples_nums / args.train_batch_size) # each_epoch_steps = int(data_loader.train_samples_nums/args.train_batch_size)+1 logger.info('*****train_set sample nums:{}'.format(train_samples_nums)) logger.info('*****train each epoch steps:{}'.format(each_epoch_steps)) train_steps_nums = each_epoch_steps * args.epochs # train_steps_nums = each_epoch_steps * args.epochs // hvd.size() logger.info('*****train_total_steps:{}'.format(train_steps_nums)) decay_steps = args.decay_epoch * each_epoch_steps logger.info('*****train decay steps:{}'.format(decay_steps)) # dropout_prob是丢弃概率 params = { "dropout_prob": args.dropout_prob, "num_labels": 1, "rnn_size": args.rnn_units, "num_layers": args.num_layers, "hidden_units": args.hidden_units, "decay_steps": decay_steps, "class_weight": 1 } # dist_strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=args.gpu_nums) config_tf = tf.ConfigProto() config_tf.gpu_options.allow_growth = True # "bert_ce_model_dir" # mirrored_strategy = tf.distribute.MirroredStrategy() # config_tf.gpu_options.visible_device_list = str(hvd.local_rank()) # checkpoint_path = os.path.join(bert_config.get(args.model_checkpoint_dir), str(hvd.rank())) run_config = tf.estimator.RunConfig( model_dir=model_base_dir, save_summary_steps=train_steps_nums + 10, save_checkpoints_steps=each_epoch_steps, session_config=config_tf, keep_checkpoint_max=1, # train_distribute=dist_strategy ) bert_init_checkpoints = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("bert_init_checkpoints")) model_fn = bert_binaryclassification_model_fn_builder( bert_config_file, bert_init_checkpoints, args) estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) if args.do_train: # train_input_fn = lambda: data_loader.create_dataset(is_training=True,is_testing=False, args=args) # eval_input_fn = lambda: data_loader.create_dataset(is_training=False,is_testing=False,args=args) # train_X,train_Y = np.load(data_loader.train_X_path,allow_pickle=True),np.load(data_loader.train_Y_path,allow_pickle=True) # train_input_fn = lambda :event_class_input_bert_fn(train_data_list,token_type_ids=train_token_type_id_list,label_map_len=data_loader.labels_map_len, # is_training=True,is_testing=False,args=args,input_Ys=train_label_list) train_input_fn = lambda: event_binclass_input_bert_fn( train_data_list, token_type_ids=train_token_type_id_list, label_map_len=1, is_training=True, is_testing=False, args=args, input_Ys=train_label_list) # eval_X,eval_Y = np.load(data_loader.valid_X_path,allow_pickle=True),np.load(data_loader.valid_Y_path,allow_pickle=True) # eval_input_fn = lambda: event_class_input_bert_fn(dev_data_list,token_type_ids=dev_token_type_id_list,label_map_len=data_loader.labels_map_len, # is_training=False,is_testing=False,args=args,input_Ys=dev_label_list) eval_input_fn = lambda: event_binclass_input_bert_fn( dev_data_list, token_type_ids=dev_token_type_id_list, label_map_len=1, is_training=False, is_testing=False, args=args, input_Ys=dev_label_list) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps_nums) exporter = tf.estimator.BestExporter( exports_to_keep=1, serving_input_receiver_fn=bert_event_bin_serving_input_receiver_fn) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, throttle_secs=0, exporters=[exporter]) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # "bert_ce_model_pb" estimator.export_saved_model(pb_model_dir, bert_event_bin_serving_input_receiver_fn)
def run_event_classification(args): """ 事件类型分析,多标签二分类问题,借鉴NL2SQL预测column的方法 :param args: :return: """ model_base_dir = event_config.get(args.model_checkpoint_dir).format( args.fold_index) pb_model_dir = event_config.get(args.model_pb_dir).format(args.fold_index) # print(model_base_dir) # print(pb_model_dir) vocab_file_path = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file")) bert_config_file = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("bert_config_path")) event_type_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("event_type_file")) data_loader = EventTypeClassificationPrepare(vocab_file_path, 512, event_type_file) # train_data_list,train_label_list,train_token_type_id_list,dev_data_list,dev_label_list,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,is_train=True) train_data_list = np.load( "data/index_type_fold_data_{}/token_ids_train.npy".format( args.fold_index), allow_pickle=True) train_label_list = np.load( "data/index_type_fold_data_{}/labels_train.npy".format( args.fold_index), allow_pickle=True) train_token_type_id_list = np.load( "data/index_type_fold_data_{}/token_type_ids_train.npy".format( args.fold_index), allow_pickle=True) train_type_index_ids_list = np.load( "data/index_type_fold_data_{}/type_index_in_token_ids_train.npy". format(args.fold_index), allow_pickle=True) dev_data_list = np.load( "data/index_type_fold_data_{}/token_ids_dev.npy".format( args.fold_index), allow_pickle=True) dev_label_list = np.load( "data/index_type_fold_data_{}/labels_dev.npy".format(args.fold_index), allow_pickle=True) dev_token_type_id_list = np.load( "data/index_type_fold_data_{}/token_type_ids_dev.npy".format( args.fold_index), allow_pickle=True) dev_type_index_ids_list = np.load( "data/index_type_fold_data_{}/type_index_in_token_ids_dev.npy".format( args.fold_index), allow_pickle=True) train_labels = np.array(train_label_list) # print(train_labels.shape) a = np.sum(train_labels, axis=0) a = [max(a) / ele for ele in a] class_weight = np.array(a) class_weight = np.reshape(class_weight, (1, 65)) # print(class_weight) # dev_datas,dev_token_type_ids,dev_labels = data_loader._read_json_file(eval_file) train_samples_nums = len(train_data_list) dev_samples_nums = len(dev_data_list) if train_samples_nums % args.train_batch_size != 0: each_epoch_steps = int(train_samples_nums / args.train_batch_size) + 1 else: each_epoch_steps = int(train_samples_nums / args.train_batch_size) # each_epoch_steps = int(data_loader.train_samples_nums/args.train_batch_size)+1 logger.info('*****train_set sample nums:{}'.format(train_samples_nums)) logger.info('*****train each epoch steps:{}'.format(each_epoch_steps)) train_steps_nums = each_epoch_steps * args.epochs # train_steps_nums = each_epoch_steps * args.epochs // hvd.size() logger.info('*****train_total_steps:{}'.format(train_steps_nums)) decay_steps = args.decay_epoch * each_epoch_steps logger.info('*****train decay steps:{}'.format(decay_steps)) # dropout_prob是丢弃概率 params = { "dropout_prob": args.dropout_prob, "num_labels": data_loader.labels_map_len, "rnn_size": args.rnn_units, "num_layers": args.num_layers, "hidden_units": args.hidden_units, "decay_steps": decay_steps, "class_weight": class_weight } # dist_strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=args.gpu_nums) config_tf = tf.ConfigProto() config_tf.gpu_options.allow_growth = True run_config = tf.estimator.RunConfig( model_dir=model_base_dir, save_summary_steps=train_steps_nums + 10, save_checkpoints_steps=each_epoch_steps, session_config=config_tf, keep_checkpoint_max=1, # train_distribute=dist_strategy ) bert_init_checkpoints = os.path.join( event_config.get("bert_pretrained_model_path"), event_config.get("bert_init_checkpoints")) model_fn = bert_classification_model_fn_builder(bert_config_file, bert_init_checkpoints, args) estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) if args.do_train: train_input_fn = lambda: event_index_class_input_bert_fn( train_data_list, token_type_ids=train_token_type_id_list, type_index_ids_list=train_type_index_ids_list, label_map_len=data_loader.labels_map_len, is_training=True, is_testing=False, args=args, input_Ys=train_label_list) # eval_X,eval_Y = np.load(data_loader.valid_X_path,allow_pickle=True),np.load(data_loader.valid_Y_path,allow_pickle=True) # eval_input_fn = lambda: event_class_input_bert_fn(dev_data_list,token_type_ids=dev_token_type_id_list,label_map_len=data_loader.labels_map_len, # is_training=False,is_testing=False,args=args,input_Ys=dev_label_list) eval_input_fn = lambda: event_index_class_input_bert_fn( dev_data_list, token_type_ids=dev_token_type_id_list, type_index_ids_list=dev_type_index_ids_list, label_map_len=data_loader.labels_map_len, is_training=False, is_testing=False, args=args, input_Ys=dev_label_list) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps_nums) exporter = tf.estimator.BestExporter( exports_to_keep=1, serving_input_receiver_fn=bert_event_type_serving_input_receiver_fn ) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, throttle_secs=0, exporters=[exporter]) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # "bert_ce_model_pb" estimator.export_saved_model( pb_model_dir, bert_event_type_serving_input_receiver_fn)
def parse_kfold_verify(args): if (args.gpus is not None): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus # test_file = "data/test1.json" # Path of test dataset json file test_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_test")) # Path of text multi label classification saved model class_type_model_path = event_config.get(args.event_type_model_path) event_schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema")) event_schema_dict = parse_event_schema(event_schema_file) # multi label event type classifer fp_type = fastPredictTypeClassification(class_type_model_path, event_config) # parse json file to get id and text id_list, text_list = fp_type.parse_test_json(test_file) # kfold_type_result_list = [] # for prediction in 65 probabilities event_type_result_list = [] # for result event type name for k in range(1): predict_fn = fp_type.load_models_kfold(class_type_model_path.format(k)) cur_fold_event_type_probs = fp_type.predict_for_all_prob( predict_fn, text_list) kfold_type_result_list.append(cur_fold_event_type_probs) for i in range(len(text_list)): cur_sample_event_type_buffer = [ ele[i] for ele in kfold_type_result_list ] cur_sample_event_type_prob = np.array( cur_sample_event_type_buffer).reshape((-1, 65)) avg_result = np.mean(cur_sample_event_type_prob, axis=0) event_label_ids = np.argwhere(avg_result > 0.5) event_cur_type_strs = [ fp_type.data_loader.id2labels_map.get(ele[0]) for ele in event_label_ids ] event_type_result_list.append(event_cur_type_strs) # path of Answerable Verificaion model to predict whether a query is answerable, # 第一阶段 粗读 的 answaerable verifier , # External Front Verifier exterinal_av_model_path = "output/model/final_verify_cls_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model" # verify_av_model_path_old = event_config.get(args.event_verfifyav_model_path) verify_av_model_path_old = "output/model/verify_avmrc_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model" # 第二阶段 精读 的 answaerable verifier , # Internal Front Verifier 。 interbal_av_model_path = "output/model/final_verify_avmrc_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model" fp_cls_old = fastPredictCls(exterinal_av_model_path, event_config, "data/slot_pattern/slot_descrip_old") # fp_cls_new = fastPredictCls(cls_model_path, event_config, "data/slot_pattern/slot_descrip") fp_answerable_verifier = fastPredictCls(exterinal_av_model_path, event_config, "data/slot_pattern/slot_descrip") kfold_eav_hasa_result = [] kfold_start_result = [] kfold_end_result = [] kfold_hasa_result = [] for k in range(1): # predict_fn_cls_new = fp_answerable_verifier.load_models_kfold(external_av_model_path.format(k)) # 粗读fn predict_fn_ex_av = fp_answerable_verifier.load_models_kfold( exterinal_av_model_path.format(k)) # predict_fn_av = fp_cls_new.load_models_kfold(verify_av_model_path_new.format(k)) # 精读fn predict_fn_in_av = fp_answerable_verifier.load_models_kfold( interbal_av_model_path.format(k)) cur_fold_eav_probs_result = {} cur_fold_av_start_probs_result = {} cur_fold_av_end_probs_result = {} cur_fold_av_has_answer_probs_result = {} for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list): if event_type_res is None or len(event_type_res) == 0: # submit_result.append({"id": sample_id, "event_list": []}) cur_fold_eav_probs_result.update({sample_id: []}) continue for cur_event_type in event_type_res: cur_event_type = cur_event_type.strip() if cur_event_type is None or cur_event_type == "": continue corresponding_role_type_list = event_schema_dict.get( cur_event_type) cur_event_type_answerable_probs_result = [] cur_event_av_start_probs_result = [] cur_event_av_end_probs_result = [] cur_event_av_hasanswer_probs_result = [] for cur_role_type in corresponding_role_type_list: has_answer_probs = None label_prob = None start_probs = None end_probs = None cur_query_word = fp_answerable_verifier.data_loader.gen_query_for_each_sample( cur_event_type, cur_role_type) query_token_ids, query_token_len, token_type_ids_, token_mapping_new = fp_answerable_verifier.data_loader.trans_single_data_for_test( text, cur_query_word, 512) ############################################################################# ## Exterinal Answerable Verify, predict answerable probs eav_probs = fp_answerable_verifier.predict_single_sample_prob( predict_fn_ex_av, query_token_ids, query_token_len, token_type_ids_) ############################################################################# # Internal Answerable Verify ,predict start&end labe and answerable probs role_start_ids, role_end_ids, role_start_probs, role_end_probs, iav_probs = fp_cls_old.predict_single_sample_av_prob( predict_fn_in_av, query_token_ids, query_token_len, token_type_ids_) cur_event_type_answerable_probs_result.append(eav_probs) cur_event_av_hasanswer_probs_result.append(iav_probs) cur_event_av_start_probs_result.append(role_start_probs) cur_event_av_end_probs_result.append(role_end_probs) cur_fold_eav_probs_result.update({ sample_id + "-" + cur_event_type: cur_event_type_answerable_probs_result }) cur_fold_av_start_probs_result.update({ sample_id + "-" + cur_event_type: cur_event_av_start_probs_result }) cur_fold_av_end_probs_result.update({ sample_id + "-" + cur_event_type: cur_event_av_end_probs_result }) cur_fold_av_has_answer_probs_result.update({ sample_id + "-" + cur_event_type: cur_event_av_hasanswer_probs_result }) kfold_eav_hasa_result.append(cur_fold_eav_probs_result) kfold_start_result.append(cur_fold_av_start_probs_result) kfold_end_result.append(cur_fold_av_end_probs_result) kfold_hasa_result.append(cur_fold_av_has_answer_probs_result) submit_result = [] for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list): event_list = [] if event_type_res is None or len(event_type_res) == 0: submit_result.append({"id": sample_id, "event_list": []}) continue for cur_event_type in event_type_res: cur_event_type = cur_event_type.strip() if cur_event_type is None or cur_event_type == "": continue corresponding_role_type_list = event_schema_dict.get( cur_event_type) find_key = sample_id + "-" + cur_event_type fold_cls_probs_cur_sample = [ ele.get(find_key) for ele in kfold_eav_hasa_result ] fold_start_probs_cur_sample = [ ele.get(find_key) for ele in kfold_start_result ] fold_end_probs_cur_sample = [ ele.get(find_key) for ele in kfold_end_result ] fold_has_probs_cur_sample = [ ele.get(find_key) for ele in kfold_hasa_result ] for index, cur_role_type in enumerate( corresponding_role_type_list): cur_eav_fold_probs = [ probs[index] for probs in fold_cls_probs_cur_sample ] cur_iav_hasa_fold_probs = [ probs[index] for probs in fold_has_probs_cur_sample ] cur_eav_fold_probs = np.array(cur_eav_fold_probs).reshape( (-1, 1)) cls_avg_result = np.mean(cur_eav_fold_probs, axis=0) cur_iav_hasa_fold_probs = np.array( cur_iav_hasa_fold_probs).reshape((-1, 1)) has_avg_result = np.mean(cur_iav_hasa_fold_probs, axis=0) ###### # EAV * 0.5 + IAV * 0.5 final_probs_hasa = 0.5 * (cls_avg_result) + 0.5 * ( has_avg_result) if final_probs_hasa > 0.4: cur_query_word = fp_answerable_verifier.data_loader.gen_query_for_each_sample( cur_event_type, cur_role_type) token_ids, query_len, token_type_ids, token_mapping = fp_answerable_verifier.data_loader.trans_single_data_for_test( text, cur_query_word, 512) token_len = len(token_ids) cur_start_fold_probs = [ probs[index] for probs in fold_start_probs_cur_sample ] cur_end_fold_probs = [ probs[index] for probs in fold_end_probs_cur_sample ] cur_start_fold_probs = np.array( cur_start_fold_probs).reshape((-1, token_len, 2)) cur_end_fold_probs = np.array(cur_end_fold_probs).reshape( (-1, token_len, 2)) start_avg_result = np.mean(cur_start_fold_probs, axis=0) end_avg_result = np.mean(cur_end_fold_probs, axis=0) text_start_probs = start_avg_result[query_len:-1, 1] text_end_probs = end_avg_result[query_len:-1, 1] pos_start_probs = (text_start_probs) pos_end_probs = (text_end_probs) start_ids = (pos_start_probs > 0.4).astype(int) end_ids = (pos_end_probs > 0.4).astype(int) token_mapping = token_mapping[1:-1] entity_list, span_start_end_tuple_list = fp_answerable_verifier.extract_entity_from_start_end_ids( text=text, start_ids=start_ids, end_ids=end_ids, token_mapping=token_mapping) for entity in entity_list: if len(entity) > 1: event_list.append({ "event_type": cur_event_type, "arguments": [{ "role": cur_role_type, "argument": entity }] }) submit_result.append({"id": sample_id, "event_list": event_list}) with codecs.open(args.submit_result, 'w', 'utf-8') as fw: for dict_result in submit_result: write_str = json.dumps(dict_result, ensure_ascii=False) fw.write(write_str) fw.write("\n") print("finish")
def parse_kfold_verfify(args): # test_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_test")) test_file = "data/test2.json" class_type_model_path = event_config.get(args.event_type_model_path) event_schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema")) event_schema_dict = parse_event_schema(event_schema_file) fp_type = fastPredictTypeClassification(class_type_model_path, event_config) id_list, text_list = fp_type.parse_test_json(test_file) kfold_type_result_list = [] event_type_result_list = [] # for k in range(6): # predict_fn = fp_type.load_models_kfold(class_type_model_path.format(k)) # cur_fold_event_type_probs = fp_type.predict_for_all_prob(predict_fn,text_list) # kfold_type_result_list.append(cur_fold_event_type_probs) # for i in range(len(text_list)): # cur_sample_event_type_buffer = [ele[i] for ele in kfold_type_result_list] # cur_sample_event_type_prob = np.array(cur_sample_event_type_buffer).reshape((6,65)) # avg_result = np.mean(cur_sample_event_type_prob,axis=0) # event_label_ids = np.argwhere(avg_result > 0.5) # event_cur_type_strs = [fp_type.data_loader.id2labels_map.get( # ele[0]) for ele in event_label_ids] # event_type_result_list.append(event_cur_type_strs) # with codecs.open("test2_kfold_new_final_event_type.txt", 'w', 'utf-8') as fw: # for event_type_result in event_type_result_list: # write_line = ",".join(event_type_result) # fw.write(write_line) # fw.write("\n") event_type_result_list = [] with codecs.open("test2_kfold_new_final_event_type.txt", 'r', 'utf-8') as fr: for line in fr: line = line.strip("\n") event_list_cur = line.split(",") event_type_result_list.append(event_list_cur) # cls_model_path = event_config.get(args.event_cls_model_path) cls_model_path = "output/model/verify_cls_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model" cls_model_path_new = "output/model/final_verify_cls_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model" # verify_av_model_path_old = event_config.get(args.event_verfifyav_model_path) verify_av_model_path_old = "output/model/verify_avmrc_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model" verify_av_model_path_new = "output/model/final_verify_avmrc_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model" fp_cls_old = fastPredictCls(cls_model_path, event_config, "data/slot_pattern/slot_descrip_old") fp_cls_new = fastPredictCls(cls_model_path, event_config, "data/slot_pattern/slot_descrip") kfold_cls_result = [] kfold_start_result = [] kfold_end_result = [] kfold_hasa_result = [] for k in range(6): predict_fn = fp_cls_old.load_models_kfold(cls_model_path.format(k)) predict_fn_cls_new = fp_cls_new.load_models_kfold( cls_model_path_new.format(k)) predict_fn_av = fp_cls_new.load_models_kfold( verify_av_model_path_new.format(k)) predict_fn_av_old = fp_cls_old.load_models_kfold( verify_av_model_path_old.format(k)) cur_fold_cls_probs_result = {} cur_fold_av_start_probs_result = {} cur_fold_av_end_probs_result = {} cur_fold_av_has_answer_probs_result = {} for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list): if event_type_res is None or len(event_type_res) == 0: # submit_result.append({"id": sample_id, "event_list": []}) cur_fold_cls_probs_result.update({sample_id: []}) continue for cur_event_type in event_type_res: cur_event_type = cur_event_type.strip() if cur_event_type is None or cur_event_type == "": continue corresponding_role_type_list = event_schema_dict.get( cur_event_type) cur_event_type_cls_probs_result = [] cur_event_av_start_probs_result = [] cur_event_av_end_probs_result = [] cur_event_av_hasanswer_probs_result = [] for cur_role_type in corresponding_role_type_list: cur_query_word_old = fp_cls_old.data_loader.gen_query_for_each_sample( cur_event_type, cur_role_type) token_ids, query_len, token_type_ids, token_mapping = fp_cls_old.data_loader.trans_single_data_for_test( text, cur_query_word_old, 512) label_prob = fp_cls_old.predict_single_sample_prob( predict_fn, token_ids, query_len, token_type_ids) start_ids, end_ids, start_probs, end_probs, has_answer_probs = fp_cls_old.predict_single_sample_av_prob( predict_fn_av_old, token_ids, query_len, token_type_ids) # cur_event_av_start_probs_result.append(start_probs) # cur_event_av_end_probs_result.append(end_probs) # new cur_query_word_new = fp_cls_new.data_loader.gen_query_for_each_sample( cur_event_type, cur_role_type) token_ids_new, query_len_new, token_type_ids_new, token_mapping_new = fp_cls_new.data_loader.trans_single_data_for_test( text, cur_query_word_new, 512) label_prob_new = fp_cls_new.predict_single_sample_prob( predict_fn_cls_new, token_ids_new, query_len_new, token_type_ids_new) start_ids_new, end_ids_new, start_probs_new, end_probs_new, has_answer_probs_new = fp_cls_old.predict_single_sample_av_prob( predict_fn_av, token_ids_new, query_len_new, token_type_ids_new) cur_event_av_hasanswer_probs_result.append( (has_answer_probs, has_answer_probs_new)) cur_event_type_cls_probs_result.append( (label_prob, label_prob_new)) cur_event_av_start_probs_result.append( (start_probs, start_probs_new)) cur_event_av_end_probs_result.append( (end_probs, end_probs_new)) cur_fold_cls_probs_result.update({ sample_id + "-" + cur_event_type: cur_event_type_cls_probs_result }) cur_fold_av_start_probs_result.update({ sample_id + "-" + cur_event_type: cur_event_av_start_probs_result }) cur_fold_av_end_probs_result.update({ sample_id + "-" + cur_event_type: cur_event_av_end_probs_result }) cur_fold_av_has_answer_probs_result.update({ sample_id + "-" + cur_event_type: cur_event_av_hasanswer_probs_result }) kfold_cls_result.append(cur_fold_cls_probs_result) kfold_start_result.append(cur_fold_av_start_probs_result) kfold_end_result.append(cur_fold_av_end_probs_result) kfold_hasa_result.append(cur_fold_av_has_answer_probs_result) submit_result = [] for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list): event_list = [] if event_type_res is None or len(event_type_res) == 0: submit_result.append({"id": sample_id, "event_list": []}) continue for cur_event_type in event_type_res: cur_event_type = cur_event_type.strip() if cur_event_type is None or cur_event_type == "": continue corresponding_role_type_list = event_schema_dict.get( cur_event_type) find_key = sample_id + "-" + cur_event_type fold_cls_probs_cur_sample = [ ele.get(find_key) for ele in kfold_cls_result ] fold_start_probs_cur_sample = [ ele.get(find_key) for ele in kfold_start_result ] fold_end_probs_cur_sample = [ ele.get(find_key) for ele in kfold_end_result ] fold_has_probs_cur_sample = [ ele.get(find_key) for ele in kfold_hasa_result ] for index, cur_role_type in enumerate( corresponding_role_type_list): cur_cls_fold_probs = [ probs[index] for probs in fold_cls_probs_cur_sample ] cur_cls_fold_probs_old = [] cur_cls_fold_probs_new = [] cur_hasa_fold_probs = [ probs[index] for probs in fold_has_probs_cur_sample ] cur_hasa_fold_probs_old = [] cur_hasa_fold_probs_new = [] for k in range(len(cur_cls_fold_probs)): cur_cls_fold_probs_old.append(cur_cls_fold_probs[k][0]) cur_cls_fold_probs_new.append(cur_cls_fold_probs[k][1]) cur_hasa_fold_probs_old.append(cur_hasa_fold_probs[k][0]) cur_hasa_fold_probs_new.append(cur_hasa_fold_probs[k][1]) cur_cls_fold_probs_old = np.array( cur_cls_fold_probs_old).reshape((6, 1)) cls_avg_result_old = np.mean(cur_cls_fold_probs_old, axis=0) cur_cls_fold_probs_new = np.array( cur_cls_fold_probs_new).reshape((6, 1)) cls_avg_result_new = np.mean(cur_cls_fold_probs_new, axis=0) cur_hasa_fold_probs_old = np.array( cur_hasa_fold_probs_old).reshape((6, 1)) has_avg_result_old = np.mean(cur_hasa_fold_probs_old, axis=0) cur_hasa_fold_probs_new = np.array( cur_hasa_fold_probs_new).reshape((6, 1)) has_avg_result_new = np.mean(cur_hasa_fold_probs_new, axis=0) # cur_hasa_fold_probs = np.array(cur_hasa_fold_probs).reshape((6,1)) # has_avg_result = np.mean(cur_hasa_fold_probs,axis=0) final_probs_hasa = 0.5 * ( cls_avg_result_old + cls_avg_result_new) / 2 + 0.5 * ( has_avg_result_old + has_avg_result_new) / 2 if final_probs_hasa > 0.4: cur_query_word = fp_cls_new.data_loader.gen_query_for_each_sample( cur_event_type, cur_role_type) token_ids, query_len, token_type_ids, token_mapping = fp_cls_new.data_loader.trans_single_data_for_test( text, cur_query_word, 512) cur_query_word_old = fp_cls_old.data_loader.gen_query_for_each_sample( cur_event_type, cur_role_type) token_ids_old, query_len_old, token_type_ids_old, token_mapping_old = fp_cls_old.data_loader.trans_single_data_for_test( text, cur_query_word_old, 512) token_len = len(token_ids) token_len_old = len(token_ids_old) cur_start_fold_probs = [ probs[index] for probs in fold_start_probs_cur_sample ] cur_end_fold_probs = [ probs[index] for probs in fold_end_probs_cur_sample ] cur_start_fold_probs_old = [] cur_start_fold_probs_new = [] cur_end_fold_probs_old = [] cur_end_fold_probs_new = [] for k in range(len(cur_start_fold_probs)): cur_start_fold_probs_old.append( cur_start_fold_probs[k][0]) cur_start_fold_probs_new.append( cur_start_fold_probs[k][1]) cur_end_fold_probs_old.append(cur_end_fold_probs[k][0]) cur_end_fold_probs_new.append(cur_end_fold_probs[k][1]) # cur_start_fold_probs_old = [probs[index] for probs in fold_start_probs_cur_sample] # cur_end_fold_probs_old = [probs[index] for probs in fold_end_probs_cur_sample] cur_start_fold_probs_old = np.array( cur_start_fold_probs_old).reshape( (6, token_len_old, 2)) cur_end_fold_probs_old = np.array( cur_end_fold_probs_old).reshape((6, token_len_old, 2)) start_avg_result_old = np.mean(cur_start_fold_probs_old, axis=0) end_avg_result_old = np.mean(cur_end_fold_probs_old, axis=0) pos_start_probs_old = start_avg_result_old[:, 1] pos_end_probs_old = end_avg_result_old[:, 1] text_start_probs_old = pos_start_probs_old[ query_len_old:-1] text_end_probs_old = pos_end_probs_old[query_len_old:-1] cur_start_fold_probs_new = np.array( cur_start_fold_probs_new).reshape((6, token_len, 2)) cur_end_fold_probs_new = np.array( cur_end_fold_probs_new).reshape((6, token_len, 2)) start_avg_result_new = np.mean(cur_start_fold_probs_new, axis=0) end_avg_result_new = np.mean(cur_end_fold_probs_new, axis=0) pos_start_probs_new = start_avg_result_new[:, 1] pos_end_probs_new = end_avg_result_new[:, 1] text_start_probs_new = pos_start_probs_new[query_len:-1] text_end_probs_new = pos_end_probs_new[query_len:-1] pos_start_probs = (text_start_probs_old + text_start_probs_new) / 2 pos_end_probs = (text_end_probs_old + text_end_probs_new) / 2 start_ids = (pos_start_probs > 0.4).astype(int) # end_ids = np.argmax(end_avg_result,axis=-1) end_ids = (pos_end_probs > 0.4).astype(int) token_mapping = token_mapping[1:-1] # start_ids = start_ids[query_len:-1] # end_ids = end_ids[query_len:-1] entity_list, span_start_end_tuple_list = fp_cls_old.extract_entity_from_start_end_ids( text=text, start_ids=start_ids, end_ids=end_ids, token_mapping=token_mapping) # if len(entity_list) == 0: # score_has_answer = 0.0 # else: # span_score = [text_start_probs[ele[0]]+text_end_probs[ele[1]] for ele in span_start_end_tuple_list] # score_has_answer = max(span_score) # score_no_answer = 0.5*(max(pos_start_probs[0:query_len])+max(pos_end_probs[0:query_len]))+0.5*final_probs_hasa # diff_score = score_has_answer - score_no_answer for entity in entity_list: if len(entity) > 1: event_list.append({ "event_type": cur_event_type, "arguments": [{ "role": cur_role_type, "argument": entity }] }) submit_result.append({"id": sample_id, "event_list": event_list}) # for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list): # event_list = [] # if event_type_res is None or len(event_type_res) == 0: # submit_result.append({"id": sample_id, "event_list": []}) # continue # for cur_event_type in event_type_res: # cur_event_type = cur_event_type.strip() # if cur_event_type is None or cur_event_type == "": # continue # corresponding_role_type_list = event_schema_dict.get(cur_event_type) # find_key = sample_id + "-" + cur_event_type # fold_probs_cur_sample = [ele.get(find_key) for ele in kfold_result] # for index,cur_role_type in enumerate(corresponding_role_type_list): # cur_query_word = fp_role_mrc.data_loader.gen_query_for_each_sample( # cur_event_type, cur_role_type) # token_ids, query_len, token_type_ids, token_mapping = fp_role_mrc.data_loader.trans_single_data_for_test( # text, cur_query_word, 512) # cur_role_fold_probs = [probs[index] for probs in fold_probs_cur_sample] # # cur_role_fold_probs_array = np.vstack(cur_role_fold_probs) # token_len = len(token_ids) # cur_role_fold_probs_array = np.array(cur_role_fold_probs).reshape((1,token_len,3)) # avg_result = np.mean(cur_role_fold_probs_array,axis=0) # pred_ids = np.argmax(avg_result,axis=-1) # token_mapping = token_mapping[1:-1] # pred_ids = pred_ids[query_len:-1] # entity_list = extract_entity_span_from_muliclass(text,pred_ids,token_mapping) # for entity in entity_list: # event_list.append({"event_type": cur_event_type, "arguments": [ # {"role": cur_role_type, "argument": entity}]}) # submit_result.append({"id": sample_id, "event_list": event_list}) # for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list): # # if sample_id == "66de2f44ca8839ddcb0708096864df8b": # # print(text) # if event_type_res is None or len(event_type_res) == 0: # submit_result.append({"id": sample_id, "event_list": []}) # continue # event_list = [] # # print(event_type_res) # # {"event_type": "司法行为-开庭", "arguments": [{"role": "时间", "argument": "4月29日上午"} # for cur_event_type in event_type_res: # cur_event_type = cur_event_type.strip() # if cur_event_type is None or cur_event_type == "": # continue # corresponding_role_type_list = event_schema_dict.get( # cur_event_type) # for cur_role_type in corresponding_role_type_list: # if True: # cur_query_word = fp_role_mrc.data_loader.gen_query_for_each_sample( # cur_event_type, cur_role_type) # token_ids, query_len, token_type_ids, token_mapping = fp_role_mrc.data_loader.trans_single_data_for_test( # text, cur_query_word, 512) # start_ids, end_ids,start_probs,end_probs = fp_role_mrc.predict_single_sample( # token_ids, query_len, token_type_ids) # # print(start_probs.shape) # pos_start_probs = start_probs[:,1] # # pos_end_probs = end_probs[:,1] # start_ids = (pos_start_probs > 0.4).astype(int) # # end_ids = (pos_end_probs > 0.4).astype(int) # # end_ids = (pos_end_probs > 0.4).astype(int) # # 先松后紧 # if sum(start_ids) == 0: # continue # # if sum(start_ids) > 1: # # print(text) # token_mapping = token_mapping[1:-1] # # a = start_ids[query_len-1:] # start_ids = start_ids[query_len:-1] # end_ids = end_ids[query_len:-1] # entity_list = fp_role_mrc.extract_entity_from_start_end_ids( # text=text, start_ids=start_ids, end_ids=end_ids, token_mapping=token_mapping) # for entity in entity_list: # if len(entity) > 1: # event_list.append({"event_type": cur_event_type, "arguments": [ # {"role": cur_role_type, "argument": entity}]}) # submit_result.append({"id": sample_id, "event_list": event_list}) with codecs.open(args.submit_result, 'w', 'utf-8') as fw: for dict_result in submit_result: write_str = json.dumps(dict_result, ensure_ascii=False) fw.write(write_str) fw.write("\n")
def parse_kfold(args): test_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_test")) class_type_model_path = event_config.get(args.event_type_model_path) event_schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema")) event_schema_dict = parse_event_schema(event_schema_file) fp_type = fastPredictTypeClassification(class_type_model_path, event_config) id_list, text_list = fp_type.parse_test_json(test_file) kfold_type_result_list = [] event_type_result_list = [] for k in range(6): predict_fn = fp_type.load_models_kfold(class_type_model_path.format(k)) cur_fold_event_type_probs = fp_type.predict_for_all_prob( predict_fn, text_list) kfold_type_result_list.append(cur_fold_event_type_probs) for i in range(len(text_list)): cur_sample_event_type_buffer = [ ele[i] for ele in kfold_type_result_list ] cur_sample_event_type_prob = np.array( cur_sample_event_type_buffer).reshape((6, 65)) avg_result = np.mean(cur_sample_event_type_prob, axis=0) event_label_ids = np.argwhere(avg_result > 0.45) event_cur_type_strs = [ fp_type.data_loader.id2labels_map.get(ele[0]) for ele in event_label_ids ] event_type_result_list.append(event_cur_type_strs) # event_type_result_list = fp_type.predict_for_all((text_list)) # event_type_result_list = [] # with codecs.open("new_final_event_type.txt", 'r', 'utf-8') as fr: # for line in fr: # line = line.strip("\n") # event_list_cur = line.split(",") # event_type_result_list.append(event_list_cur) role_model_path = event_config.get(args.model_role_pb_dir) role_model_path_use_best = "output/model/re_lr_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/checkpoint/export/best_exporter" fp_role_mrc = fastPredictMRC(role_model_path, event_config, "role") id_list, text_list = fp_role_mrc.parse_test_json(test_file) submit_result = [] # index = 0 kfold_result = [] for k in range(1): # if k in [0,3,5]: predict_fn = fp_role_mrc.load_models(role_model_path.format(k)) # else: # predict_fn = fp_role_mrc.load_models(role_model_path_use_best.format(k)) cur_fold_probs_result = {} for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list): if event_type_res is None or len(event_type_res) == 0: # submit_result.append({"id": sample_id, "event_list": []}) cur_fold_probs_result.update({sample_id: []}) continue for cur_event_type in event_type_res: cur_event_type = cur_event_type.strip() if cur_event_type is None or cur_event_type == "": continue corresponding_role_type_list = event_schema_dict.get( cur_event_type) event_type_probs_result = [] for cur_role_type in corresponding_role_type_list: cur_query_word = fp_role_mrc.data_loader.gen_query_for_each_sample( cur_event_type, cur_role_type) token_ids, query_len, token_type_ids, token_mapping = fp_role_mrc.data_loader.trans_single_data_for_test( text, cur_query_word, 512) pred_ids, pred_probs = fp_role_mrc.predict_single_sample( predict_fn, token_ids, query_len, token_type_ids) event_type_probs_result.append(pred_probs) cur_fold_probs_result.update({ sample_id + "-" + cur_event_type: event_type_probs_result }) kfold_result.append(cur_fold_probs_result) for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list): event_list = [] if event_type_res is None or len(event_type_res) == 0: submit_result.append({"id": sample_id, "event_list": []}) continue for cur_event_type in event_type_res: cur_event_type = cur_event_type.strip() if cur_event_type is None or cur_event_type == "": continue corresponding_role_type_list = event_schema_dict.get( cur_event_type) find_key = sample_id + "-" + cur_event_type fold_probs_cur_sample = [ele.get(find_key) for ele in kfold_result] for index, cur_role_type in enumerate( corresponding_role_type_list): cur_query_word = fp_role_mrc.data_loader.gen_query_for_each_sample( cur_event_type, cur_role_type) token_ids, query_len, token_type_ids, token_mapping = fp_role_mrc.data_loader.trans_single_data_for_test( text, cur_query_word, 512) cur_role_fold_probs = [ probs[index] for probs in fold_probs_cur_sample ] # cur_role_fold_probs_array = np.vstack(cur_role_fold_probs) token_len = len(token_ids) cur_role_fold_probs_array = np.array( cur_role_fold_probs).reshape((1, token_len, 3)) avg_result = np.mean(cur_role_fold_probs_array, axis=0) pred_ids = np.argmax(avg_result, axis=-1) token_mapping = token_mapping[1:-1] pred_ids = pred_ids[query_len:-1] entity_list = extract_entity_span_from_muliclass( text, pred_ids, token_mapping) for entity in entity_list: event_list.append({ "event_type": cur_event_type, "arguments": [{ "role": cur_role_type, "argument": entity }] }) submit_result.append({"id": sample_id, "event_list": event_list}) with codecs.open(args.submit_result, 'w', 'utf-8') as fw: for dict_result in submit_result: write_str = json.dumps(dict_result, ensure_ascii=False) fw.write(write_str) fw.write("\n")
def gen_role_class_data(): """ generate role mrc data for verify_neg_fold_data_{} """ # bert vocab file path vocab_file_path = os.path.join(event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file")) # event role slot list file path # slot_pattern/vocab_all_slot_label_noBI_map.txt slot_file = os.path.join(event_config.get("slot_list_root_path"),event_config.get("bert_slot_complete_file_name_role")) # schema file path schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema")) # query map file path # data/slot_descrip query_file = os.path.join(event_config.get("slot_list_root_path"),event_config.get("query_map_file")) data_loader = EventRolePrepareMRC(vocab_file_path,512,slot_file,schema_file,query_file) train_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_train")) eval_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_eval")) data_loader.k_fold_split_data(train_file,eval_file,True)