def train(args): os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean #args.clean和do_train默认是true; #也就是训练一次之后,第二次再训练时会删除上次的文件 if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) #check output dir exists #默认rootpaht+'output' if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) processor = processors[args.ner](args.output_dir) ####tokenizer的使用 tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto( #应该是物理cpu个数 device_count={"CPU": 2}, # limit to num_cpu_core CPU usage #cpu 核心数*超线程数*物理cpu个数=逻辑cpu数 log_device_placement=False, #True打印日志,false不打印日志 inter_op_parallelism_threads=6, intra_op_parallelism_threads=6, allow_soft_placement=True) run_config = tf.estimator.RunConfig( model_dir=args.output_dir, save_summary_steps=500, ##每隔500步保存tensorbaord的summary save_checkpoints_steps=500, ##每隔500步保存checkpoints模型 session_config=session_config) eval_examples = None train_examples = None num_train_steps = None num_warmup_steps = None if args.do_train: # 加载训练数据 train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) # # tf.logging.info("get labels") label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 tf.logging.info('def model_fn_builer') model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} tf.logging.info('def estimator') estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) # # #train和eval都为真时才会进行训练和评估 if args.do_train: # 1. 将数据转化为tf_record 数据 tf.logging.info('convert data into train tf_record ') train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", args.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 2.读取record 数据,组成batch tf.logging.info('read train record and convert to batch') train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # train and eval togither tf.logging.info('call early stopping hook') early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=num_train_steps, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) ### '''estimator.train(input_fn=lambda :my_input_fn(TRAIN_DATA),steps=300) #训练完后进行验证,这里传入我们的测试数据 test_result = estimator.evaluate(input_fn=lambda :my_input_fn(TEST_DATA)) #输出测试验证结果''' t0 = time.time() tf.logging.info('start time and train') estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) ##默认被注释掉 tt1 = time.time() tf.logging.info('train spent time:{}s'.format(tt1 - t0)) if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) # 打印验证集数据信息 tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", args.batch_size) tf.logging.info('convert data to eval tf_record 数据') eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) tf.logging.info('read eval record ,convert to batch') eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # 自己添加的 t1 = time.time() eval_loss = estimator.evaluate(input_fn=eval_input_fn) t2 = time.time() tf.logging.info('eval_loss=\n{}'.format(eval_loss)) tf.logging.info('eval spent time:{}s'.format(t2 - t1)) # tf.logging.info('call train_spec') # train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, # hooks=[early_stopping_hook] # ) # tf.logging.info('call eval_spec') # eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) # tf.logging.info('call tf.estimator.train_and_evaluate') #单机上和分布式都可以使用 # tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) #default do_predict is True if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test.txt") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", args.batch_size) predict_drop_remainder = False tf.logging.info('call pred predict_input_fn') predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) #############prdict的时候也是以batch大小的形式 tf.logging.info('start predict,estimator.predict') result = estimator.predict(input_fn=predict_input_fn) #####预测结果似乎是写在这里的;里面有三列第一列是内容,第二列是真实标签,第三列是预测标签 output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info('predict_line.text:\n{}'.format( predict_line.text)) tf.logging.info('predict_line.label:\n{}'.format( predict_line.label)) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info('e:\n{}'.format(e)) tf.logging.info('predict_line.text:\n{}'.format( predict_line.text)) tf.logging.info('predict_line.label:\n{}'.format( predict_line.label)) line = '' break idx += 1 writer.write(line + '\n') ###打开文件将结果写入 tf.logging.info('save predicted result :label_test.txt') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) tf.logging.info('import conlleval to eval and get eval_result ') from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) tf.logging.info('eval_result:\n{}'.format(''.join(eval_result))) # 写结果到文件中 tf.logging.info('save predict_score.txt') with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(args.output_dir)
def train(args): os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) #check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) processor = processors[args.ner](args.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) run_config = tf.estimator.RunConfig(model_dir=args.output_dir, save_summary_steps=500, save_checkpoints_steps=500, session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if args.do_train and args.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", num_train_steps) eval_examples = processor.get_dev_examples(args.data_dir) # 打印验证集数据信息 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.batch_size) label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) if args.do_train and args.do_eval: # 1. 将数据转化为tf_record 数据 train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither # early stop hook early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=num_train_steps, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test") logger.info("***** Running prediction*****") logger.info(" Num examples = %d", len(predict_examples)) logger.info(" Batch size = %d", args.batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(args.output_dir)
def train(args): os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) #check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) if not os.path.exists(os.path.join(args.output_dir, 'eval')): os.mkdir(os.path.join(args.output_dir, 'eval')) processor = processors[args.ner](args.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) run_config = tf.estimator.RunConfig( model_dir=args.output_dir, #模型保存路径 keep_checkpoint_max=10, #最大保存模型的数量 save_summary_steps=args.save_summary_steps, #每个多少steps保存一次summary save_checkpoints_steps=args.save_checkpoints_steps, #每个多少steps保存一次模型 session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if args.do_train and args.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info("***** Running training *****") logger.info(" Num examples = %d" % len(train_examples)) logger.info(" Batch size = %d" % args.batch_size) logger.info(" Num steps = %d" % num_train_steps) eval_examples = processor.get_dev_examples(args.data_dir) # 打印验证集数据信息 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d" % len(eval_examples)) logger.info(" Batch size = %d" % args.batch_size) if not os.path.exists(os.path.join(args.output_dir, 'label_list.pkl')): label_list = processor.get_labels(labels=args.label_list) else: with open(os.path.join(args.output__dir, 'label_list.pkl'), 'r', encoding='utf-8') as p: label_list = pickle.load(p) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, #里面没有pad:0,所以加1 init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps= num_warmup_steps, #热身步数,此时学习率很小,当global_steps<num_warmup_steps时,learn_rate=global_steps/num_warmup_steps*init_learn_rate args=args) params = {'batch_size': args.batch_size} estimator = tf.estimator.Estimator( model_fn, #搭建的模型分为三种情况,训练,验证,预测 model_dir=args.output_dir, #config和这里都可以设置模型保存路径,二选一设置即可,都设置必须保持一致 params=params, config=run_config) if args.do_train and args.do_eval: # ckpt_file = tf.train.latest_checkpoint(args.output_dir) # print('加载{}模型来train'.format(ckpt_file)) # 1. 将数据转化为tf_record 数据 train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither # early stop hook early_stopping_hook = tf.contrib.estimator.stop_if_no_increase_hook( estimator=estimator, metric_name='f1', max_steps_without_increase=args.max_steps_without_decrease, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, steps=None, throttle_secs=120 ) # steps 评估的迭代步数,如果为None,则在整个数据集上评估。每save一次model才会评估一次,并且至少间隔120秒 tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} print('id2label:{}'.format(id2label)) predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test") logger.info("***** Running prediction*****") logger.info(" Num examples = %d" % len(predict_examples)) logger.info(" Batch size = %d" % args.batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq + 2: #过滤掉pad的预测结果 break if idx == 0: #过滤掉cls的预测结果 idx += 1 continue if idx == len_seq + 1: #过滤掉seq的预测结果 idx += 1 continue curr_labels = id2label[id] #凡是预测为[CLS]和[SEP]的标签都人为的改成O if curr_labels in ['[CLS]', '[SEP]']: # if idx==1: # if id2label[prediction[idx+1]][0] in ['B','O']: curr_labels = 'O' # else: # curr_labels='B'+id2label[prediction[idx+1]][1:] # else: # if id2label[prediction[idx-1]]=='O': # curr_labels='O' # else: # curr_labels='I'+id2label[prediction[idx-1]][1:] try: line += line_token[idx - 1] + ' ' + label_token[ idx - 1] + ' ' + iobes_iob([curr_labels])[0] + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 如果不是针对daGuan比赛以下这段可以注释 tmp_file = open('dg_result.txt', 'w', encoding='utf-8') with open(output_predict_file) as f: lines = f.readlines() for line in lines: if line == '\n': tmp_file.write('\n') continue lis_line = line.strip().split() tmp_file.write(lis_line[0] + '\t' + lis_line[-1] + '\n') tmp_file.close() tf_metrics.recover_reduce_sentence_length('dg_result.txt', 'dg_NERdata/test_raw.txt', 'dg_rc_result.txt') tf_metrics.BIO2line_file('dg_rc_result.txt', 'dg_NERdata/result_file.txt') f1score = tf_metrics.get_f1score( result_file='dg_NERdata/result_file.txt', target_file='dg_NERdata/train_v_8.txt') print('df_f1score: {}'.format(f1score)) # 如果不是针对daGuan比赛这段以上这段可以注释 # 写结果到文件中 with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) fd.write('dg_f1score: {}\n'.format(f1score)) ## 如果不是针对daGuan比赛可以注释
def zjb_eval(output_dir, data_dir, max_seq_length, vocab_file, batch_size, learning_rate, init_checkpoint, bert_config_file, num_train_epochs): tf.logging.set_verbosity(tf.logging.INFO) from bert_base.train.train_helper import get_args_parser args = get_args_parser() token_path = os.path.join(output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=args.do_lower_case) processors = {"ner": NerProcessor} processor = processors[args.ner](output_dir) label_list = processor.get_labels() predict_examples = processor.get_test_examples(data_dir) predict_file = os.path.join(output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, max_seq_length, tokenizer, predict_file, output_dir, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) # session_config.gpu_options.per_process_gpu_memory_fraction = 0.8 # 占用80%显存 run_config = tf.estimator.RunConfig(model_dir=output_dir, save_summary_steps=500, save_checkpoints_steps=500, session_config=session_config) train_examples = processor.get_train_examples(data_dir) num_train_steps = int( len(train_examples) * 1.0 / batch_size * num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) bert_config = modeling.BertConfig.from_json_file(bert_config_file) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(output_dir)
def main(_): if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.device_map processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if FLAGS.clean and FLAGS.do_train: if os.path.exists(FLAGS.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(FLAGS.output_dir) except Exception as e: print(e) print("pleace remove the files of output dir and data.conf") exit(-1) # check output dir exists if not os.path.exists(FLAGS.output_dir): os.mkdir(FLAGS.output_dir) history_max_steps = load_global_step_from_checkpoint_dir(FLAGS.output_dir) processor = processors[FLAGS.task_name](FLAGS.data_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) session_config = tf.compat.v1.ConfigProto( log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True, ) run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=FLAGS.save_checkpoints_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps, session_config=session_config, ) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train and FLAGS.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = history_max_steps + int( len(train_examples) * 1.0 / FLAGS.batch_size * FLAGS.num_train_epochs) if num_train_steps < 1: raise AttributeError("training data is so small...") num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) eval_examples = processor.get_dev_examples(FLAGS.data_dir) # 打印验证集数据信息 tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.batch_size) label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=FLAGS, ) params = {"batch_size": FLAGS.batch_size} estimator = tf.estimator.Estimator( model_fn, params=params, config=run_config, # warm_start_from=run_config.model_dir, ) if FLAGS.do_train and FLAGS.do_eval: # 1. 将数据转化为tf_record 数据 train_file = os.path.join(FLAGS.output_dir, "train.tf_record") # 2.读取record 数据,组成batch train_input_fn = get_tf_record_data(train_file, train_examples, label_list, tokenizer) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") eval_input_fn = get_tf_record_data(eval_file, eval_examples, label_list, tokenizer) train(estimator, num_train_steps, train_input_fn, eval_input_fn) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, "label2id.pkl"), "rb") as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") predict_input_fn = get_tf_record_data(predict_file, predict_examples, label_list, tokenizer, False, False) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = "" line_token = predict_line.words label_token = predict_line.labels len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ["[CLS]", "[SEP]"]: continue try: line += (line_token[idx] + " " + label_token[idx] + " " + curr_labels + "\n") except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = "" break idx += 1 writer.write(line + "\n") with codecs.open(output_predict_file, "w", encoding="utf-8") as writer: result_to_pair(writer) eval_result = conlleval.return_report(output_predict_file) print("".join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(FLAGS.output_dir, "predict_score.txt"), "a", encoding="utf-8") as fd: fd.write("".join(eval_result)) adam_filter(FLAGS.output_dir)
def main(): ''' PrePare and check file''' # 检查checkpoint配置的准确性 tokenization.validate_case_matches_checkpoint(arg_dic['do_lower_case'], arg_dic['init_checkpoint']) if not arg_dic['do_train'] and not arg_dic['do_eval'] and not arg_dic[ 'do_predict']: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) # 导入Bert配置 bert_config = modeling.BertConfig.from_json_file( arg_dic['bert_config_file']) if arg_dic['max_seq_length'] > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (arg_dic['max_seq_length'], bert_config.max_position_embeddings)) ''' Estimator Config ''' processors = {"ner": SelfProcessor} processor = processors[arg_dic["ner"]]() tokenizer = tokenization.FullTokenizer( vocab_file=arg_dic["vocab_file"], do_lower_case=arg_dic["do_lower_case"]) ''' 配置tf.Session的运算方式: log_device_placement: 打印出TensorFlow使用了那种操作 inter_op_parallelism_threads: 设置线程一个操作内部并行运算的线程数,比如矩阵乘法,如果设置为0,则表示以最优的线程数处理 intra_op_parallelism_threads: 设置多个操作并行运算的线程数,比如 c = a + b,d = e + f . 可以并行运算 allow_soft_placement: 那么当运行设备不满足要求时,会自动分配GPU或者CPU ''' session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) ''' Estimator Config: model_dir: 存储模型参数,graph等的路径 save_summary_steps: 每隔这么多步骤保存摘要 save_checkpoints_steps: 每隔多少个step就存一次checkpoint ''' run_config = tf.estimator.RunConfig( model_dir=arg_dic["ckpt_dir"], save_summary_steps=arg_dic["save_summary_steps"], save_checkpoints_steps=arg_dic["save_checkpoints_steps"], session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None ''' Load Data and Model about train and eval ''' if arg_dic["do_train"] and arg_dic["do_eval"]: # train train_examples = processor.get_train_examples(arg_dic["data_dir"]) num_train_steps = int( len(train_examples) * 1.0 / arg_dic["train_batch_size"] * arg_dic["num_train_epochs"]) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * arg_dic["warmup_proportion"]) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", arg_dic["train_batch_size"]) logger.info(" Num steps = %d", num_train_steps) # eval eval_examples = processor.get_dev_examples(arg_dic["data_dir"]) # 打印验证集数据信息 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", arg_dic["train_batch_size"]) label_list = processor.get_labels(arg_dic["data_dir"] + "label.txt") ''' Model of Estimator''' model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=arg_dic["init_checkpoint"], learning_rate=arg_dic["learning_rate"], num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) params = {'batch_size': arg_dic["train_batch_size"]} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) ''' Train of Estimator''' if arg_dic["do_train"] and arg_dic["do_eval"]: '''data input_fn''' # 1. 将数据转化为tf_record 数据 train_file = os.path.join(arg_dic["tfrecord_dir"], "train.tf_record") # 如果不存在train_record则生成 if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, arg_dic["max_seq_length"], tokenizer, train_file, arg_dic["tfrecord_dir"]) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=arg_dic["max_seq_length"], is_training=True, drop_remainder=True) # 1. eval eval_file = os.path.join(arg_dic["tfrecord_dir"], "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, arg_dic["max_seq_length"], tokenizer, eval_file, arg_dic["tfrecord_dir"]) # 2. eval read eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=arg_dic["max_seq_length"], is_training=False, drop_remainder=False) '''estimator train''' ''' max_steps_without_increase:如果没有增加的最大长是多少,如果超过了这个最大步长metric还是没有增加那么就会停止。 eval_dir:默认是使用estimator.eval_dir目录,用于存放评估的summary file。 run_every_secs:表示多长时间调用一次should_stop_fn ''' early_stopping_hook = tf.estimator.experimental.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=num_train_steps, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=arg_dic["save_checkpoints_steps"]) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) ''' throttle_secs:多少秒后又开始评估,如果没有新的 checkpoints 产生,则不评估,所以这个间隔是最小值。 ''' eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, throttle_secs=arg_dic["eval_model_steps"]) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if arg_dic["do_predict"]: token_path = os.path.join(arg_dic["output_dir"], "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(arg_dic["tfrecord_dir"], 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(arg_dic["data_dir"]) predict_file = os.path.join(arg_dic["output_dir"], "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, arg_dic["max_seq_length"], tokenizer, predict_file, arg_dic["output_dir"], mode="test") logger.info("***** Running prediction*****") logger.info(" Num examples = %d", len(predict_examples)) logger.info(" Batch size = %d", arg_dic["train_batch_size"]) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=arg_dic["max_seq_length"], is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(arg_dic["output_dir"], "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from bert_base.train import conlleval # predict的项 eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(arg_dic["output_dir"], 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result))