bert_config=bert_config, is_training=False, input_ids=input_ids_p, input_mask=input_mask_p, segment_ids=None, labels=None, num_labels=num_labels, use_one_hot_embeddings=False, dropout_rate=1.0, ) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(model_dir)) tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join( bert_dir, "vocab.txt"), do_lower_case=args.do_lower_case) ckpt = tf.train.get_checkpoint_state(model_dir) ckpt_path = ckpt.model_checkpoint_path def read_model_param_and_value(): reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path) param_dict = reader.get_variable_to_shape_map() for key, val in param_dict.items(): try: # if "crf_loss" in key or "project" in key: # print(key) # , reader.get_tensor(key)) if "bert/encoder/Reshape_13" in key:
def zjb_eval(output_dir, data_dir, max_seq_length, vocab_file, batch_size, learning_rate, init_checkpoint, bert_config_file, num_train_epochs): tf.logging.set_verbosity(tf.logging.INFO) from bert_base.train.train_helper import get_args_parser args = get_args_parser() token_path = os.path.join(output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=args.do_lower_case) processors = {"ner": NerProcessor} processor = processors[args.ner](output_dir) label_list = processor.get_labels() predict_examples = processor.get_test_examples(data_dir) predict_file = os.path.join(output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, max_seq_length, tokenizer, predict_file, output_dir, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) # session_config.gpu_options.per_process_gpu_memory_fraction = 0.8 # 占用80%显存 run_config = tf.estimator.RunConfig(model_dir=output_dir, save_summary_steps=500, save_checkpoints_steps=500, session_config=session_config) train_examples = processor.get_train_examples(data_dir) num_train_steps = int( len(train_examples) * 1.0 / batch_size * num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) bert_config = modeling.BertConfig.from_json_file(bert_config_file) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(output_dir)
def train(args): os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) #check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) processor = processors[args.ner](args.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) run_config = tf.estimator.RunConfig(model_dir=args.output_dir, save_summary_steps=500, save_checkpoints_steps=500, session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if args.do_train and args.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", num_train_steps) eval_examples = processor.get_dev_examples(args.data_dir) # 打印验证集数据信息 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.batch_size) label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) if args.do_train and args.do_eval: # 1. 将数据转化为tf_record 数据 train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither # early stop hook early_stopping_hook = tf.estimator.experimental.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=num_train_steps, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test") logger.info("***** Running prediction*****") logger.info(" Num examples = %d", len(predict_examples)) logger.info(" Batch size = %d", args.batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(args.output_dir)
def train(args): tf.logging.set_verbosity(tf.logging.INFO) processors = { "RASA": RasaProcessor, } tokenization.validate_case_matches_checkpoint(args.do_lower_case, args.init_checkpoint) if not args.do_train and not args.do_eval and not args.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(args.output_dir) processor = processors[args.ner]() label_list = processor.get_labels(args.data_dir) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) tpu_cluster_resolver = None if args.use_tpu and args.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( args.tpu_name, zone=args.tpu_zone, project=args.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=args.master, model_dir=args.output_dir, save_checkpoints_steps=args.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=args.iterations_per_loop, num_shards=args.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.batch_size * args.num_train_epochs) num_warmup_steps = int(num_train_steps * args.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=args.use_tpu, use_one_hot_embeddings=args.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator(use_tpu=args.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=args.batch_size, eval_batch_size=args.batch_size, predict_batch_size=args.batch_size) if args.do_train: train_file = os.path.join(args.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", args.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) num_actual_eval_examples = len(eval_examples) if args.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % args.batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(args.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", args.batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if args.use_tpu: assert len(eval_examples) % args.batch_size == 0 eval_steps = int(len(eval_examples) // args.batch_size) eval_drop_remainder = True if args.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_predict: predict_examples = processor.get_test_examples(args.data_dir) num_actual_predict_examples = len(predict_examples) if args.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % args.batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(args.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", args.batch_size) predict_drop_remainder = True if args.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) print("result: {}".format(result)) output_predict_file = os.path.join(args.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): #print("prediction: {}".format(prediction)) probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def train(args): os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map tf.logging.set_verbosity(tf.logging.INFO) processors = { "ner": NerProcessor } bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) #check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) if not os.path.exists(os.path.join(args.output_dir,'eval')): os.mkdir(os.path.join(args.output_dir,'eval')) processor = processors[args.ner](args.output_dir) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto( log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) run_config = tf.estimator.RunConfig( model_dir=args.output_dir,#模型保存路径 keep_checkpoint_max=10, #最大保存模型的数量 save_summary_steps=args.save_summary_steps, #每个多少steps保存一次summary save_checkpoints_steps=args.save_checkpoints_steps,#每个多少steps保存一次模型 session_config=session_config ) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if args.do_train and args.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) *1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info("***** Running training *****") logger.info(" Num examples = %d" %len(train_examples)) logger.info(" Batch size = %d" %args.batch_size) logger.info(" Num steps = %d" %num_train_steps) eval_examples = processor.get_dev_examples(args.data_dir) # 打印验证集数据信息 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d"%len(eval_examples)) logger.info(" Batch size = %d"%args.batch_size) if not os.path.exists(os.path.join(args.output_dir, 'label_list.pkl')): label_list = processor.get_labels(labels=args.label_list) else: with open(os.path.join(args.output__dir, 'label_list.pkl'),'r',encoding='utf-8') as p: label_list=pickle.load(p) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, #里面没有pad:0,所以加1 init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, #热身步数,此时学习率很小,当global_steps<num_warmup_steps时,learn_rate=global_steps/num_warmup_steps*init_learn_rate args=args) params = { 'batch_size': args.batch_size } estimator = tf.estimator.Estimator( model_fn, #搭建的模型分为三种情况,训练,验证,预测 model_dir=args.output_dir,#config和这里都可以设置模型保存路径,二选一设置即可,都设置必须保持一致 params=params, config=run_config) if args.do_train and args.do_eval: # ckpt_file = tf.train.latest_checkpoint(args.output_dir) # print('加载{}模型来train'.format(ckpt_file)) # 1. 将数据转化为tf_record 数据 train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither # early stop hook early_stopping_hook = tf.contrib.estimator.stop_if_no_increase_hook( estimator=estimator, metric_name='f1', max_steps_without_increase=args.max_steps_without_decrease, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,steps=None,throttle_secs=120) # steps 评估的迭代步数,如果为None,则在整个数据集上评估。每save一次model才会评估一次,并且至少间隔120秒 tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} print('id2label:{}'.format(id2label)) predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test") logger.info("***** Running prediction*****") logger.info(" Num examples = %d" %len(predict_examples)) logger.info(" Batch size = %d"%args.batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq+2: #过滤掉pad的预测结果 break if idx == 0: #过滤掉cls的预测结果 idx+=1 continue if idx==len_seq+1: #过滤掉seq的预测结果 idx+=1 continue curr_labels = id2label[id] #凡是预测为[CLS]和[SEP]的标签都人为的改成O if curr_labels in ['[CLS]','[SEP]']: # if idx==1: # if id2label[prediction[idx+1]][0] in ['B','O']: curr_labels='O' # else: # curr_labels='B'+id2label[prediction[idx+1]][1:] # else: # if id2label[prediction[idx-1]]=='O': # curr_labels='O' # else: # curr_labels='I'+id2label[prediction[idx-1]][1:] try: line += line_token[idx-1] + ' ' + label_token[idx-1] + ' ' + iobes_iob([curr_labels])[0] + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 如果不是针对daGuan比赛以下这段可以注释 tmp_file=open('dg_result.txt','w',encoding='utf-8') with open(output_predict_file) as f: lines=f.readlines() for line in lines: if line=='\n': tmp_file.write('\n') continue lis_line=line.strip().split() tmp_file.write(lis_line[0]+'\t'+lis_line[-1]+'\n') tmp_file.close() tf_metrics.recover_reduce_sentence_length('dg_result.txt', 'dg_NERdata/test_raw.txt', 'dg_rc_result.txt') tf_metrics.BIO2line_file('dg_rc_result.txt','dg_NERdata/result_file.txt') f1score=tf_metrics.get_f1score(result_file='dg_NERdata/result_file.txt', target_file='dg_NERdata/train_v_8.txt') print('df_f1score: {}'.format(f1score)) # 如果不是针对daGuan比赛这段以上这段可以注释 # 写结果到文件中 with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) fd.write('dg_f1score: {}\n'.format(f1score))## 如果不是针对daGuan比赛可以注释
def train(FLAGS): print(FLAGS.bert_config_file) processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file( FLAGS.bert_config_file) #用bert里面方法打印配置 if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) #check output dir exists if not os.path.exists(FLAGS.output_dir): os.mkdir(FLAGS.output_dir) processor = processors['ner'](FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) session_config = tf.ConfigProto( log_device_placement=False, #是否打印日志 inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) #是否从gpu,替换成cpu run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=FLAGS.save_summary_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps, session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train and FLAGS.do_dev: # 加载训练数据 train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) * 1.0 / FLAGS.train_batch_size * FLAGS.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int( num_train_steps * FLAGS.warmup_proportion ) #bert模型中刚开始学习率设置小一点,等过了num_warmup_steps百分比后学习率在还原 logger.info("***** Running training *****") eval_examples = processor.get_dev_examples(FLAGS.data_dir) label_list = processor.get_labels() # 1. 将数据转化为tf_record 数据 train_file = os.path.join(FLAGS.output_dir, "train.tf_record") if os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, FLAGS.output_dir) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, FLAGS.output_dir) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, FLAGS=FLAGS) params = {'batch_size': FLAGS.train_batch_size} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', #用来监控的目标 max_steps_without_decrease= num_train_steps, #如果没有增加的最大长是多少,如果超过了这个最大步长metric还是没有增加那么就会停止。 eval_dir=None, #默认是使用estimator.eval_dir目录,用于存放评估的summary file min_steps=0, #训练的最小步长,如果训练小于这个步长那么永远都不会停止 run_every_secs=None, run_every_steps=FLAGS.save_checkpoints_steps ) #表示多长时间获得步长调用一次should_stop_fn train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train(self): if ARGS.bert: from bert_data_utils import BertDataUtils tokenizer = tokenization.FullTokenizer(vocab_file=ARGS.vocab_dir, ) self.train_data = BertDataUtils(tokenizer, batch_size=1) self.dev_data = BertDataUtils(tokenizer, batch_size=20) self.dev_batch = self.dev_data.iteration() else: from data_utils import DataBatch self.train_data = DataBatch(data_type='train', batch_size=1) data = { "batch_size": self.train_data.batch_size, "input_size": self.train_data.input_size, "vocab": self.train_data.vocab, "tag_map": self.train_data.tag_map, } f = open("data/data_map.pkl", "wb") cPickle.dump(data, f) f.close() self.vocab = self.train_data.vocab self.input_size = len(self.vocab.values()) + 1 self.dev_data = DataBatch(data_type='dev', batch_size=300) self.dev_batch = self.dev_data.iteration() self.nums_tags = len(self.train_data.tag_map.keys()) self.tag_map = self.train_data.tag_map self.train_length = len(self.train_data.data) # self.test_data = DataBatch(data_type='test', batch_size=100) # self.test_batch = self.test_data.get_batch() # save vocab print("-" * 50) print("train data:\t", self.train_length) print("nums of tags:\t", self.nums_tags) self.__creat_model() with tf.Session() as sess: with tf.device("/gpu:0"): ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and tf.train.checkpoint_exists( ckpt.model_checkpoint_path): print("restore model") self.saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) tvars = tf.trainable_variables() (assignment_map, initialized_variable_names) = \ modeling.get_assignment_map_from_checkpoint(tvars, ARGS.init_checkpoint) tf.train.init_from_checkpoint(ARGS.init_checkpoint, assignment_map) for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" print(" name = %s, shape = %s%s", var.name, var.shape, init_string) for i in range(self.max_epoch): print("-" * 50) print("epoch {}".format(i)) steps = 0 for batch in self.train_data.get_batch(): steps += 1 if ARGS.bert: global_steps, loss, logits, acc, length = self.bert_step( sess, batch) else: global_steps, loss, logits, acc, length = self.step( sess, batch) if steps % 1 == 0: print("[->] step {}/{}\tloss {:.2f}\tacc {:.2f}". format(steps, len(self.train_data.batch_data), loss, acc)) if ARGS.bert: self.bert_evaluate(sess, "ORG") self.bert_evaluate(sess, "PER") else: self.evaluate(sess, "ORG") self.evaluate(sess, "PER") self.saver.save(sess, self.checkpoint_path)
# pred_ids = tf.identity(pred_ids, 'pred_ids') # probabilities = tf.identity(probabilities, 'pred_prob') saver = tf.train.Saver() # (total_loss, logits, trans, pred_ids) = create_model( # bert_config=bert_config, is_training=False, input_ids=input_ids_p, input_mask=input_mask_p, segment_ids=None, # labels=None, num_labels=num_labels, use_one_hot_embeddings=False, dropout_rate=1.0) saver = tf.train.Saver() print("model_dir: ", model_dir) saver.restore(sess, tf.train.latest_checkpoint(model_dir)) tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join( bert_dir, 'vocab.txt'), do_lower_case=DO_LOWER_CASE) class InputFeatures(object): """A single set of features of data.""" def __init__( self, input_ids, input_mask, segment_ids, ): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids # self.label_ids = label_ids
tag_ids = tag_ids + (max_length - len(tag_ids)) * [0] inputs_ids = inputs_ids + (max_length - len(inputs_ids)) * [0] segment_ids = segment_ids + (max_length - len(segment_ids)) * [0] input_mask = input_mask + (max_length - len(input_mask)) * [0] assert len(tag_ids) == len(inputs_ids) == len(segment_ids) == len( input_mask) padded_data.append( [ntokens, tag_ids, inputs_ids, segment_ids, input_mask]) return padded_data def iteration(self): idx = 0 while True: yield self.batch_data[idx] idx += 1 if idx > len(self.batch_data) - 1: idx = 0 def get_batch(self): for data in self.batch_data: yield data if __name__ == "__main__": from bert_base.bert import tokenization tokenizer = tokenization.FullTokenizer(vocab_file="data/vocab.txt", ) bert_data_util = BertDataUtils(tokenizer) bert_data_util.load_data() bert_data_util.prepare_batch() import pdb pdb.set_trace()
def main(_): if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) processor = ThuProcessor() #定义分词器 tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # estimator 运行参数 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=FLAGS.save_summary_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=5, log_step_count_steps=500, session_config=tf.ConfigProto(log_device_placement=True) #session_config=tf.ConfigProto(log_device_placement=True, # device_count={'GPU': 1})) ) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) import ipdb #ipdb.set_trace() num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # get_labels() must be called after get_train_examoles or other examples label_list = processor.get_labels() logger.info('************ label_list=', ' '.join(label_list)) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) # params是一个dict 里面的key是model_fn 里面用到的参数名称,value是对应的数据 params = { 'batch_size': FLAGS.train_batch_size, } estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params=params, ) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, 'train') logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", FLAGS.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, num_label=len(label_list), is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, 'eval') logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, num_label=len(label_list), is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, 'test') logger.info("***** Running prediction*****") logger.info(" Num examples = %d", len(predict_examples)) logger.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, num_label=len(label_list), is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.txt") with tf.gfile.GFile(output_predict_file, "w") as writer: logger.info("***** Predict results *****") for prediction in result: output_line = "\t".join( str(class_probability) for class_probability in prediction) + "\n" writer.write(output_line)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) layer_indexes = [int(x) for x in FLAGS.layers.split(",")] bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=FLAGS.master, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) examples = read_examples(FLAGS.input_file) features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, layer_indexes=layer_indexes, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=FLAGS.batch_size) input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length) with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, "w")) as writer: for result in estimator.predict(input_fn, yield_single_examples=True): unique_id = int(result["unique_id"]) feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = result["layer_output_%d" % j] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(float(x), 6) for x in layer_output[i:(i + 1)].flat ] all_layers.append(layers) features = collections.OrderedDict() features["token"] = token features["layers"] = all_layers all_features.append(features) output_json["features"] = all_features writer.write(json.dumps(output_json) + "\n")
def train(FLAGS): print(FLAGS.bert_config_file) processors = {'ner': NerProcessor} bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # max_seq_length必须小于设置的max_position_embeddings,max_position_embeddings这里是512 if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( 'Cannot use sequence length %d because the BERT model' 'was not trained up to sequence length %d' % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 检查output目录是否存在 if not os.path.exists(FLAGS.output_dir): os.mkdir(FLAGS.output_dir) processor = processors['ner'](FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) """ tf.ConfigProto: tensorflow config protocol,tensorflow配置协议 :param log_device_placement: 如果是True,我们可以看到我们的tensor、op是在哪台设备、哪颗CPU上运行的。如果是Flase就看不到。 :param inter_op_parallelism_threads: 每个进程可用的为进行阻塞操作节点准备线程池中线程数量,设置为0代表让系统选择合适数值。 :param intra_op_parallelism_threads: 线程池中线程的数量,如果设置为0代表让系统设置合适的数值。 :param allow_soft_placement: 这个参数制定是否允许计算的“软分配”。 如果这个参数设置为True,那么一个操作在下列情况下会被放在CPU上运行: 1、操作没有GPU的实现 2、没有已知的GPU 3、需要与来自CPU的reftype输入进行协同定位 """ session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) """ tf.estimator.RunConfig: tensorflow运行配置文件 :param model_dir: 模型的输出路径 :param save_summary_steps: 多少步进行可视化更新 :param save_checkpoints_steps: 多少步进行存储ck文件 :param session_config: session的配置 """ run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=FLAGS.save_summary_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps, session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train and FLAGS.do_dev: # 加载训练数据 train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int(1.0 * len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) logger.info('***** Running training *****') eval_examples = processor.get_dev_examples(FLAGS.data_dir) label_list = processor.get_labels() # 1、将训练数据转化为TF_Record数据 train_file = os.path.join(FLAGS.output_dir, 'train.tf_record') if not os.path.exists(train_file): filed_based_convert_examples_to_features( examples=train_examples, label_list=label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer, output_file=train_file, output_dir=FLAGS.output_dir) # 2、读取TF_Record训练数据,转化为batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) # 1、将验证数据转化为TF_Record数据 eval_file = os.path.join(FLAGS.output_dir, 'eval.tf_record') if not os.path.exists(eval_file): filed_based_convert_examples_to_features( examples=eval_examples, label_list=label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer, output_file=eval_file, output_dir=FLAGS.output_dir) # 2、读取TF_Record验证数据,转化为batch eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) """ 返回的model_fn是一个函数,其定义了模型、训练、评测方法 并且使用了钩子参数,加载了Bert模型的参数进行了自己模型的参数初始化过程 tf新的架构方法,通过定义model_fn函数,定义模型,然后通过EstimatorAPI进行模型的其他工作 EstimatorAPI就可以控制模型的训练、预测和评估工作等。 """ model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, FLAGS=FLAGS) params = {'batch_size': FLAGS.train_batch_size} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) # 设置early_stopping,防止过拟合 early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=num_train_steps, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=FLAGS.save_checkpoints_steps) # 训练 train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def main(_): if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.device_map processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if FLAGS.clean and FLAGS.do_train: if os.path.exists(FLAGS.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(FLAGS.output_dir) except Exception as e: print(e) print("pleace remove the files of output dir and data.conf") exit(-1) # check output dir exists if not os.path.exists(FLAGS.output_dir): os.mkdir(FLAGS.output_dir) history_max_steps = load_global_step_from_checkpoint_dir(FLAGS.output_dir) processor = processors[FLAGS.task_name](FLAGS.data_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) session_config = tf.compat.v1.ConfigProto( log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True, ) run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=FLAGS.save_checkpoints_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps, session_config=session_config, ) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train and FLAGS.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = history_max_steps + int( len(train_examples) * 1.0 / FLAGS.batch_size * FLAGS.num_train_epochs) if num_train_steps < 1: raise AttributeError("training data is so small...") num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) eval_examples = processor.get_dev_examples(FLAGS.data_dir) # 打印验证集数据信息 tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.batch_size) label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=FLAGS, ) params = {"batch_size": FLAGS.batch_size} estimator = tf.estimator.Estimator( model_fn, params=params, config=run_config, # warm_start_from=run_config.model_dir, ) if FLAGS.do_train and FLAGS.do_eval: # 1. 将数据转化为tf_record 数据 train_file = os.path.join(FLAGS.output_dir, "train.tf_record") # 2.读取record 数据,组成batch train_input_fn = get_tf_record_data(train_file, train_examples, label_list, tokenizer) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") eval_input_fn = get_tf_record_data(eval_file, eval_examples, label_list, tokenizer) train(estimator, num_train_steps, train_input_fn, eval_input_fn) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, "label2id.pkl"), "rb") as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") predict_input_fn = get_tf_record_data(predict_file, predict_examples, label_list, tokenizer, False, False) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = "" line_token = predict_line.words label_token = predict_line.labels len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ["[CLS]", "[SEP]"]: continue try: line += (line_token[idx] + " " + label_token[idx] + " " + curr_labels + "\n") except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = "" break idx += 1 writer.write(line + "\n") with codecs.open(output_predict_file, "w", encoding="utf-8") as writer: result_to_pair(writer) eval_result = conlleval.return_report(output_predict_file) print("".join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(FLAGS.output_dir, "predict_score.txt"), "a", encoding="utf-8") as fd: fd.write("".join(eval_result)) adam_filter(FLAGS.output_dir)
def train(args): os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean #args.clean和do_train默认是true; #也就是训练一次之后,第二次再训练时会删除上次的文件 if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) #check output dir exists #默认rootpaht+'output' if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) processor = processors[args.ner](args.output_dir) ####tokenizer的使用 tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto( #应该是物理cpu个数 device_count={"CPU": 2}, # limit to num_cpu_core CPU usage #cpu 核心数*超线程数*物理cpu个数=逻辑cpu数 log_device_placement=False, #True打印日志,false不打印日志 inter_op_parallelism_threads=6, intra_op_parallelism_threads=6, allow_soft_placement=True) run_config = tf.estimator.RunConfig( model_dir=args.output_dir, save_summary_steps=500, ##每隔500步保存tensorbaord的summary save_checkpoints_steps=500, ##每隔500步保存checkpoints模型 session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None # default args.do_train and args.do_eval are true if args.do_train and args.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", args.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) eval_examples = processor.get_dev_examples(args.data_dir) # 打印验证集数据信息 tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", args.batch_size) tf.logging.info("get labels") label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 tf.logging.info('def model_fn_builer') model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} tf.logging.info('def estimator') estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) #train和eval都为真时才会进行训练和评估 if args.do_train and args.do_eval: # 1. 将数据转化为tf_record 数据 tf.logging.info('convert data into train tf_record ') train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) # 2.读取record 数据,组成batch tf.logging.info('read train record and convert to batch') train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) tf.logging.info('convert data to eval tf_record 数据') eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) tf.logging.info('read eval record ,convert to batch') eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither tf.logging.info('call early stopping hook') early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=num_train_steps, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) ### '''estimator.train(input_fn=lambda :my_input_fn(TRAIN_DATA),steps=300) #训练完后进行验证,这里传入我们的测试数据 test_result = estimator.evaluate(input_fn=lambda :my_input_fn(TEST_DATA)) #输出测试验证结果''' t0 = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) ##默认被注释掉 t1 = time.time() tf.logging.info('train spent time:{}s'.format(t1 - t0)) # 自己添加的 eval_loss = estimator.evaluate(input_fn=eval_input_fn) t2 = time.time() tf.logging.info('eval_loss=\n{}'.format(eval_loss)) tf.logging.info('eval spent time:{}s'.format(t2 - t1)) # tf.logging.info('call train_spec') # train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, # hooks=[early_stopping_hook] # ) # tf.logging.info('call eval_spec') # eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) # tf.logging.info('call tf.estimator.train_and_evaluate') #单机上和分布式都可以使用 # tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) #default do_predict is True if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test.txt") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", args.batch_size) predict_drop_remainder = False tf.logging.info('call pred predict_input_fn') predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) #############prdict的时候也是以batch大小的形式 tf.logging.info('start predict,estimator.predict') result = estimator.predict(input_fn=predict_input_fn) #####预测结果似乎是写在这里的;里面有三列第一列是内容,第二列是真实标签,第三列是预测标签 output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info('predict_line.text:\n{}'.format( predict_line.text)) tf.logging.info('predict_line.label:\n{}'.format( predict_line.label)) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info('e:\n{}'.format(e)) tf.logging.info('predict_line.text:\n{}'.format( predict_line.text)) tf.logging.info('predict_line.label:\n{}'.format( predict_line.label)) line = '' break idx += 1 writer.write(line + '\n') ###打开文件将结果写入 tf.logging.info('save predicted result :label_test.txt') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) tf.logging.info('import conlleval to eval and get eval_result ') from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) tf.logging.info('eval_result:\n{}'.format(''.join(eval_result))) # 写结果到文件中 tf.logging.info('save predict_score.txt') with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(args.output_dir)