def create_tokenizer_from_hub_module(bert_model_hub, do_lower=True): """Get the vocab file and casing info from the Hub module.""" with tf.Graph().as_default(): bert_module = hub.Module(bert_model_hub) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) with tf.compat.v1.Session() as sess: vocab_file, do_lower_case = sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"] ]) return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=(do_lower_case and do_lower))
try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) # check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 创建ner dataprocessor对象 processor = processors[args.rc](args.output_dir) label_list = processor.get_labels(labels=args.label_list) print(len(label_list)) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto( log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) if args.do_train and args.do_eval: train_and_eval(args=args, processor=processor, tokenizer=tokenizer, bert_config=bert_config, sess_config=session_config, label_list=label_list) # if args.filter_adam_var: # adam_filter(os.path.join(args.output_dir, 'model')) if args.do_predict: predict(args=args, processor=processor, tokenizer=tokenizer,
def main(args): """ 主函数 """ os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map tf.logging.set_verbosity(tf.logging.INFO) processors = {"RC": RCProcessor} bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) # check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 创建ner dataprocessor对象 processor = processors[args.rc](args.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) # estimator配置,需要研究一下 run_config = tf.estimator.RunConfig( model_dir=args.output_dir, save_summary_steps=args.save_summary_steps, save_checkpoints_steps=args.save_checkpoints_steps, session_config=session_config) train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) num_train_steps = None num_warmup_steps = None # 训练与验证 if args.do_train and args.do_eval: # 加载训练数据 num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) # 如何将日志信息打印到文件? tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", args.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", args.batch_size) label_list = processor.get_labels(labels=args.label_list) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) if args.do_train and args.do_eval: # 1. 将数据转化为tf_record 数据 train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither # early stop hook early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=num_train_steps, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # 预测结果 if args.do_predict: predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", args.batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(args.output_dir, "prediction_test.txt") def result_to_pair(writer, data_file, result): f = open(data_file, 'r') for line, prediction in zip(f, result): line = line.strip() label = label_list[np.argmax(prediction['probabilities'])] writer.write(line + '\t' + str(label) + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer, os.path.join(args.data_dir, 'test.txt'), result) # 在predict的时候同时需输出dev集的结果,因为需要后处理,再线上评估 eval_file = os.path.join(args.output_dir, "eval.tf_record") predict_input_fn_d = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) result_d = estimator.predict(input_fn=predict_input_fn_d) output_d_file = os.path.join(args.output_dir, "prediction_dev.txt") with codecs.open(output_d_file, 'w', encoding='utf-8') as writer: result_to_pair(writer, os.path.join(args.data_dir, 'dev.txt'), result_d) # filter model if args.filter_adam_var: adam_filter(args.output_dir)
def main(_): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.compat.v1.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.compat.v1.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.compat.v1.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.compat.v1.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(args): """ 主函数 """ os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map tf.logging.set_verbosity(tf.logging.INFO) processors = { "ner": NerProcessor } bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if args.clean and args.do_train: print('hahaha') if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) # check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 创建ner dataprocessor对象 processor = processors[args.ner](args.output_dir) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto( log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) # estimator配置,需要研究一下 run_config = tf.estimator.RunConfig( model_dir=args.output_dir, save_summary_steps=500, save_checkpoints_steps=500, session_config=session_config ) train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) num_train_steps = None num_warmup_steps = None # 训练与验证 if args.do_train and args.do_eval: # 加载训练数据 num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) # 如何将日志信息打印到文件? tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", args.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", args.batch_size) label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) # 这个numlabels为什么要加1,要研究一下 params = { 'batch_size': args.batch_size } estimator = tf.estimator.Estimator( model_fn, params=params, config=run_config) if args.do_train and args.do_eval: # 1. 将数据转化为tf_record 数据 train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither # early stop hook early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=1000, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # 预测结果 if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", args.batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer, examples, result): for predict_line, prediction in zip(examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + \ label_token[idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer, predict_examples, result) eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # 在predict的时候同时需输出train和dev集的结果,因为需要后一步的生成负样本数据 train_file = os.path.join(args.output_dir, "train.tf_record") eval_file = os.path.join(args.output_dir, "eval.tf_record") predict_input_fn_t = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) predict_input_fn_d = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) result_t = estimator.predict(input_fn=predict_input_fn_t) output_t_file = os.path.join(args.output_dir, "label_train.txt") result_d = estimator.predict(input_fn=predict_input_fn_d) output_d_file = os.path.join(args.output_dir, "label_dev.txt") with codecs.open(output_t_file, 'w', encoding='utf-8') as writer: result_to_pair(writer, train_examples, result_t) with codecs.open(output_d_file, 'w', encoding='utf-8') as writer: result_to_pair(writer, eval_examples, result_d) eval_result_t = conlleval.return_report(output_t_file) print(''.join(eval_result_t)) eval_result_d = conlleval.return_report(output_d_file) print(''.join(eval_result_d)) # filter model if args.filter_adam_var: adam_filter(args.output_dir)