def run_training( train_fn, model_fn, model_dir: str, num_gpus: int = 1, gpu_mem_fraction: float = 0.96, log_step: int = 100, summary_step: int = 100, save_checkpoint_step: int = 1000, max_steps: int = 10000, eval_step: int = 10, eval_throttle: int = 120, train_batch_size: int = 128, train_hooks=None, eval_fn=None, ): tf.logging.set_verbosity(tf.logging.INFO) if num_gpus > 1 and not use_tpu: dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpus, auto_shard_dataset=True, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=num_gpus), ) else: dist_strategy = None gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_mem_fraction) config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_step, model_dir=model_dir, save_checkpoints_steps=save_checkpoint_step, save_summary_steps=summary_step, session_config=config, ) estimator = tf.estimator.Estimator(model_fn=model_fn, params={}, config=run_config) if eval_fn: train_spec = tf.estimator.TrainSpec(input_fn=train_fn, max_steps=max_steps, hooks=train_hooks) eval_spec = tf.estimator.EvalSpec(input_fn=eval_fn, steps=eval_step, throttle_secs=eval_throttle) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) else: estimator.train(input_fn=train_fn, max_steps=max_steps, hooks=train_hooks)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) # Get corpus info FLAGS.n_token = data_utils.VOCAB_SIZE tf.logging.info('n_token {}'.format(FLAGS.n_token)) if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) bsz_per_core = per_device_batch_size(FLAGS.train_batch_size, FLAGS.num_gpu_cores) tf.logging.info('size of batch {}'.format(bsz_per_core)) train_input_fn, train_record_info_dict = get_input_fn( 'train', bsz_per_core) tf.logging.info('num of batches {}'.format( train_record_info_dict['num_batch'])) train_cache_fn = get_cache_fn(FLAGS.mem_len, bsz_per_core) tf.logging.info(train_cache_fn) tf.logging.info('Use normal RunConfig') tf.logging.info(FLAGS.num_gpu_cores) dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, auto_shard_dataset=True, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 10 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.save_steps, save_summary_steps=None, ) model_fn = get_model_fn() tf.logging.info('Use normal Estimator') estimator = Estimator( model_fn=model_fn, params={ 'batch_size': bsz_per_core, 'cache': None }, config=run_config, ) tf.logging.info('***** Running training *****') tf.logging.info(' Batch size = %d', FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)
def run_training( train_fn, model_fn, model_dir: str, num_gpus: int = 1, log_step: int = 100, summary_step: int = 100, save_checkpoint_step: int = 1000, max_steps: int = 10000, eval_step: int = 10, eval_throttle: int = 120, train_hooks=None, eval_fn=None, ): tf.logging.set_verbosity(tf.logging.INFO) if num_gpus > 1 and not use_tpu: dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpus, auto_shard_dataset=True, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=num_gpus), ) else: dist_strategy = None run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_step, model_dir=model_dir, save_checkpoints_steps=save_checkpoint_step, save_summary_steps=summary_step, ) estimator = tf.estimator.Estimator(model_fn=model_fn, params={}, config=run_config) if eval_fn: train_spec = tf.estimator.TrainSpec(input_fn=train_fn, max_steps=max_steps, hooks=train_hooks) eval_spec = tf.estimator.EvalSpec(input_fn=eval_fn, steps=eval_step, throttle_secs=eval_throttle) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) else: estimator.train(input_fn=train_fn, max_steps=max_steps, hooks=train_hooks)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "qqp": QqpProcessor, 'chnsenticorp': ChnsenticorpProcessor, 'gt': GTProcessor, 'tcl': TCLProcessor } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 dist_run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) tpu_run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint is_multi_gpu = FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16, weight_list = FLAGS.weight_list) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if is_multi_gpu: estimator = Estimator( model_fn=model_fn, params={}, config=dist_run_config) else: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) # TF Serving if FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') save_for_serving(estimator, serving_dir, FLAGS.max_seq_length, not is_multi_gpu) # Find the latest checkpoint max_idx = 0 for filename in os.listdir(FLAGS.output_dir): if filename.startswith('model.ckpt-'): max_idx = max(int(filename.split('.')[1].split('-')[1]), max_idx) init_checkpoint = os.path.join(FLAGS.output_dir, f'model.ckpt-{max_idx}') tf.logging.info(f'Current checkpoint: {init_checkpoint}') if FLAGS.do_eval: model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16, weight_list = FLAGS.weight_list) eval_estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = eval_estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # dump result as json file (easy parsing for other tasks) class ExtEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) if isinstance(obj, np.ndarray): return obj.tolist() else: return super(ExtEncoder, self).default(obj) output_eval_file2 = os.path.join(FLAGS.output_dir, "eval_results.json") with tf.gfile.GFile(output_eval_file2, "w") as writer: json.dump(result, writer, indent=4, cls=ExtEncoder) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def run_training( train_fn, model_fn, model_dir: str, num_gpus: int = 1, gpu_mem_fraction: float = 0.95, log_step: int = 100, summary_step: int = 100, save_checkpoint_step: int = 1000, max_steps: int = 10000, eval_step: int = 10, eval_throttle: int = 120, use_tpu: bool = False, tpu_name: str = None, tpu_zone: str = None, gcp_project: str = None, iterations_per_loop: int = 100, num_tpu_cores: int = 8, train_batch_size: int = 128, train_hooks=None, eval_fn=None, ): tf.logging.set_verbosity(tf.logging.INFO) if num_gpus > 1 and not use_tpu: dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpus, auto_shard_dataset=True, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=num_gpus), ) else: dist_strategy = None if use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu_name, zone=tpu_zone, project=gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=None, model_dir=model_dir, save_checkpoints_steps=save_checkpoint_step, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=num_tpu_cores, per_host_input_for_training=is_per_host, ), ) else: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_mem_fraction) config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_step, model_dir=model_dir, save_checkpoints_steps=save_checkpoint_step, save_summary_steps=summary_step, session_config=config, ) if use_tpu: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=use_tpu, model_fn=model_fn, config=run_config, train_batch_size=train_batch_size, eval_batch_size=None, ) eval_fn = None else: estimator = tf.estimator.Estimator(model_fn=model_fn, params={}, config=run_config) if eval_fn: train_spec = tf.estimator.TrainSpec(input_fn=train_fn, max_steps=max_steps, hooks=train_hooks) eval_spec = tf.estimator.EvalSpec(input_fn=eval_fn, steps=eval_step, throttle_secs=eval_throttle) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) else: estimator.train(input_fn=train_fn, max_steps=max_steps, hooks=train_hooks)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.n_gpus, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.n_gpus), ) ''' IF ERROR COULD TRY dist_strategy = tf.contrib.distribute.MirroredStrategy( devices=["device:GPU:%d" % i for i in range(FLAGS.n_gpus)], cross_tower_ops=tf.distribute.HierarchicalCopyAllReduce()) ''' log_every_n_steps = 8 run_config = RunConfig(train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = Estimator(model_fn=model_fn, params={}, config=run_config) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) print("<<<<< TEST print") tf.logging.info("<<<<< TEST") processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, 'chnsenticorp': ChnsenticorpProcessor, 'gt': GTProcessor } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 dist_run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) tpu_run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, weight_list=FLAGS.weight_list, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. init_checkpoint = FLAGS.init_checkpoint is_multi_gpu = FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2 if is_multi_gpu: stimator = Estimator(model_fn=model_fn, params={}, config=dist_run_config) else: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) """=========================EXPORT MODEL========================""" def serving_input_receiver_fn(): """An input receiver that expects a serialized tf.Example.""" reciever_tensors = { "input_ids": tf.placeholder(dtype=tf.int64, shape=[1, FLAGS.max_seq_length]) } features = { "input_ids": reciever_tensors['input_ids'], "input_mask": 1 - tf.cast(tf.equal(reciever_tensors['input_ids'], 0), dtype=tf.int64), "segment_ids": tf.zeros(dtype=tf.int64, shape=[1, FLAGS.max_seq_length]), 'label_ids': tf.zeros(dtype=tf.int64, shape=[1, 1]) } return tf.estimator.export.ServingInputReceiver( features, reciever_tensors) estimator._export_to_tpu = False estimator.export_savedmodel(FLAGS.export_model_dir, serving_input_receiver_fn) """=========================EXPORT MODEL========================""" output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] # for input_pattern in FLAGS.input_file.split(","): # print(input_pattern) # input_files.extend(tf.gfile.Glob(input_pattern)) for input_file in os.listdir(FLAGS.input_dir): print(input_file) file_name = FLAGS.input_dir + input_file input_files.extend(tf.gfile.Glob(file_name)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) # tpu_cluster_resolver = None # if FLAGS.use_tpu and FLAGS.tpu_name: # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 # run_config = tf.contrib.tpu.RunConfig( # cluster=tpu_cluster_resolver, # master=FLAGS.master, # model_dir=FLAGS.output_dir, # save_checkpoints_steps=FLAGS.save_checkpoints_steps, # tpu_config=tf.contrib.tpu.TPUConfig( # iterations_per_loop=FLAGS.iterations_per_loop, # num_shards=FLAGS.num_tpu_cores, # per_host_input_for_training=is_per_host)) mirrored_strategy = tf.distribute.MirroredStrategy( cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=1)) run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, log_step_count_steps=5, save_summary_steps=5, train_distribute=mirrored_strategy, eval_distribute=mirrored_strategy) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # estimator = tf.contrib.tpu.TPUEstimator( # use_tpu=FLAGS.use_tpu, # model_fn=model_fn, # config=run_config, # train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, model_dir=FLAGS.output_dir) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def predict(): #FLAGS = common.model_config() tf.logging.set_verbosity(tf.logging.INFO) args.test_data = common.parse_path(args.test_data) model_config = common.loadJsonConfig( os.path.join(args.trained_model_dir, "model_config.json")) #args.added_layer_config = common.parse_path(args.added_layer_config) df = dataprocess.load_data(args.test_data) test_column_names = args.predict_column_names.split(' ') ckpt = tf.train.get_checkpoint_state(args.trained_model_dir) checkpoint_file = ckpt.model_checkpoint_path tokenization.validate_case_matches_checkpoint( model_config['do_lower_case'], checkpoint_file) # file = open(args.bert_model, 'r', encoding='utf-8') # sub_dir = file.read().strip('\n') # file.close() # bert_model_dir = args.bert_dir + sub_dir #bert_model_dir = args.bert_dir bert_config_file = os.path.join(args.trained_model_dir, "bert_config.json") bert_config = modeling.BertConfig.from_json_file(bert_config_file) if model_config['max_seq_length'] > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (model_config['max_seq_length'], bert_config.max_position_embeddings)) tf.gfile.MakeDirs(args.output_dir) processor = common.toxicCommentProcessor() vocab_file = os.path.join(args.trained_model_dir, "vocab.txt") tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=model_config['do_lower_case']) tpu_cluster_resolver = None if args.use_tpu and args.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( args.tpu_name, zone=args.tpu_zone, project=args.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if args.use_gpu and args.num_gpu_cores == None: num_gpu_cores = len([ x for x in device_lib.list_local_devices() if x.device_type == 'GPU' ]) else: num_gpu_cores = args.num_gpu_cores if args.use_gpu and int(num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig, GPU number: %", num_gpu_cores) dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=num_gpu_cores)) log_every_n_steps = 8 run_config = RunConfig(train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=args.master, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=args.iterations_per_loop, num_shards=args.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None learning_rate = None #added_layer = common.loadJsonConfig(args.added_layer_config) model = common.get_model(model_config['layer_name']) # model = common.get_model(args.add_layer) label_num = model_config['num_labels'] model_fn = common.model_fn_builder(bert_config=bert_config, is_training_bert=False, num_labels=label_num, init_checkpoint=checkpoint_file, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=args.use_tpu, use_one_hot_embeddings=args.use_tpu, use_gpu=args.use_gpu, num_gpu_cores=num_gpu_cores, fp16=args.use_fp16, model=model) if args.use_gpu and int(num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=args.use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=args.predict_batch_size) predict_examples = processor.get_test_examples(df, test_column_names, label_num) num_actual_predict_examples = len(predict_examples) if args.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % args.predict_batch_size != 0: predict_examples.append(common.PaddingInputExample()) predict_file = os.path.join(args.output_dir, "predict.tf_record") if not os.path.isfile(predict_file): common.file_based_convert_examples_to_features( predict_examples, model_config['max_seq_length'], tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", args.predict_batch_size) predict_drop_remainder = True if args.use_tpu else False predict_input_fn = common.file_based_input_fn_builder( input_file=predict_file, seq_length=model_config['max_seq_length'], label_length=label_num, is_training=False, drop_remainder=predict_drop_remainder, batch_size=args.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) #output_predict_file = os.path.join(args.output_dir, "predict_results.csv") output_predict_file = common.generate_path(args.output_dir) #with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") predict_res = [] for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] neg_preds = np.zeros(shape=probabilities.shape, dtype=float) pos_preds = np.ones(shape=probabilities.shape, dtype=float) predictions = np.where(probabilities < 0.5, neg_preds, pos_preds) if i >= num_actual_predict_examples: break piece = np.r_[probabilities, predictions] predict_res.append(piece) num_written_lines += 1 output_colums = [] for i in range(len(predict_res[0]) // 2): col_name = "probability_" + str(i + 1) output_colums.append(col_name) for i in range(len(predict_res[0]) // 2): col_name = "prediction_" + str(i + 1) output_colums.append(col_name) out_df = pd.DataFrame(columns=output_colums, data=predict_res) print(out_df.head(3)) out_df = pd.concat([df, out_df], axis=1) print(out_df.head(3)) out_df.to_csv(output_predict_file, index=False)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( 'At least one of `do_train` or `do_eval` must be True.' ) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file ) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(','): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info('*** Input Files ***') for input_file in input_files: tf.logging.info(' %s' % input_file) tf.logging.info('Use normal RunConfig') tf.logging.info(FLAGS.num_gpu_cores) dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus = FLAGS.num_gpu_cores, auto_shard_dataset = True, cross_device_ops = AllReduceCrossDeviceOps( 'nccl', num_packs = FLAGS.num_gpu_cores ), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 10 run_config = RunConfig( train_distribute = dist_strategy, eval_distribute = dist_strategy, log_step_count_steps = log_every_n_steps, model_dir = FLAGS.output_dir, save_checkpoints_steps = FLAGS.save_checkpoints_steps, save_summary_steps = None, ) model_fn = model_fn_builder( albert_config = albert_config, init_checkpoint = FLAGS.init_checkpoint, learning_rate = FLAGS.learning_rate, num_train_steps = FLAGS.num_train_steps, num_warmup_steps = FLAGS.num_warmup_steps, use_tpu = FLAGS.use_tpu, use_one_hot_embeddings = FLAGS.use_tpu, optimizer = FLAGS.optimizer, poly_power = FLAGS.poly_power, start_warmup_step = FLAGS.start_warmup_step, ) tf.logging.info('Use normal Estimator') estimator = Estimator(model_fn = model_fn, params = {}, config = run_config) if FLAGS.do_train: tf.logging.info('***** Running training *****') tf.logging.info(' Batch size = %d', FLAGS.train_batch_size) train_input_fn = input_fn_builder_gpu( input_files = input_files, max_seq_length = FLAGS.max_seq_length, max_predictions_per_seq = FLAGS.max_predictions_per_seq, is_training = True, batch_size = per_device_batch_size( FLAGS.train_batch_size, FLAGS.num_gpu_cores ), ) estimator.train( input_fn = train_input_fn, max_steps = FLAGS.num_train_steps ) if FLAGS.do_eval: tf.logging.info('***** Running evaluation *****') tf.logging.info(' Batch size = %d', FLAGS.eval_batch_size) global_step = -1 output_eval_file = os.path.join(FLAGS.output_dir, 'eval_results.txt') writer = tf.gfile.GFile(output_eval_file, 'w') tf.gfile.MakeDirs(FLAGS.export_dir) eval_input_fn = input_fn_builder( input_files = input_files, max_seq_length = FLAGS.max_seq_length, max_predictions_per_seq = FLAGS.max_predictions_per_seq, is_training = False, ) while global_step < FLAGS.num_train_steps: if estimator.latest_checkpoint() is None: tf.logging.info('No checkpoint found yet. Sleeping.') time.sleep(1) else: result = estimator.evaluate( input_fn = eval_input_fn, steps = FLAGS.max_eval_steps ) global_step = result['global_step'] tf.logging.info('***** Eval results *****') for key in sorted(result.keys()): tf.logging.info(' %s = %s', key, str(result[key])) writer.write('%s = %s\n' % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": classifier_utils.ColaProcessor, "mnli": classifier_utils.MnliProcessor, "mrpc": classifier_utils.MrpcProcessor, # "xnli": XnliProcessor, "sts-b": classifier_utils.StsbProcessor, "qqp": classifier_utils.QqpProcessor, "sst-2": classifier_utils.Sst2Processor, "qnli": classifier_utils.QnliProcessor, "rte": classifier_utils.RteProcessor, "wnli": classifier_utils.WnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) albert_config.hidden_dropout_prob = FLAGS.albert_dropout_prob albert_config.attention_probs_dropout_prob = FLAGS.albert_dropout_prob if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) # multiple gpus NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1 using_customized_optimizer = None if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror": os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in list(range(NUM_GPUS))]) # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=NUM_GPUS, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=NUM_GPUS), ) using_customized_optimizer = True tf.logging.info('Use MirroredStrategy with %d devices.', strategy.num_replicas_in_sync) else: strategy = tf.distribute.OneDeviceStrategy("GPU:0") using_customized_optimizer = False tf.logging.info('Single device mode.') tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), train_distribute=strategy, eval_distribute=strategy, #get error during evaluation ) train_examples = None total_time = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) model_fn = classifier_utils.model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.train_step, num_warmup_steps=FLAGS.warmup_step, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, task_name=task_name, customized=using_customized_optimizer, optimizer=FLAGS.optimizer, discard_classifier_weights=FLAGS.discard_classifier_weights) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu and FLAGS.tpu_name: tf.logging.info("Use TPUEstimator") estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) else: tf.logging.info("Use normal Estimator") estimator = Estimator( model_fn=model_fn, params={}, config=run_config, ) if FLAGS.do_train: cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir train_file = os.path.join(cached_dir, task_name + "_train.tf_record") if not tf.gfile.Exists(train_file): classifier_utils.file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, task_name) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info( f" Batch size = {FLAGS.train_batch_size} * {NUM_GPUS}") tf.logging.info(" Num steps = %d", FLAGS.train_step) train_input_fn = classifier_utils.file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size) time_hist = TimeHistory() estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step, hooks=[time_hist]) total_time = sum(time_hist.times) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_features = classifier_utils.convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, task_name) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record") if not tf.gfile.Exists(eval_file): classifier_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, task_name) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = classifier_utils.file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.eval_batch_size) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") if task_name == "sts-b": key_name = "pearson" elif task_name == "cola": key_name = "matthew_corr" else: key_name = "eval_accuracy" if tf.gfile.Exists(checkpoint_path + ".index"): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: global_step = -1 best_perf = -1 checkpoint_path = None writer = tf.gfile.GFile(output_eval_file, "w") writer.write("===== Hyperparameters =====\n") writer.write("Training batch size: {}\n".format( FLAGS.train_batch_size)) writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length)) writer.write("Learning rate: {}\n".format(FLAGS.learning_rate)) writer.write("Num of GPU cores: {}\n".format(NUM_GPUS)) if FLAGS.do_train: avg_time_per_batch = np.mean(time_hist.times) writer.write("Total time: {}\n".format(total_time)) writer.write("Speed: {}\n".format(FLAGS.train_batch_size * NUM_GPUS / avg_time_per_batch)) if FLAGS.train_step and FLAGS.warmup_step: writer.write("Training steps: {}\n".format(FLAGS.train_step)) writer.write("Warmup steps: {}\n".format(FLAGS.warmup_step)) while global_step < FLAGS.train_step: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for checkpoint in sorted(steps_and_files.items()): step, checkpoint_path = checkpoint if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") tf.logging.info(f"num_gpu_cores = {NUM_GPUS}") writer.write("===== Evuations =====\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close() if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") classifier_utils.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, task_name) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = classifier_utils.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.predict(input_fn=predict_input_fn, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.gfile.GFile(output_submit_file, "w") as sub_writer: sub_writer.write("index" + "\t" + "prediction\n") num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, (example, prediction)) in\ enumerate(zip(predict_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) if task_name != "sts-b": actual_label = label_list[int(prediction["predictions"])] else: actual_label = str(prediction["predictions"]) sub_writer.write(example.guid + "\t" + actual_label + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def train(): tf.logging.set_verbosity(tf.logging.INFO) args.train_data = common.parse_path(args.train_data) # args.bert_model = common.parse_path(args.bert_model) args.added_layer_config = common.parse_path(args.added_layer_config) df = dataprocess.load_data(args.train_data) train_column_names = args.train_column_names.split(' ') label_column_names = args.label_column_names.split(' ') label_len = len(label_column_names) # file = open(args.bert_model, 'r', encoding='utf-8') # sub_dir = file.read().strip('\n') # file.close() # bert_model_dir = args.bert_dir + sub_dir bert_model_dir = args.bert_dir if (args.init_checkpoint_file == None or args.init_checkpoint_file == ""): args.init_checkpoint_file = bert_model_dir + "/bert_model.ckpt" tokenization.validate_case_matches_checkpoint(args.do_lower_case, args.init_checkpoint_file) bert_config_file = os.path.join(bert_model_dir, "bert_config.json") bert_config = modeling.BertConfig.from_json_file(bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(args.output_dir) processor = common.toxicCommentProcessor() vocab_file = os.path.join(bert_model_dir, "vocab.txt") tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=args.do_lower_case) added_layer = common.loadJsonConfig(args.added_layer_config) model = common.get_model(added_layer['layer_name']) #copy file shutil.copyfile(bert_config_file, os.path.join(args.output_dir, "bert_config.json")) shutil.copyfile(vocab_file, os.path.join(args.output_dir, "vocab.txt")) #add model config model_config = { "do_lower_case": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_column_names), "layer_name": added_layer['layer_name'] } json.dump(model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) tpu_cluster_resolver = None if args.use_tpu and args.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( args.tpu_name, zone=args.tpu_zone, project=args.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 #num_gpu_cores = 0 num_gpu_cores = len( [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']) if args.num_gpu_cores != None and args.num_gpu_cores < num_gpu_cores: num_gpu_cores = args.num_gpu_cores if args.use_gpu and int(num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig, GPU number: %d" % (num_gpu_cores)) dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=num_gpu_cores)) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=args.output_dir, save_checkpoints_steps=args.save_checkpoints_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=args.master, model_dir=args.output_dir, save_checkpoints_steps=args.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=args.iterations_per_loop, num_shards=args.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = processor.get_train_examples(df, train_column_names, label_column_names) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) num_warmup_steps = int(num_train_steps * args.warmup_proportion) model_fn = common.model_fn_builder( bert_config=bert_config, is_training_bert=args.is_training_bert, num_labels=len(label_column_names), init_checkpoint=args.init_checkpoint_file, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=args.use_tpu, use_one_hot_embeddings=args.use_tpu, use_gpu=args.use_gpu, num_gpu_cores=num_gpu_cores, fp16=args.use_fp16, model=model) if args.use_gpu and int(num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=args.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=args.train_batch_size) train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.isfile(train_file): common.file_based_convert_examples_to_features(train_examples, args.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", args.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = common.file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, label_length=label_len, is_training=True, drop_remainder=True, batch_size=args.train_batch_size) tensors_to_log = {"train loss": "loss/Mean:0"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1) estimator.train(input_fn=train_input_fn, hooks=[logging_hook], max_steps=num_train_steps)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "named_entity": NamedEntityProcessor, "punct": PunctProcessor, "norm": NormProcessor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig") dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: tf.logging.info("***** Predict results *****") for item in result: predictions = item['predictions'] seq_len = item['seq_len'] predictions = predictions[1:seq_len + 1] labels = [] for pred in predictions: labels.append(label_list[pred]) writer.write( tokenization.printable_text(' '.join(labels)) + '\n') if FLAGS.do_train and FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') is_tpu_estimator = not FLAGS.use_gpu or int(FLAGS.num_gpu_cores) < 2 save_for_serving(estimator, serving_dir, FLAGS.max_seq_length, is_tpu_estimator)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) validate_flags_or_throw(albert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) # multiple gpus NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1 using_customized_optimizer = None if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror": os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in list(range(NUM_GPUS))]) # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=NUM_GPUS, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=NUM_GPUS), ) using_customized_optimizer = True tf.logging.info('Use MirroredStrategy with %d devices.', strategy.num_replicas_in_sync) else: strategy = tf.distribute.OneDeviceStrategy("GPU:0") using_customized_optimizer = False tf.logging.info('Single device mode.') tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int( min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), train_distribute=strategy, eval_distribute=strategy, #get error during evaluation ) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = squad_utils.read_squad_examples( input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) if FLAGS.do_train: num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = squad_utils.v2_model_fn_builder( albert_config=albert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, max_seq_length=FLAGS.max_seq_length, start_n_top=FLAGS.start_n_top, end_n_top=FLAGS.end_n_top, dropout_prob=FLAGS.dropout_prob, customized=using_customized_optimizer, optimizer=FLAGS.optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu and FLAGS.tpu_name: tf.logging.info("Use TPUEstimator") estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) else: tf.logging.info("Use normal Estimator") estimator = Estimator( model_fn=model_fn, params={}, config=run_config, ) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. if not tf.gfile.Exists(FLAGS.train_feature_file): train_writer = squad_utils.FeatureWriter(filename=os.path.join( FLAGS.train_feature_file), is_training=True) squad_utils.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature, do_lower_case=FLAGS.do_lower_case) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) # tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info( f" Batch size = {FLAGS.train_batch_size} * {NUM_GPUS}") tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.train_feature_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size, is_v2=True) time_hist = TimeHistory() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) total_time = sum(time_hist.times) if FLAGS.do_predict: with tf.gfile.Open(FLAGS.predict_file) as predict_file: prediction_json = json.load(predict_file)["data"] eval_examples = squad_utils.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists(FLAGS.predict_feature_left_file)): tf.logging.info("Loading eval features from {}".format( FLAGS.predict_feature_left_file)) with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: eval_features = pickle.load(fin) else: eval_writer = squad_utils.FeatureWriter( filename=FLAGS.predict_feature_file, is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) squad_utils.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature, do_lower_case=FLAGS.do_lower_case) eval_writer.close() with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: pickle.dump(eval_features, fout) tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.predict_feature_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size, is_v2=True) def get_result(checkpoint): """Evaluate the checkpoint on SQuAD v2.0.""" # If running eval on the TPU, you will need to specify the number of # steps. reader = tf.train.NewCheckpointReader(checkpoint) global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) all_results = [] for result in estimator.predict(predict_input_fn, yield_single_examples=True, checkpoint_path=checkpoint): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_top_log_probs = ([ float(x) for x in result["start_top_log_probs"].flat ]) start_top_index = [ int(x) for x in result["start_top_index"].flat ] end_top_log_probs = ([ float(x) for x in result["end_top_log_probs"].flat ]) end_top_index = [int(x) for x in result["end_top_index"].flat] cls_logits = float(result["cls_logits"].flat[0]) all_results.append( squad_utils.RawResultV2( unique_id=unique_id, start_top_log_probs=start_top_log_probs, start_top_index=start_top_index, end_top_log_probs=end_top_log_probs, end_top_index=end_top_index, cls_logits=cls_logits)) output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") result_dict = {} cls_dict = {} squad_utils.accumulate_predictions_v2( result_dict, cls_dict, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.start_n_top, FLAGS.end_n_top) return squad_utils.evaluate_v2( result_dict, cls_dict, prediction_json, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file), int(global_step) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "f1" writer = tf.gfile.GFile(output_eval_file, "w") avg_time_per_batch = np.mean(time_hist.times) writer.write("===== Hyperparameters =====\n") writer.write("Training batch size: {}\n".format( FLAGS.train_batch_size)) writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length)) writer.write("Learning rate: {}\n".format(FLAGS.learning_rate)) writer.write("Num of GPU cores: {}\n".format(NUM_GPUS)) if FLAGS.do_train: avg_time_per_batch = np.mean(time_hist.times) writer.write("Total time: {}\n".format(total_time)) writer.write("Speed: {}\n".format(FLAGS.train_batch_size * NUM_GPUS / avg_time_per_batch)) if num_train_steps and num_warmup_steps: writer.write("Training steps: {}\n".format(num_train_steps)) writer.write("Warmup steps: {}\n".format(num_warmup_steps)) if tf.gfile.Exists(checkpoint_path + ".index"): result = get_result(checkpoint_path) best_perf = result[0][key_name] global_step = result[1] else: global_step = -1 best_perf = -1 checkpoint_path = None while global_step < num_train_steps: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result, global_step = get_result(checkpoint_path) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) writer.write("best {} = {}\n".format(key_name, best_perf)) tf.logging.info(" best {} = {}\n".format( key_name, best_perf)) if len(_find_valid_cands(global_step)) > 2: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result, global_step = get_result(checkpoint_path) tf.logging.info("***** Final Eval results *****") tf.logging.info(f"num_gpu_cores = {NUM_GPUS}") writer.write("===== Evuations =====\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best perf happened at step: {}".format(global_step))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) token_word2id, token_vocab_size = read_vocab(FLAGS.token_vocab_file) #input_files = [FLAGS.input_file] input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) #tf.logging.info("*** Input Files ***") #tf.logging.info(" %s" % input_files[0]) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.n_gpus, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.n_gpus), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig(train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, word2id=token_word2id) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # estimator = Estimator( # model_fn=model_fn, # params={}, # config=run_config) estimator = Estimator( model_fn=model_fn, config=run_config, ) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_files = [FLAGS.eval_input_file] eval_input_fn = input_fn_builder( input_files=eval_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) tf.logging.info("***** Running test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) test_input_files = [FLAGS.test_input_file] eval_input_fn = input_fn_builder( input_files=test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "test_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Test results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) tf.logging.info("***** Running Small test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) small_test_input_files = [FLAGS.small_test_input_file] eval_input_fn = input_fn_builder( input_files=small_test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "small_test_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Small Test results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_test: tf.logging.info("***** Running test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) test_input_files = [FLAGS.small_eval_input_file] test_input_fn = input_fn_builder( input_files=test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.predict(input_fn=test_input_fn) tf.logging.info("***** Test results *****") output_eval_file = os.path.join(FLAGS.output_dir, "small_id_eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: for i, p in tqdm(enumerate(result)): writer.write( str(p['masked_pre']) + ' ' + str(p['masked_tar']) + '\n')
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( 'At least one of `do_train` or `do_eval` must be True.') bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(','): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info('*** Input Files ***') for input_file in input_files: tf.logging.info(' %s' % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info('Use normal RunConfig') tf.logging.info(FLAGS.num_gpu_cores) dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, auto_shard_dataset=True, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 10 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, ) else: run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host, ), ) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, ) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info('Use normal Estimator') estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, ) if FLAGS.do_train: tf.logging.info('***** Running training *****') tf.logging.info(' Batch size = %d', FLAGS.train_batch_size) if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: train_input_fn = input_fn_builder_gpu( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, batch_size=per_device_batch_size(FLAGS.train_batch_size, FLAGS.num_gpu_cores), ) else: train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, ) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info('***** Running evaluation *****') tf.logging.info(' Batch size = %d', FLAGS.eval_batch_size) if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: train_input_fn = input_fn_builder_gpu( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, batch_size=FLAGS.eval_batch_size, ) else: eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, ) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, 'eval_results.txt') with tf.gfile.GFile(output_eval_file, 'w') as writer: tf.logging.info('***** Eval results *****') for key in sorted(result.keys()): tf.logging.info(' %s = %s', key, str(result[key])) writer.write('%s = %s\n' % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 session_config = tf.ConfigProto(log_device_placement=True) session_config.gpu_options.per_process_gpu_memory_fraction = 0.7 session_config.gpu_options.allow_growth = True if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig") # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_meta = os.path.join(FLAGS.data_dir, "train.json") with open(train_meta, 'r') as f: d = json.load(f) num_train_example = d['num_train_example'] num_train_steps = int(num_train_example / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint model_fn = model_fn_builder(bert_config=bert_config, num_labels=125, init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.data_dir, "train*.tfrecord") train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_file = os.path.join(FLAGS.data_dir, "eval*.tfrecord") eval_meta = os.path.join(FLAGS.data_dir, "eval.json") with open(eval_meta, 'r') as f: d = json.load(f) num_eval_examples = d['num_eval_examples'] tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = FLAGS.eval_steps if eval_steps == 0: eval_steps = None eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch.1 eval_steps = int(num_eval_examples / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: pred_meta = os.path.join(FLAGS.data_dir, "predict.json") predict_file = os.path.join(FLAGS.data_dir, "predict*.tfrecord") with open(pred_meta, 'r') as f: d = json.load(f) num_pred_examples = d['num_pred_examples'] tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", num_pred_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: tf.logging.info("***** Predict results *****") for prediction in result: output_line = "\t".join( str(class_probability) for class_probability in prediction) + "\n" writer.write(output_line) if FLAGS.do_train and FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') is_tpu_estimator = not FLAGS.use_gpu or int(FLAGS.num_gpu_cores) < 2 save_for_serving(estimator, serving_dir, FLAGS.max_seq_length, is_tpu_estimator)