def main(_): tf.logging.set_verbosity(tf.logging.INFO) # Get corpus info FLAGS.n_token = data_utils.VOCAB_SIZE tf.logging.info('n_token {}'.format(FLAGS.n_token)) if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) bsz_per_core = FLAGS.train_batch_size train_input_fn, train_record_info_dict = get_input_fn( 'train', bsz_per_core) tf.logging.info('num of batches {}'.format( train_record_info_dict['num_batch'])) train_cache_fn = get_cache_fn(FLAGS.mem_len, bsz_per_core) tf.logging.info(train_cache_fn) log_every_n_steps = 10 run_config = RunConfig( log_step_count_steps=log_every_n_steps, model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.save_steps, save_summary_steps=None, ) model_fn = get_model_fn() tf.logging.info('Use normal Estimator') estimator = Estimator( model_fn=model_fn, params={ 'batch_size': bsz_per_core, 'cache': None }, config=run_config, ) tf.logging.info('***** Running evaluation *****') tf.logging.info(' Batch size = %d', FLAGS.train_batch_size) estimator.evaluate(input_fn=train_input_fn, steps=100)
def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir): """Train or evaluate the model. Args: is_training: whether to train or evaluate the model. In training mode, quantization will be simulated where the quantize_and_dequantize_v2 are placed. use_trt: if true, use TRT INT8 mode for evaluation, which will perform real quantization. Otherwise use native TensorFlow which will perform simulated quantization. Ignored if is_training is True. batch_size: batch size. num_epochs: how many epochs to train. Ignored if is_training is False. model_dir: where to save or load checkpoint. Returns: The Estimator evaluation result. """ # Get dataset train_data, test_data = mnist.load_data() def _PreprocessFn(x, y): x = math_ops.cast(x, dtypes.float32) x = array_ops.expand_dims(x, axis=2) x = 2.0 * (x / 255.0) - 1.0 y = math_ops.cast(y, dtypes.int32) return x, y def _EvalInputFn(): mnist_x, mnist_y = test_data dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y)) dataset = dataset.apply( data.experimental.map_and_batch( map_func=_PreprocessFn, batch_size=batch_size, num_parallel_calls=8)) dataset = dataset.repeat(count=1) iterator = dataset.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels def _TrainInputFn(): mnist_x, mnist_y = train_data dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y)) dataset = dataset.shuffle(2 * len(mnist_x)) dataset = dataset.apply( data.experimental.map_and_batch( map_func=_PreprocessFn, batch_size=batch_size, num_parallel_calls=8)) dataset = dataset.repeat(count=num_epochs) iterator = dataset.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels def _ModelFn(features, labels, mode): if is_training: logits_out = self._BuildGraph(features) else: graph_def = self._GetGraphDef(use_trt, batch_size, model_dir) logits_out = importer.import_graph_def( graph_def, input_map={INPUT_NODE_NAME: features}, return_elements=[OUTPUT_NODE_NAME + ':0'], name='')[0] loss = losses.sparse_softmax_cross_entropy( labels=labels, logits=logits_out) summary.scalar('loss', loss) classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out') accuracy = metrics.accuracy( labels=labels, predictions=classes_out, name='acc_op') summary.scalar('accuracy', accuracy[1]) if mode == ModeKeys.EVAL: return EstimatorSpec( mode, loss=loss, eval_metric_ops={'accuracy': accuracy}) elif mode == ModeKeys.TRAIN: optimizer = AdamOptimizer(learning_rate=1e-2) train_op = optimizer.minimize(loss, global_step=get_global_step()) return EstimatorSpec(mode, loss=loss, train_op=train_op) config_proto = config_pb2.ConfigProto() config_proto.gpu_options.allow_growth = True estimator = Estimator( model_fn=_ModelFn, model_dir=model_dir if is_training else None, config=RunConfig(session_config=config_proto)) if is_training: estimator.train(_TrainInputFn) results = estimator.evaluate(_EvalInputFn) logging.info('accuracy: %s', str(results['accuracy'])) return results
def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir): """Train or evaluate the model. Args: is_training: whether to train or evaluate the model. In training mode, quantization will be simulated where the quantize_and_dequantize_v2 are placed. use_trt: if true, use TRT INT8 mode for evaluation, which will perform real quantization. Otherwise use native TensorFlow which will perform simulated quantization. Ignored if is_training is True. batch_size: batch size. num_epochs: how many epochs to train. Ignored if is_training is False. model_dir: where to save or load checkpoint. Returns: The Estimator evaluation result. """ # Get dataset train_data, test_data = mnist.load_data() def _PreprocessFn(x, y): x = math_ops.cast(x, dtypes.float32) x = array_ops.expand_dims(x, axis=2) x = 2.0 * (x / 255.0) - 1.0 y = math_ops.cast(y, dtypes.int32) return x, y def _EvalInputFn(): mnist_x, mnist_y = test_data dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y)) dataset = dataset.apply( data.experimental.map_and_batch(map_func=_PreprocessFn, batch_size=batch_size, num_parallel_calls=8)) dataset = dataset.repeat(count=1) iterator = dataset.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels def _TrainInputFn(): mnist_x, mnist_y = train_data dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y)) dataset = dataset.shuffle(2 * len(mnist_x)) dataset = dataset.apply( data.experimental.map_and_batch(map_func=_PreprocessFn, batch_size=batch_size, num_parallel_calls=8)) dataset = dataset.repeat(count=num_epochs) iterator = dataset.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels def _ModelFn(features, labels, mode): if is_training: logits_out = self._BuildGraph(features) else: graph_def = self._GetGraphDef(use_trt, batch_size, model_dir) logits_out = importer.import_graph_def( graph_def, input_map={INPUT_NODE_NAME: features}, return_elements=[OUTPUT_NODE_NAME + ':0'], name='')[0] loss = losses.sparse_softmax_cross_entropy(labels=labels, logits=logits_out) summary.scalar('loss', loss) classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out') accuracy = metrics.accuracy(labels=labels, predictions=classes_out, name='acc_op') summary.scalar('accuracy', accuracy[1]) if mode == ModeKeys.EVAL: return EstimatorSpec(mode, loss=loss, eval_metric_ops={'accuracy': accuracy}) elif mode == ModeKeys.TRAIN: optimizer = AdamOptimizer(learning_rate=1e-2) train_op = optimizer.minimize(loss, global_step=get_global_step()) return EstimatorSpec(mode, loss=loss, train_op=train_op) config_proto = config_pb2.ConfigProto() config_proto.gpu_options.allow_growth = True estimator = Estimator(model_fn=_ModelFn, model_dir=model_dir if is_training else None, config=RunConfig(session_config=config_proto)) if is_training: estimator.train(_TrainInputFn) results = estimator.evaluate(_EvalInputFn) logging.info('accuracy: %s', str(results['accuracy'])) return results
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( 'At least one of `do_train` or `do_eval` must be True.') bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(','): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info('*** Input Files ***') for input_file in input_files: tf.logging.info(' %s' % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info('Use normal RunConfig') tf.logging.info(FLAGS.num_gpu_cores) dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, auto_shard_dataset=True, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 10 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, ) else: run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host, ), ) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, ) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info('Use normal Estimator') estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, ) if FLAGS.do_train: tf.logging.info('***** Running training *****') tf.logging.info(' Batch size = %d', FLAGS.train_batch_size) if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: train_input_fn = input_fn_builder_gpu( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, batch_size=per_device_batch_size(FLAGS.train_batch_size, FLAGS.num_gpu_cores), ) else: train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, ) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info('***** Running evaluation *****') tf.logging.info(' Batch size = %d', FLAGS.eval_batch_size) if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: train_input_fn = input_fn_builder_gpu( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, batch_size=FLAGS.eval_batch_size, ) else: eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, ) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, 'eval_results.txt') with tf.gfile.GFile(output_eval_file, 'w') as writer: tf.logging.info('***** Eval results *****') for key in sorted(result.keys()): tf.logging.info(' %s = %s', key, str(result[key])) writer.write('%s = %s\n' % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.n_gpus, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.n_gpus), ) ''' IF ERROR COULD TRY dist_strategy = tf.contrib.distribute.MirroredStrategy( devices=["device:GPU:%d" % i for i in range(FLAGS.n_gpus)], cross_tower_ops=tf.distribute.HierarchicalCopyAllReduce()) ''' log_every_n_steps = 8 run_config = RunConfig(train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = Estimator(model_fn=model_fn, params={}, config=run_config) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "qqp": QqpProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-4225061263 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) else: run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2 and FLAGS.do_train: # init_checkpoint = None # else: # init_checkpoint = FLAGS.init_checkpoint init_checkpoint = FLAGS.init_checkpoint model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train( input_fn=train_input_fn, max_steps=num_train_steps, ) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples if FLAGS.do_train and FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') save_for_serving(estimator, serving_dir)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) token_word2id, token_vocab_size = read_vocab(FLAGS.token_vocab_file) #input_files = [FLAGS.input_file] input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) #tf.logging.info("*** Input Files ***") #tf.logging.info(" %s" % input_files[0]) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.n_gpus, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.n_gpus), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig(train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, word2id=token_word2id) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # estimator = Estimator( # model_fn=model_fn, # params={}, # config=run_config) estimator = Estimator( model_fn=model_fn, config=run_config, ) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_files = [FLAGS.eval_input_file] eval_input_fn = input_fn_builder( input_files=eval_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) tf.logging.info("***** Running test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) test_input_files = [FLAGS.test_input_file] eval_input_fn = input_fn_builder( input_files=test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "test_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Test results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) tf.logging.info("***** Running Small test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) small_test_input_files = [FLAGS.small_test_input_file] eval_input_fn = input_fn_builder( input_files=small_test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "small_test_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Small Test results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_test: tf.logging.info("***** Running test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) test_input_files = [FLAGS.small_eval_input_file] test_input_fn = input_fn_builder( input_files=test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.predict(input_fn=test_input_fn) tf.logging.info("***** Test results *****") output_eval_file = os.path.join(FLAGS.output_dir, "small_id_eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: for i, p in tqdm(enumerate(result)): writer.write( str(p['masked_pre']) + ' ' + str(p['masked_tar']) + '\n')
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( 'At least one of `do_train` or `do_eval` must be True.' ) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file ) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(','): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info('*** Input Files ***') for input_file in input_files: tf.logging.info(' %s' % input_file) tf.logging.info('Use normal RunConfig') tf.logging.info(FLAGS.num_gpu_cores) dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus = FLAGS.num_gpu_cores, auto_shard_dataset = True, cross_device_ops = AllReduceCrossDeviceOps( 'nccl', num_packs = FLAGS.num_gpu_cores ), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 10 run_config = RunConfig( train_distribute = dist_strategy, eval_distribute = dist_strategy, log_step_count_steps = log_every_n_steps, model_dir = FLAGS.output_dir, save_checkpoints_steps = FLAGS.save_checkpoints_steps, save_summary_steps = None, ) model_fn = model_fn_builder( albert_config = albert_config, init_checkpoint = FLAGS.init_checkpoint, learning_rate = FLAGS.learning_rate, num_train_steps = FLAGS.num_train_steps, num_warmup_steps = FLAGS.num_warmup_steps, use_tpu = FLAGS.use_tpu, use_one_hot_embeddings = FLAGS.use_tpu, optimizer = FLAGS.optimizer, poly_power = FLAGS.poly_power, start_warmup_step = FLAGS.start_warmup_step, ) tf.logging.info('Use normal Estimator') estimator = Estimator(model_fn = model_fn, params = {}, config = run_config) if FLAGS.do_train: tf.logging.info('***** Running training *****') tf.logging.info(' Batch size = %d', FLAGS.train_batch_size) train_input_fn = input_fn_builder_gpu( input_files = input_files, max_seq_length = FLAGS.max_seq_length, max_predictions_per_seq = FLAGS.max_predictions_per_seq, is_training = True, batch_size = per_device_batch_size( FLAGS.train_batch_size, FLAGS.num_gpu_cores ), ) estimator.train( input_fn = train_input_fn, max_steps = FLAGS.num_train_steps ) if FLAGS.do_eval: tf.logging.info('***** Running evaluation *****') tf.logging.info(' Batch size = %d', FLAGS.eval_batch_size) global_step = -1 output_eval_file = os.path.join(FLAGS.output_dir, 'eval_results.txt') writer = tf.gfile.GFile(output_eval_file, 'w') tf.gfile.MakeDirs(FLAGS.export_dir) eval_input_fn = input_fn_builder( input_files = input_files, max_seq_length = FLAGS.max_seq_length, max_predictions_per_seq = FLAGS.max_predictions_per_seq, is_training = False, ) while global_step < FLAGS.num_train_steps: if estimator.latest_checkpoint() is None: tf.logging.info('No checkpoint found yet. Sleeping.') time.sleep(1) else: result = estimator.evaluate( input_fn = eval_input_fn, steps = FLAGS.max_eval_steps ) global_step = result['global_step'] tf.logging.info('***** Eval results *****') for key in sorted(result.keys()): tf.logging.info(' %s = %s', key, str(result[key])) writer.write('%s = %s\n' % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": classifier_utils.ColaProcessor, "mnli": classifier_utils.MnliProcessor, "mrpc": classifier_utils.MrpcProcessor, # "xnli": XnliProcessor, "sts-b": classifier_utils.StsbProcessor, "qqp": classifier_utils.QqpProcessor, "sst-2": classifier_utils.Sst2Processor, "qnli": classifier_utils.QnliProcessor, "rte": classifier_utils.RteProcessor, "wnli": classifier_utils.WnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) albert_config.hidden_dropout_prob = FLAGS.albert_dropout_prob albert_config.attention_probs_dropout_prob = FLAGS.albert_dropout_prob if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) # multiple gpus NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1 using_customized_optimizer = None if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror": os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in list(range(NUM_GPUS))]) # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=NUM_GPUS, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=NUM_GPUS), ) using_customized_optimizer = True tf.logging.info('Use MirroredStrategy with %d devices.', strategy.num_replicas_in_sync) else: strategy = tf.distribute.OneDeviceStrategy("GPU:0") using_customized_optimizer = False tf.logging.info('Single device mode.') tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), train_distribute=strategy, eval_distribute=strategy, #get error during evaluation ) train_examples = None total_time = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) model_fn = classifier_utils.model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.train_step, num_warmup_steps=FLAGS.warmup_step, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, task_name=task_name, customized=using_customized_optimizer, optimizer=FLAGS.optimizer, discard_classifier_weights=FLAGS.discard_classifier_weights) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu and FLAGS.tpu_name: tf.logging.info("Use TPUEstimator") estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) else: tf.logging.info("Use normal Estimator") estimator = Estimator( model_fn=model_fn, params={}, config=run_config, ) if FLAGS.do_train: cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir train_file = os.path.join(cached_dir, task_name + "_train.tf_record") if not tf.gfile.Exists(train_file): classifier_utils.file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, task_name) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info( f" Batch size = {FLAGS.train_batch_size} * {NUM_GPUS}") tf.logging.info(" Num steps = %d", FLAGS.train_step) train_input_fn = classifier_utils.file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size) time_hist = TimeHistory() estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step, hooks=[time_hist]) total_time = sum(time_hist.times) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_features = classifier_utils.convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, task_name) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record") if not tf.gfile.Exists(eval_file): classifier_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, task_name) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = classifier_utils.file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.eval_batch_size) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") if task_name == "sts-b": key_name = "pearson" elif task_name == "cola": key_name = "matthew_corr" else: key_name = "eval_accuracy" if tf.gfile.Exists(checkpoint_path + ".index"): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: global_step = -1 best_perf = -1 checkpoint_path = None writer = tf.gfile.GFile(output_eval_file, "w") writer.write("===== Hyperparameters =====\n") writer.write("Training batch size: {}\n".format( FLAGS.train_batch_size)) writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length)) writer.write("Learning rate: {}\n".format(FLAGS.learning_rate)) writer.write("Num of GPU cores: {}\n".format(NUM_GPUS)) if FLAGS.do_train: avg_time_per_batch = np.mean(time_hist.times) writer.write("Total time: {}\n".format(total_time)) writer.write("Speed: {}\n".format(FLAGS.train_batch_size * NUM_GPUS / avg_time_per_batch)) if FLAGS.train_step and FLAGS.warmup_step: writer.write("Training steps: {}\n".format(FLAGS.train_step)) writer.write("Warmup steps: {}\n".format(FLAGS.warmup_step)) while global_step < FLAGS.train_step: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for checkpoint in sorted(steps_and_files.items()): step, checkpoint_path = checkpoint if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") tf.logging.info(f"num_gpu_cores = {NUM_GPUS}") writer.write("===== Evuations =====\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close() if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") classifier_utils.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, task_name) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = classifier_utils.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.predict(input_fn=predict_input_fn, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.gfile.GFile(output_submit_file, "w") as sub_writer: sub_writer.write("index" + "\t" + "prediction\n") num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, (example, prediction)) in\ enumerate(zip(predict_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) if task_name != "sts-b": actual_label = label_list[int(prediction["predictions"])] else: actual_label = str(prediction["predictions"]) sub_writer.write(example.guid + "\t" + actual_label + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "named_entity": NamedEntityProcessor, "punct": PunctProcessor, "norm": NormProcessor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig") dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: tf.logging.info("***** Predict results *****") for item in result: predictions = item['predictions'] seq_len = item['seq_len'] predictions = predictions[1:seq_len + 1] labels = [] for pred in predictions: labels.append(label_list[pred]) writer.write( tokenization.printable_text(' '.join(labels)) + '\n') if FLAGS.do_train and FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') is_tpu_estimator = not FLAGS.use_gpu or int(FLAGS.num_gpu_cores) < 2 save_for_serving(estimator, serving_dir, FLAGS.max_seq_length, is_tpu_estimator)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 session_config = tf.ConfigProto(log_device_placement=True) session_config.gpu_options.per_process_gpu_memory_fraction = 0.7 session_config.gpu_options.allow_growth = True if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig") # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_meta = os.path.join(FLAGS.data_dir, "train.json") with open(train_meta, 'r') as f: d = json.load(f) num_train_example = d['num_train_example'] num_train_steps = int(num_train_example / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint model_fn = model_fn_builder(bert_config=bert_config, num_labels=125, init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.data_dir, "train*.tfrecord") train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_file = os.path.join(FLAGS.data_dir, "eval*.tfrecord") eval_meta = os.path.join(FLAGS.data_dir, "eval.json") with open(eval_meta, 'r') as f: d = json.load(f) num_eval_examples = d['num_eval_examples'] tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = FLAGS.eval_steps if eval_steps == 0: eval_steps = None eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch.1 eval_steps = int(num_eval_examples / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: pred_meta = os.path.join(FLAGS.data_dir, "predict.json") predict_file = os.path.join(FLAGS.data_dir, "predict*.tfrecord") with open(pred_meta, 'r') as f: d = json.load(f) num_pred_examples = d['num_pred_examples'] tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", num_pred_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: tf.logging.info("***** Predict results *****") for prediction in result: output_line = "\t".join( str(class_probability) for class_probability in prediction) + "\n" writer.write(output_line) if FLAGS.do_train and FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') is_tpu_estimator = not FLAGS.use_gpu or int(FLAGS.num_gpu_cores) < 2 save_for_serving(estimator, serving_dir, FLAGS.max_seq_length, is_tpu_estimator)