def train_fn(hparams, num_workers): """Copy of train function from estimator.py.""" # TODO: Merge improvements into the original. # pylint: disable=protected-access hparams.tgt_sos_id, hparams.tgt_eos_id = nmt_estimator._get_tgt_sos_eos_id( hparams) model_fn = nmt_estimator.make_model_fn(hparams) def print_log(): mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=0) mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE, value=hparams.num_examples_per_epoch) if hparams.use_tpu_low_level_api: runner = create_train_runner(hparams, num_workers) mlperf_log.gnmt_print(key=mlperf_log.RUN_START) input_fn = DistributedPipeline(hparams, num_workers) runner.initialize(input_fn, {}) runner.build_model(model_fn, {}) print_log() runner.train(0, hparams.num_train_steps) return 0.0 # cluster = tf.contrib.cluster_resolver.TPUClusterResolver(hparams.tpu_name) # cluster_spec = cluster.cluster_spec() # print('cluster_spec: %s' % cluster_spec) # num_workers = cluster_spec.num_tasks('tpu_worker') # print('num_workers: %s' % num_workers) pipeline = DistributedPipeline(hparams, num_workers) print_log() if hparams.use_tpu: run_config = nmt_estimator._get_tpu_run_config(hparams, True) estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, config=run_config, use_tpu=hparams.use_tpu, train_batch_size=hparams.batch_size, eval_batch_size=hparams.batch_size, predict_batch_size=hparams.infer_batch_size) else: raise ValueError("Distributed input pipeline only supported on TPUs.") hooks = [pipeline] if hparams.use_async_checkpoint: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=hparams.out_dir, save_steps=int(hparams.num_examples_per_epoch / hparams.batch_size))) estimator.train(input_fn=pipeline, max_steps=hparams.num_train_steps, hooks=hooks) # Return value is not used return 0.0
def train_fn(hparams): """Train function.""" hparams.tgt_sos_id, hparams.tgt_eos_id = _get_tgt_sos_eos_id(hparams) model_fn = make_model_fn(hparams) def print_log(): mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=0) mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE, value=hparams.num_examples_per_epoch) if hparams.use_tpu_low_level_api: runner = create_train_runner_and_build_graph(hparams, model_fn) print_log() runner.train(0, hparams.num_train_steps) return 0.0 input_fn = make_input_fn(hparams, tf.contrib.learn.ModeKeys.TRAIN) print_log() if hparams.use_tpu: run_config = _get_tpu_run_config(hparams, True) estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, config=run_config, use_tpu=hparams.use_tpu, train_batch_size=hparams.batch_size, eval_batch_size=hparams.batch_size, predict_batch_size=hparams.infer_batch_size) else: distribution_strategy = get_distribution_strategy(hparams.num_gpus) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=hparams.out_dir, config=tf.estimator.RunConfig(train_distribute=distribution_strategy)) hooks = [] if hparams.use_async_checkpoint: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=hparams.out_dir, save_steps=int( hparams.num_examples_per_epoch / hparams.batch_size))) estimator.train( input_fn=input_fn, max_steps=hparams.num_train_steps, hooks=hooks) # Return value is not used return 0.0
def main(argv): del argv # Unused. global SUCCESS print(FLAGS.model_dir) if FLAGS.model_dir: print(FLAGS.model_dir) else: print(FLAGS.training_file_pattern) raise Exception('No model dir') # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') run_config, params = construct_run_config(FLAGS.iterations_per_loop) if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once': if params['train_with_low_level_api']: params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards trunner = train_low_level_runner.TrainLowLevelRunner( iterations=FLAGS.iterations_per_loop) input_fn = dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data) mlperf_log.ssd_print(key=mlperf_log.RUN_START) trunner.initialize(input_fn, ssd_model.ssd_model_fn, params) else: mlperf_log.ssd_print(key=mlperf_log.RUN_START) if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'): if params['eval_with_low_level_api']: params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards erunner = eval_low_level_runner.EvalLowLevelRunner( eval_steps=int(FLAGS.eval_samples / FLAGS.eval_batch_size)) input_fn = dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data) erunner.initialize(input_fn, params) erunner.build_model(ssd_model.ssd_model_fn, params) # TPU Estimator if FLAGS.mode == 'train': if params['train_with_low_level_api']: train_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0) trunner.train(train_steps) trunner.shutdown() else: if FLAGS.device == 'gpu': params['dataset_num_shards'] = 1 params['dataset_index'] = 0 train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0) hooks = [] if FLAGS.use_async_checkpoint: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) train_estimator.train( input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size), hooks=hooks) if FLAGS.eval_after_training: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) eval_results = coco_metric.compute_map(predictions, FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'train_and_eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) current_step = 0 mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) threads = [] for eval_step in ssd_constants.EVAL_STEPS: # Compute the actual eval steps based on the actural train_batch_size steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size) current_epoch = current_step // params['steps_per_epoch'] # TODO(wangtao): figure out how to log for each epoch. mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=current_epoch) tf.logging.info('Starting training cycle for %d steps.' % steps) if params['train_with_low_level_api']: trunner.train(steps) else: run_config, params = construct_run_config(steps) if FLAGS.device == 'gpu': train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) train_estimator.train(input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=steps) if SUCCESS: break current_step = current_step + steps current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('Starting evaluation cycle at step %d.' % current_step) mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) # Run evaluation at the given step. if params['eval_with_low_level_api']: predictions = list(erunner.predict()) else: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer)) threads.append(t) t.start() trunner.shutdown() for t in threads: t.join() # success is a string right now as boolean is not JSON serializable. if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) summary_writer.close() elif FLAGS.mode == 'eval': if not params['eval_with_low_level_api']: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist() eval_epochs = [ steps * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size // params['steps_per_epoch'] for steps in eval_steps ] # For 8x8 slices and above. if FLAGS.train_batch_size >= 4096: eval_epochs = [i * 2 for i in eval_epochs] tf.logging.info('Eval epochs: %s' % eval_epochs) # Run evaluation when there's a new checkpoint threads = [] count = 1 for ckpt in next_checkpoint(FLAGS.model_dir): print("current count is {}\n".format(count)) count += 1 if SUCCESS: break current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('current step: %s' % current_step) tf.logging.info('current epoch: %s' % current_epoch) if not params[ 'eval_every_checkpoint'] and current_epoch not in eval_epochs: continue tf.logging.info('Starting to evaluate.') try: mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer)) threads.append(t) t.start() # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) for t in threads: t.join() if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) summary_writer.close() elif FLAGS.mode == 'eval_once': if not params['eval_with_low_level_api']: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) # Run evaluation when there's a new checkpoint for ckpt in next_checkpoint(FLAGS.model_dir): current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] print('current epoch: %s' % current_epoch) if FLAGS.eval_epoch < current_epoch: break if FLAGS.eval_epoch > current_epoch: continue tf.logging.info('Starting to evaluate.') try: mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) coco_eval(predictions, current_epoch, current_step, summary_writer) # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) print('Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. print('Checkpoint %s no longer exists, skipping checkpoint' % ckpt) print('%d ending' % FLAGS.eval_epoch) summary_writer.close()
def main(unused_argv): steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size tpu = FLAGS.tpu or FLAGS.master tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu if (tpu or FLAGS.use_tpu) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if FLAGS.use_train_runner: if FLAGS.mode == 'in_memory_eval': low_level_runner = train_and_eval_runner.TrainAndEvalRunner( iterations=FLAGS.iterations_per_loop, train_steps=FLAGS.train_steps, eval_steps=int( math.ceil(FLAGS.num_eval_images / FLAGS.eval_batch_size))) else: trunner = train_runner.TrainRunner( iterations=FLAGS.iterations_per_loop, train_steps=FLAGS.train_steps) if FLAGS.mode != 'eval': mlp_log.mlperf_print('init_start', None) if FLAGS.use_async_checkpointing or FLAGS.mode == 'in_memory_eval': save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) mlp_log.mlperf_print('global_batch_size', FLAGS.train_batch_size) if not FLAGS.use_train_runner: config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, save_summary_steps=0, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, tpu_job_name=FLAGS.tpu_job_name, # num_shards=FLAGS.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, export_to_tpu=False) assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', ( 'Invalid value for --precision flag; must be bfloat16 or float32.') tf.logging.info('Precision: %s', FLAGS.precision) use_bfloat16 = FLAGS.precision == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=use_bfloat16, transpose_input=FLAGS.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)]] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=FLAGS.image_size, num_parallel_calls=FLAGS.num_parallel_calls, num_cores=FLAGS.num_prefetch_threads, prefetch_depth_auto_tune=FLAGS.prefetch_depth_auto_tune, use_bfloat16=use_bfloat16) for is_training in [True, False] ] if FLAGS.use_train_runner and FLAGS.mode == 'train': params = {'batch_size': FLAGS.train_batch_size} trunner.initialize(imagenet_train.input_fn, resnet_model_fn, params) if FLAGS.use_train_runner and FLAGS.mode == 'in_memory_eval': params = {'batch_size': FLAGS.train_batch_size} low_level_runner.initialize(imagenet_train.input_fn, imagenet_eval.input_fn, resnet_model_fn, params) if FLAGS.mode != 'eval': mlp_log.mlperf_print('init_stop', None) mlp_log.mlperf_print('run_start', None) mlp_log.mlperf_print('num_train_examples', FLAGS.num_train_images) mlp_log.mlperf_print('num_eval_examples', FLAGS.num_eval_images) steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size eval_steps = int(math.ceil(FLAGS.num_eval_images / FLAGS.eval_batch_size)) if FLAGS.mode == 'eval': params = { 'batch_size': FLAGS.eval_batch_size } if FLAGS.use_eval_runner: erunner = eval_runner.EvalRunner( input_fn=imagenet_eval.input_fn, model_fn=resnet_model_fn, params=params, num_steps=eval_steps) success = False # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') current_step = int(os.path.basename(ckpt).split('-')[1]) try: start_timestamp = time.time() # This time will include compilation time if FLAGS.use_eval_runner: eval_results = erunner.eval( num_steps=eval_steps, checkpoint_path=ckpt) else: eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) mlp_log.mlperf_print( 'eval_accuracy', float(eval_results['top_1_accuracy']), metadata={'epoch_num': max(current_step // steps_per_epoch - 1, 0)}) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) if eval_results['top_1_accuracy'] >= FLAGS.stop_threshold: success = True mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'}) mlp_log.mlperf_print('run_final', None) break # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if not success: mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'}) mlp_log.mlperf_print('run_final', None) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' if FLAGS.mode == 'train': if FLAGS.use_train_runner: trunner.train() else: hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) elif FLAGS.mode == 'in_memory_eval': if FLAGS.use_train_runner: low_level_runner.train_and_eval( enable_tracing=FLAGS.enable_auto_tracing) low_level_runner.shutdown() else: steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size hooks = [] hooks.append( in_memory_eval.TPUInMemoryEvalHook( resnet_classifier, imagenet_eval.input_fn, steps_per_epoch, stop_threshold=FLAGS.stop_threshold, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size, every_n_iter=steps_per_epoch * 4)) if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, steps_per_epoch * 4))) resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir) # pylint: disable=protected-access,line-too-long steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / steps_per_epoch, current_step) start_timestamp = time.time() # This time will include compilation time assert FLAGS.mode == 'train_and_eval' success = False while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) mlp_log.mlperf_print( 'eval_accuracy', float(eval_results['top_1_accuracy']), metadata={'epoch_num': max(current_step // steps_per_epoch - 1, 0)}) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) if eval_results['top_1_accuracy'] >= FLAGS.stop_threshold: success = True mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'}) mlp_log.mlperf_print('run_final', None) break elapsed_time = int(time.time() - start_timestamp) if not success: mlp_log.mlperf_print('run_stop', None, metadata={'status': 'abort'}) mlp_log.mlperf_print('run_final', None) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn) if FLAGS.use_train_runner and FLAGS.mode == 'train': trunner.shutdown()
def main(argv): del argv # Unused. global SUCCESS # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') run_config, params = construct_run_config(FLAGS.iterations_per_loop) mlp_log.mlperf_print('global_batch_size', FLAGS.train_batch_size) mlp_log.mlperf_print('opt_base_learning_rate', params['base_learning_rate']) mlp_log.mlperf_print('opt_weight_decay', params['weight_decay']) mlp_log.mlperf_print( 'model_bn_span', FLAGS.train_batch_size // FLAGS.num_shards * params['distributed_group_size']) if FLAGS.mode in ('eval', 'eval_once'): coco_gt = coco_metric.create_coco( FLAGS.val_json_file, use_cpp_extension=params['use_cocoeval_cc']) if FLAGS.mode == 'train_and_eval' and params[ 'in_memory_eval'] and FLAGS.train_batch_size != FLAGS.eval_batch_size: raise RuntimeError( 'train batch size should be equal to eval batch size for in memory eval.' ) if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once' and not params[ 'in_memory_eval']: if params['train_with_low_level_api'] and not params['in_memory_eval']: params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards input_partition_dims = FLAGS.input_partition_dims if input_partition_dims is not None and params['transpose_input']: if params['batch_size'] > 8: input_partition_dims = [ input_partition_dims[i] for i in [1, 2, 3, 0] ] else: input_partition_dims = [ input_partition_dims[i] for i in [1, 2, 0, 3] ] trunner = train_low_level_runner.TrainLowLevelRunner( input_partition_dims=[input_partition_dims, None] if FLAGS.input_partition_dims else None, num_cores_per_shard=int(np.prod(FLAGS.input_partition_dims)) if FLAGS.input_partition_dims else 1, iterations=FLAGS.iterations_per_loop, ) input_fn = dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data) trunner.initialize(input_fn, ssd_model.ssd_model_fn, params) if params[ 'eval_with_low_level_api'] and FLAGS.mode != 'train' and not params[ 'in_memory_eval']: params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards eval_steps = int(math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size)) if params['distributed_eval']: erunner = dist_eval_low_level_runner.DistEvalLowLevelRunner( eval_steps=eval_steps) else: erunner = eval_low_level_runner.EvalLowLevelRunner( eval_steps=eval_steps) input_fn = dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, distributed_eval=params['distributed_eval'], count=eval_steps * FLAGS.eval_batch_size) erunner.initialize(input_fn, params) erunner.build_model(ssd_model.ssd_model_fn, params) # TPU Estimator if FLAGS.mode == 'train': if params['train_with_low_level_api']: train_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) trunner.train(train_steps) trunner.shutdown() else: if FLAGS.device == 'gpu': train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) hooks = [] if FLAGS.use_async_checkpoint: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) train_estimator.train( input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size), hooks=hooks) if FLAGS.eval_after_training: if params['eval_with_low_level_api']: predictions = list(erunner.predict()) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) eval_results = coco_metric.compute_map( predictions, coco_gt, use_cpp_extension=params['use_cocoeval_cc'], nms_on_tpu=params['nms_on_tpu']) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'train_and_eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) if params['in_memory_eval']: params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards eval_steps = int( math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size)) input_partition_dims = FLAGS.input_partition_dims if input_partition_dims is not None and params['transpose_input']: if params['batch_size'] > 8: input_partition_dims = [ input_partition_dims[i] for i in [1, 2, 3, 0] ] else: input_partition_dims = [ input_partition_dims[i] for i in [1, 2, 0, 3] ] runner = train_and_eval_low_level_runner.TrainAndEvalLowLevelRunner( iterations=FLAGS.iterations_per_loop, eval_steps=eval_steps, input_partition_dims=input_partition_dims if FLAGS.input_partition_dims else None, num_cores_per_shard=int(np.prod(FLAGS.input_partition_dims)) if FLAGS.input_partition_dims else 1, ) input_fn = dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data) # Init for eval. eval_input_fn = dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, distributed_eval=True, count=eval_steps * FLAGS.eval_batch_size) runner.initialize(input_fn, eval_input_fn, ssd_model.ssd_model_fn, params) train_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) runner.train_and_eval(train_steps) runner.shutdown() return current_step = 0 threads = [] for eval_step in ssd_constants.EVAL_STEPS: # Compute the actual eval steps based on the actural train_batch_size steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size) current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('Starting training cycle for %d steps.' % steps) if params['train_with_low_level_api']: trunner.train(steps, current_step) else: run_config, params = construct_run_config(steps) if FLAGS.device == 'gpu': train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) train_estimator.train(input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=steps) if SUCCESS: break current_step = current_step + steps current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('Starting evaluation cycle at step %d.' % current_step) # Run evaluation at the given step. if params['eval_with_low_level_api']: # TODO(b/123313070): Fix convergence discrepency # for train and distributed eval on POD with low level API. predictions = list(erunner.predict()) else: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer, coco_gt, params['use_cocoeval_cc'], params['nms_on_tpu'])) threads.append(t) t.start() if params['train_with_low_level_api']: trunner.shutdown() for t in threads: t.join() summary_writer.close() elif FLAGS.mode == 'eval': if not params['eval_with_low_level_api']: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist() eval_epochs = [ steps * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size // params['steps_per_epoch'] for steps in eval_steps ] # For 8x8 slices and above. if FLAGS.train_batch_size >= 4096: eval_epochs = [i * 2 for i in eval_epochs] tf.logging.info('Eval epochs: %s' % eval_epochs) # Run evaluation when there's a new checkpoint threads = [] for ckpt in next_checkpoint(FLAGS.model_dir): if SUCCESS: break current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('current epoch: %s' % current_epoch) if not params[ 'eval_every_checkpoint'] and current_epoch not in eval_epochs: continue tf.logging.info('Starting to evaluate.') try: if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer, coco_gt)) threads.append(t) t.start() # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) for t in threads: t.join() summary_writer.close() elif FLAGS.mode == 'eval_once': if not params['eval_with_low_level_api']: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) # Run evaluation when there's a new checkpoint for ckpt in next_checkpoint(FLAGS.model_dir): current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] print('current epoch: %s' % current_epoch) if FLAGS.eval_epoch < current_epoch: break if FLAGS.eval_epoch > current_epoch: continue tf.logging.info('Starting to evaluate.') try: if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) coco_eval(predictions, current_step, summary_writer, coco_gt, params['use_cocoeval_cc'], params['nms_on_tpu']) # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. print('Checkpoint %s no longer exists, skipping checkpoint' % ckpt) print('%d ending' % FLAGS.eval_epoch) summary_writer.close()