def main(argv): del argv # Unused. if FLAGS.start_profiler_server: # Starts profiler. It will perform profiling when receive profiling request. profiler.start_profiler_server(FLAGS.profiler_port_number) if FLAGS.use_tpu: if FLAGS.distribution_strategy is None: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: raise RuntimeError( 'Distribution strategy must be None when --use_tpu is True.') else: tpu_cluster_resolver = None if FLAGS.mode not in ['train', 'eval', 'train_and_eval']: raise ValueError('Unrecognize --mode: %s' % FLAGS.mode) # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') if FLAGS.mode == 'train_and_eval': if FLAGS.distribution_strategy is not None: raise RuntimeError('You must use --distribution_strategy=None for ' 'train_and_eval.') # Parse hparams hparams = retinanet_model.default_hparams() config_file = FLAGS.config_file hparams.num_epochs = FLAGS.num_epochs if config_file and tf.gfile.Exists(config_file): # load params from file. with tf.gfile.Open(config_file, 'r') as f: values_map = json.load(f) hparams.override_from_dict(values_map) hparams.parse(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in hparams, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = hparams.get('image_size') for level in range(hparams.get('min_level'), hparams.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) if FLAGS.auto_mixed_precision and FLAGS.distribution_strategy: config_proto.graph_options.rewrite_options.auto_mixed_precision = ( rewriter_config_pb2.RewriterConfig.ON) if FLAGS.distribution_strategy is None: # Uses TPUEstimator. params = dict( hparams.values(), num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) tpu_config = contrib_tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=contrib_tpu.InputPipelineConfig. PER_HOST_V2) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, ) else: if FLAGS.num_gpus < 0: raise ValueError('`num_gpus` cannot be negative.') def _per_device_batch_size(batch_size, num_gpus): """Calculate per GPU batch for Estimator. Args: batch_size: Global batch size to be divided among devices. num_gpus: How many GPUs are used per worker. Returns: Batch size per device. Raises: ValueError: if batch_size is not divisible by number of devices """ if num_gpus <= 1: return batch_size remainder = batch_size % num_gpus if remainder: raise ValueError( 'Batch size must be a multiple of the number GPUs per worker.' ) return int(batch_size / num_gpus) # Uses Estimator. params = dict( hparams.values(), num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, use_bfloat16=False, auto_mixed_precision=FLAGS.auto_mixed_precision, dataset_max_intra_op_parallelism=FLAGS. dataset_max_intra_op_parallelism, dataset_private_threadpool_size=FLAGS. dataset_private_threadpool_size, ) if FLAGS.distribution_strategy == 'mirrored': params['batch_size'] = _per_device_batch_size( FLAGS.train_batch_size, FLAGS.num_gpus) if FLAGS.num_gpus == 0: devices = ['device:CPU:0'] else: devices = [ 'device:GPU:{}'.format(i) for i in range(FLAGS.num_gpus) ] if FLAGS.all_reduce_alg: dist_strat = tf.distribute.MirroredStrategy( devices=devices, cross_device_ops=contrib_distribute. AllReduceCrossDeviceOps(FLAGS.all_reduce_alg, num_packs=2)) else: dist_strat = tf.distribute.MirroredStrategy(devices=devices) run_config = tf.estimator.RunConfig(session_config=config_proto, train_distribute=dist_strat, eval_distribute=dist_strat) elif FLAGS.distribution_strategy == 'multi_worker_mirrored': local_device_protos = device_lib.list_local_devices() params['batch_size'] = _per_device_batch_size( FLAGS.train_batch_size, sum([1 for d in local_device_protos if d.device_type == 'GPU'])) if FLAGS.worker_hosts is None: tf_config_json = json.loads(os.environ.get('TF_CONFIG', '{}')) # Replaces master with chief. if tf_config_json: if 'master' in tf_config_json['cluster']: tf_config_json['cluster']['chief'] = tf_config_json[ 'cluster'].pop('master') if tf_config_json['task']['type'] == 'master': tf_config_json['task']['type'] = 'chief' os.environ['TF_CONFIG'] = json.dumps(tf_config_json) tf_config_json = json.loads(os.environ['TF_CONFIG']) worker_hosts = tf_config_json['cluster']['worker'] worker_hosts.extend(tf_config_json['cluster'].get('chief', [])) else: # Set TF_CONFIG environment variable worker_hosts = FLAGS.worker_hosts.split(',') os.environ['TF_CONFIG'] = json.dumps({ 'cluster': { 'worker': worker_hosts }, 'task': { 'type': 'worker', 'index': FLAGS.task_index } }) dist_strat = tf.distribute.experimental.MultiWorkerMirroredStrategy( communication=_COLLECTIVE_COMMUNICATION_OPTIONS[ FLAGS.all_reduce_alg]) run_config = tf.estimator.RunConfig(session_config=config_proto, train_distribute=dist_strat) else: raise ValueError('Unrecognized distribution strategy.') if FLAGS.mode == 'train': if FLAGS.model_dir is not None: if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) with tf.gfile.Open(os.path.join(FLAGS.model_dir, 'hparams.json'), 'w') as f: json.dump(hparams.values(), f, sort_keys=True, indent=2) tf.logging.info(params) if FLAGS.distribution_strategy is None: total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) train_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) # Run evaluation after training finishes. eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) if FLAGS.eval_after_training: if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) output_dir = os.path.join(FLAGS.model_dir, 'train_eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) evaluation.write_summary(eval_results, summary_writer, total_steps) else: train_estimator = tf.estimator.Estimator( model_fn=retinanet_model.est_retinanet_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=params) if FLAGS.distribution_strategy == 'mirrored': total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) tf.logging.info('Starting `MirroredStrategy` training...') train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) elif FLAGS.distribution_strategy == 'multi_worker_mirrored': total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / (len(worker_hosts) * FLAGS.train_batch_size)) train_spec = tf.estimator.TrainSpec( input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) eval_spec = tf.estimator.EvalSpec(input_fn=tf.data.Dataset) tf.logging.info( 'Starting `MultiWorkerMirroredStrategy` training...') tf.estimator.train_and_evaluate(train_estimator, train_spec, eval_spec) else: raise ValueError('Unrecognized distribution strategy.') elif FLAGS.mode == 'eval': # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) if FLAGS.distribution_strategy is None: # Uses TPUEstimator. eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) else: # Uses Estimator. if FLAGS.distribution_strategy == 'multi_worker_mirrored': raise ValueError( '--distribution_strategy=multi_worker_mirrored is not supported ' 'for eval.') elif FLAGS.distribution_strategy == 'mirrored': eval_estimator = tf.estimator.Estimator( model_fn=retinanet_model.est_retinanet_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=params) else: raise ValueError('Unrecognized distribution strategy.') def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) # Run evaluation when there's a new checkpoint for ckpt in contrib_training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) evaluation.write_summary(eval_results, summary_writer, current_step) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) elif FLAGS.mode == 'train_and_eval': if FLAGS.distribution_strategy is not None: raise ValueError( 'Distribution strategy is not implemented for --mode=train_and_eval.' ) if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') output_dir = os.path.join(FLAGS.model_dir, 'train_and_eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) num_cycles = int(FLAGS.num_epochs * FLAGS.num_examples_per_epoch / FLAGS.num_steps_per_eval) for cycle in range(num_cycles): tf.logging.info('Starting training cycle, epoch: %d.' % cycle) train_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), steps=FLAGS.num_steps_per_eval) tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle) # Run evaluation after every epoch. eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Evaluation results: %s' % eval_results) current_step = int(cycle * FLAGS.num_steps_per_eval) evaluation.write_summary(eval_results, summary_writer, current_step) else: tf.logging.info('Mode not found.') if FLAGS.model_dir: tf.logging.info('Exporting saved model.') eval_params = dict( params, use_tpu=True, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=True, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.inference_batch_size, config=run_config, params=eval_params) export_path = eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=build_serving_input_fn( hparams.image_size, FLAGS.inference_batch_size)) if FLAGS.add_warmup_requests: inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, hparams.image_size, batch_sizes=[FLAGS.inference_batch_size])
def main(argv): del argv # Unused. if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None if FLAGS.mode is 'train' and FLAGS.training_file_pattern is None: raise RuntimeError('You must specify --training_file_pattern for training.') if FLAGS.mode is 'eval': if FLAGS.valid_data_dir is None: raise RuntimeError('You must specify --valid_data_dir for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError('You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_shards=FLAGS.num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) config_proto = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) run_config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop, FLAGS.num_shards)) # TPU Estimator if FLAGS.mode == 'train': train_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=1, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'eval': # eval only runs on CPU or GPU host with batch_size = 1 # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, eval_batch_size=1, train_batch_size=FLAGS.train_batch_size, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info('Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info('Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: tf.logging.info('Mode not found.')
def main(argv): del argv # Unused. if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError('You must specify either --master or --tpu_name.') if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn('Both --master and --tpu_name are set. Ignoring ' '--tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver( tpu_names=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) tf.Session.reset(tpu_grpc_url) params = dict( hparams.values(), num_shards=FLAGS.num_shards, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) run_config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop, FLAGS.num_shards)) # TPU Estimator if FLAGS.mode == 'train': estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_50_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) estimator.train(input_fn=dataloader.InputReader(FLAGS.train_data_dir, is_training=True), steps=int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) elif FLAGS.mode == 'eval': # eval only runs on CPU or GPU host with batch_size = 1 eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) estimator_eval = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_50_model_fn, use_tpu=False, eval_batch_size=1, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = estimator_eval.evaluate( input_fn=dataloader.InputReader(FLAGS.valid_data_dir, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: tf.logging.info('Mode not found.')
def main(argv): del argv # Unused. if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in hparams, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = hparams.get('image_size') for level in range(hparams.get('min_level'), hparams.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict( hparams.values(), num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) tpu_config = tf.contrib.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, ) # TPU Estimator if FLAGS.mode == 'train': tf.logging.info(params) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) # Run evaluation after training finishes. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) if FLAGS.eval_after_training: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) if FLAGS.model_dir: eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=lambda: serving_input_fn(hparams. image_size)) elif FLAGS.mode == 'eval': # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.contrib.training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=lambda: serving_input_fn( hparams.image_size)) except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.num_epochs): tf.logging.info('Starting training cycle, epoch: %d.' % cycle) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle) # Run evaluation after every epoch. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Evaluation results: %s' % eval_results) eval_estimator.export_saved_model(export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=lambda: serving_input_fn(hparams.image_size)) else: tf.logging.info('Mode not found.')
def main(argv): del argv # Unused. # if FLAGS.use_tpu: # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # FLAGS.tpu, # zone=FLAGS.tpu_zone, # project=FLAGS.gcp_project) # tpu_grpc_url = tpu_cluster_resolver.get_master() # tf.Session.reset(tpu_grpc_url) # else: # tpu_cluster_resolver = None # Check data path if FLAGS.mode in ('train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError('You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') # if FLAGS.val_json_file is None: # raise RuntimeError('You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), # num_shards=FLAGS.num_cores, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, # val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) config_proto = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) # if FLAGS.use_xla and not FLAGS.use_tpu: # config_proto.graph_options.optimizer_options.global_jit_level = ( # tf.OptimizerOptions.ON_1) # tpu_config = tf.contrib.tpu.TPUConfig( # FLAGS.iterations_per_loop, # num_shards=FLAGS.num_cores, # per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 # ) run_config= tf.estimator.RunConfig(model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop) # run_config = tf.contrib.tpu.RunConfig( # cluster=tpu_cluster_resolver, # evaluation_master=FLAGS.eval_master, # model_dir=FLAGS.model_dir, # log_step_count_steps=FLAGS.iterations_per_loop, # session_config=config_proto, # tpu_config=tpu_config, # ) model_fn = retinanet_model.retinanet_model_fn # TPU Estimator if FLAGS.mode == 'train': tf.logging.info(params) train_estimator=tf.estimator.Estimator( model_fn=model_fn, model_dir=FLAGS.model_dir) # train_estimator = tf.contrib.tpu.TPUEstimator( # model_fn=model_fn, # use_tpu=FLAGS.use_tpu, # train_batch_size=FLAGS.train_batch_size, # config=run_config, # params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, # use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator= tf.estimator.Estimator( model_fn= retinanet_model.retinanet_model_fn, # train_batch_size= FLAGS.train_batch_size, # eval_batch_size= FLAGS.eval_batch_size, config=run_config, params=eval_params ) # eval_estimator = tf.contrib.tpu.TPUEstimator( # model_fn=retinanet_model.retinanet_model_fn, # use_tpu=False, # train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size, # config=run_config, # params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples//FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'eval': # eval only runs on CPU or GPU host with batch_size = 1 # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. eval_params = dict( params, # use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = tf.estimator.Estimator( model_fn= retinanet_model.retinanet_model_fn, # train_batch_size= FLAGS.train_batch_size, # eval_batch_size= FLAGS.eval_batch_size, config=run_config, params=eval_params ) def terminate_eval(): tf.logging.info('Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint # for ckpt in checkpoints_iterator( # FLAGS.model_dir, # min_interval_secs=FLAGS.min_eval_interval, # timeout=FLAGS.eval_timeout, # timeout_fn=terminate_eval): # tf.logging.info('Starting to evaluate.') # try: # eval_results = eval_estimator.evaluate( # input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, # is_training=False), # steps=FLAGS.eval_samples//FLAGS.eval_batch_size) # tf.logging.info('Eval results: %s' % eval_results) # # Terminate eval job when final checkpoint is reached # current_step = int(os.path.basename(ckpt).split('-')[1]) # total_step = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / # FLAGS.train_batch_size) # if current_step >= total_step: # tf.logging.info('Evaluation finished after training step %d' % # current_step) # break # except tf.errors.NotFoundError: # # Since the coordinator is on a different job than the TPU worker, # # sometimes the TPU worker does not finish initializing until long after # # the CPU job tells it to start evaluating. In this case, the checkpoint # # file could have been deleted already. # tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint' % # ckpt) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.num_epochs): tf.logging.info('Starting training cycle, epoch: %d.' % cycle) train_estimator = tf.estimator.Estimator( model_fn= retinanet_model.retinanet_model_fn, # train_batch_size= FLAGS.train_batch_size, # eval_batch_size= FLAGS.eval_batch_size, config=run_config, params=params ) # tf.contrib.tpu.TPUEstimator( # model_fn=retinanet_model.retinanet_model_fn, # use_tpu=FLAGS.use_tpu, # train_batch_size=FLAGS.train_batch_size, # config=run_config, # params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle) # Run evaluation after every epoch. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = tf.estimator.Estimator( model_fn= retinanet_model.retinanet_model_fn, # train_batch_size= FLAGS.train_batch_size, # eval_batch_size= FLAGS.eval_batch_size, config=run_config, params=eval_params ) # tf.contrib.tpu.TPUEstimator( # model_fn=retinanet_model.retinanet_model_fn, # use_tpu=False, # train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size, # config=run_config, # params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples//FLAGS.eval_batch_size) tf.logging.info('Evaluation results: %s' % eval_results) else: tf.logging.info('Mode not found.')