def main(argv): del argv # Unused. if FLAGS.start_profiler_server: # Starts profiler. It will perform profiling when receive profiling request. profiler.start_profiler_server(FLAGS.profiler_port_number) if FLAGS.use_tpu: if FLAGS.distribution_strategy is None: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: raise RuntimeError( 'Distribution strategy must be None when --use_tpu is True.') else: tpu_cluster_resolver = None if FLAGS.mode not in ['train', 'eval', 'train_and_eval']: raise ValueError('Unrecognize --mode: %s' % FLAGS.mode) # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') if FLAGS.mode == 'train_and_eval': if FLAGS.distribution_strategy is not None: raise RuntimeError('You must use --distribution_strategy=None for ' 'train_and_eval.') # Parse hparams hparams = retinanet_model.default_hparams() config_file = FLAGS.config_file hparams.num_epochs = FLAGS.num_epochs if config_file and tf.gfile.Exists(config_file): # load params from file. with tf.gfile.Open(config_file, 'r') as f: values_map = json.load(f) hparams.override_from_dict(values_map) hparams.parse(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in hparams, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = hparams.get('image_size') for level in range(hparams.get('min_level'), hparams.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) if FLAGS.auto_mixed_precision and FLAGS.distribution_strategy: config_proto.graph_options.rewrite_options.auto_mixed_precision = ( rewriter_config_pb2.RewriterConfig.ON) if FLAGS.distribution_strategy is None: # Uses TPUEstimator. params = dict( hparams.values(), num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) tpu_config = contrib_tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=contrib_tpu.InputPipelineConfig. PER_HOST_V2) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, ) else: if FLAGS.num_gpus < 0: raise ValueError('`num_gpus` cannot be negative.') def _per_device_batch_size(batch_size, num_gpus): """Calculate per GPU batch for Estimator. Args: batch_size: Global batch size to be divided among devices. num_gpus: How many GPUs are used per worker. Returns: Batch size per device. Raises: ValueError: if batch_size is not divisible by number of devices """ if num_gpus <= 1: return batch_size remainder = batch_size % num_gpus if remainder: raise ValueError( 'Batch size must be a multiple of the number GPUs per worker.' ) return int(batch_size / num_gpus) # Uses Estimator. params = dict( hparams.values(), num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, use_bfloat16=False, auto_mixed_precision=FLAGS.auto_mixed_precision, dataset_max_intra_op_parallelism=FLAGS. dataset_max_intra_op_parallelism, dataset_private_threadpool_size=FLAGS. dataset_private_threadpool_size, ) if FLAGS.distribution_strategy == 'mirrored': params['batch_size'] = _per_device_batch_size( FLAGS.train_batch_size, FLAGS.num_gpus) if FLAGS.num_gpus == 0: devices = ['device:CPU:0'] else: devices = [ 'device:GPU:{}'.format(i) for i in range(FLAGS.num_gpus) ] if FLAGS.all_reduce_alg: dist_strat = tf.distribute.MirroredStrategy( devices=devices, cross_device_ops=contrib_distribute. AllReduceCrossDeviceOps(FLAGS.all_reduce_alg, num_packs=2)) else: dist_strat = tf.distribute.MirroredStrategy(devices=devices) run_config = tf.estimator.RunConfig(session_config=config_proto, train_distribute=dist_strat, eval_distribute=dist_strat) elif FLAGS.distribution_strategy == 'multi_worker_mirrored': local_device_protos = device_lib.list_local_devices() params['batch_size'] = _per_device_batch_size( FLAGS.train_batch_size, sum([1 for d in local_device_protos if d.device_type == 'GPU'])) if FLAGS.worker_hosts is None: tf_config_json = json.loads(os.environ.get('TF_CONFIG', '{}')) # Replaces master with chief. if tf_config_json: if 'master' in tf_config_json['cluster']: tf_config_json['cluster']['chief'] = tf_config_json[ 'cluster'].pop('master') if tf_config_json['task']['type'] == 'master': tf_config_json['task']['type'] = 'chief' os.environ['TF_CONFIG'] = json.dumps(tf_config_json) tf_config_json = json.loads(os.environ['TF_CONFIG']) worker_hosts = tf_config_json['cluster']['worker'] worker_hosts.extend(tf_config_json['cluster'].get('chief', [])) else: # Set TF_CONFIG environment variable worker_hosts = FLAGS.worker_hosts.split(',') os.environ['TF_CONFIG'] = json.dumps({ 'cluster': { 'worker': worker_hosts }, 'task': { 'type': 'worker', 'index': FLAGS.task_index } }) dist_strat = tf.distribute.experimental.MultiWorkerMirroredStrategy( communication=_COLLECTIVE_COMMUNICATION_OPTIONS[ FLAGS.all_reduce_alg]) run_config = tf.estimator.RunConfig(session_config=config_proto, train_distribute=dist_strat) else: raise ValueError('Unrecognized distribution strategy.') if FLAGS.mode == 'train': if FLAGS.model_dir is not None: if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) with tf.gfile.Open(os.path.join(FLAGS.model_dir, 'hparams.json'), 'w') as f: json.dump(hparams.values(), f, sort_keys=True, indent=2) tf.logging.info(params) if FLAGS.distribution_strategy is None: total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) train_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) # Run evaluation after training finishes. eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) if FLAGS.eval_after_training: if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) output_dir = os.path.join(FLAGS.model_dir, 'train_eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) evaluation.write_summary(eval_results, summary_writer, total_steps) else: train_estimator = tf.estimator.Estimator( model_fn=retinanet_model.est_retinanet_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=params) if FLAGS.distribution_strategy == 'mirrored': total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) tf.logging.info('Starting `MirroredStrategy` training...') train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) elif FLAGS.distribution_strategy == 'multi_worker_mirrored': total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / (len(worker_hosts) * FLAGS.train_batch_size)) train_spec = tf.estimator.TrainSpec( input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) eval_spec = tf.estimator.EvalSpec(input_fn=tf.data.Dataset) tf.logging.info( 'Starting `MultiWorkerMirroredStrategy` training...') tf.estimator.train_and_evaluate(train_estimator, train_spec, eval_spec) else: raise ValueError('Unrecognized distribution strategy.') elif FLAGS.mode == 'eval': # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) if FLAGS.distribution_strategy is None: # Uses TPUEstimator. eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) else: # Uses Estimator. if FLAGS.distribution_strategy == 'multi_worker_mirrored': raise ValueError( '--distribution_strategy=multi_worker_mirrored is not supported ' 'for eval.') elif FLAGS.distribution_strategy == 'mirrored': eval_estimator = tf.estimator.Estimator( model_fn=retinanet_model.est_retinanet_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=params) else: raise ValueError('Unrecognized distribution strategy.') def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) # Run evaluation when there's a new checkpoint for ckpt in contrib_training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) evaluation.write_summary(eval_results, summary_writer, current_step) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) elif FLAGS.mode == 'train_and_eval': if FLAGS.distribution_strategy is not None: raise ValueError( 'Distribution strategy is not implemented for --mode=train_and_eval.' ) if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') output_dir = os.path.join(FLAGS.model_dir, 'train_and_eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) num_cycles = int(FLAGS.num_epochs * FLAGS.num_examples_per_epoch / FLAGS.num_steps_per_eval) for cycle in range(num_cycles): tf.logging.info('Starting training cycle, epoch: %d.' % cycle) train_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), steps=FLAGS.num_steps_per_eval) tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle) # Run evaluation after every epoch. eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Evaluation results: %s' % eval_results) current_step = int(cycle * FLAGS.num_steps_per_eval) evaluation.write_summary(eval_results, summary_writer, current_step) else: tf.logging.info('Mode not found.') if FLAGS.model_dir: tf.logging.info('Exporting saved model.') eval_params = dict( params, use_tpu=True, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=True, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.inference_batch_size, config=run_config, params=eval_params) export_path = eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=build_serving_input_fn( hparams.image_size, FLAGS.inference_batch_size)) if FLAGS.add_warmup_requests: inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, hparams.image_size, batch_sizes=[FLAGS.inference_batch_size])
def main(argv): del argv # Unused. if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in hparams, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = hparams.get('image_size') for level in range(hparams.get('min_level'), hparams.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict( hparams.values(), num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) tpu_config = tf.contrib.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, ) # TPU Estimator if FLAGS.mode == 'train': tf.logging.info(params) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) # Run evaluation after training finishes. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) if FLAGS.eval_after_training: if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) if FLAGS.model_dir: eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=lambda: serving_input_fn(hparams. image_size)) elif FLAGS.mode == 'eval': # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.contrib.training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=lambda: serving_input_fn( hparams.image_size)) except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.num_epochs): tf.logging.info('Starting training cycle, epoch: %d.' % cycle) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle) # Run evaluation after every epoch. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Evaluation results: %s' % eval_results) eval_estimator.export_saved_model(export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=lambda: serving_input_fn(hparams.image_size)) else: tf.logging.info('Mode not found.')
def main(_): if FLAGS.strategy == 'tpu': tf.disable_eager_execution() tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: # Always enable auto mixed precision graph rewrite os.environ[ 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'] = '1' tpu_cluster_resolver = None # Check data path if FLAGS.mode in ('train', 'train_and_eval'): if FLAGS.training_file_pattern is None: raise RuntimeError( 'Must specify --training_file_pattern for train.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError( 'Must specify --validation_file_pattern for eval.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, 'image_masks': None, } # The Input Partition Logic: We partition only the partition-able tensors. feat_sizes = utils.get_feat_sizes(config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict(config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, profile=FLAGS.profile, mode=FLAGS.mode) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.strategy != 'tpu': if FLAGS.use_xla: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) config_proto.gpu_options.allow_growth = True model_dir = FLAGS.model_dir model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) max_instances_per_image = config.max_instances_per_image if FLAGS.eval_samples: eval_steps = int((FLAGS.eval_samples + FLAGS.eval_batch_size - 1) // FLAGS.eval_batch_size) else: eval_steps = None total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch) train_steps = total_examples // FLAGS.train_batch_size logging.info(params) if not tf.io.gfile.exists(model_dir): tf.io.gfile.makedirs(model_dir) config_file = os.path.join(model_dir, 'config.yaml') if not tf.io.gfile.exists(config_file): tf.io.gfile.GFile(config_file, 'w').write(str(config)) train_input_fn = dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image) eval_input_fn = dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image) if FLAGS.strategy == 'tpu': tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tf_random_seed=FLAGS.tf_random_seed, ) # TPUEstimator can do both train and eval. train_est = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) eval_est = train_est else: strategy = None if FLAGS.strategy == 'gpus': strategy = tf.distribute.MirroredStrategy() run_config = tf.estimator.RunConfig( model_dir=model_dir, train_distribute=strategy, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tf_random_seed=FLAGS.tf_random_seed, ) def get_estimator(global_batch_size): params['num_shards'] = getattr(strategy, 'num_replicas_in_sync', 1) params['batch_size'] = global_batch_size // params['num_shards'] return tf.estimator.Estimator(model_fn=model_fn_instance, config=run_config, params=params) # train and eval need different estimator due to different batch size. train_est = get_estimator(FLAGS.train_batch_size) eval_est = get_estimator(FLAGS.eval_batch_size) # start train/eval flow. if FLAGS.mode == 'train': train_est.train(input_fn=train_input_fn, max_steps=train_steps) if FLAGS.eval_after_training: eval_est.evaluate(input_fn=eval_input_fn, steps=eval_steps) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout): logging.info('Starting to evaluate.') try: eval_results = eval_est.evaluate(eval_input_fn, steps=eval_steps) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) if current_step >= train_steps: logging.info('Eval finished step %d/%d', current_step, train_steps) break except tf.errors.NotFoundError: # Checkpoint might be not already deleted by the time eval finished. # We simply skip ssuch case. logging.info('Checkpoint %s no longer exists, skipping.', ckpt) elif FLAGS.mode == 'train_and_eval': ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) try: step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = (step * FLAGS.train_batch_size // FLAGS.num_examples_per_epoch) logging.info('found ckpt at step %d (epoch %d)', step, current_epoch) except (IndexError, TypeError): logging.info('Folder %s has no ckpt with valid step.', FLAGS.model_dir) current_epoch = 0 def run_train_and_eval(e): print('\n =====> Starting training, epoch: %d.' % e) train_est.train(input_fn=train_input_fn, max_steps=e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size) print('\n =====> Starting evaluation, epoch: %d.' % e) eval_results = eval_est.evaluate(input_fn=eval_input_fn, steps=eval_steps) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) epochs_per_cycle = 1 # higher number has less graph construction overhead. for e in range(current_epoch + 1, config.num_epochs + 1, epochs_per_cycle): if FLAGS.run_epoch_in_child_process: p = multiprocessing.Process(target=run_train_and_eval, args=(e, )) p.start() p.join() if p.exitcode != 0: return p.exitcode else: run_train_and_eval(e) else: logging.info('Invalid mode: %s', FLAGS.mode)
def main(argv): del argv # Unused. # Configure parameters. config = mask_rcnn_params.default_config() config = params_io.override_hparams(config, FLAGS.config) if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if (FLAGS.mode in ('train', 'train_and_eval') and not config.training_file_pattern): raise RuntimeError( 'You must specify `training_file_pattern` for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if not config.validation_file_pattern: raise RuntimeError('You must specify `validation_file_pattern` ' 'for evaluation.') if not config.val_json_file: raise RuntimeError( 'You must specify `val_json_file` for evaluation.') # The following is for spatial partitioning. `features` has one tensor while # `labels` has 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # Note: In the below code, TPUEstimator uses both `shard` and `replica` (with # the same meaning). if FLAGS.input_partition_dims: labels_partition_dims = { 'gt_boxes': None, 'gt_classes': None, 'cropped_gt_masks': None, } # TODO(b/119617317): The Input Partition Logic. We partition only the # partition-able tensors. Spatial partition requires that the # to-be-partitioned tensors must have a dimension that is a multiple of # `partition_dims`. Depending on the `partition_dims` and the `image_size` # and the `max_level` in config, some high-level anchor labels (i.e., # `cls_targets` and `box_targets`) cannot be partitioned. For example, when # `partition_dims` is [1, 4, 2, 1], image size is 1536, `max_level` is 9, # `cls_targets_8` has a shape of [batch_size, 6, 6, 9], which cannot be # partitioned (6 % 4 != 0). In this case, the level-8 and level-9 target # tensors are not partition-able, and the highest partition-able level is 7. image_size = config.image_size for level in range(config.min_level, config.max_level + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['score_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['score_targets_%d' % level] = None num_cores_per_replica = np.prod(FLAGS.input_partition_dims) features_partition_dims = { 'images': FLAGS.input_partition_dims, 'source_ids': None, 'image_info': None, } input_partition_dims = [features_partition_dims, labels_partition_dims] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict( config.values(), num_shards=num_shards, use_tpu=FLAGS.use_tpu, mode=FLAGS.mode, # The following are used by the host_call function. model_dir=FLAGS.model_dir, iterations_per_loop=FLAGS.iterations_per_loop, transpose_input=FLAGS.transpose_input) tpu_config = tf.contrib.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, tpu_config=tpu_config, ) if FLAGS.mode == 'train': if FLAGS.model_dir: save_config(config, FLAGS.model_dir) tf.logging.info(params) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN, use_fake_data=FLAGS.use_fake_data), max_steps=config.total_steps) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params_dict = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, transpose_input=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.eval_batch_size, config=run_config, params=eval_params_dict) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_results = evaluation(eval_estimator, config) write_summary(eval_results, summary_writer, config.total_steps) summary_writer.close() elif FLAGS.mode == 'eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_params_dict = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, transpose_input=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.eval_batch_size, config=run_config, params=eval_params_dict) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.contrib.training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) tf.logging.info('Starting to evaluate.') try: eval_results = evaluation(eval_estimator, config) write_summary(eval_results, summary_writer, current_step) if current_step >= config.total_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) summary_writer.close() # Export saved model. eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=functools.partial( dataloader.serving_input_fn, batch_size=config.eval_batch_size, image_size=config.image_size)) elif FLAGS.mode == 'train_and_eval': if FLAGS.model_dir: save_config(config, FLAGS.model_dir) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, config=run_config, params=params) eval_params_dict = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.eval_batch_size, config=run_config, params=eval_params_dict) num_cycles = int(config.total_steps / config.num_steps_per_eval) for cycle in range(num_cycles): tf.logging.info('Start training cycle %d.' % cycle) train_estimator.train(input_fn=dataloader.InputReader( config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN), steps=config.num_steps_per_eval) tf.logging.info('Start evaluation cycle %d.' % cycle) eval_results = evaluation(eval_estimator, config) current_step = int(cycle * config.num_steps_per_eval) write_summary(eval_results, summary_writer, current_step) tf.logging.info('Starting training cycle %d.' % num_cycles) train_estimator.train(input_fn=dataloader.InputReader( config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN), max_steps=config.total_steps) eval_results = evaluation(eval_estimator, config) write_summary(eval_results, summary_writer, config.total_steps) summary_writer.close() # Export saved model. eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=functools.partial( dataloader.serving_input_fn, batch_size=config.eval_batch_size, image_size=config.image_size)) else: tf.logging.info('Mode not found.')
def main(argv): del argv # Unused. # Check flag values if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError('You must specify either --master or --tpu_name.') if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn('Both --master and --tpu_name are set. Ignoring ' '--tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) if FLAGS.mode is 'train' and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode is 'eval': if FLAGS.valid_data_dir is None: raise RuntimeError( 'You must specify --valid_data_dir for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_shards=FLAGS.num_shards, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) run_config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop, FLAGS.num_shards)) # TPU Estimator if FLAGS.mode == 'train': train_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_50_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_50_model_fn, use_tpu=False, eval_batch_size=1, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'eval': # eval only runs on CPU or GPU host with batch_size = 1 # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_50_model_fn, use_tpu=False, eval_batch_size=1, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: tf.logging.info('Mode not found.')
def get_dataset(cfg, file_pattern, is_training): """Returns a tf.data.Dataset""" return dataloader.InputReader( cfg, is_training, FLAGS.use_tfrecord, FLAGS.mixed_precision)( file_pattern, cfg.TRAIN.BATCH_SIZE if is_training else cfg.TEST.BATCH_SIZE)
def export(self, output_dir: Text = None, tensorrt: Text = None, tflite: Text = None, file_pattern: Text = None, num_calibration_steps: int = 2000): """Export a saved model, frozen graph, and potential tflite/tensorrt model. Args: output_dir: the output folder for saved model. tensorrt: If not None, must be {'FP32', 'FP16', 'INT8'}. tflite: Type for post-training quantization. file_pattern: Glob for tfrecords, e.g. coco/val-*.tfrecord. num_calibration_steps: Number of post-training quantization calibration steps to run. """ export_model, input_spec = self._get_model_and_spec(tflite) image_size = utils.parse_image_size(self.params['image_size']) if output_dir: tf.saved_model.save( export_model, output_dir, signatures=export_model.__call__.get_concrete_function(input_spec)) logging.info('Model saved at %s', output_dir) # also save freeze pb file. graphdef = self.freeze( export_model.__call__.get_concrete_function(input_spec)) proto_path = tf.io.write_graph( graphdef, output_dir, self.model_name + '_frozen.pb', as_text=False) logging.info('Frozen graph saved at %s', proto_path) if tflite: shape = (self.batch_size, *image_size, 3) input_spec = tf.TensorSpec( shape=shape, dtype=input_spec.dtype, name=input_spec.name) # from_saved_model supports advanced converter features like op fusing. converter = tf.lite.TFLiteConverter.from_saved_model(output_dir) if tflite == 'FP32': converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.target_spec.supported_types = [tf.float32] elif tflite == 'FP16': converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.target_spec.supported_types = [tf.float16] elif tflite == 'INT8': # Enables MLIR-based post-training quantization. converter.experimental_new_quantizer = True if file_pattern: config = hparams_config.get_efficientdet_config(self.model_name) config.override(self.params) ds = dataloader.InputReader( file_pattern, is_training=False, max_instances_per_image=config.max_instances_per_image)( config, batch_size=self.batch_size) def representative_dataset_gen(): for image, _ in ds.take(num_calibration_steps): yield [image] else: # Used for debugging, can remove later. logging.warn('Use real representative dataset instead of fake ones.') num_calibration_steps = 10 def representative_dataset_gen(): # rewrite this for real data. for _ in range(num_calibration_steps): yield [tf.ones(shape, dtype=input_spec.dtype)] converter.representative_dataset = representative_dataset_gen converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.inference_input_type = tf.uint8 # TFLite's custom NMS op isn't supported by post-training quant, # so we add TFLITE_BUILTINS as well. supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS_INT8, tf.lite.OpsSet.TFLITE_BUILTINS ] converter.target_spec.supported_ops = supported_ops else: raise ValueError(f'Invalid tflite {tflite}: must be FP32, FP16, INT8.') tflite_path = os.path.join(output_dir, tflite.lower() + '.tflite') tflite_model = converter.convert() tf.io.gfile.GFile(tflite_path, 'wb').write(tflite_model) logging.info('TFLite is saved at %s', tflite_path) if tensorrt: trt_path = os.path.join(output_dir, 'tensorrt_' + tensorrt.lower()) conversion_params = tf.experimental.tensorrt.ConversionParams( max_workspace_size_bytes=(2 << 20), maximum_cached_engines=1, precision_mode=tensorrt.upper()) converter = tf.experimental.tensorrt.Converter( output_dir, conversion_params=conversion_params) converter.convert() converter.save(trt_path) logging.info('TensorRT model is saved at %s', trt_path)
def main(argv): del argv # Unused. # TODO(b/132208296): remove this workaround that uses control flow v2. control_flow_util.ENABLE_CONTROL_FLOW_V2 = True tpu = FLAGS.tpu or FLAGS.master tpu_cluster_resolver = runner_utils.create_tpu_cluster_resolver( FLAGS.use_tpu, tpu, FLAGS.tpu_zone, FLAGS.gcp_project) if tpu_cluster_resolver: tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) # Check data path run_train = FLAGS.mode in ('train', 'train_and_eval') if run_train and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') run_eval = FLAGS.mode in ('eval', 'train_and_eval') or ( FLAGS.mode == 'train' and FLAGS.eval_after_training) if run_eval: if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') # Parse hparams hparams = mask_rcnn_params.default_hparams() hparams.parse(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` has 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # Note: In the below code, TPUEstimator uses both `shard` and `replica` (with # the same meaning). # Note that spatial partition is part of the model-parallelism optimization. # See core_assignment_utils.py for more details about model parallelism. if FLAGS.input_partition_dims: labels_partition_dims = { 'gt_boxes': None, 'gt_classes': None, 'cropped_gt_masks': None, } for level in range(hparams.get('min_level'), hparams.get('max_level') + 1): labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['score_targets_%d' % level] = None num_cores_per_replica = int(np.prod(FLAGS.input_partition_dims)) image_partition_dims = [ FLAGS.input_partition_dims[i] for i in [1, 0, 2] ] if hparams.get('transpose_input') else FLAGS.input_partition_dims features_partition_dims = { 'images': image_partition_dims, 'source_ids': None, 'image_info': None, } input_partition_dims = [features_partition_dims, labels_partition_dims] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict(hparams.values(), num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, model_dir=FLAGS.model_dir) tpu_config = tf.contrib.tpu.TPUConfig( params['iterations_per_loop'], num_shards=num_shards, num_cores_per_replica=params['num_cores_per_replica'], input_partition_dims=input_partition_dims, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2, tpu_job_name=FLAGS.tpu_job_name, ) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, log_step_count_steps=params['iterations_per_loop'], tpu_config=tpu_config, save_checkpoints_steps=params['iterations_per_loop'], ) train_replicas_per_worker = ( params['cores_per_worker'] // params['num_cores_per_replica'] ) if params['num_cores_per_replica'] else params['cores_per_worker'] train_params = dict( params, replicas_per_worker=train_replicas_per_worker, ) eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) # MLPerf logging. mlp_log.mlperf_print(key='init_start', value=None) mlp_log.mlperf_print(key='global_batch_size', value=params['train_batch_size']) runner = None if run_train and run_eval: if params['train_use_tpu_estimator'] or params[ 'eval_use_tpu_estimator']: raise RuntimeError( 'train_and_eval runner does not support TPUEstimator.') dist_eval_params = dict( eval_params, replicas_per_worker=train_replicas_per_worker, ) runner = mask_rcnn_runner.TrainEvalRunner( model_fn=mask_rcnn_model.MaskRcnnModelFn(), input_fn=dataloader.InputReader(FLAGS.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN, use_fake_data=FLAGS.use_fake_data), eval_input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, mode=tf.estimator.ModeKeys.PREDICT, distributed_eval=True), eval_metric=coco_metric.EvaluationMetric(FLAGS.val_json_file, use_cpp_extension=True), train_params=train_params, eval_params=dist_eval_params, run_config=run_config) elif run_train: # Check low-level train runner compatibility. if not params['train_use_tpu_estimator']: if FLAGS.mode == 'train_and_eval': raise RuntimeError( 'Low level train runner does not support mode ' 'train_and_eval yet.') train_params = dict( params, replicas_per_worker=train_replicas_per_worker, ) runner = mask_rcnn_runner.TrainRunner( model_fn=mask_rcnn_model.MaskRcnnModelFn(), input_fn=dataloader.InputReader(FLAGS.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN, use_fake_data=FLAGS.use_fake_data), params=train_params, run_config=run_config, use_tpu_estimator=train_params['train_use_tpu_estimator']) else: sidecar_eval_params = dict( eval_params, # sidecar eval only uses one worker and does not use spatial partition. replicas_per_worker=FLAGS.num_cores, ) runner = mask_rcnn_runner.EvalRunner( mask_rcnn_model.MaskRcnnModelFn(), dataloader.InputReader(FLAGS.validation_file_pattern, mode=tf.estimator.ModeKeys.PREDICT), coco_metric.EvaluationMetric(FLAGS.val_json_file, use_cpp_extension=True), sidecar_eval_params, run_config, use_tpu_estimator=sidecar_eval_params['eval_use_tpu_estimator']) if FLAGS.mode == 'train': runner.train() elif FLAGS.mode == 'eval': def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True run_success = False # Run evaluation when there's a new checkpoint for ckpt in tf.contrib.training.checkpoints_iterator( params['model_dir'], min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = runner.evaluate(ckpt) current_step, _ = runner.get_step_and_epoch_number(ckpt) if (eval_results['AP'] >= mask_rcnn_params.BOX_EVAL_TARGET and eval_results['mask_AP'] >= mask_rcnn_params.MASK_EVAL_TARGET): mlp_log.mlperf_print(key='run_stop', metadata={'status': 'success'}) run_success = True break if int(current_step) >= params['total_steps']: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) if not run_success: mlp_log.mlperf_print(key='run_stop', metadata={'status': 'aborted'}) elif FLAGS.mode == 'train_and_eval': runner.train_and_eval() else: tf.logging.info('Mode not found.')
def main(_): if FLAGS.strategy == 'tpu': tf.disable_eager_execution() tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ('train', 'train_and_eval'): if FLAGS.training_file_pattern is None: raise RuntimeError('Must specify --training_file_pattern for train.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('Must specify --validation_file_pattern for eval.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError('--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, 'image_masks': None, } # The Input Partition Logic: We partition only the partition-able tensors. feat_sizes = utils.get_feat_sizes( config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len(FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [FLAGS.input_partition_dims, labels_partition_dims] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict( config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode) config_proto = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) if FLAGS.strategy != 'tpu': if FLAGS.use_xla: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) config_proto.gpu_options.allow_growth = True model_dir = FLAGS.model_dir strategy = None if FLAGS.strategy == 'tpu': tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig .PER_HOST_V2) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tf_random_seed=FLAGS.tf_random_seed, ) else: if FLAGS.strategy == 'gpus': strategy = tf.distribute.MirroredStrategy() run_config = tf.estimator.RunConfig( model_dir=model_dir, train_distribute=strategy, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) max_instances_per_image = config.max_instances_per_image eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size) total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch) train_steps = total_examples // FLAGS.train_batch_size logging.info(params) train_input_fn = dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image) eval_input_fn = dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image) if FLAGS.strategy == 'tpu': estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) else: params['batch_size'] = ( FLAGS.train_batch_size // getattr(strategy, 'num_replicas_in_sync', 1)) params['num_shards'] = getattr(strategy, 'num_replicas_in_sync', 1) estimator = tf.estimator.Estimator( model_fn=model_fn_instance, config=run_config, params=params) # start train/eval flow. if FLAGS.mode == 'train': estimator.train(input_fn=train_input_fn, max_steps=train_steps) if FLAGS.eval_after_training: estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout): logging.info('Starting to evaluate.') try: eval_results = estimator.evaluate(eval_input_fn, steps=eval_steps) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) if current_step >= train_steps: logging.info('Eval finished step %d/%d', current_step, train_steps) break except tf.errors.NotFoundError: # Checkpoint might be not already deleted by the time eval finished. # We simply skip ssuch case. logging.info('Checkpoint %s no longer exists, skipping.', ckpt) elif FLAGS.mode == 'train_and_eval': train_spec = tf.estimator.TrainSpec( input_fn=train_input_fn, max_steps=train_steps) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, steps=eval_steps, throttle_secs=600) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) else: logging.info('Invalid mode: %s', FLAGS.mode)
def main(_): config = hparams_config.get_efficientdet_config(FLAGS.model_name) config.override(FLAGS.hparams) config.batch_size = FLAGS.batch_size config.val_json_file = FLAGS.val_json_file config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS base_height, base_width = utils.parse_image_size(config['image_size']) if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver) logging.info('All devices: %s', tf.config.list_logical_devices('TPU')) elif FLAGS.strategy == 'gpus': ds_strategy = tf.distribute.MirroredStrategy() logging.info('All devices: %s', tf.config.list_physical_devices('GPU')) else: if tf.config.list_physical_devices('GPU'): ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0') else: ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0') # in format (height, width, flip) augmentations = [] if FLAGS.enable_tta: for size_offset in (0, 128, 256): for flip in (False, True): augmentations.append((base_height + size_offset, base_width + size_offset, flip)) else: augmentations.append((base_height, base_width, False)) all_detections = [] all_labels = [] with ds_strategy.scope(): # Network model = efficientdet_keras.EfficientDetNet(config=config) model.build((config.batch_size, base_height, base_width, 3)) model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir)) first_loop = True for height, width, flip in augmentations: config.image_size = (height, width) # dataset ds = dataloader.InputReader( FLAGS.val_file_pattern, is_training=False, use_fake_data=False, max_instances_per_image=config.max_instances_per_image)(config) if FLAGS.eval_samples: ds = ds.take(FLAGS.eval_samples // config.batch_size) # create the function once per augmentation, since it closes over the # value of config, which gets updated with the new image size @tf.function def f(images, labels): cls_outputs, box_outputs = model(images, training=False) return postprocess.generate_detections(config, cls_outputs, box_outputs, labels['image_scales'], labels['source_ids'], flip) # inference for images, labels in ds: if flip: images = tf.image.flip_left_right(images) detections = f(images, labels) all_detections.append(detections) if first_loop: all_labels.append(labels) first_loop = False # collect the giant list of detections into a map from image id to # detections detections_per_source = dict() for batch in all_detections: for d in batch: img_id = d[0][0] if img_id.numpy() in detections_per_source: detections_per_source[img_id.numpy()] = tf.concat( [d, detections_per_source[img_id.numpy()]], 0) else: detections_per_source[img_id.numpy()] = d # collect the groundtruth per image id groundtruth_per_source = dict() for batch in all_labels: for img_id, groundtruth in zip(batch['source_ids'], batch['groundtruth_data']): groundtruth_per_source[img_id.numpy()] = groundtruth # calucate the AP scores for all the images label_map = label_util.get_label_map(config.label_map) evaluator = coco_metric.EvaluationMetric(filename=config.val_json_file, label_map=label_map) for img_id, d in detections_per_source.items(): if FLAGS.enable_tta: d = wbf.ensemble_detections(config, d, len(augmentations)) evaluator.update_state( tf.stack([groundtruth_per_source[img_id]]).numpy(), postprocess.transform_detections(tf.stack([d])).numpy()) # compute the final eval results. if evaluator: metrics = evaluator.result() metric_dict = {} for i, name in enumerate(evaluator.metric_names): metric_dict[name] = metrics[i] if label_map: for i, cid in enumerate(sorted(label_map.keys())): name = 'AP_/%s' % label_map[cid] metric_dict[name] = metrics[i + len(evaluator.metric_names)] print(metric_dict)
def main(_): config = hparams_config.get_efficientdet_config(FLAGS.model_name) config.override(FLAGS.hparams) config.batch_size = FLAGS.batch_size config.val_json_file = FLAGS.val_json_file config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS base_height, base_width = utils.parse_image_size(config['image_size']) # Network model = efficientdet_keras.EfficientDetNet(config=config) model.build((config.batch_size, base_height, base_width, 3)) model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir)) @tf.function def f(imgs, labels, flip): cls_outputs, box_outputs = model(imgs, training=False) return postprocess.generate_detections(config, cls_outputs, box_outputs, labels['image_scales'], labels['source_ids'], flip) # in format (height, width, flip) augmentations = [] if FLAGS.enable_tta: for size_offset in (0, 128, 256): for flip in (False, True): augmentations.append((base_height + size_offset, base_width + size_offset, flip)) else: augmentations.append((base_height, base_width, False)) evaluator = None detections_per_source = dict() for height, width, flip in augmentations: config.image_size = (height, width) # dataset ds = dataloader.InputReader( FLAGS.val_file_pattern, is_training=False, use_fake_data=False, max_instances_per_image=config.max_instances_per_image)(config) # compute stats for all batches. total_steps = FLAGS.eval_samples // FLAGS.batch_size progress = tf.keras.utils.Progbar(total_steps) for i, (images, labels) in enumerate(ds): progress.update(i, values=None) if i > total_steps: break if flip: images = tf.image.flip_left_right(images) detections = f(images, labels, flip) for img_id, d in zip(labels['source_ids'], detections): if img_id.numpy() in detections_per_source: detections_per_source[img_id.numpy()] = tf.concat( [d, detections_per_source[img_id.numpy()]], 0) else: detections_per_source[img_id.numpy()] = d evaluator = coco_metric.EvaluationMetric( filename=config.val_json_file) for d in detections_per_source.values(): if FLAGS.enable_tta: d = wbf.ensemble_detections(config, d, len(augmentations)) evaluator.update_state( labels['groundtruth_data'].numpy(), postprocess.transform_detections(tf.stack([d])).numpy()) # compute the final eval results. if evaluator: metrics = evaluator.result() metric_dict = {} for i, name in enumerate(evaluator.metric_names): metric_dict[name] = metrics[i] label_map = label_util.get_label_map(config.label_map) if label_map: for i, cid in enumerate(sorted(label_map.keys())): name = 'AP_/%s' % label_map[cid] metric_dict[name] = metrics[i - len(evaluator.metric_names)] print(metric_dict)
def main(argv): del argv # Unused. # if given an efficentdet ckpt don't use default backbone ckpt if FLAGS.backbone_ckpt == BACKBONE_CKPT_DEFAULT_DIR and FLAGS.ckpt is not None: print("Using ckpt flag: {}, ignoring default backbone_ckpt: {}".format( FLAGS.ckpt, FLAGS.backbone_ckpt)) FLAGS.backbone_ckpt = None if FLAGS.use_horovod is not None: if FLAGS.dump_all_ranks: FLAGS.model_dir += "/worker_" + str(hvd.rank()) if not 'HOROVOD_CYCLE_TIME' in os.environ: os.environ['HOROVOD_CYCLE_TIME'] = '0.5' if not 'HABANA_HCCL_COMM_API' in os.environ: os.environ['HABANA_HCCL_COMM_API'] = '0' hvd_init() if not FLAGS.no_hpu: from habana_frameworks.tensorflow import load_habana_module load_habana_module() if FLAGS.use_horovod: assert (horovod_enabled()) set_env(use_amp=FLAGS.use_amp) # deterministic setting if FLAGS.sbs_test or FLAGS.deterministic: set_deterministic() # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if not FLAGS.val_json_file and not FLAGS.testdev_dir: raise RuntimeError( 'You must specify --val_json_file or --testdev for evaluation.' ) # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = config.get('image_size') for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores if horovod_enabled(): num_shards = hvd.size() else: num_shards = 1 params = build_estimator_params('train', config, num_shards) # disabling input data scaling/flip manipulations. if FLAGS.sbs_test: sbs_params = dict(input_rand_hflip=False, train_scale_min=1, train_scale_max=1, dropout_rate=0.0) params.update(sbs_params) tf_random_seed = 0 if FLAGS.deterministic else None run_config = build_estimator_config('train', config, num_shards, num_cores_per_replica, input_partition_dims) write_hparams_v1(FLAGS.model_dir, { 'batch_size': FLAGS.train_batch_size, **FLAGS.flag_values_dict() }) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) # TPU Estimator logging.info(params) if FLAGS.mode == 'train': train_estimator = HorovodEstimator(model_fn=model_fn_instance, model_dir=FLAGS.model_dir, config=run_config, params=params) # for deterministic input, we pass to dataloader False for not manipulating input data is_training = not FLAGS.deterministic use_fake_data = FLAGS.use_fake_data or FLAGS.deterministic input_fn = dataloader.InputReader(FLAGS.training_file_pattern, is_training=is_training, params=params, use_fake_data=use_fake_data, is_deterministic=FLAGS.deterministic) max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / (FLAGS.train_batch_size * num_shards)) + 1 # for sbs test, train under sbs callbacks if FLAGS.sbs_test: from TensorFlow.common.debug import dump_callback SBS_TEST_CONFIG = os.path.join( os.environ['TF_TESTS_ROOT'], "tests/tf_training_tests/side_by_side/topologies/efficientdet/dump_config.json" ) with dump_callback(SBS_TEST_CONFIG): train_estimator.train(input_fn=input_fn, max_steps=max_steps) else: if FLAGS.ckpt is not None: train_estimator.train(input_fn=input_fn, steps=max_steps) else: train_estimator.train(input_fn=input_fn, max_steps=max_steps) elif FLAGS.mode == 'eval': eval_params = build_estimator_params('eval', config, num_shards) eval_config = build_estimator_config('eval', config, num_shards, num_cores_per_replica, input_partition_dims) # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=eval_config, params=eval_params) def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) logging.info('Eval results: %s', eval_results) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break write_summary(eval_results, ckpt, current_step) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'train_and_eval': train_params = build_estimator_params('train', config, num_shards) train_config = build_estimator_config('train', config, num_shards, num_cores_per_replica, input_partition_dims) train_estimator = HorovodEstimator(model_fn=model_fn_instance, model_dir=FLAGS.model_dir, config=train_config, params=train_params) eval_estimator = None for cycle in range(FLAGS.num_epochs): logging.info('Starting training cycle, epoch: %d.', cycle) train_estimator.train( input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data), max_steps=(cycle + 1) * int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) # synchronization point for all ranks if horovod_enabled(): hvd.allreduce(tf.constant(0)) logging.info('Starting evaluation cycle, epoch: %d.', cycle) # Run evaluation after every epoch. if eval_estimator is None: eval_params = build_estimator_params('eval', config, num_shards) eval_config = build_estimator_config('eval', config, num_shards, num_cores_per_replica, input_partition_dims) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=eval_config, params=eval_params) if is_rank0(): eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) checkpoint_path = Path(FLAGS.model_dir) last_ckpt = tf.train.latest_checkpoint(str(checkpoint_path), latest_filename=None) current_step = int(os.path.basename(last_ckpt).split('-')[1]) write_summary(eval_results, FLAGS.model_dir, current_step) logging.info('Evaluation results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) pass else: logging.info('Mode not found.')
def main(argv): del argv # Unused. tpu_cluster_resolver = create_tpu_cluster_resolver() if tpu_cluster_resolver: tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') # Parse hparams hparams = mask_rcnn_model.default_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_shards=FLAGS.num_cores, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, # The following are used by the host_call function. model_dir=FLAGS.model_dir, iterations_per_loop=FLAGS.iterations_per_loop, dynamic_input_shapes=FLAGS.dynamic_input_shapes, transpose_input=FLAGS.transpose_input) tpu_config = tf.contrib.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, tpu_config=tpu_config, ) if FLAGS.mode != 'eval': mlperf_log.maskrcnn_print(key=mlperf_log.RUN_START) mlperf_log.maskrcnn_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.maskrcnn_print(key=mlperf_log.TRAIN_EPOCH, value=0) if FLAGS.mode == 'train': max_steps = int( (FLAGS.num_epochs * float(FLAGS.num_examples_per_epoch)) / float(FLAGS.train_batch_size)) if params['dynamic_input_shapes']: train_with_dynamic_shapes(params, max_steps, FLAGS.iterations_per_loop) else: tf.logging.info(params) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN), max_steps=max_steps) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, dynamic_input_shapes=False, transpose_input=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_results = evaluation(eval_estimator, FLAGS.num_epochs, params['val_json_file']) write_summary(eval_results, summary_writer, max_steps) if (eval_results['AP'] >= BOX_EVAL_TARGET and eval_results['mask_AP'] >= MASK_EVAL_TARGET): mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP, value={'success': 'true'}) else: mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) summary_writer.close() mlperf_log.maskrcnn_print(key=mlperf_log.RUN_FINAL) elif FLAGS.mode == 'eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_params = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, transpose_input=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True run_success = False # Run evaluation when there's a new checkpoint for ckpt in tf.contrib.training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) tf.logging.info('Starting to evaluate.') try: current_epoch = current_step / (float( FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) eval_results = evaluation(eval_estimator, current_epoch, params['val_json_file']) write_summary(eval_results, summary_writer, current_step) if (eval_results['AP'] >= BOX_EVAL_TARGET and eval_results['mask_AP'] >= MASK_EVAL_TARGET): mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP, value={'success': 'true'}) run_success = True break total_step = int( (FLAGS.num_epochs * float(FLAGS.num_examples_per_epoch)) / float(FLAGS.train_batch_size)) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) if not run_success: mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.maskrcnn_print(key=mlperf_log.RUN_FINAL) summary_writer.close() elif FLAGS.mode == 'train_and_eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) eval_params = dict(params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, dynamic_input_shapes=False) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) run_success = False steps_per_epoch = int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size) for cycle in range(int(math.floor(FLAGS.num_epochs))): tf.logging.info('Starting training cycle, epoch: %d.' % cycle) mlperf_log.maskrcnn_print(key=mlperf_log.TRAIN_EPOCH, value=cycle) if params['dynamic_input_shapes']: tf.logging.info( 'Use dynamic input shapes training for %d steps. Train ' 'to %d steps', steps_per_epoch, (cycle + 1) * steps_per_epoch) train_with_dynamic_shapes(params, (cycle + 1) * steps_per_epoch, FLAGS.iterations_per_loop) else: train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN), steps=steps_per_epoch) tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle) # Run evaluation after every epoch. eval_results = evaluation(eval_estimator, cycle, params['val_json_file']) current_step = (cycle + 1) * steps_per_epoch write_summary(eval_results, summary_writer, current_step) if (eval_results['AP'] >= BOX_EVAL_TARGET and eval_results['mask_AP'] >= MASK_EVAL_TARGET): mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP, value={'success': 'true'}) run_success = True break if not run_success: current_epoch = int(math.floor(FLAGS.num_epochs)) max_steps = int( (FLAGS.num_epochs * float(FLAGS.num_examples_per_epoch)) / float(FLAGS.train_batch_size)) # Final epoch. tf.logging.info('Starting training cycle, epoch: %d.' % current_epoch) mlperf_log.maskrcnn_print(key=mlperf_log.TRAIN_EPOCH, value=current_epoch) if params['dynamic_input_shapes']: remaining_steps = max_steps - int( current_epoch * steps_per_epoch) if remaining_steps > 0: tf.logging.info( 'Use dynamic input shapes training for %d steps. ' 'Train to %d steps', remaining_steps, max_steps) train_with_dynamic_shapes(params, max_steps, remaining_steps) else: train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN), max_steps=max_steps) eval_results = evaluation(eval_estimator, current_epoch, params['val_json_file']) write_summary(eval_results, summary_writer, max_steps) if (eval_results['AP'] >= BOX_EVAL_TARGET and eval_results['mask_AP'] >= MASK_EVAL_TARGET): mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP, value={'success': 'true'}) else: mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.maskrcnn_print(key=mlperf_log.RUN_FINAL) summary_writer.close() else: tf.logging.info('Mode not found.')
def train_with_dynamic_shapes(params, max_steps, iterations_per_loop): """Train with dynamic input shapes.""" params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_cores params['global_batch_size'] = FLAGS.train_batch_size tf.logging.info(params) tpu_cluster_resolver = create_tpu_cluster_resolver() tpu_strategy = tf.contrib.distribute.TPUStrategy(tpu_cluster_resolver, steps_per_run=1, num_cores=FLAGS.num_cores) session_config = tf.ConfigProto(allow_soft_placement=True) tpu_strategy.configure(session_config) sess = tf.Session(tpu_cluster_resolver.get_master(), config=session_config) # Call tpu.initialize_system() before everything! sess.run(tpu.initialize_system()) input_fn = dataloader.InputReader(FLAGS.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN) host_dataset = input_fn(params) multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator( host_dataset, devices=['/device:TPU:{}'.format(x) for x in range(FLAGS.num_cores)], prefetch_buffer_size=2) inputs_flattener = utils.InputsFlattener() per_host_sharded_inputs = [] captured_scaffold_fn = utils.CapturedObject() def single_step_fn(): """Function for a single TPU step.""" all_input_data = multi_device_iterator.get_next() for core in range(FLAGS.num_cores): features_shape, features, labels = all_input_data[core] flattened_inputs = (inputs_flattener.flatten_features_and_labels( features, labels)) per_host_sharded_inputs.append(flattened_inputs) if params['transpose_input']: is_height_short_side = tf.less(features_shape[0], features_shape[1]) else: is_height_short_side = tf.less(features_shape[1], features_shape[2]) def height_short_side_model_fn(*args): """Mode function for input images with height on the short side.""" features, labels = inputs_flattener.unflatten_features_and_labels( args) features, labels = _set_feature_and_label_shapes( features, labels, params) spec = mask_rcnn_model.mask_rcnn_model_fn( features, labels, tf.estimator.ModeKeys.TRAIN, params) captured_scaffold_fn.capture(spec.scaffold_fn) return spec.train_op def height_long_side_model_fn(*args): """Mode function for input images with height on the long side.""" features, labels = inputs_flattener.unflatten_features_and_labels( args) # Create a new params which has the reversed dynamic image shape. new_params = copy.deepcopy(params) new_params['dynamic_image_size'] = new_params[ 'dynamic_image_size'][::-1] features, labels = _set_feature_and_label_shapes( features, labels, new_params) spec = mask_rcnn_model.mask_rcnn_model_fn( features, labels, tf.estimator.ModeKeys.TRAIN, new_params) captured_scaffold_fn.capture(spec.scaffold_fn) return spec.train_op rewrite_computation = tf.cond( is_height_short_side, lambda: tpu.replicate(height_short_side_model_fn, per_host_sharded_inputs), # pylint: disable=line-too-long lambda: tpu.replicate(height_long_side_model_fn, per_host_sharded_inputs) # pylint: disable=line-too-long ) return rewrite_computation def multiple_steps_fn(): """function for multiple TPU steps in a host training loop.""" return utils.wrap_computation_in_while_loop(single_step_fn, n=iterations_per_loop, parallel_iterations=1) with tpu_strategy.scope(): # NOTE: `tpu_strategy.extended.call_for_each_replica` is not supported # in TF 1.12, use `tpu_strategy.call_for_each_tower` in that version. computation = tpu_strategy.extended.call_for_each_replica(multiple_steps_fn) # pylint: disable=line-too-long saver = tf.train.Saver() latest_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir) if latest_checkpoint: saver.restore(sess, latest_checkpoint) else: captured_scaffold_fn.get()() sess.run(tf.global_variables_initializer()) sess.run(multi_device_iterator.initializer) current_step = sess.run(tf.train.get_global_step()) # Save a 0-step checkpoint. if current_step == 0: saver.save(sess, FLAGS.model_dir + '/model', global_step=current_step) for iter_steps in range(current_step, max_steps, iterations_per_loop): tf.logging.info('Dynamic shape training steps: %d', iter_steps) _ = sess.run(computation) # Save checkpoints. saver.save(sess, FLAGS.model_dir + '/model', global_step=iter_steps + iterations_per_loop) sess.run(tpu.shutdown_system()) sess.close()
def main(_): config = hparams_config.get_efficientdet_config(FLAGS.model_name) config.override(FLAGS.hparams) config.batch_size = FLAGS.batch_size config.val_json_file = FLAGS.val_json_file config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS base_height, base_width = utils.parse_image_size(config['image_size']) # Network model = efficientdet_keras.EfficientDetNet(config=config) model.build((config.batch_size, base_height, base_width, 3)) model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir)) # in format (height, width, flip) augmentations = [] if FLAGS.enable_tta: for size_offset in (0, 128, 256): for flip in (False, True): augmentations.append((base_height + size_offset, base_width + size_offset, flip)) else: augmentations.append((base_height, base_width, False)) detections_per_source = dict() for height, width, flip in augmentations: config.image_size = (height, width) # dataset ds = dataloader.InputReader( FLAGS.val_file_pattern, is_training=False, use_fake_data=False, max_instances_per_image=config.max_instances_per_image)( config) # compute stats for all batches. for images, labels in ds: if flip: images = tf.image.flip_left_right(images) cls_outputs, box_outputs = model(images, training=False) detections = postprocess.generate_detections(config, cls_outputs, box_outputs, labels['image_scales'], labels['source_ids'], flip) for id, d in zip(labels['source_ids'], detections): if id.numpy() in detections_per_source: detections_per_source[id.numpy()] = tf.concat([d, detections_per_source[id.numpy()]], 0) else: detections_per_source[id.numpy()] = d evaluator = coco_metric.EvaluationMetric(filename=config.val_json_file) for d in detections_per_source.values(): if FLAGS.enable_tta: d = wbf.ensemble_detections(config, d, len(augmentations)) evaluator.update_state( labels['groundtruth_data'].numpy(), postprocess.transform_detections(tf.stack([d])).numpy()) # compute the final eval results. metric_values = evaluator.result() metric_dict = {} for i, metric_value in enumerate(metric_values): metric_dict[evaluator.metric_names[i]] = metric_value print(metric_dict)
def main(_): config = hparams_config.get_efficientdet_config(FLAGS.model_name) config.override(FLAGS.hparams) config.val_json_file = FLAGS.val_json_file config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS config.drop_remainder = False # eval all examples w/o drop. config.image_size = utils.parse_image_size(config['image_size']) if config.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver) logging.info('All devices: %s', tf.config.list_logical_devices('TPU')) elif config.strategy == 'gpus': ds_strategy = tf.distribute.MirroredStrategy() logging.info('All devices: %s', tf.config.list_physical_devices('GPU')) else: if tf.config.list_physical_devices('GPU'): ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0') else: ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0') with ds_strategy.scope(): # Network model = efficientdet_keras.EfficientDetNet(config=config) model.build((1, *config.image_size, 3)) model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir)) @tf.function def model_fn(images, labels): cls_outputs, box_outputs = model(images, training=False) return postprocess.generate_detections(config, cls_outputs, box_outputs, labels['image_scales'], labels['source_ids']) # Evaluator for AP calculation. label_map = label_util.get_label_map(config.label_map) evaluator = coco_metric.EvaluationMetric(filename=config.val_json_file, label_map=label_map) @tf.function def eval_update(gt, pred): tf.numpy_function(evaluator.update_state, [gt, postprocess.transform_detections(pred)], []) # dataset batch_size = FLAGS.batch_size # global batch size. ds = dataloader.InputReader( FLAGS.val_file_pattern, is_training=False, max_instances_per_image=config.max_instances_per_image)( config, batch_size=batch_size) if FLAGS.eval_samples: ds = ds.take((FLAGS.eval_samples + batch_size - 1) // batch_size) ds = ds_strategy.experimental_distribute_dataset(ds) # evaluate all images. eval_samples = FLAGS.eval_samples or 5000 pbar = tf.keras.utils.Progbar( (eval_samples + batch_size - 1) // batch_size) for i, (images, labels) in enumerate(ds): detections = ds_strategy.run(model_fn, (images, labels)) ds_strategy.run(eval_update, (labels['groundtruth_data'], detections)) pbar.update(i) # compute the final eval results. metrics = evaluator.result() metric_dict = {} for i, name in enumerate(evaluator.metric_names): metric_dict[name] = metrics[i] if label_map: for i, cid in enumerate(sorted(label_map.keys())): name = 'AP_/%s' % label_map[cid] metric_dict[name] = metrics[i + len(evaluator.metric_names)] print(FLAGS.model_name, metric_dict)
def main(_): if FLAGS.strategy == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top logging.info('Use horovod with multi gpus') hvd.init() os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank()) import tensorflow.compat.v1 as tf # pylint: disable=g-import-not-at-top tf.enable_v2_tensorshape() tf.disable_eager_execution() if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. feat_sizes = utils.get_feat_sizes(config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict(config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and FLAGS.strategy != 'tpu': config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) config_proto.gpu_options.allow_growth = True tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) if FLAGS.strategy == 'horovod': model_dir = FLAGS.model_dir if hvd.rank() == 0 else None else: model_dir = FLAGS.model_dir run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) max_instances_per_image = config.max_instances_per_image use_tpu = (FLAGS.strategy == 'tpu') # TPU Estimator logging.info(params) if FLAGS.mode == 'train': train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image), max_steps=int((config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, strategy=FLAGS.strategy, input_rand_hflip=False, is_training_bn=False, mixed_precision=None, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, max_instances_per_image=max_instances_per_image), steps=FLAGS.eval_samples // FLAGS.eval_batch_size, name=FLAGS.eval_name) logging.info('Eval results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) elif FLAGS.mode == 'eval': # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. eval_params = dict( params, strategy=FLAGS.strategy, input_rand_hflip=False, is_training_bn=False, mixed_precision=None, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, max_instances_per_image=max_instances_per_image), steps=FLAGS.eval_samples // FLAGS.eval_batch_size, name=FLAGS.eval_name) logging.info('Eval results: %s', eval_results) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'train_and_eval': for cycle in range(config.num_epochs): logging.info('Starting training cycle, epoch: %d.', cycle) train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image), steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) logging.info('Starting evaluation cycle, epoch: %d.', cycle) # Run evaluation after every epoch. eval_params = dict( params, strategy=FLAGS.strategy, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, max_instances_per_image=max_instances_per_image), steps=FLAGS.eval_samples // FLAGS.eval_batch_size, name=FLAGS.eval_name) logging.info('Evaluation results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) else: logging.info('Mode not found.')
def main(argv): del argv # Unused. if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_shards=FLAGS.num_cores, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) tpu_config = tf.contrib.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, ) model_fn = retinanet_model.retinanet_model_fn # TPU Estimator if FLAGS.mode == 'train': tf.logging.info(params) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'eval': # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.contrib.training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.num_epochs): tf.logging.info('Starting training cycle, epoch: %d.' % cycle) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle) # Run evaluation after every epoch. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) tf.logging.info('Evaluation results: %s' % eval_results) else: tf.logging.info('Mode not found.')