def main(argv): del argv # Unused. # Configure parameters. config = mask_rcnn_params.default_config() config = params_io.override_hparams(config, FLAGS.config) # Check data path train_input_fn = None eval_input_fn = None if (FLAGS.mode in ('train', 'train_and_eval') and not config.training_file_pattern): raise RuntimeError('You must specify `training_file_pattern` for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if not config.validation_file_pattern: raise RuntimeError('You must specify `validation_file_pattern` ' 'for evaluation.') if not config.val_json_file: raise RuntimeError('You must specify `val_json_file` for evaluation.') if FLAGS.mode in ('train', 'train_and_eval'): train_input_fn = dataloader.InputReader( config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN, use_fake_data=FLAGS.use_fake_data, use_instance_mask=config.include_mask) if FLAGS.mode in ('eval', 'train_and_eval'): eval_input_fn = dataloader.InputReader( config.validation_file_pattern, mode=tf.estimator.ModeKeys.PREDICT, num_examples=config.eval_samples, use_instance_mask=config.include_mask) run(config, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn)
def main(_): config = mask_rcnn_params.default_config() config = params_io.override_hparams(config, FLAGS.config) config.is_training_bn = False config.train_batch_size = FLAGS.batch_size config.eval_batch_size = FLAGS.batch_size model_params = dict( config.values(), use_tpu=FLAGS.use_tpu, mode=tf.estimator.ModeKeys.PREDICT, transpose_input=False) print(' - Setting up TPUEstimator...') estimator = tf.contrib.tpu.TPUEstimator( model_fn=serving.serving_model_fn_builder( FLAGS.output_source_id, FLAGS.output_image_info, FLAGS.output_box_features, FLAGS.output_normalized_coordinates), model_dir=FLAGS.model_dir, config=tpu_config.RunConfig( tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop), master='local', evaluation_master='local'), params=model_params, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, export_to_tpu=FLAGS.use_tpu, export_to_cpu=True) print(' - Exporting the model...') input_type = FLAGS.input_type export_path = estimator.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=functools.partial( serving.serving_input_fn, batch_size=FLAGS.batch_size, desired_image_size=config.image_size, padding_stride=(2 ** config.max_level), input_type=input_type, input_name=FLAGS.input_name), checkpoint_path=FLAGS.checkpoint_path) if FLAGS.add_warmup_requests and input_type == 'image_bytes': inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, config.image_size, batch_sizes=[FLAGS.batch_size], image_format='JPEG', input_signature=FLAGS.input_name) print(' - Done! path: %s' % export_path)
def main(argv): del argv # Unused. # Configure parameters. config = mask_rcnn_params.default_config() config = params_io.override_hparams(config, FLAGS.config) if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if (FLAGS.mode in ('train', 'train_and_eval') and not config.training_file_pattern): raise RuntimeError( 'You must specify `training_file_pattern` for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if not config.validation_file_pattern: raise RuntimeError('You must specify `validation_file_pattern` ' 'for evaluation.') if not config.val_json_file: raise RuntimeError( 'You must specify `val_json_file` for evaluation.') # The following is for spatial partitioning. `features` has one tensor while # `labels` has 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # Note: In the below code, TPUEstimator uses both `shard` and `replica` (with # the same meaning). if FLAGS.input_partition_dims: labels_partition_dims = { 'source_ids': None, 'groundtruth_data': None, 'image_info': None, 'cropped_gt_masks': None, } # TODO(b/119617317): The Input Partition Logic. We partition only the # partition-able tensors. Spatial partition requires that the # to-be-partitioned tensors must have a dimension that is a multiple of # `partition_dims`. Depending on the `partition_dims` and the `image_size` # and the `max_level` in config, some high-level anchor labels (i.e., # `cls_targets` and `box_targets`) cannot be partitioned. For example, when # `partition_dims` is [1, 4, 2, 1], image size is 1536, `max_level` is 9, # `cls_targets_8` has a shape of [batch_size, 6, 6, 9], which cannot be # partitioned (6 % 4 != 0). In this case, the level-8 and level-9 target # tensors are not partition-able, and the highest partition-able level is 7. image_size = config.image_size for level in range(config.min_level, config.max_level + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['score_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['score_targets_%d' % level] = None num_cores_per_replica = np.prod(FLAGS.input_partition_dims) input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict( config.values(), num_shards=num_shards, use_tpu=FLAGS.use_tpu, mode=FLAGS.mode, # The following are used by the host_call function. model_dir=FLAGS.model_dir, iterations_per_loop=FLAGS.iterations_per_loop, transpose_input=FLAGS.transpose_input) tpu_config = tf.contrib.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, tpu_config=tpu_config, ) if FLAGS.mode == 'train': if FLAGS.model_dir: save_config(config, FLAGS.model_dir) tf.logging.info(params) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN, use_fake_data=FLAGS.use_fake_data), max_steps=config.total_steps) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params_dict = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, transpose_input=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.eval_batch_size, config=run_config, params=eval_params_dict) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_results = evaluation(eval_estimator, config) write_summary(eval_results, summary_writer, config.total_steps) summary_writer.close() elif FLAGS.mode == 'eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_params_dict = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, transpose_input=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.eval_batch_size, config=run_config, params=eval_params_dict) def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.contrib.training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) tf.logging.info('Starting to evaluate.') try: eval_results = evaluation(eval_estimator, config) write_summary(eval_results, summary_writer, current_step) if current_step >= config.total_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) summary_writer.close() elif FLAGS.mode == 'train_and_eval': if FLAGS.model_dir: save_config(config, FLAGS.model_dir) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, config=run_config, params=params) eval_params_dict = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.eval_batch_size, config=run_config, params=eval_params_dict) num_cycles = int(config.total_steps / config.num_steps_per_eval) for cycle in range(num_cycles): tf.logging.info('Start training cycle %d.' % cycle) train_estimator.train(input_fn=dataloader.InputReader( config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN), steps=config.num_steps_per_eval) tf.logging.info('Start evaluation cycle %d.' % cycle) eval_results = evaluation(eval_estimator, config) current_step = int(cycle * config.num_steps_per_eval) write_summary(eval_results, summary_writer, current_step) tf.logging.info('Starting training cycle %d.' % num_cycles) train_estimator.train(input_fn=dataloader.InputReader( config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN), max_steps=config.total_steps) #edited - save the model def serving_input_fn(): feature_placeholders = { 'image_bytes': tf.placeholder(tf.string, shape=()) } image, _ = tf.squeeze(feature_placeholders['image_bytes']) features = {'image': tf.expand_dims(image, 0)} return tf.estimator.export.ServingInputReceiver( features, feature_placeholders) train_estimator.export_savedmodel( export_dir_base=os.path.join(output_dir, 'export/exporter'), serving_input_receiver_fn=serving_input_fn) eval_results = evaluation(eval_estimator, config) write_summary(eval_results, summary_writer, config.total_steps) summary_writer.close() else: tf.logging.info('Mode not found.')