Ejemplo n.º 1
0
def main(argv):
  del argv  # Unused.

  # Configure parameters.
  config = mask_rcnn_params.default_config()
  config = params_io.override_hparams(config, FLAGS.config)

  # Check data path
  train_input_fn = None
  eval_input_fn = None
  if (FLAGS.mode in ('train', 'train_and_eval') and
      not config.training_file_pattern):
    raise RuntimeError('You must specify `training_file_pattern` for training.')
  if FLAGS.mode in ('eval', 'train_and_eval'):
    if not config.validation_file_pattern:
      raise RuntimeError('You must specify `validation_file_pattern` '
                         'for evaluation.')
    if not config.val_json_file:
      raise RuntimeError('You must specify `val_json_file` for evaluation.')

  if FLAGS.mode in ('train', 'train_and_eval'):
    train_input_fn = dataloader.InputReader(
        config.training_file_pattern,
        mode=tf.estimator.ModeKeys.TRAIN,
        use_fake_data=FLAGS.use_fake_data,
        use_instance_mask=config.include_mask)
  if FLAGS.mode in ('eval', 'train_and_eval'):
    eval_input_fn = dataloader.InputReader(
        config.validation_file_pattern,
        mode=tf.estimator.ModeKeys.PREDICT,
        num_examples=config.eval_samples,
        use_instance_mask=config.include_mask)
  run(config, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn)
Ejemplo n.º 2
0
def main(_):
  config = mask_rcnn_params.default_config()
  config = params_io.override_hparams(config, FLAGS.config)
  config.is_training_bn = False
  config.train_batch_size = FLAGS.batch_size
  config.eval_batch_size = FLAGS.batch_size

  model_params = dict(
      config.values(),
      use_tpu=FLAGS.use_tpu,
      mode=tf.estimator.ModeKeys.PREDICT,
      transpose_input=False)

  print(' - Setting up TPUEstimator...')
  estimator = tf.contrib.tpu.TPUEstimator(
      model_fn=serving.serving_model_fn_builder(
          FLAGS.output_source_id,
          FLAGS.output_image_info,
          FLAGS.output_box_features,
          FLAGS.output_normalized_coordinates),
      model_dir=FLAGS.model_dir,
      config=tpu_config.RunConfig(
          tpu_config=tpu_config.TPUConfig(
              iterations_per_loop=FLAGS.iterations_per_loop),
          master='local',
          evaluation_master='local'),
      params=model_params,
      use_tpu=FLAGS.use_tpu,
      train_batch_size=FLAGS.batch_size,
      predict_batch_size=FLAGS.batch_size,
      export_to_tpu=FLAGS.use_tpu,
      export_to_cpu=True)

  print(' - Exporting the model...')
  input_type = FLAGS.input_type
  export_path = estimator.export_saved_model(
      export_dir_base=FLAGS.export_dir,
      serving_input_receiver_fn=functools.partial(
          serving.serving_input_fn,
          batch_size=FLAGS.batch_size,
          desired_image_size=config.image_size,
          padding_stride=(2 ** config.max_level),
          input_type=input_type,
          input_name=FLAGS.input_name),
      checkpoint_path=FLAGS.checkpoint_path)

  if FLAGS.add_warmup_requests and input_type == 'image_bytes':
    inference_warmup.write_warmup_requests(
        export_path,
        FLAGS.model_name,
        config.image_size,
        batch_sizes=[FLAGS.batch_size],
        image_format='JPEG',
        input_signature=FLAGS.input_name)
  print(' - Done! path: %s' % export_path)
def main(argv):
    del argv  # Unused.

    # Configure parameters.
    config = mask_rcnn_params.default_config()
    config = params_io.override_hparams(config, FLAGS.config)

    if FLAGS.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if (FLAGS.mode in ('train', 'train_and_eval')
            and not config.training_file_pattern):
        raise RuntimeError(
            'You must specify `training_file_pattern` for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if not config.validation_file_pattern:
            raise RuntimeError('You must specify `validation_file_pattern` '
                               'for evaluation.')
        if not config.val_json_file:
            raise RuntimeError(
                'You must specify `val_json_file` for evaluation.')

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` has 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # Note: In the below code, TPUEstimator uses both `shard` and `replica` (with
    # the same meaning).
    if FLAGS.input_partition_dims:
        labels_partition_dims = {
            'source_ids': None,
            'groundtruth_data': None,
            'image_info': None,
            'cropped_gt_masks': None,
        }
        # TODO(b/119617317): The Input Partition Logic. We partition only the
        # partition-able tensors. Spatial partition requires that the
        # to-be-partitioned tensors must have a dimension that is a multiple of
        # `partition_dims`. Depending on the `partition_dims` and the `image_size`
        # and the `max_level` in config, some high-level anchor labels (i.e.,
        # `cls_targets` and `box_targets`) cannot be partitioned. For example, when
        # `partition_dims` is [1, 4, 2, 1], image size is 1536, `max_level` is 9,
        # `cls_targets_8` has a shape of [batch_size, 6, 6, 9], which cannot be
        # partitioned (6 % 4 != 0). In this case, the level-8 and level-9 target
        # tensors are not partition-able, and the highest partition-able level is 7.
        image_size = config.image_size
        for level in range(config.min_level, config.max_level + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['score_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['score_targets_%d' % level] = None
        num_cores_per_replica = np.prod(FLAGS.input_partition_dims)
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores
    params = dict(
        config.values(),
        num_shards=num_shards,
        use_tpu=FLAGS.use_tpu,
        mode=FLAGS.mode,
        # The following are used by the host_call function.
        model_dir=FLAGS.model_dir,
        iterations_per_loop=FLAGS.iterations_per_loop,
        transpose_input=FLAGS.transpose_input)

    tpu_config = tf.contrib.tpu.TPUConfig(
        FLAGS.iterations_per_loop,
        num_shards=num_shards,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
        PER_HOST_V2)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        evaluation_master=FLAGS.eval_master,
        model_dir=FLAGS.model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        tpu_config=tpu_config,
    )

    if FLAGS.mode == 'train':
        if FLAGS.model_dir:
            save_config(config, FLAGS.model_dir)

        tf.logging.info(params)
        train_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=config.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(input_fn=dataloader.InputReader(
            config.training_file_pattern,
            mode=tf.estimator.ModeKeys.TRAIN,
            use_fake_data=FLAGS.use_fake_data),
                              max_steps=config.total_steps)

        if FLAGS.eval_after_training:
            # Run evaluation after training finishes.
            eval_params_dict = dict(
                params,
                use_tpu=FLAGS.use_tpu,
                input_rand_hflip=False,
                is_training_bn=False,
                transpose_input=False,
            )

            eval_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=mask_rcnn_model.mask_rcnn_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=config.train_batch_size,
                eval_batch_size=config.eval_batch_size,
                predict_batch_size=config.eval_batch_size,
                config=run_config,
                params=eval_params_dict)

            output_dir = os.path.join(FLAGS.model_dir, 'eval')
            tf.gfile.MakeDirs(output_dir)
            # Summary writer writes out eval metrics.
            summary_writer = tf.summary.FileWriter(output_dir)
            eval_results = evaluation(eval_estimator, config)
            write_summary(eval_results, summary_writer, config.total_steps)

            summary_writer.close()

    elif FLAGS.mode == 'eval':
        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        eval_params_dict = dict(
            params,
            use_tpu=FLAGS.use_tpu,
            input_rand_hflip=False,
            is_training_bn=False,
            transpose_input=False,
        )

        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
            predict_batch_size=config.eval_batch_size,
            config=run_config,
            params=eval_params_dict)

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.contrib.training.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):
            # Terminate eval job when final checkpoint is reached
            current_step = int(os.path.basename(ckpt).split('-')[1])

            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = evaluation(eval_estimator, config)
                write_summary(eval_results, summary_writer, current_step)

                if current_step >= config.total_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)
        summary_writer.close()

    elif FLAGS.mode == 'train_and_eval':
        if FLAGS.model_dir:
            save_config(config, FLAGS.model_dir)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        summary_writer = tf.summary.FileWriter(output_dir)
        train_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=config.train_batch_size,
            config=run_config,
            params=params)
        eval_params_dict = dict(
            params,
            use_tpu=FLAGS.use_tpu,
            input_rand_hflip=False,
            is_training_bn=False,
        )
        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
            predict_batch_size=config.eval_batch_size,
            config=run_config,
            params=eval_params_dict)

        num_cycles = int(config.total_steps / config.num_steps_per_eval)
        for cycle in range(num_cycles):
            tf.logging.info('Start training cycle %d.' % cycle)
            train_estimator.train(input_fn=dataloader.InputReader(
                config.training_file_pattern,
                mode=tf.estimator.ModeKeys.TRAIN),
                                  steps=config.num_steps_per_eval)

            tf.logging.info('Start evaluation cycle %d.' % cycle)
            eval_results = evaluation(eval_estimator, config)

            current_step = int(cycle * config.num_steps_per_eval)
            write_summary(eval_results, summary_writer, current_step)

        tf.logging.info('Starting training cycle %d.' % num_cycles)
        train_estimator.train(input_fn=dataloader.InputReader(
            config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN),
                              max_steps=config.total_steps)

        #edited - save the model

        def serving_input_fn():

            feature_placeholders = {
                'image_bytes': tf.placeholder(tf.string, shape=())
            }

            image, _ = tf.squeeze(feature_placeholders['image_bytes'])

            features = {'image': tf.expand_dims(image, 0)}
            return tf.estimator.export.ServingInputReceiver(
                features, feature_placeholders)

        train_estimator.export_savedmodel(
            export_dir_base=os.path.join(output_dir, 'export/exporter'),
            serving_input_receiver_fn=serving_input_fn)

        eval_results = evaluation(eval_estimator, config)
        write_summary(eval_results, summary_writer, config.total_steps)
        summary_writer.close()
    else:
        tf.logging.info('Mode not found.')