Example #1
0
    def test_resample_feature_adder_compile(self):
        config = hparams_config.get_efficientdet_config("efficientdet-d0")
        feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
        tf2.random.set_seed(SEED)
        inputs = [
            tf2.keras.Input(shape=[512, 512, 3]),
            tf2.keras.Input(shape=[256, 256, 16]),
            tf2.keras.Input(shape=[128, 128, 24]),
            tf2.keras.Input(shape=[64, 64, 40]),
            tf2.keras.Input(shape=[32, 32, 112]),
            tf2.keras.Input(shape=[16, 16, 320])
        ]
        outputs = efficientdet_arch_keras.ResampleFeatureAdder(config)(inputs)
        model = tf2.keras.Model(inputs=inputs, outputs=outputs)

        examples = [[
            tf2.ones([1, 512, 512, 3]),
            tf2.ones([1, 256, 256, 16]),
            tf2.ones([1, 128, 128, 24]),
            tf2.ones([1, 64, 64, 40]),
            tf2.ones([1, 32, 32, 112]),
            tf2.ones([1, 16, 16, 320])
        ]]

        preds = model(examples)

        try:
            utils.verify_feats_size(preds,
                                    feat_sizes=feat_sizes,
                                    min_level=config.min_level,
                                    max_level=config.max_level,
                                    data_format=config.data_format)
        except ValueError as err:
            self.assertFalse(True, msg=repr(err))
        self.assertEqual(len(preds), 5, "P3-P7")
Example #2
0
  def __init__(self, min_level, max_level, num_scales, aspect_ratios,
               anchor_scale, image_size):
    """Constructs multiscale RetinaNet anchors.

    Args:
      min_level: integer number of minimum level of the output feature pyramid.
      max_level: integer number of maximum level of the output feature pyramid.
      num_scales: integer number representing intermediate scales added
        on each level. For instances, num_scales=2 adds two additional
        anchor scales [2^0, 2^0.5] on each level.
      aspect_ratios: list of tuples representing the aspect ratio anchors added
        on each level. For instances, aspect_ratios =
        [(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level.
      anchor_scale: float number representing the scale of size of the base
        anchor to the feature stride 2^level.
      image_size: integer number or tuple of integer number of input image size.
    """
    self.min_level = min_level
    self.max_level = max_level
    self.num_scales = num_scales
    self.aspect_ratios = aspect_ratios
    self.anchor_scale = anchor_scale
    if isinstance(image_size, int):
      self.image_size = (image_size, image_size)
    else:
      self.image_size = image_size
    self.feat_sizes = utils.get_feat_sizes(image_size, max_level)
    self.config = self._generate_configs()
    self.boxes = self._generate_boxes()
Example #3
0
    def test_variables(self):
        config = hparams_config.get_efficientdet_config()
        feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
        with tf.Graph().as_default():
            feats = [
                tf.random.uniform([1, 64, 64, 40]),
                tf.random.uniform([1, 32, 32, 112]),
                tf.random.uniform([1, 16, 16, 320]),
                tf.random.uniform([1, 8, 8, 64]),
                tf.random.uniform([1, 4, 4, 64])
            ]
            efficientdet_arch_keras.build_bifpn_layer(feats, feat_sizes,
                                                      config)
            vars1 = [var.name for var in tf.global_variables()]

        with tf.Graph().as_default():
            feats = [
                tf.random.uniform([1, 64, 64, 40]),
                tf.random.uniform([1, 32, 32, 112]),
                tf.random.uniform([1, 16, 16, 320]),
                tf.random.uniform([1, 8, 8, 64]),
                tf.random.uniform([1, 4, 4, 64])
            ]
            legacy_arch.build_bifpn_layer(feats, feat_sizes, config)
            vars2 = [var.name for var in tf.global_variables()]

        self.assertEqual(vars1, vars2)
  def __init__(self, min_level: int, max_level: int, image_size: int,
               fpn_weight_method: str, apply_bn_for_resampling: bool,
               is_training_bn: bool, conv_after_downsample: bool,
               use_native_resize_op: bool, data_format: str, pooling_type: str,
               fpn_num_filters: int, conv_bn_act_pattern: bool, act_type: str,
               separable_conv: bool, use_tpu: bool, fpn_name: str, **kwargs):
    self.min_level = min_level
    self.max_level = max_level
    self.image_size = image_size
    self.feat_sizes = utils.get_feat_sizes(image_size, max_level)

    self.fpn_weight_method = fpn_weight_method
    self.apply_bn_for_resampling = apply_bn_for_resampling
    self.is_training_bn = is_training_bn
    self.conv_after_downsample = conv_after_downsample
    self.use_native_resize_op = use_native_resize_op
    self.data_format = data_format
    self.fpn_num_filters = fpn_num_filters
    self.pooling_type = pooling_type
    self.conv_bn_act_pattern = conv_bn_act_pattern
    self.act_type = act_type
    self.use_tpu = use_tpu
    self.separable_conv = separable_conv

    self.fpn_config = None
    self.fpn_name = fpn_name

    super(BiFPNLayer, self).__init__(**kwargs)
Example #5
0
  def __init__(self, min_level, max_level, num_scales, aspect_ratios,
               anchor_scale, image_size):
    """Constructs multiscale anchors.

    Args:
      min_level: integer number of minimum level of the output feature pyramid.
      max_level: integer number of maximum level of the output feature pyramid.
      num_scales: integer number representing intermediate scales added
        on each level. For instances, num_scales=2 adds two additional
        anchor scales [2^0, 2^0.5] on each level.
      aspect_ratios: list of representing the aspect ratio anchors added
        on each level. For instances, aspect_ratios = [1.0, 2.0, 0..5]
        adds three anchors on each level.
      anchor_scale: float number representing the scale of size of the base
        anchor to the feature stride 2^level. Or a list, one value per layer.
      image_size: integer number or tuple of integer number of input image size.
    """
    self.min_level = min_level
    self.max_level = max_level
    self.num_scales = num_scales
    self.aspect_ratios = aspect_ratios
    if isinstance(anchor_scale, (list, tuple)):
      assert len(anchor_scale) == max_level - min_level + 1
      self.anchor_scales = anchor_scale
    else:
      self.anchor_scales = [anchor_scale] * (max_level - min_level + 1)
    self.image_size = utils.parse_image_size(image_size)
    self.feat_sizes = utils.get_feat_sizes(image_size, max_level)
    self.config = self._generate_configs()
    self.boxes = self._generate_boxes()
Example #6
0
def build_feature_network(features, config):
    """Build FPN input features.

  Args:
   features: input tensor.
   config: a dict-like config, including all parameters.

  Returns:
    A dict from levels to the feature maps processed after feature network.
  """
    feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
    feats = []
    if config.min_level not in features.keys():
        raise ValueError(
            'features.keys ({}) should include min_level ({})'.format(
                features.keys(), config.min_level))

    # Build additional input features that are not from backbone.
    for level in range(config.min_level, config.max_level + 1):
        if level in features.keys():
            feats.append(features[level])
        else:
            # Adds a coarser level by downsampling the last feature map.
            feats.append(
                resample_feature_map(
                    feats[-1],
                    name='p%d' % level,
                    target_height=(feats[-1].shape[1] - 1) // 2 + 1,
                    target_width=(feats[-1].shape[2] - 1) // 2 + 1,
                    target_num_channels=config.fpn_num_filters,
                    apply_bn=config.apply_bn_for_resampling,
                    is_training=config.is_training_bn,
                    conv_after_downsample=config.conv_after_downsample,
                    use_native_resize_op=config.use_native_resize_op,
                    pooling_type=config.pooling_type,
                    use_tpu=config.use_tpu))

    _verify_feats_size(feats,
                       feat_sizes=feat_sizes,
                       min_level=config.min_level,
                       max_level=config.max_level)

    with tf.variable_scope('fpn_cells'):
        for rep in range(config.fpn_cell_repeats):
            with tf.variable_scope('cell_{}'.format(rep)):
                logging.info('building cell %d', rep)
                new_feats = build_bifpn_layer(feats, feat_sizes, config)

                feats = [
                    new_feats[level]
                    for level in range(config.min_level, config.max_level + 1)
                ]

                _verify_feats_size(feats,
                                   feat_sizes=feat_sizes,
                                   min_level=config.min_level,
                                   max_level=config.max_level)

    return new_feats
Example #7
0
    def test_fnode_compile(self):
        config = hparams_config.get_efficientdet_config("efficientdet-d0")
        fpn_config = legacy_arch.get_fpn_config(config.fpn_name,
                                                config.min_level,
                                                config.max_level,
                                                config.fpn_weight_method)
        feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
        i = 0
        fnode_cfg = fpn_config.nodes[i]

        examples = [[
            tf2.ones([1, 512, 512, 3]),
            tf2.ones([1, 256, 256, 16]),
            tf2.ones([1, 128, 128, 24]),
            tf2.ones([1, 64, 64, 40]),
            tf2.ones([1, 32, 32, 112]),
            tf2.ones([1, 16, 16, 320])
        ]]
        inputs = [
            tf2.keras.Input(shape=[512, 512, 3]),
            tf2.keras.Input(shape=[256, 256, 16]),
            tf2.keras.Input(shape=[128, 128, 24]),
            tf2.keras.Input(shape=[64, 64, 40]),
            tf2.keras.Input(shape=[32, 32, 112]),
            tf2.keras.Input(shape=[16, 16, 320])
        ]

        x = efficientdet_arch_keras.ResampleFeatureAdder(config)(inputs)
        outputs = efficientdet_arch_keras.FNode(
            feat_sizes[fnode_cfg['feat_level']]['height'],
            feat_sizes[fnode_cfg['feat_level']]['width'],
            fnode_cfg['inputs_offsets'],
            config.fpn_num_filters,
            config.apply_bn_for_resampling,
            config.is_training_bn,
            config.conv_after_downsample,
            config.conv_bn_act_pattern,
            config.separable_conv,
            config.act_type,
            strategy=config.strategy,
            weight_method=fpn_config.weight_method,
            data_format=config.data_format,
            name='fnode{}'.format(i))(x)
        model = tf2.keras.Model(inputs=inputs, outputs=outputs)
        preds = model(examples)

        self.assertEqual(
            len(preds),
            6,
            msg=
            "Expected that FNode will add one more node (P6') to initial 5 (P3 - P7)"
        )
        self.assertEqual(feat_sizes[fnode_cfg['feat_level']]['height'],
                         preds[5].shape[1])
        self.assertEqual(feat_sizes[fnode_cfg['feat_level']]['width'],
                         preds[5].shape[2])
def build_feature_network(features, config):
    """Build FPN input features.

  Args:
   features: input tensor.
   config: a dict-like config, including all parameters.

  Returns:
    A dict from levels to the feature maps processed after feature network.
  """
    feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
    feats = ResampleFeatureAdder(config)(features)
    new_feats = FPNCells(feat_sizes, config)(feats)
    return new_feats
Example #9
0
def build_feature_network(features, config):
    """Build FPN input features.

    Args:
     features: input tensor.
     config: a dict-like config, including all parameters.

    Returns:
      A dict from levels to the feature maps processed after feature network.
    """
    feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
    feats = []
    if config.min_level not in features.keys():
        raise ValueError(
            'features.keys ({}) should include min_level ({})'.format(
                features.keys(), config.min_level))

    # Build additional input features that are not from backbone.
    for level in range(config.min_level, config.max_level + 1):
        if level in features.keys():
            feats.append(features[level])
        else:
            h_id, w_id = (2,
                          3) if config.data_format == 'channels_first' else (1,
                                                                             2)
            # Adds a coarser level by downsampling the last feature map.
            feats.append(
                ResampleFeatureMap(
                    target_height=(feats[-1].shape[h_id] - 1) // 2 + 1,
                    target_width=(feats[-1].shape[w_id] - 1) // 2 + 1,
                    target_num_channels=config.fpn_num_filters,
                    apply_bn=config.apply_bn_for_resampling,
                    is_training=config.is_training_bn,
                    conv_after_downsample=config.conv_after_downsample,
                    use_native_resize_op=config.use_native_resize_op,
                    pooling_type=config.pooling_type,
                    strategy=config.strategy,
                    data_format=config.data_format,
                    name='resample_p{}'.format(level),
                )(feats[-1]))

    utils.verify_feats_size(feats,
                            feat_sizes=feat_sizes,
                            min_level=config.min_level,
                            max_level=config.max_level,
                            data_format=config.data_format)

    new_feats = FPNCells(feat_sizes, config)(feats)
    return new_feats
    def test_get_feat_sizes(self):
        feats = utils.get_feat_sizes(640, 2)
        self.assertEqual(feats, [{
            'height': 640,
            'width': 640
        }, {
            'height': 320,
            'width': 320
        }, {
            'height': 160,
            'width': 160
        }])

        feats = utils.get_feat_sizes((640, 300), 2)
        self.assertEqual(feats, [{
            'height': 640,
            'width': 300,
        }, {
            'height': 320,
            'width': 150,
        }, {
            'height': 160,
            'width': 75,
        }])
Example #11
0
    def __init__(self, config, name='fpn_cells'):
        super().__init__(name=name)
        self.config = config
        self.feat_sizes = utils.get_feat_sizes(config.image_size,
                                               config.max_level)

        if config.fpn_config:
            self.fpn_config = config.fpn_config
        else:
            self.fpn_config = fpn_configs.get_fpn_config(
                config.fpn_name, config.min_level, config.max_level,
                config.fpn_weight_method)

        self.cells = [
            FPNCell(self.feat_sizes, self.config, name='cell_%d' % rep)
            for rep in range(self.config.fpn_cell_repeats)
        ]
Example #12
0
def build_feature_network(features, config):
    """Build FPN input features.

    Args:
    features: {0:image, 1:endpoints["reduction_1], ...}
    config: a dict-like config, including all parameters.

    Returns:
        {3:P3", 4:P4", 5:P5", 6:P6", 7:P7"}
    """
    feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
    feats = []
    if config.min_level not in features.keys():
        raise ValueError(
            'features.keys ({}) should include min_level ({})'.format(
                features.keys(), config.min_level))

    # Build additional input features that are not from backbone.
    for level in range(config.min_level, config.max_level + 1):
        if level in features.keys():
            feats.append(features[level])
        else:
            # Adds a coarser level by downsampling the last feature map.
            feats.append(
                resample_feature_map(
                    feats[-1],
                    name='p%d' % level,
                    target_height=(feats[-1].shape[1] - 1) // 2 + 1,
                    target_width=(feats[-1].shape[2] - 1) // 2 + 1,
                    target_num_channels=config.fpn_num_filters,
                    is_training=config.is_training_bn))

    for rep in range(config.fpn_cell_repeats):
        logging.info('building cell %d', rep)
        new_feats = build_bifpn_layer(feats, feat_sizes, config)

        feats = [
            new_feats[level]
            for level in range(config.min_level, config.max_level + 1)
        ]

    return new_feats
Example #13
0
def build_feature_network(feats, config):
    """Build FPN input features.

  Args:
   feats: A list of input tensors starting from min_level.
   config: a dict-like config, including all parameters.

  Returns:
    A dict from levels to the feature maps processed after feature network.
  """
    feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
    if not feats:
        raise ValueError('FPN input features cannot be empty.')

    # Build additional input features that are not from backbone.
    while len(feats) < config.max_level - config.min_level + 1:
        level = len(feats) + config.min_level
        h_id, w_id = (2, 3) if config.data_format == 'channels_first' else (1,
                                                                            2)
        # Adds a coarser level by downsampling the last feature map.
        feats.append(
            ResampleFeatureMap(
                target_height=(feats[-1].shape[h_id] - 1) // 2 + 1,
                target_width=(feats[-1].shape[w_id] - 1) // 2 + 1,
                target_num_channels=config.fpn_num_filters,
                apply_bn=config.apply_bn_for_resampling,
                is_training=config.is_training_bn,
                conv_after_downsample=config.conv_after_downsample,
                strategy=config.strategy,
                data_format=config.data_format,
                name='resample_p{}'.format(level),
            )(feats[-1]))

    utils.verify_feats_size(feats,
                            feat_sizes=feat_sizes,
                            min_level=config.min_level,
                            max_level=config.max_level,
                            data_format=config.data_format)

    new_feats = FPNCells(feat_sizes, config)(feats)
    return new_feats
Example #14
0
    def test_fpncells_compile(self):
        config = hparams_config.get_efficientdet_config("efficientdet-d0")
        feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
        inputs = [
            tf2.keras.Input(shape=[64, 64, 40]),
            tf2.keras.Input(shape=[32, 32, 112]),
            tf2.keras.Input(shape=[16, 16, 320]),
            tf2.keras.Input(shape=[8, 8, 64]),
            tf2.keras.Input(shape=[4, 4, 64]),
        ]

        outputs = efficientdet_arch_keras.FPNCells(
            feat_sizes, config, name='cell_{}'.format(0))(inputs)
        model = tf2.keras.Model(inputs=inputs, outputs=outputs)

        examples = [
            tf2.ones([1, 64, 64, 40]),
            tf2.ones([1, 32, 32, 112]),
            tf2.ones([1, 16, 16, 320]),
            tf2.ones([1, 8, 8, 64]),
            tf2.ones([1, 4, 4, 64]),
        ]
        preds = model(examples)
        self.assertEqual(len(preds), 5)
Example #15
0
def main(_):
    if FLAGS.strategy == 'tpu':
        tf.disable_eager_execution()
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in ('train', 'train_and_eval'):
        if FLAGS.training_file_pattern is None:
            raise RuntimeError(
                'Must specify --training_file_pattern for train.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError(
                'Must specify --validation_file_pattern for eval.')

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
            'image_masks': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        feat_sizes = utils.get_feat_sizes(config.get('image_size'),
                                          config.get('max_level'))
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = feat_sizes[level]
            if _can_partition(spatial_dim['height']) and _can_partition(
                    spatial_dim['width']):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_shards=num_shards,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  backbone_ckpt=FLAGS.backbone_ckpt,
                  ckpt=FLAGS.ckpt,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode=FLAGS.mode)
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.strategy != 'tpu':
        if FLAGS.use_xla:
            config_proto.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_1)
        config_proto.gpu_options.allow_growth = True

    model_dir = FLAGS.model_dir
    strategy = None
    if FLAGS.strategy == 'tpu':
        tpu_config = tf.estimator.tpu.TPUConfig(
            FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1,
            num_cores_per_replica=num_cores_per_replica,
            input_partition_dims=input_partition_dims,
            per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
            PER_HOST_V2)
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=model_dir,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            tpu_config=tpu_config,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tf_random_seed=FLAGS.tf_random_seed,
        )
    else:
        if FLAGS.strategy == 'gpus':
            strategy = tf.distribute.MirroredStrategy()
        run_config = tf.estimator.RunConfig(
            model_dir=model_dir,
            train_distribute=strategy,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tf_random_seed=FLAGS.tf_random_seed,
        )

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
    max_instances_per_image = config.max_instances_per_image
    eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size)
    total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
    train_steps = total_examples // FLAGS.train_batch_size
    logging.info(params)
    with tf.io.gfile.GFile(os.path.join(model_dir, 'config.yaml'), 'w') as f:
        f.write(str(config))

    train_input_fn = dataloader.InputReader(
        FLAGS.training_file_pattern,
        is_training=True,
        use_fake_data=FLAGS.use_fake_data,
        max_instances_per_image=max_instances_per_image)
    eval_input_fn = dataloader.InputReader(
        FLAGS.validation_file_pattern,
        is_training=False,
        use_fake_data=FLAGS.use_fake_data,
        max_instances_per_image=max_instances_per_image)

    if FLAGS.strategy == 'tpu':
        estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=params)
    else:
        params['batch_size'] = (FLAGS.train_batch_size //
                                getattr(strategy, 'num_replicas_in_sync', 1))
        params['num_shards'] = getattr(strategy, 'num_replicas_in_sync', 1)
        estimator = tf.estimator.Estimator(model_fn=model_fn_instance,
                                           config=run_config,
                                           params=params)

    # start train/eval flow.
    if FLAGS.mode == 'train':
        estimator.train(input_fn=train_input_fn, max_steps=train_steps)
        if FLAGS.eval_after_training:
            estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout):

            logging.info('Starting to evaluate.')
            try:
                eval_results = estimator.evaluate(eval_input_fn,
                                                  steps=eval_steps)
                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                if current_step >= train_steps:
                    logging.info('Eval finished step %d/%d', current_step,
                                 train_steps)
                    break

            except tf.errors.NotFoundError:
                # Checkpoint might be not already deleted by the time eval finished.
                # We simply skip ssuch case.
                logging.info('Checkpoint %s no longer exists, skipping.', ckpt)

    elif FLAGS.mode == 'train_and_eval':
        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=train_steps)
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                          steps=eval_steps,
                                          throttle_secs=600)
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    else:
        logging.info('Invalid mode: %s', FLAGS.mode)
Example #16
0
def main(_):

    if FLAGS.strategy == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        logging.info('Use horovod with multi gpus')
        hvd.init()
        os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())
    import tensorflow.compat.v1 as tf  # pylint: disable=g-import-not-at-top
    tf.enable_v2_tensorshape()
    tf.disable_eager_execution()

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        feat_sizes = utils.get_feat_sizes(config.get('image_size'),
                                          config.get('max_level'))
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = feat_sizes[level]
            if _can_partition(spatial_dim['height']) and _can_partition(
                    spatial_dim['width']):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_shards=num_shards,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  backbone_ckpt=FLAGS.backbone_ckpt,
                  ckpt=FLAGS.ckpt,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode=FLAGS.mode)
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.strategy != 'tpu':
        if FLAGS.use_xla:
            config_proto.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_1)
        config_proto.gpu_options.allow_growth = True

    tpu_config = tf.estimator.tpu.TPUConfig(
        FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
        PER_HOST_V2)

    if FLAGS.strategy == 'horovod':
        model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
    else:
        model_dir = FLAGS.model_dir

    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        tpu_config=tpu_config,
        tf_random_seed=FLAGS.tf_random_seed,
    )

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
    max_instances_per_image = config.max_instances_per_image
    eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size)
    use_tpu = (FLAGS.strategy == 'tpu')
    logging.info(params)

    def _train(steps):
        """Build train estimator and run training if steps > 0."""
        train_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(input_fn=dataloader.InputReader(
            FLAGS.training_file_pattern,
            is_training=True,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=max_instances_per_image),
                              max_steps=steps)

    def _eval(steps):
        """Build estimator and eval the latest checkpoint if steps > 0."""
        eval_params = dict(
            params,
            strategy=FLAGS.strategy,
            input_rand_hflip=False,
            is_training_bn=False,
        )
        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)
        eval_results = eval_estimator.evaluate(input_fn=dataloader.InputReader(
            FLAGS.validation_file_pattern,
            is_training=False,
            max_instances_per_image=max_instances_per_image),
                                               steps=steps,
                                               name=FLAGS.eval_name)
        logging.info('Evaluation results: %s', eval_results)
        return eval_results

    # start train/eval flow.
    if FLAGS.mode == 'train':
        total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
        _train(total_examples // FLAGS.train_batch_size)
        if FLAGS.eval_after_training:
            _eval(eval_steps)

    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout):

            logging.info('Starting to evaluate.')
            try:
                eval_results = _eval(eval_steps)
                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (config.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info('Checkpoint %s no longer exists, skipping.', ckpt)

    elif FLAGS.mode == 'train_and_eval':
        ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
        try:
            step = int(os.path.basename(ckpt).split("-")[1])
            current_epoch = (step * FLAGS.train_batch_size //
                             FLAGS.num_examples_per_epoch)
            logging.info('found ckpt at step %d (epoch %d)', step,
                         current_epoch)
        except (IndexError, TypeError):
            logging.info("Folder has no ckpt with valid step.",
                         FLAGS.model_dir)
            current_epoch = 0

        epochs_per_cycle = 1  # higher number has less graph construction overhead.
        for e in range(current_epoch + 1, config.num_epochs + 1,
                       epochs_per_cycle):
            print('-----------------------------------------------------\n'
                  '=====> Starting training, epoch: %d.' % e)
            _train(e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size)
            print('-----------------------------------------------------\n'
                  '=====> Starting evaluation, epoch: %d.' % e)
            eval_results = _eval(eval_steps)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

    else:
        logging.info('Invalid mode: %s', FLAGS.mode)
Example #17
0
    def __init__(self, model_name=None, config=None, name=''):
        """Initialize model."""
        super().__init__(name=name)

        config = config or hparams_config.get_efficientdet_config(model_name)
        self.config = config

        # Backbone.
        backbone_name = config.backbone_name
        is_training_bn = config.is_training_bn
        if 'efficientnet' in backbone_name:
            override_params = {
                'batch_norm':
                utils.batch_norm_class(is_training_bn, config.strategy),
                'relu_fn':
                functools.partial(utils.activation_fn,
                                  act_type=config.act_type),
            }
            if 'b0' in backbone_name:
                override_params['survival_prob'] = 0.0
            if config.backbone_config is not None:
                override_params['blocks_args'] = (
                    efficientnet_builder.BlockDecoder().encode(
                        config.backbone_config.blocks))
            override_params['data_format'] = config.data_format
            self.backbone = backbone_factory.get_model(
                backbone_name, override_params=override_params)

        # Feature network.
        feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
        self.resample_layers = []  # additional resampling layers.
        for level in range(6, config.max_level + 1):
            # Adds a coarser level by downsampling the last feature map.
            self.resample_layers.append(
                ResampleFeatureMap(
                    target_height=feat_sizes[level]['height'],
                    target_width=feat_sizes[level]['width'],
                    target_num_channels=config.fpn_num_filters,
                    apply_bn=config.apply_bn_for_resampling,
                    is_training_bn=config.is_training_bn,
                    conv_after_downsample=config.conv_after_downsample,
                    strategy=config.strategy,
                    data_format=config.data_format,
                    name='resample_p%d' % level,
                ))
        self.fpn_cells = FPNCells(config)

        # class/box output prediction network.
        num_anchors = len(config.aspect_ratios) * config.num_scales
        num_filters = config.fpn_num_filters
        for head in config.heads:
            if head == 'object_detection':
                self.class_net = ClassNet(num_classes=config.num_classes,
                                          num_anchors=num_anchors,
                                          num_filters=num_filters,
                                          min_level=config.min_level,
                                          max_level=config.max_level,
                                          is_training_bn=config.is_training_bn,
                                          act_type=config.act_type,
                                          repeats=config.box_class_repeats,
                                          separable_conv=config.separable_conv,
                                          survival_prob=config.survival_prob,
                                          strategy=config.strategy,
                                          data_format=config.data_format)

                self.box_net = BoxNet(num_anchors=num_anchors,
                                      num_filters=num_filters,
                                      min_level=config.min_level,
                                      max_level=config.max_level,
                                      is_training_bn=config.is_training_bn,
                                      act_type=config.act_type,
                                      repeats=config.box_class_repeats,
                                      separable_conv=config.separable_conv,
                                      survival_prob=config.survival_prob,
                                      strategy=config.strategy,
                                      data_format=config.data_format)

            if head == 'segmentation':
                self.seg_head = SegmentationHead(
                    num_classes=config.seg_num_classes,
                    num_filters=num_filters,
                    min_level=config.min_level,
                    max_level=config.max_level,
                    is_training_bn=config.is_training_bn,
                    act_type=config.act_type,
                    strategy=config.strategy,
                    data_format=config.data_format)
Example #18
0
def main(argv):
    del argv  # Unused.

    if FLAGS.use_tpu:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        feat_sizes = utils.get_feat_sizes(config.get('image_size'),
                                          config.get('max_level'))
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = feat_sizes[level]
            if _can_partition(spatial_dim['height']) and _can_partition(
                    spatial_dim['width']):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(
        config.as_dict(),
        model_name=FLAGS.model_name,
        iterations_per_loop=FLAGS.iterations_per_loop,
        model_dir=FLAGS.model_dir,
        num_shards=num_shards,
        num_examples_per_epoch=FLAGS.num_examples_per_epoch,
        use_tpu=FLAGS.use_tpu,
        backbone_ckpt=FLAGS.backbone_ckpt,
        ckpt=FLAGS.ckpt,
        val_json_file=FLAGS.val_json_file,
        testdev_dir=FLAGS.testdev_dir,
        mode=FLAGS.mode,
    )
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.use_xla and not FLAGS.use_tpu:
        config_proto.graph_options.optimizer_options.global_jit_level = (
            tf.OptimizerOptions.ON_1)

    tpu_config = tf.estimator.tpu.TPUConfig(
        FLAGS.iterations_per_loop,
        num_shards=num_shards,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
        PER_HOST_V2)

    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        evaluation_master=FLAGS.eval_master,
        model_dir=FLAGS.model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        tpu_config=tpu_config,
        tf_random_seed=FLAGS.tf_random_seed,
    )

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)

    # TPU Estimator
    logging.info(params)
    if FLAGS.mode == 'train':
        train_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(
            input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                            is_training=True,
                                            use_fake_data=FLAGS.use_fake_data),
            max_steps=int((config.num_epochs * FLAGS.num_examples_per_epoch) /
                          FLAGS.train_batch_size))

        if FLAGS.eval_after_training:
            # Run evaluation after training finishes.
            eval_params = dict(
                params,
                use_tpu=FLAGS.use_tpu,
                input_rand_hflip=False,
                is_training_bn=False,
                precision=None,
            )
            eval_estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn_instance,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
            logging.info('Eval results: %s', eval_results)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

    elif FLAGS.mode == 'eval':
        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        eval_params = dict(
            params,
            use_tpu=FLAGS.use_tpu,
            input_rand_hflip=False,
            is_training_bn=False,
            precision=None,
        )

        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)

        def terminate_eval():
            logging.info('Terminating eval after %d seconds of no checkpoints',
                         FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
                logging.info('Eval results: %s', eval_results)

                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (config.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(config.num_epochs):
            logging.info('Starting training cycle, epoch: %d.', cycle)
            train_estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn_instance,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern,
                is_training=True,
                use_fake_data=FLAGS.use_fake_data),
                                  steps=int(FLAGS.num_examples_per_epoch /
                                            FLAGS.train_batch_size))

            logging.info('Starting evaluation cycle, epoch: %d.', cycle)
            # Run evaluation after every epoch.
            eval_params = dict(
                params,
                use_tpu=FLAGS.use_tpu,
                input_rand_hflip=False,
                is_training_bn=False,
            )

            eval_estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn_instance,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
            logging.info('Evaluation results: %s', eval_results)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

    else:
        logging.info('Mode not found.')
Example #19
0
def build_feature_network(features, config):
  """Build FPN input features.

  Args:
   features: input tensor.
   config: a dict-like config, including all parameters.

  Returns:
    A dict from levels to the feature maps processed after feature network.
  """
  feat_sizes = utils.get_feat_sizes(config.image_size, config.max_level)
  feats = []
  if config.min_level not in features.keys():
    raise ValueError('features.keys ({}) should include min_level ({})'.format(
        features.keys(), config.min_level))

  # Build additional input features that are not from backbone.
  for level in range(config.min_level, config.max_level + 1):
    if level in features.keys():
      feats.append(features[level])
    else:
      h_id, w_id = (2, 3) if config.data_format == 'channels_first' else (1, 2)
      # Adds a coarser level by downsampling the last feature map.
      feats.append(
          resample_feature_map(
              feats[-1],
              name='p%d' % level,
              target_height=(feats[-1].shape[h_id] - 1) // 2 + 1,
              target_width=(feats[-1].shape[w_id] - 1) // 2 + 1,
              target_num_channels=config.fpn_num_filters,
              apply_bn=config.apply_bn_for_resampling,
              is_training=config.is_training_bn,
              conv_after_downsample=config.conv_after_downsample,
              use_native_resize_op=config.use_native_resize_op,
              pooling_type=config.pooling_type,
              use_tpu=config.use_tpu,
              data_format=config.data_format
          ))

  _verify_feats_size(
      feats,
      feat_sizes=feat_sizes,
      min_level=config.min_level,
      max_level=config.max_level,
      data_format=config.data_format)

  with tf.variable_scope('fpn_cells'):
    nodes = list()
    id = count(5)

    ends = {
        '3': list(),
        '4': list(),
        '5': list(),
        '6': list(),
        '7': list()
    }

    for _ in range(config.fpn_cell_repeats):
        build_fpn(nodes,ends,id)
    connect_fpn(nodes,ends)

    p = hparams_config.Config()
    p.nodes =nodes
    p.weight_method = 'fastattn'

    new_feats = build_bifpn_layer(feats, feat_sizes, config,p)

    feats = [new_feats[level] for level in range(config.min_level, config.max_level + 1)]

    _verify_feats_size(
        feats,
        feat_sizes=feat_sizes,
        min_level=config.min_level,
        max_level=config.max_level,
        data_format=config.data_format)

  return new_feats