def extract_features(self, preprocessed_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    Raises:
      ValueError if conv_defs is not provided or from_layer does not meet the
        size requirement.
    """

        if not self._conv_defs:
            raise ValueError('Must provide backbone conv defs.')

        if len(self._from_layer) != 2:
            raise ValueError('SSD input feature names are not provided.')

        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        feature_map_layout = {
            'from_layer':
            [self._from_layer[0], self._from_layer[1], '', '', '', ''],
            'layer_depth': [-1, -1, 512, 256, 256, 128],
            'use_depthwise':
            self._use_depthwise,
            'use_explicit_padding':
            self._use_explicit_padding,
        }

        with tf.variable_scope(self._scope_name,
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(
                mobilenet_v3.training_scope(is_training=None, bn_decay=0.9997)), \
                slim.arg_scope(
                    [mobilenet.depth_multiplier], min_depth=self._min_depth):
                with (slim.arg_scope(self._conv_hyperparams_fn())
                      if self._override_base_feature_extractor_hyperparams else
                      context_manager.IdentityContextManager()):
                    _, image_features = mobilenet_v3.mobilenet_base(
                        ops.pad_to_multiple(preprocessed_inputs,
                                            self._pad_to_multiple),
                        conv_defs=self._conv_defs,
                        final_endpoint=self._from_layer[1],
                        depth_multiplier=self._depth_multiplier,
                        use_explicit_padding=self._use_explicit_padding,
                        scope=scope)
                with slim.arg_scope(self._conv_hyperparams_fn()):
                    feature_maps = feature_map_generators.multi_resolution_feature_maps(
                        feature_map_layout=feature_map_layout,
                        depth_multiplier=self._depth_multiplier,
                        min_depth=self._min_depth,
                        insert_1x1_conv=True,
                        image_features=image_features)

        return list(feature_maps.values())
    def extract_features(self, preprocessed_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        feature_map_layout = {
            'from_layer':
            ['layer_15/expansion_output', 'layer_19', '', '', '', ''],
            'layer_depth': [-1, -1, 512, 256, 256, 128],
            'use_depthwise': self._use_depthwise,
            'use_explicit_padding': self._use_explicit_padding,
        }

        if self._num_layers == 7:
            feature_map_layout['from_layer'] += ['']
            feature_map_layout['layer_depth'] += [64]

        with tf.variable_scope('MobilenetV3',
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(
                mobilenet_v3.training_scope(is_training=None, bn_decay=0.9997)), \
                slim.arg_scope(
                    [mobilenet.depth_multiplier], min_depth=self._min_depth):
                with (slim.arg_scope(self._conv_hyperparams_fn())
                      if self._override_base_feature_extractor_hyperparams else
                      context_manager.IdentityContextManager()):
                    _, image_features = mobilenet_v3.mobilenet_base(
                        ops.pad_to_multiple(preprocessed_inputs,
                                            self._pad_to_multiple),
                        final_endpoint='layer_19',
                        depth_multiplier=self._depth_multiplier,
                        use_explicit_padding=self._use_explicit_padding,
                        squeeze_excitation=self._squeeze_excitation,
                        scope=scope)
                with slim.arg_scope(self._conv_hyperparams_fn()):
                    feature_maps = feature_map_generators.multi_resolution_feature_maps(
                        feature_map_layout=feature_map_layout,
                        depth_multiplier=self._depth_multiplier,
                        min_depth=self._min_depth,
                        insert_1x1_conv=True,
                        image_features=image_features)

        return feature_maps.values()
Esempio n. 3
0
def _mobilenet_v3(net,
                  depth_multiplier,
                  output_stride,
                  conv_defs=None,
                  divisible_by=None,
                  reuse=None,
                  scope=None,
                  final_endpoint=None):
    """Auxiliary function to build mobilenet v3.

  Args:
    net: Input tensor of shape [batch_size, height, width, channels].
    depth_multiplier: Float multiplier for the depth (number of channels)
      for all convolution ops. The value must be greater than zero. Typical
      usage will be to set this value in (0, 1) to reduce the number of
      parameters or computation cost of the model.
    output_stride: An integer that specifies the requested ratio of input to
      output spatial resolution. If not None, then we invoke atrous convolution
      if necessary to prevent the network from reducing the spatial resolution
      of the activation maps. Allowed values are 8 (accurate fully convolutional
      mode), 16 (fast fully convolutional mode), 32 (classification mode).
    conv_defs: A list of ConvDef namedtuples specifying the net architecture.
    divisible_by: None (use default setting) or an integer that ensures all
      layers # channels will be divisible by this number. Used in MobileNet.
    reuse: Reuse model variables.
    scope: Optional variable scope.
    final_endpoint: The endpoint to construct the network up to.

  Returns:
    net: The output tensor.
    end_points: A set of activations for external use.

  Raises:
    ValueError: If conv_defs or final_endpoint is not specified.
  """
    del divisible_by
    with tf.variable_scope(scope, 'MobilenetV3', [net], reuse=reuse) as scope:
        if conv_defs is None:
            raise ValueError('conv_defs must be specified for mobilenet v3.')
        if final_endpoint is None:
            raise ValueError(
                'Final endpoint must be specified for mobilenet v3.')
        net, end_points = mobilenet_v3.mobilenet_base(
            net,
            depth_multiplier=depth_multiplier,
            conv_defs=conv_defs,
            output_stride=output_stride,
            final_endpoint=final_endpoint,
            scope=scope)

        return net, end_points
    def _extract_proposal_features(self, preprocessed_inputs, scope):
        """Extracts first stage RPN features.

    Args:
      preprocessed_inputs: A [batch, height, width, channels] float32 tensor
        representing a batch of images.
      scope: A scope name.

    Returns:
      rpn_feature_map: A tensor with shape [batch, height, width, depth]
      activations: A dictionary mapping feature extractor tensor names to
        tensors

    Raises:
      InvalidArgumentError: If the spatial size of `preprocessed_inputs`
        (height or width) is less than 33.
      ValueError: If the created network is missing the required activation.
    """

        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        with tf.variable_scope('MobilenetV3',
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(
                mobilenet_v3.training_scope(is_training=None, bn_decay=0.9997)), \
                slim.arg_scope(
                    [mobilenet.depth_multiplier], min_depth=self._min_depth):
                _, activations = mobilenet_v3.mobilenet_base(
                    preprocessed_inputs,
                    conv_defs=mobilenet_v3.V3_LARGE_DETECTION,
                    final_endpoint='layer_17',
                    depth_multiplier=self._depth_multiplier,
                    scope=scope)

        return activations['layer_17'], activations