Example #1
0
 def test_die_on_tensor_shape_with_rank_three(self):
   tensor_shape = tf.TensorShape(dims=[32, 299, 384])
   with self.assertRaises(ValueError):
     static_shape.get_batch_size(tensor_shape)
     static_shape.get_height(tensor_shape)
     static_shape.get_width(tensor_shape)
     static_shape.get_depth(tensor_shape)
Example #2
0
  def build(self, input_shapes):
    """Creates the variables of the layer."""
    if len(input_shapes) != len(self._prediction_heads[BOX_ENCODINGS]):
      raise ValueError('This box predictor was constructed with %d heads,'
                       'but there are %d inputs.' %
                       (len(self._prediction_heads[BOX_ENCODINGS]),
                        len(input_shapes)))
    for stack_index, input_shape in enumerate(input_shapes):
      net = []

      # Add additional conv layers before the class predictor.
      features_depth = static_shape.get_depth(input_shape)
      depth = max(min(features_depth, self._max_depth), self._min_depth)
      tf.logging.info(
          'depth of additional conv before box predictor: {}'.format(depth))

      if depth > 0 and self._num_layers_before_predictor > 0:
        for i in range(self._num_layers_before_predictor):
          net.append(keras.Conv2D(depth, [1, 1],
                                  name='SharedConvolutions_%d/Conv2d_%d_1x1_%d'
                                  % (stack_index, i, depth),
                                  padding='SAME',
                                  **self._conv_hyperparams.params()))
          net.append(self._conv_hyperparams.build_batch_norm(
              training=(self._is_training and not self._freeze_batchnorm),
              name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_norm'
              % (stack_index, i, depth)))
          net.append(self._conv_hyperparams.build_activation_layer(
              name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_activation'
              % (stack_index, i, depth),
          ))
      # Until certain bugs are fixed in checkpointable lists,
      # this net must be appended only once it's been filled with layers
      self._shared_nets.append(net)
    self.built = True
Example #3
0
def pad_to_multiple(tensor, multiple):
  """Returns the tensor zero padded to the specified multiple.

  Appends 0s to the end of the first and second dimension (height and width) of
  the tensor until both dimensions are a multiple of the input argument
  'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input
  multiple of 4, PadToMultiple will append 0s so that the resulting tensor will
  be of shape [1, 4, 8, 1].

  Args:
    tensor: rank 4 float32 tensor, where
            tensor -> [batch_size, height, width, channels].
    multiple: the multiple to pad to.

  Returns:
    padded_tensor: the tensor zero padded to the specified multiple.
  """
  tensor_shape = tensor.get_shape()
  batch_size = static_shape.get_batch_size(tensor_shape)
  tensor_height = static_shape.get_height(tensor_shape)
  tensor_width = static_shape.get_width(tensor_shape)
  tensor_depth = static_shape.get_depth(tensor_shape)

  if batch_size is None:
    batch_size = tf.shape(tensor)[0]

  if tensor_height is None:
    tensor_height = tf.shape(tensor)[1]
    padded_tensor_height = tf.to_int32(
        tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_height = int(
        math.ceil(float(tensor_height) / multiple) * multiple)

  if tensor_width is None:
    tensor_width = tf.shape(tensor)[2]
    padded_tensor_width = tf.to_int32(
        tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_width = int(
        math.ceil(float(tensor_width) / multiple) * multiple)

  if tensor_depth is None:
    tensor_depth = tf.shape(tensor)[3]

  # Use tf.concat instead of tf.pad to preserve static shape
  if padded_tensor_height != tensor_height:
    height_pad = tf.zeros([
        batch_size, padded_tensor_height - tensor_height, tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, height_pad], 1)
  if padded_tensor_width != tensor_width:
    width_pad = tf.zeros([
        batch_size, padded_tensor_height, padded_tensor_width - tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, width_pad], 2)

  return tensor
Example #4
0
def pad_to_multiple(tensor, multiple):
  """Returns the tensor zero padded to the specified multiple.

  Appends 0s to the end of the first and second dimension (height and width) of
  the tensor until both dimensions are a multiple of the input argument
  'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input
  multiple of 4, PadToMultiple will append 0s so that the resulting tensor will
  be of shape [1, 4, 8, 1].

  Args:
    tensor: rank 4 float32 tensor, where
            tensor -> [batch_size, height, width, channels].
    multiple: the multiple to pad to.

  Returns:
    padded_tensor: the tensor zero padded to the specified multiple.
  """
  tensor_shape = tensor.get_shape()
  batch_size = static_shape.get_batch_size(tensor_shape)
  tensor_height = static_shape.get_height(tensor_shape)
  tensor_width = static_shape.get_width(tensor_shape)
  tensor_depth = static_shape.get_depth(tensor_shape)

  if batch_size is None:
    batch_size = tf.shape(tensor)[0]

  if tensor_height is None:
    tensor_height = tf.shape(tensor)[1]
    padded_tensor_height = tf.to_int32(
        tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_height = int(
        math.ceil(float(tensor_height) / multiple) * multiple)

  if tensor_width is None:
    tensor_width = tf.shape(tensor)[2]
    padded_tensor_width = tf.to_int32(
        tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_width = int(
        math.ceil(float(tensor_width) / multiple) * multiple)

  if tensor_depth is None:
    tensor_depth = tf.shape(tensor)[3]

  # Use tf.concat instead of tf.pad to preserve static shape
  if padded_tensor_height != tensor_height:
    height_pad = tf.zeros([
        batch_size, padded_tensor_height - tensor_height, tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, height_pad], 1)
  if padded_tensor_width != tensor_width:
    width_pad = tf.zeros([
        batch_size, padded_tensor_height, padded_tensor_width - tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, width_pad], 2)

  return tensor
Example #5
0
  def _predict(self, image_features, num_predictions_per_location):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
    features_depth = static_shape.get_depth(image_features.get_shape())
    depth = max(min(features_depth, self._max_depth), self._min_depth)

    # Add a slot for the background class.
    num_class_slots = self.num_classes + 1
    net = image_features
    with slim.arg_scope(self._conv_hyperparams), \
         slim.arg_scope([slim.dropout], is_training=self._is_training):
      # Add additional conv layers before the predictor.
      if depth > 0 and self._num_layers_before_predictor > 0:
        for i in range(self._num_layers_before_predictor):
          net = slim.conv2d(
              net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
      with slim.arg_scope([slim.conv2d], activation_fn=None,
                          normalizer_fn=None, normalizer_params=None):
        box_encodings = slim.conv2d(
            net, num_predictions_per_location * self._box_code_size,
            [self._kernel_size, self._kernel_size],
            scope='BoxEncodingPredictor')
        if self._use_dropout:
          net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
        class_predictions_with_background = slim.conv2d(
            net, num_predictions_per_location * num_class_slots,
            [self._kernel_size, self._kernel_size], scope='ClassPredictor')
        if self._apply_sigmoid_to_scores:
          class_predictions_with_background = tf.sigmoid(
              class_predictions_with_background)

    batch_size = static_shape.get_batch_size(image_features.get_shape())
    if batch_size is None:
      features_height = static_shape.get_height(image_features.get_shape())
      features_width = static_shape.get_width(image_features.get_shape())
      flattened_predictions_size = (features_height * features_width *
                                    num_predictions_per_location)
      box_encodings = tf.reshape(
          box_encodings,
          [-1, flattened_predictions_size, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background,
          [-1, flattened_predictions_size, num_class_slots])
    else:
      box_encodings = tf.reshape(
          box_encodings, [batch_size, -1, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background, [batch_size, -1, num_class_slots])
    return {BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background}
    def _predict(self, image_features, num_predictions_per_location):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)

        # Add a slot for the background class.
        num_class_slots = self.num_classes + 1
        net_cls = image_features
        net_reg = image_features
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
            # Add additional conv layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    if self._use_depthwise_before_predictor:
                        net_cls = slim.separable_conv2d(
                            net_cls,
                            None, [3, 3],
                            depth_multiplier=1,
                            padding='SAME',
                            reuse=tf.AUTO_REUSE,
                            scope='Conv2d_cls_%d_3x3_%d_depthwise' %
                            (i, depth))
                        net_cls = slim.conv2d(net_cls,
                                              depth, [1, 1],
                                              padding='SAME',
                                              reuse=tf.AUTO_REUSE,
                                              scope='Conv2d_cls_%d_1x1_%d' %
                                              (i, depth))
                        net_reg = slim.separable_conv2d(
                            net_reg,
                            None, [3, 3],
                            depth_multiplier=1,
                            padding='SAME',
                            reuse=tf.AUTO_REUSE,
                            scope='Conv2d_reg_%d_3x3_%d_depthwise' %
                            (i, depth))
                        net_reg = slim.conv2d(net_reg,
                                              depth, [1, 1],
                                              padding='SAME',
                                              reuse=tf.AUTO_REUSE,
                                              scope='Conv2d_reg_%d_1x1_%d' %
                                              (i, depth))
                    else:
                        net_cls = slim.conv2d(net_cls,
                                              depth, [3, 3],
                                              padding='SAME',
                                              reuse=tf.AUTO_REUSE,
                                              scope='Conv2d_cls_%d_3x3_%d' %
                                              (i, depth))
                        net_reg = slim.conv2d(net_reg,
                                              depth, [3, 3],
                                              padding='SAME',
                                              reuse=tf.AUTO_REUSE,
                                              scope='Conv2d_reg_%d_3x3_%d' %
                                              (i, depth))

            with slim.arg_scope([slim.conv2d],
                                activation_fn=None,
                                normalizer_fn=None,
                                normalizer_params=None):
                box_encodings = slim.conv2d(
                    net_reg,
                    num_predictions_per_location * self._box_code_size,
                    [self._kernel_size, self._kernel_size],
                    reuse=tf.AUTO_REUSE,
                    scope='BoxEncodingPredictor')
                if self._use_dropout:
                    net_cls = slim.dropout(net_cls,
                                           keep_prob=self._dropout_keep_prob)
                class_predictions_with_background = slim.conv2d(
                    net_cls,
                    num_predictions_per_location * num_class_slots,
                    [self._kernel_size, self._kernel_size],
                    reuse=tf.AUTO_REUSE,
                    scope='ClassPredictor')
                if self._apply_sigmoid_to_scores:
                    class_predictions_with_background = tf.sigmoid(
                        class_predictions_with_background)

        combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
            image_features)
        box_encodings = tf.reshape(
            box_encodings,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                1, self._box_code_size
            ]))
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                num_class_slots
            ]))
        return {
            BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
        }
    def _predict(self, image_features, num_predictions_per_location):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
        features_height = static_shape.get_height(image_features.get_shape())
        features_width = static_shape.get_width(image_features.get_shape())
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)

        # Add a slot for the background class.
        num_class_slots = self.num_classes + 1
        net = image_features
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
            # Add additional conv layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.conv2d(net,
                                      depth, [1, 1],
                                      scope='Conv2d_%d_1x1_%d' % (i, depth))

            with tf.variable_scope('gradientconv2d'):
                box_encodings = gradient_conv2d411(
                    inputs=net,
                    inputs_hwc=[
                        features_height, features_width, features_depth
                    ],
                    num_outputs=num_predictions_per_location *
                    self._box_code_size,
                    scope='BoxEncodingPredictor')
                if self._use_dropout:
                    net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
                class_predictions_with_background = gradient_conv2d411(
                    inputs=net,
                    inputs_hwc=[
                        features_height, features_width, features_depth
                    ],
                    num_outputs=num_predictions_per_location * num_class_slots,
                    scope='ClassPredictor')
                if self._apply_sigmoid_to_scores:
                    class_predictions_with_background = tf.sigmoid(
                        class_predictions_with_background)

        combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
            image_features)
        box_encodings = tf.reshape(
            box_encodings,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                1, self._box_code_size
            ]))
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                num_class_slots
            ]))
        return {
            BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
        }
Example #8
0
    def _predict(self, image_features, num_predictions_per_location_list):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
        width_i, channels_i] containing features for a batch of images.
      num_predictions_per_location_list: A list of integers representing the
        number of box predictions to be made per spatial location for each
        feature map.

    Returns:
      box_encodings: A list of float tensors of shape
        [batch_size, num_anchors_i, q, code_size] representing the location of
        the objects, where q is 1 or the number of classes. Each entry in the
        list corresponds to a feature map in the input `image_features` list.
      class_predictions_with_background: A list of float tensors of shape
        [batch_size, num_anchors_i, num_classes + 1] representing the class
        predictions for the proposals. Each entry in the list corresponds to a
        feature map in the input `image_features` list.
    """
        box_encodings_list = []
        class_predictions_list = []
        # TODO(rathodv): Come up with a better way to generate scope names
        # in box predictor once we have time to retrain all models in the zoo.
        # The following lines create scope names to be backwards compatible with the
        # existing checkpoints.
        box_predictor_scopes = [_NoopVariableScope()]
        if len(image_features) > 1:
            box_predictor_scopes = [
                tf.variable_scope('BoxPredictor_{}'.format(i))
                for i in range(len(image_features))
            ]

        for (image_feature, num_predictions_per_location,
             box_predictor_scope) in zip(image_features,
                                         num_predictions_per_location_list,
                                         box_predictor_scopes):
            with box_predictor_scope:
                # Add a slot for the background class.
                num_class_slots = self.num_classes + 1
                net = image_feature
                with slim.arg_scope(self._conv_hyperparams_fn()), \
                     slim.arg_scope([slim.dropout], is_training=self._is_training):
                    # Add additional conv layers before the class predictor.
                    features_depth = static_shape.get_depth(
                        image_feature.get_shape())
                    depth = max(min(features_depth, self._max_depth),
                                self._min_depth)
                    tf.logging.info(
                        'depth of additional conv before box predictor: {}'.
                        format(depth))
                    if depth > 0 and self._num_layers_before_predictor > 0:
                        for i in range(self._num_layers_before_predictor):
                            net = slim.conv2d(net,
                                              depth, [1, 1],
                                              scope='Conv2d_%d_1x1_%d' %
                                              (i, depth))
                    with slim.arg_scope([slim.conv2d],
                                        activation_fn=None,
                                        normalizer_fn=None,
                                        normalizer_params=None):
                        if self._use_depthwise:
                            box_encodings = slim.separable_conv2d(
                                net,
                                None, [self._kernel_size, self._kernel_size],
                                padding='SAME',
                                depth_multiplier=1,
                                stride=1,
                                rate=1,
                                scope='BoxEncodingPredictor_depthwise')
                            box_encodings = slim.conv2d(
                                box_encodings,
                                num_predictions_per_location *
                                self._box_code_size, [1, 1],
                                scope='BoxEncodingPredictor')
                        else:
                            box_encodings = slim.conv2d(
                                net,
                                num_predictions_per_location *
                                self._box_code_size,
                                [self._kernel_size, self._kernel_size],
                                scope='BoxEncodingPredictor')
                        if self._use_dropout:
                            net = slim.dropout(
                                net, keep_prob=self._dropout_keep_prob)
                        if self._use_depthwise:
                            class_predictions_with_background = slim.separable_conv2d(
                                net,
                                None, [self._kernel_size, self._kernel_size],
                                padding='SAME',
                                depth_multiplier=1,
                                stride=1,
                                rate=1,
                                scope='ClassPredictor_depthwise')
                            class_predictions_with_background = slim.conv2d(
                                class_predictions_with_background,
                                num_predictions_per_location * num_class_slots,
                                [1, 1],
                                scope='ClassPredictor')
                        else:
                            class_predictions_with_background = slim.conv2d(
                                net,
                                num_predictions_per_location * num_class_slots,
                                [self._kernel_size, self._kernel_size],
                                scope='ClassPredictor',
                                biases_initializer=tf.constant_initializer(
                                    self._class_prediction_bias_init))
                        if self._apply_sigmoid_to_scores:
                            class_predictions_with_background = tf.sigmoid(
                                class_predictions_with_background)

                combined_feature_map_shape = (
                    shape_utils.combined_static_and_dynamic_shape(
                        image_feature))
                box_encodings = tf.reshape(
                    box_encodings,
                    tf.stack([
                        combined_feature_map_shape[0],
                        combined_feature_map_shape[1] *
                        combined_feature_map_shape[2] *
                        num_predictions_per_location, 1, self._box_code_size
                    ]))
                box_encodings_list.append(box_encodings)
                class_predictions_with_background = tf.reshape(
                    class_predictions_with_background,
                    tf.stack([
                        combined_feature_map_shape[0],
                        combined_feature_map_shape[1] *
                        combined_feature_map_shape[2] *
                        num_predictions_per_location, num_class_slots
                    ]))
                class_predictions_list.append(
                    class_predictions_with_background)
        return {
            BOX_ENCODINGS: box_encodings_list,
            CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_list
        }
Example #9
0
    def _predict(self, image_features, num_predictions_per_location):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)

        # Add a slot for the background class.
        num_class_slots = self.num_classes + 1
        net = image_features

        end_points_collection = self._scope.name + '_end_points'
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training), \
             slim.arg_scope([slim.conv2d], trainable=self._is_training,
                            outputs_collections=end_points_collection):
            # Add additional conv layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.conv2d(net,
                                      depth, [1, 1],
                                      scope='Conv2d_%d_1x1_%d' % (i, depth))
            with slim.arg_scope([slim.conv2d],
                                activation_fn=None,
                                normalizer_fn=None,
                                normalizer_params=None):
                box_encodings = slim.conv2d(
                    net,
                    num_predictions_per_location * self._box_code_size,
                    [self._kernel_size, self._kernel_size],
                    scope='BoxEncodingPredictor')
                if self._use_dropout:
                    net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
                class_predictions_with_background = slim.conv2d(
                    net,
                    num_predictions_per_location * num_class_slots,
                    [self._kernel_size, self._kernel_size],
                    scope='ClassPredictor')
                if self._apply_sigmoid_to_scores:
                    class_predictions_with_background = tf.sigmoid(
                        class_predictions_with_background)

        combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
            image_features)
        box_encodings = tf.reshape(
            box_encodings,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                1, self._box_code_size
            ]))
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                num_class_slots
            ]))

        # TODO: If TF's version is updated, just use clear_collection argument for
        # convert_collection_to_dict (current: 1.2.1).
        end_points = slim.utils.convert_collection_to_dict(
            end_points_collection)
        framework_ops.get_default_graph().clear_collection(
            end_points_collection)

        return {
            BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
        }
Example #10
0
 def test_return_correct_depth(self):
   tensor_shape = tf.TensorShape(dims=[32, 299, 384, 3])
   self.assertEqual(3, static_shape.get_depth(tensor_shape))
Example #11
0
  def _predict(self, image_features, num_predictions_per_location_list):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
        width_i, channels_i] containing features for a batch of images.
      num_predictions_per_location_list: A list of integers representing the
        number of box predictions to be made per spatial location for each
        feature map.

    Returns:
      box_encodings: A list of float tensors of shape
        [batch_size, num_anchors_i, q, code_size] representing the location of
        the objects, where q is 1 or the number of classes. Each entry in the
        list corresponds to a feature map in the input `image_features` list.
      class_predictions_with_background: A list of float tensors of shape
        [batch_size, num_anchors_i, num_classes + 1] representing the class
        predictions for the proposals. Each entry in the list corresponds to a
        feature map in the input `image_features` list.
    """
    box_encodings_list = []
    class_predictions_list = []
    # TODO(rathodv): Come up with a better way to generate scope names
    # in box predictor once we have time to retrain all models in the zoo.
    # The following lines create scope names to be backwards compatible with the
    # existing checkpoints.
    box_predictor_scopes = [_NoopVariableScope()]
    if len(image_features) > 1:
      box_predictor_scopes = [
          tf.variable_scope('BoxPredictor_{}'.format(i))
          for i in range(len(image_features))
      ]

    for (image_feature,
         num_predictions_per_location, box_predictor_scope) in zip(
             image_features, num_predictions_per_location_list,
             box_predictor_scopes):
      with box_predictor_scope:
        # Add a slot for the background class.
        num_class_slots = self.num_classes + 1
        net = image_feature
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
          # Add additional conv layers before the class predictor.
          features_depth = static_shape.get_depth(image_feature.get_shape())
          depth = max(min(features_depth, self._max_depth), self._min_depth)
          tf.logging.info('depth of additional conv before box predictor: {}'.
                          format(depth))
          if depth > 0 and self._num_layers_before_predictor > 0:
            for i in range(self._num_layers_before_predictor):
              net = slim.conv2d(
                  net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
          with slim.arg_scope([slim.conv2d], activation_fn=None,
                              normalizer_fn=None, normalizer_params=None):
            if self._use_depthwise:
              box_encodings = slim.separable_conv2d(
                  net, None, [self._kernel_size, self._kernel_size],
                  padding='SAME', depth_multiplier=1, stride=1,
                  rate=1, scope='BoxEncodingPredictor_depthwise')
              box_encodings = slim.conv2d(
                  box_encodings,
                  num_predictions_per_location * self._box_code_size, [1, 1],
                  scope='BoxEncodingPredictor')
            else:
              box_encodings = slim.conv2d(
                  net, num_predictions_per_location * self._box_code_size,
                  [self._kernel_size, self._kernel_size],
                  scope='BoxEncodingPredictor')
            if self._use_dropout:
              net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
            if self._use_depthwise:
              class_predictions_with_background = slim.separable_conv2d(
                  net, None, [self._kernel_size, self._kernel_size],
                  padding='SAME', depth_multiplier=1, stride=1,
                  rate=1, scope='ClassPredictor_depthwise')
              class_predictions_with_background = slim.conv2d(
                  class_predictions_with_background,
                  num_predictions_per_location * num_class_slots,
                  [1, 1], scope='ClassPredictor')
            else:
              class_predictions_with_background = slim.conv2d(
                  net, num_predictions_per_location * num_class_slots,
                  [self._kernel_size, self._kernel_size],
                  scope='ClassPredictor',
                  biases_initializer=tf.constant_initializer(
                      self._class_prediction_bias_init))
            if self._apply_sigmoid_to_scores:
              class_predictions_with_background = tf.sigmoid(
                  class_predictions_with_background)

        combined_feature_map_shape = (shape_utils.
                                      combined_static_and_dynamic_shape(
                                          image_feature))
        box_encodings = tf.reshape(
            box_encodings, tf.stack([combined_feature_map_shape[0],
                                     combined_feature_map_shape[1] *
                                     combined_feature_map_shape[2] *
                                     num_predictions_per_location,
                                     1, self._box_code_size]))
        box_encodings_list.append(box_encodings)
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background,
            tf.stack([combined_feature_map_shape[0],
                      combined_feature_map_shape[1] *
                      combined_feature_map_shape[2] *
                      num_predictions_per_location,
                      num_class_slots]))
        class_predictions_list.append(class_predictions_with_background)
    return {
        BOX_ENCODINGS: box_encodings_list,
        CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_list
    }
Example #12
0
    def _predict_class(self,
                       image_features,
                       activation_fn=None,
                       with_background=False,
                       scope=None):
        """Computes encoded object classes (without background).

    Flattens image_features and applies fully connected ops (with no
    non-linearity) to predict class predictions.  In this
    setting, anchors are not spatially arranged in any way and are assumed to
    have been folded into the batch dimension.  Thus we output 1 for the
    anchors dimension.

    Args:
      image_features: A float tensor of shape [batch_size, height, width, channels]
      containing features for a batch of images.

    Returns:
      A dictionary containing the following tensors.
        class_predictions: A float tensor of shape
          [batch_size, 1, num_classes] representing the class
          predictions for the proposals.
      If predict_masks is True the dictionary also contains:
        instance_masks: A float tensor of shape
          [batch_size, 1, num_classes, image_height, image_width]
      If predict_keypoints is True the dictionary also contains:
        keypoints: [batch_size, 1, num_keypoints, 2]

    Raises:
      ValueError: if num_predictions_per_location is not 1.
    """
        num_classes = self.num_classes
        if with_background:
            num_classes += 1
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)

        roi_features = image_features
        if self._spatial_average:
            roi_features = tf.reduce_mean(roi_features, [1, 2],
                                          keep_dims=True,
                                          name='AvgPool')

        # net = slim.flatten(roi_features)
        n_batch = roi_features.get_shape().as_list()[0]
        if n_batch == None:
            is_empty = tf.equal(tf.size(roi_features), 0)
            h = roi_features.get_shape().as_list()[1]
            w = roi_features.get_shape().as_list()[2]
            c = roi_features.get_shape().as_list()[3]
            net = tf.cond(is_empty,
                          lambda: tf.zeros([0, h * w * c], tf.float32),
                          lambda: slim.flatten(roi_features))
        else:
            net = slim.flatten(roi_features)

        end_points_collection = self._scope.name + '_end_points'
        with slim.arg_scope(self._fc_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training), \
             slim.arg_scope([slim.fully_connected], trainable=self._is_training,
                            outputs_collections=end_points_collection):
            # Add additional fc layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.fully_connected(net,
                                               depth,
                                               scope='FC_%d_%d' % (i, depth))
                    if self._use_dropout:
                        net = slim.dropout(net,
                                           keep_prob=self._dropout_keep_prob)
            class_predictions = slim.fully_connected(
                net,
                num_classes,
                activation_fn=activation_fn,
                scope='ClassPredictor')
        class_predictions = tf.reshape(class_predictions, [-1, 1, num_classes])

        if with_background:
            predictions_dict = {
                CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions
            }
        else:
            predictions_dict = {CLASS_PREDICTIONS: class_predictions}

        end_points = slim.utils.convert_collection_to_dict(
            end_points_collection)
        framework_ops.get_default_graph().clear_collection(
            end_points_collection)
        return predictions_dict
    def _predict(self,
                 image_features,
                 num_predictions_per_location_list,
                 audio_features=None):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
        width_i, channels_i] containing features for a batch of images.
      num_predictions_per_location_list: A list of integers representing the
        number of box predictions to be made per spatial location for each
        feature map.

    Returns:
      box_encodings: A list of float tensors of shape
        [batch_size, num_anchors_i, q, code_size] representing the location of
        the objects, where q is 1 or the number of classes. Each entry in the
        list corresponds to a feature map in the input `image_features` list.
      class_predictions_with_background: A list of float tensors of shape
        [batch_size, num_anchors_i, num_classes + 1] representing the class
        predictions for the proposals. Each entry in the list corresponds to a
        feature map in the input `image_features` list.
    """
        predictions = {
            BOX_ENCODINGS: [],
            CLASS_PREDICTIONS_WITH_BACKGROUND: [],
        }
        for head_name in self._other_heads.keys():
            predictions[head_name] = []
        # TODO(rathodv): Come up with a better way to generate scope names
        # in box predictor once we have time to retrain all models in the zoo.
        # The following lines create scope names to be backwards compatible with the
        # existing checkpoints.
        box_predictor_scopes = [_NoopVariableScope()]
        if len(image_features) > 1:
            box_predictor_scopes = [
                tf.variable_scope('BoxPredictor_{}'.format(i))
                for i in range(len(image_features))
            ]

        if (audio_features != None):
            for (image_feature, num_predictions_per_location,
                 box_predictor_scope) in zip(
                     image_features, num_predictions_per_location_list,
                     box_predictor_scopes):
                net = image_feature
                audio_feature = audio_features['fc5']
                with box_predictor_scope:
                    with slim.arg_scope(self._conv_hyperparams_fn()):
                        with slim.arg_scope([slim.dropout],
                                            is_training=self._is_training):
                            # Add additional conv layers before the class predictor.
                            features_depth = static_shape.get_depth(
                                image_feature.get_shape())
                            depth = max(min(features_depth, self._max_depth),
                                        self._min_depth)
                            tf.logging.info(
                                'depth of additional conv before box predictor: {}'
                                .format(depth))
                            if depth > 0 and self._num_layers_before_predictor > 0:
                                for i in range(
                                        self._num_layers_before_predictor):
                                    net = slim.conv2d(
                                        net,
                                        depth, [1, 1],
                                        reuse=tf.AUTO_REUSE,
                                        scope='Conv2d_%d_1x1_%d' % (i, depth))

                            # M: video + audio
                            shape = net.get_shape()
                            print("before fusion: feature_map",
                                  net.get_shape())
                            print("before fusion: audio_feature",
                                  audio_feature.get_shape())
                            # for test
                            #inputs = np.random.rand(4,1,1,256).astype(np.float32)
                            #audio_feature = [tf.placeholder_with_default(v, v.shape) for v in inputs]
                            extended_audio = tf.tile(
                                audio_feature, [1, shape[1], shape[2], 1])
                            f_net = tf.concat([net, extended_audio], 3)
                            print("after fusion: new feature_map",
                                  f_net.get_shape())

                            # do 1 x 1 convolution
                            # check out the code in the old model
                            #f_net = slim.conv2d(f_net, shape[3], [1, 1], scope='fusion')

                            sorted_keys = sorted(self._other_heads.keys())
                            sorted_keys.append(BOX_ENCODINGS)
                            sorted_keys.append(
                                CLASS_PREDICTIONS_WITH_BACKGROUND)
                            for head_name in sorted_keys:
                                if head_name == BOX_ENCODINGS:
                                    head_obj = self._box_prediction_head
                                    prediction = head_obj.predict(
                                        features=net,
                                        num_predictions_per_location=
                                        num_predictions_per_location)
                                elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
                                    head_obj = self._class_prediction_head
                                    prediction = head_obj.predict(
                                        features=f_net,
                                        num_predictions_per_location=
                                        num_predictions_per_location)
                                else:
                                    head_obj = self._other_heads[head_name]
                                    prediction = head_obj.predict(
                                        features=net,
                                        num_predictions_per_location=
                                        num_predictions_per_location)

                                predictions[head_name].append(prediction)

        else:
            for (image_feature, num_predictions_per_location,
                 box_predictor_scope) in zip(
                     image_features, num_predictions_per_location_list,
                     box_predictor_scopes):
                net = image_feature
                with box_predictor_scope:
                    with slim.arg_scope(self._conv_hyperparams_fn()):
                        with slim.arg_scope([slim.dropout],
                                            is_training=self._is_training):
                            # Add additional conv layers before the class predictor.
                            features_depth = static_shape.get_depth(
                                image_feature.get_shape())
                            depth = max(min(features_depth, self._max_depth),
                                        self._min_depth)
                            tf.logging.info(
                                'depth of additional conv before box predictor: {}'
                                .format(depth))
                            if depth > 0 and self._num_layers_before_predictor > 0:
                                for i in range(
                                        self._num_layers_before_predictor):
                                    net = slim.conv2d(
                                        net,
                                        depth, [1, 1],
                                        reuse=tf.AUTO_REUSE,
                                        scope='Conv2d_%d_1x1_%d' % (i, depth))

                            sorted_keys = sorted(self._other_heads.keys())
                            sorted_keys.append(BOX_ENCODINGS)
                            sorted_keys.append(
                                CLASS_PREDICTIONS_WITH_BACKGROUND)
                            for head_name in sorted_keys:
                                if head_name == BOX_ENCODINGS:
                                    head_obj = self._box_prediction_head
                                elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
                                    head_obj = self._class_prediction_head
                                else:
                                    head_obj = self._other_heads[head_name]

                                prediction = head_obj.predict(
                                    features=net,
                                    num_predictions_per_location=
                                    num_predictions_per_location)
                                predictions[head_name].append(prediction)

        return predictions
Example #14
0
  def _predict(self, image_features, num_predictions_per_location_list):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
        width_i, channels_i] containing features for a batch of images.
      num_predictions_per_location_list: A list of integers representing the
        number of box predictions to be made per spatial location for each
        feature map.

    Returns:
      A dictionary containing:
        box_encodings: A list of float tensors of shape
          [batch_size, num_anchors_i, q, code_size] representing the location of
          the objects, where q is 1 or the number of classes. Each entry in the
          list corresponds to a feature map in the input `image_features` list.
        class_predictions_with_background: A list of float tensors of shape
          [batch_size, num_anchors_i, num_classes + 1] representing the class
          predictions for the proposals. Each entry in the list corresponds to a
          feature map in the input `image_features` list.
        (optional) Predictions from other heads.
    """
    predictions = {
        BOX_ENCODINGS: [],
        CLASS_PREDICTIONS_WITH_BACKGROUND: [],
    }
    for head_name in self._other_heads.keys():
      predictions[head_name] = []
    # TODO(rathodv): Come up with a better way to generate scope names
    # in box predictor once we have time to retrain all models in the zoo.
    # The following lines create scope names to be backwards compatible with the
    # existing checkpoints.
    box_predictor_scopes = [_NoopVariableScope()]
    if len(image_features) > 1:
      box_predictor_scopes = [
          tf.variable_scope('BoxPredictor_{}'.format(i))
          for i in range(len(image_features))
      ]
    for (image_feature,
         num_predictions_per_location, box_predictor_scope) in zip(
             image_features, num_predictions_per_location_list,
             box_predictor_scopes):
      net = image_feature
      with box_predictor_scope:
        with slim.arg_scope(self._conv_hyperparams_fn()):
          with slim.arg_scope([slim.dropout], is_training=self._is_training):
            # Add additional conv layers before the class predictor.
            features_depth = static_shape.get_depth(image_feature.get_shape())
            depth = max(min(features_depth, self._max_depth), self._min_depth)
            tf.logging.info('depth of additional conv before box predictor: {}'.
                            format(depth))
            if depth > 0 and self._num_layers_before_predictor > 0:
              for i in range(self._num_layers_before_predictor):
                net = slim.conv2d(
                    net,
                    depth, [1, 1],
                    reuse=tf.AUTO_REUSE,
                    scope='Conv2d_%d_1x1_%d' % (i, depth))
            sorted_keys = sorted(self._other_heads.keys())
            sorted_keys.append(BOX_ENCODINGS)
            sorted_keys.append(CLASS_PREDICTIONS_WITH_BACKGROUND)
            for head_name in sorted_keys:
              if head_name == BOX_ENCODINGS:
                head_obj = self._box_prediction_head
              elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
                head_obj = self._class_prediction_head
              else:
                head_obj = self._other_heads[head_name]
              prediction = head_obj.predict(
                  features=net,
                  num_predictions_per_location=num_predictions_per_location)
              predictions[head_name].append(prediction)
    return predictions
Example #15
0
    def _predict(self, image_features, audio_features,
                 num_predictions_per_location):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """

        # Add a slot for the background class.
        num_class_slots = self.num_classes + 1
        net = image_features
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
            # Add additional conv layers before the class predictor.
            features_depth = static_shape.get_depth(image_features.get_shape())
            depth = max(min(features_depth, self._max_depth), self._min_depth)
            tf.logging.info(
                'depth of additional conv before box predictor: {}'.format(
                    depth))
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.conv2d(net,
                                      depth, [1, 1],
                                      scope='Conv2d_%d_1x1_%d' % (i, depth))
            with slim.arg_scope([slim.conv2d],
                                activation_fn=None,
                                normalizer_fn=None,
                                normalizer_params=None):
                box_encodings = slim.conv2d(
                    net,
                    num_predictions_per_location * self._box_code_size,
                    [self._kernel_size, self._kernel_size],
                    scope='BoxEncodingPredictor')
                if self._use_dropout:
                    net = slim.dropout(net, keep_prob=self._dropout_keep_prob)

                # fusing with audio features
                print("before fusion: feature_map", net.get_shape())
                shape = net.get_shape()
                extended_audio = tf.tile(audio_features,
                                         [1, shape[1], shape[2], 1])
                net = tf.concat([net, extended_audio], 3)
                print("after fusion: new feature_map", net.get_shape())

                # do 1 x 1 convolution
                net = slim.conv2d(net, shape[3], [1, 1], scope='fusion')

                class_predictions_with_background = slim.conv2d(
                    net,
                    num_predictions_per_location * num_class_slots,
                    [self._kernel_size, self._kernel_size],
                    scope='ClassPredictor',
                    biases_initializer=tf.constant_initializer(
                        self._class_prediction_bias_init))
                if self._apply_sigmoid_to_scores:
                    class_predictions_with_background = tf.sigmoid(
                        class_predictions_with_background)

        print("num_predictions_per_location", num_predictions_per_location)
        print("class_predictions_with_background",
              class_predictions_with_background.get_shape())

        combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
            image_features)
        box_encodings = tf.reshape(
            box_encodings,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                1, self._box_code_size
            ]))
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                num_class_slots
            ]))
        return {
            BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
        }
  def _predict(self, image_features, num_predictions_per_location):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
    features_depth = static_shape.get_depth(image_features.get_shape())
    depth = max(min(features_depth, self._max_depth), self._min_depth)

    # Add a slot for the background class.
    num_class_slots = self.num_classes + 1
    net = image_features
    with slim.arg_scope(self._conv_hyperparams), \
         slim.arg_scope([slim.dropout], is_training=self._is_training):
      # Add additional conv layers before the predictor.
      if depth > 0 and self._num_layers_before_predictor > 0:
        for i in range(self._num_layers_before_predictor):
          net = slim.conv2d(
              net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
      with slim.arg_scope([slim.conv2d], activation_fn=None,
                          normalizer_fn=None, normalizer_params=None):
        box_encodings = slim.conv2d(
            net, num_predictions_per_location * self._box_code_size,
            [self._kernel_size, self._kernel_size],
            scope='BoxEncodingPredictor')
        if self._use_dropout:
          net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
        class_predictions_with_background = slim.conv2d(
            net, num_predictions_per_location * num_class_slots,
            [self._kernel_size, self._kernel_size], scope='ClassPredictor')
        if self._apply_sigmoid_to_scores:
          class_predictions_with_background = tf.sigmoid(
              class_predictions_with_background)

    combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
        image_features)
    box_encodings = tf.reshape(
        box_encodings, tf.stack([combined_feature_map_shape[0],
                                 combined_feature_map_shape[1] *
                                 combined_feature_map_shape[2] *
                                 num_predictions_per_location,
                                 1, self._box_code_size]))
    class_predictions_with_background = tf.reshape(
        class_predictions_with_background,
        tf.stack([combined_feature_map_shape[0],
                  combined_feature_map_shape[1] *
                  combined_feature_map_shape[2] *
                  num_predictions_per_location,
                  num_class_slots]))
    return {BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background}
Example #17
0
    def _predict(self, image_features, num_predictions_per_location):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        score_predictions: A float tensor of shape [batch_size, num_anchors, 1]
          representing the score predictions for the proposals.
    """
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)

        num_class_slots = 1
        net = image_features
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
            # Add additional conv layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.conv2d(net,
                                      depth, [1, 1],
                                      scope='Conv2d_%d_1x1_%d' % (i, depth))
            with slim.arg_scope([slim.conv2d],
                                activation_fn=None,
                                normalizer_fn=None,
                                normalizer_params=None):
                box_encodings = slim.conv2d(
                    net,
                    num_predictions_per_location * self._box_code_size,
                    [self._kernel_size, self._kernel_size],
                    activation_fn=tf.nn.sigmoid,
                    scope='BoxEncodingPredictor')
                angle_encodings = slim.conv2d(
                    net,
                    num_predictions_per_location * 1,
                    [self._kernel_size, self._kernel_size],
                    scope='AngleEncodingPredictor')
                score_predictions = slim.conv2d(
                    net,
                    num_predictions_per_location * 1,
                    [self._kernel_size, self._kernel_size],
                    scope='ScorePredictor')
                #score_predictions = tf.sigmoid(score_predictions)

        batch_size = static_shape.get_batch_size(image_features.get_shape())
        if batch_size is None:
            features_height = static_shape.get_height(
                image_features.get_shape())
            features_width = static_shape.get_width(image_features.get_shape())
            flattened_predictions_size = (features_height * features_width *
                                          num_predictions_per_location)
            box_encodings = tf.reshape(
                box_encodings,
                [-1, flattened_predictions_size, 1, self._box_code_size])
            angle_encodings = tf.reshape(
                angle_encodings, [-1, flattened_predictions_size, 1, 1])
            score_predictions = tf.reshape(score_predictions,
                                           [-1, flattened_predictions_size, 1])
        else:
            box_encodings = tf.reshape(
                box_encodings, [batch_size, -1, 1, self._box_code_size])
            angle_encodings = tf.reshape(angle_encodings,
                                         [batch_size, -1, 1, 1])
            score_predictions = tf.reshape(score_predictions,
                                           [batch_size, -1, 1])
        return {
            BOX_ENCODINGS: box_encodings,
            ANGLE_ENCODINGS: angle_encodings,
            SCORE_PREDICTIONS: score_predictions
        }
Example #18
0
  def _predict(self, image_features, num_predictions_per_location):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
    features_depth = static_shape.get_depth(image_features.get_shape())  #here this is the depth of the feature map 
    depth = max(min(features_depth, self._max_depth), self._min_depth)   #here the depth is zero  This is like how many additional layers 

    # Add a slot for the background class.
    num_class_slots = self.num_classes + 1        #this is for ground trth class also 
    net = image_features            #get the image feature or the last layer this is for that convolutinal wilter 
    with slim.arg_scope(self._conv_hyperparams), \
         slim.arg_scope([slim.dropout], is_training=self._is_training):
      # Add additional conv layers before the predictor.
      if depth > 0 and self._num_layers_before_predictor > 0:  #number of layers means at which depth we calculate things 
        for i in range(self._num_layers_before_predictor):   #here both are zero in our case so no 
          net = slim.conv2d(    #all additoonal layers are covolutional layer 1*1
              net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))  #extra convolution perform to feature extractor 
      with slim.arg_scope([slim.conv2d], activation_fn=None,
                          normalizer_fn=None, normalizer_params=None):
        box_encodings = slim.conv2d(                  #here input is the net , then second parameter is the how many filters , Kernal size 
            net, num_predictions_per_location * self._box_code_size,   #This is basically convolition we will get 
            [self._kernel_size, self._kernel_size],
            scope='BoxEncodingPredictor')        #here the output depth will be  net, num_predictions_per_location * self._box_code_size 

        if self._use_dropout:          #we only use dropout for the class prediction 
          net = slim.dropout(net, keep_prob=self._dropout_keep_prob)  #we are using the dropout 

        class_predictions_with_background = slim.conv2d(
            net, num_predictions_per_location * num_class_slots,  #like above we get a 3d feature set as aboive 
            [self._kernel_size, self._kernel_size], scope='ClassPredictor')

        if self._apply_sigmoid_to_scores:  #We don't apply elementwise sigmoid to the score 
          class_predictions_with_background = tf.sigmoid(
              class_predictions_with_background)

    batch_size = static_shape.get_batch_size(image_features.get_shape()) #batch_size 
    if batch_size is None:            #
      features_height = static_shape.get_height(image_features.get_shape())
      features_width = static_shape.get_width(image_features.get_shape())
      flattened_predictions_size = (features_height * features_width *
                                    num_predictions_per_location)
      box_encodings = tf.reshape(                                           #do some reshaping 
          box_encodings,
          [-1, flattened_predictions_size, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background,
          [-1, flattened_predictions_size, num_class_slots])
    else:
      box_encodings = tf.reshape(
          box_encodings, [batch_size, -1, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background, [batch_size, -1, num_class_slots])
    return {BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background}
Example #19
0
    def _predict(self,
                 image_features,
                 num_predictions_per_location,
                 boxes_normalized=None):
        """Computes encoded object locations and corresponding confidences.

    Flattens image_features and applies fully connected ops (with no
    non-linearity) to predict box encodings and class predictions.  In this
    setting, anchors are not spatially arranged in any way and are assumed to
    have been folded into the batch dimension.  Thus we output 1 for the
    anchors dimension.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.
        Currently, this must be set to 1, or an error will be raised.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape
          [batch_size, 1, num_classes, code_size] representing the
          location of the objects.
        class_predictions_with_background: A float tensor of shape
          [batch_size, 1, num_classes + 1] representing the class
          predictions for the proposals.
      If predict_masks is True the dictionary also contains:
        instance_masks: A float tensor of shape
          [batch_size, 1, num_classes, image_height, image_width]
      If predict_keypoints is True the dictionary also contains:
        keypoints: [batch_size, 1, num_keypoints, 2]

    Raises:
      ValueError: if num_predictions_per_location is not 1.
    """
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)
        if num_predictions_per_location != 1:
            raise ValueError(
                'Currently FullyConnectedBoxPredictor only supports '
                'predicting a single box per class per location.')
        roi_features = image_features
        if self._spatial_average:
            roi_features = tf.reduce_mean(roi_features, [1, 2],
                                          keep_dims=True,
                                          name='AvgPool')

        net = slim.flatten(roi_features)
        end_points_collection = self._scope.name + '_end_points'
        with slim.arg_scope(self._fc_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training), \
             slim.arg_scope([slim.fully_connected], trainable=self._is_training,
                            outputs_collections=end_points_collection):
            # Add additional fc layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.fully_connected(net,
                                               depth,
                                               scope='FC_%d_%d' % (i, depth))
                    if self._use_dropout:
                        net = slim.dropout(net,
                                           keep_prob=self._dropout_keep_prob)
            box_encodings = slim.fully_connected(
                net,
                self._num_classes * self._box_code_size,
                activation_fn=None,
                weights_initializer=self._box_initializer,
                scope='BoxEncodingPredictor')
            class_predictions_with_background = slim.fully_connected(
                net,
                self._num_classes + 1,
                activation_fn=None,
                scope='ClassPredictor')
        box_encodings = tf.reshape(
            box_encodings, [-1, 1, self._num_classes, self._box_code_size])
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background, [-1, 1, self._num_classes + 1])

        predictions_dict = {
            BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
        }

        if self._predict_instance_masks:
            with slim.arg_scope(self._conv_hyperparams):
                upsampled_features = slim.conv2d_transpose(
                    image_features,
                    num_outputs=self._mask_prediction_conv_depth,
                    kernel_size=[2, 2],
                    stride=2,
                    trainable=self._is_training)
                mask_predictions = slim.conv2d(upsampled_features,
                                               num_outputs=self.num_classes,
                                               activation_fn=None,
                                               kernel_size=[1, 1],
                                               trainable=self._is_training)
                instance_masks = tf.expand_dims(tf.transpose(mask_predictions,
                                                             perm=[0, 3, 1,
                                                                   2]),
                                                axis=1,
                                                name='MaskPredictor')
            predictions_dict[MASK_PREDICTIONS] = instance_masks

        end_points = slim.utils.convert_collection_to_dict(
            end_points_collection)
        framework_ops.get_default_graph().clear_collection(
            end_points_collection)
        return predictions_dict