Esempio n. 1
0
 def test_die_on_tensor_shape_with_rank_three(self):
   tensor_shape = tf.TensorShape(dims=[32, 299, 384])
   with self.assertRaises(ValueError):
     static_shape.get_batch_size(tensor_shape)
     static_shape.get_height(tensor_shape)
     static_shape.get_width(tensor_shape)
     static_shape.get_depth(tensor_shape)
Esempio n. 2
0
def pad_to_multiple(tensor, multiple):
  """Returns the tensor zero padded to the specified multiple.

  Appends 0s to the end of the first and second dimension (height and width) of
  the tensor until both dimensions are a multiple of the input argument
  'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input
  multiple of 4, PadToMultiple will append 0s so that the resulting tensor will
  be of shape [1, 4, 8, 1].

  Args:
    tensor: rank 4 float32 tensor, where
            tensor -> [batch_size, height, width, channels].
    multiple: the multiple to pad to.

  Returns:
    padded_tensor: the tensor zero padded to the specified multiple.
  """
  tensor_shape = tensor.get_shape()
  batch_size = static_shape.get_batch_size(tensor_shape)
  tensor_height = static_shape.get_height(tensor_shape)
  tensor_width = static_shape.get_width(tensor_shape)
  tensor_depth = static_shape.get_depth(tensor_shape)

  if batch_size is None:
    batch_size = tf.shape(tensor)[0]

  if tensor_height is None:
    tensor_height = tf.shape(tensor)[1]
    padded_tensor_height = tf.to_int32(
        tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_height = int(
        math.ceil(float(tensor_height) / multiple) * multiple)

  if tensor_width is None:
    tensor_width = tf.shape(tensor)[2]
    padded_tensor_width = tf.to_int32(
        tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_width = int(
        math.ceil(float(tensor_width) / multiple) * multiple)

  if tensor_depth is None:
    tensor_depth = tf.shape(tensor)[3]

  # Use tf.concat instead of tf.pad to preserve static shape
  if padded_tensor_height != tensor_height:
    height_pad = tf.zeros([
        batch_size, padded_tensor_height - tensor_height, tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, height_pad], 1)
  if padded_tensor_width != tensor_width:
    width_pad = tf.zeros([
        batch_size, padded_tensor_height, padded_tensor_width - tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, width_pad], 2)

  return tensor
Esempio n. 3
0
def pad_to_multiple(tensor, multiple):
  """Returns the tensor zero padded to the specified multiple.

  Appends 0s to the end of the first and second dimension (height and width) of
  the tensor until both dimensions are a multiple of the input argument
  'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input
  multiple of 4, PadToMultiple will append 0s so that the resulting tensor will
  be of shape [1, 4, 8, 1].

  Args:
    tensor: rank 4 float32 tensor, where
            tensor -> [batch_size, height, width, channels].
    multiple: the multiple to pad to.

  Returns:
    padded_tensor: the tensor zero padded to the specified multiple.
  """
  tensor_shape = tensor.get_shape()
  batch_size = static_shape.get_batch_size(tensor_shape)
  tensor_height = static_shape.get_height(tensor_shape)
  tensor_width = static_shape.get_width(tensor_shape)
  tensor_depth = static_shape.get_depth(tensor_shape)

  if batch_size is None:
    batch_size = tf.shape(tensor)[0]

  if tensor_height is None:
    tensor_height = tf.shape(tensor)[1]
    padded_tensor_height = tf.to_int32(
        tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_height = int(
        math.ceil(float(tensor_height) / multiple) * multiple)

  if tensor_width is None:
    tensor_width = tf.shape(tensor)[2]
    padded_tensor_width = tf.to_int32(
        tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_width = int(
        math.ceil(float(tensor_width) / multiple) * multiple)

  if tensor_depth is None:
    tensor_depth = tf.shape(tensor)[3]

  # Use tf.concat instead of tf.pad to preserve static shape
  if padded_tensor_height != tensor_height:
    height_pad = tf.zeros([
        batch_size, padded_tensor_height - tensor_height, tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, height_pad], 1)
  if padded_tensor_width != tensor_width:
    width_pad = tf.zeros([
        batch_size, padded_tensor_height, padded_tensor_width - tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, width_pad], 2)

  return tensor
Esempio n. 4
0
def check_min_image_dim(min_dim, image_tensor):
    """Checks that the image width/height are greater than some number.

  This function is used to check that the width and height of an image are above
  a certain value. If the image shape is static, this function will perform the
  check at graph construction time. Otherwise, if the image shape varies, an
  Assertion control dependency will be added to the graph.

  Args:
    min_dim: The minimum number of pixels along the width and height of the
             image.
    image_tensor: The image tensor to check size for.

  Returns:
    If `image_tensor` has dynamic size, return `image_tensor` with a Assert
    control dependency. Otherwise returns image_tensor.

  Raises:
    ValueError: if `image_tensor`'s' width or height is smaller than `min_dim`.
  """
    image_shape = image_tensor.get_shape()
    image_height = static_shape.get_height(image_shape)
    image_width = static_shape.get_width(image_shape)
    if image_height is None or image_width is None:
        shape_assert = tf.Assert(
            tf.logical_and(
                tf.greater_equal(tf.shape(image_tensor)[1], min_dim),
                tf.greater_equal(tf.shape(image_tensor)[2], min_dim)),
            [
                'image size must be >= {} in both height and width.'.format(
                    min_dim)
            ])
        with tf.control_dependencies([shape_assert]):
            return tf.identity(image_tensor)

    if image_height < min_dim or image_width < min_dim:
        raise ValueError(
            'image size must be >= %d in both height and width; image dim = %d,%d'
            % (min_dim, image_height, image_width))

    return image_tensor
def check_min_image_dim(min_dim, image_tensor):
    """Checks that the image width/height are greater than some number"""
    image_shape = image_tensor.get_shape()
    image_height = static_shape.get_height(image_shape)
    image_width = static_shape.get_width(image_shape)
    if image_height is None or image_width is None:
        shape_assert = tf.Assert(
            tf.logical_and(
                tf.greater_equal(tf.shape(image_tensor)[1], min_dim),
                tf.greater_equal(tf.shape(image_tensor)[2], min_dim)),
            [
                'image size must be >= {} in both height and width.'.format(
                    min_dim)
            ])
        with tf.control_dependencies([shape_assert]):
            return tf.identity(image_tensor)

    if image_height < min_dim or image_width < min_dim:
        raise ValueError(
            'image size must be >= %d in both height and width; image dim = %d,%d'
            % (min_dim, image_height, image_width))

    return image_tensor
Esempio n. 6
0
def check_min_image_dim(min_dim, image_tensor):
  """Checks that the image width/height are greater than some number.

  This function is used to check that the width and height of an image are above
  a certain value. If the image shape is static, this function will perform the
  check at graph construction time. Otherwise, if the image shape varies, an
  Assertion control dependency will be added to the graph.

  Args:
    min_dim: The minimum number of pixels along the width and height of the
             image.
    image_tensor: The image tensor to check size for.

  Returns:
    If `image_tensor` has dynamic size, return `image_tensor` with a Assert
    control dependency. Otherwise returns image_tensor.

  Raises:
    ValueError: if `image_tensor`'s' width or height is smaller than `min_dim`.
  """
  image_shape = image_tensor.get_shape()
  image_height = static_shape.get_height(image_shape)
  image_width = static_shape.get_width(image_shape)
  if image_height is None or image_width is None:
    shape_assert = tf.Assert(
        tf.logical_and(tf.greater_equal(tf.shape(image_tensor)[1], min_dim),
                       tf.greater_equal(tf.shape(image_tensor)[2], min_dim)),
        ['image size must be >= {} in both height and width.'.format(min_dim)])
    with tf.control_dependencies([shape_assert]):
      return tf.identity(image_tensor)

  if image_height < min_dim or image_width < min_dim:
    raise ValueError(
        'image size must be >= %d in both height and width; image dim = %d,%d' %
        (min_dim, image_height, image_width))

  return image_tensor
    def _predict(self, image_features, num_predictions_per_location):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
        features_height = static_shape.get_height(image_features.get_shape())
        features_width = static_shape.get_width(image_features.get_shape())
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)

        # Add a slot for the background class.
        num_class_slots = self.num_classes + 1
        net = image_features
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
            # Add additional conv layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.conv2d(net,
                                      depth, [1, 1],
                                      scope='Conv2d_%d_1x1_%d' % (i, depth))

            with tf.variable_scope('gradientconv2d'):
                box_encodings = gradient_conv2d411(
                    inputs=net,
                    inputs_hwc=[
                        features_height, features_width, features_depth
                    ],
                    num_outputs=num_predictions_per_location *
                    self._box_code_size,
                    scope='BoxEncodingPredictor')
                if self._use_dropout:
                    net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
                class_predictions_with_background = gradient_conv2d411(
                    inputs=net,
                    inputs_hwc=[
                        features_height, features_width, features_depth
                    ],
                    num_outputs=num_predictions_per_location * num_class_slots,
                    scope='ClassPredictor')
                if self._apply_sigmoid_to_scores:
                    class_predictions_with_background = tf.sigmoid(
                        class_predictions_with_background)

        combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
            image_features)
        box_encodings = tf.reshape(
            box_encodings,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                1, self._box_code_size
            ]))
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                num_class_slots
            ]))
        return {
            BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
        }
Esempio n. 8
0
 def test_return_correct_width(self):
   tensor_shape = tf.TensorShape(dims=[32, 299, 384, 3])
   self.assertEqual(384, static_shape.get_width(tensor_shape))
Esempio n. 9
0
  def _predict(self, image_features, num_predictions_per_location):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
    features_depth = static_shape.get_depth(image_features.get_shape())  #here this is the depth of the feature map 
    depth = max(min(features_depth, self._max_depth), self._min_depth)   #here the depth is zero  This is like how many additional layers 

    # Add a slot for the background class.
    num_class_slots = self.num_classes + 1        #this is for ground trth class also 
    net = image_features            #get the image feature or the last layer this is for that convolutinal wilter 
    with slim.arg_scope(self._conv_hyperparams), \
         slim.arg_scope([slim.dropout], is_training=self._is_training):
      # Add additional conv layers before the predictor.
      if depth > 0 and self._num_layers_before_predictor > 0:  #number of layers means at which depth we calculate things 
        for i in range(self._num_layers_before_predictor):   #here both are zero in our case so no 
          net = slim.conv2d(    #all additoonal layers are covolutional layer 1*1
              net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))  #extra convolution perform to feature extractor 
      with slim.arg_scope([slim.conv2d], activation_fn=None,
                          normalizer_fn=None, normalizer_params=None):
        box_encodings = slim.conv2d(                  #here input is the net , then second parameter is the how many filters , Kernal size 
            net, num_predictions_per_location * self._box_code_size,   #This is basically convolition we will get 
            [self._kernel_size, self._kernel_size],
            scope='BoxEncodingPredictor')        #here the output depth will be  net, num_predictions_per_location * self._box_code_size 

        if self._use_dropout:          #we only use dropout for the class prediction 
          net = slim.dropout(net, keep_prob=self._dropout_keep_prob)  #we are using the dropout 

        class_predictions_with_background = slim.conv2d(
            net, num_predictions_per_location * num_class_slots,  #like above we get a 3d feature set as aboive 
            [self._kernel_size, self._kernel_size], scope='ClassPredictor')

        if self._apply_sigmoid_to_scores:  #We don't apply elementwise sigmoid to the score 
          class_predictions_with_background = tf.sigmoid(
              class_predictions_with_background)

    batch_size = static_shape.get_batch_size(image_features.get_shape()) #batch_size 
    if batch_size is None:            #
      features_height = static_shape.get_height(image_features.get_shape())
      features_width = static_shape.get_width(image_features.get_shape())
      flattened_predictions_size = (features_height * features_width *
                                    num_predictions_per_location)
      box_encodings = tf.reshape(                                           #do some reshaping 
          box_encodings,
          [-1, flattened_predictions_size, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background,
          [-1, flattened_predictions_size, num_class_slots])
    else:
      box_encodings = tf.reshape(
          box_encodings, [batch_size, -1, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background, [batch_size, -1, num_class_slots])
    return {BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background}
Esempio n. 10
0
  def _predict(self, image_features, num_predictions_per_location):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
    features_depth = static_shape.get_depth(image_features.get_shape())
    depth = max(min(features_depth, self._max_depth), self._min_depth)

    # Add a slot for the background class.
    num_class_slots = self.num_classes + 1
    net = image_features
    with slim.arg_scope(self._conv_hyperparams), \
         slim.arg_scope([slim.dropout], is_training=self._is_training):
      # Add additional conv layers before the predictor.
      if depth > 0 and self._num_layers_before_predictor > 0:
        for i in range(self._num_layers_before_predictor):
          net = slim.conv2d(
              net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
      with slim.arg_scope([slim.conv2d], activation_fn=None,
                          normalizer_fn=None, normalizer_params=None):
        box_encodings = slim.conv2d(
            net, num_predictions_per_location * self._box_code_size,
            [self._kernel_size, self._kernel_size],
            scope='BoxEncodingPredictor')
        if self._use_dropout:
          net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
        class_predictions_with_background = slim.conv2d(
            net, num_predictions_per_location * num_class_slots,
            [self._kernel_size, self._kernel_size], scope='ClassPredictor')
        if self._apply_sigmoid_to_scores:
          class_predictions_with_background = tf.sigmoid(
              class_predictions_with_background)

    batch_size = static_shape.get_batch_size(image_features.get_shape())
    if batch_size is None:
      features_height = static_shape.get_height(image_features.get_shape())
      features_width = static_shape.get_width(image_features.get_shape())
      flattened_predictions_size = (features_height * features_width *
                                    num_predictions_per_location)
      box_encodings = tf.reshape(
          box_encodings,
          [-1, flattened_predictions_size, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background,
          [-1, flattened_predictions_size, num_class_slots])
    else:
      box_encodings = tf.reshape(
          box_encodings, [batch_size, -1, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background, [batch_size, -1, num_class_slots])
    return {BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background}
  def _predict(self, image_features, num_predictions_per_location):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
    features_depth = static_shape.get_depth(image_features.get_shape())
    depth = max(min(features_depth, self._max_depth), self._min_depth)

    # Add a slot for the background class.
    num_class_slots = self.num_classes + 1
    net = image_features
    with slim.arg_scope(self._conv_hyperparams), \
         slim.arg_scope([slim.dropout], is_training=self._is_training):
      # Add additional conv layers before the predictor.
      if depth > 0 and self._num_layers_before_predictor > 0:
        for i in range(self._num_layers_before_predictor):
          net = slim.conv2d(
              net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
      with slim.arg_scope([slim.conv2d], activation_fn=None,
                          normalizer_fn=None, normalizer_params=None):
        box_encodings = slim.conv2d(
            net, num_predictions_per_location * self._box_code_size,
            [self._kernel_size, self._kernel_size],
            scope='BoxEncodingPredictor')
        if self._use_dropout:
          net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
        class_predictions_with_background = slim.conv2d(
            net, num_predictions_per_location * num_class_slots,
            [self._kernel_size, self._kernel_size], scope='ClassPredictor')
        if self._apply_sigmoid_to_scores:
          class_predictions_with_background = tf.sigmoid(
              class_predictions_with_background)

    batch_size = static_shape.get_batch_size(image_features.get_shape())
    if batch_size is None:
      features_height = static_shape.get_height(image_features.get_shape())
      features_width = static_shape.get_width(image_features.get_shape())
      flattened_predictions_size = (features_height * features_width *
                                    num_predictions_per_location)
      box_encodings = tf.reshape(
          box_encodings,
          [-1, flattened_predictions_size, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background,
          [-1, flattened_predictions_size, num_class_slots])
    else:
      box_encodings = tf.reshape(
          box_encodings, [batch_size, -1, 1, self._box_code_size])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background, [batch_size, -1, num_class_slots])
    return {BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background}
Esempio n. 12
0
    def _predict(self, image_features, num_predictions_per_location):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        score_predictions: A float tensor of shape [batch_size, num_anchors, 1]
          representing the score predictions for the proposals.
    """
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)

        num_class_slots = 1
        net = image_features
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
            # Add additional conv layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.conv2d(net,
                                      depth, [1, 1],
                                      scope='Conv2d_%d_1x1_%d' % (i, depth))
            with slim.arg_scope([slim.conv2d],
                                activation_fn=None,
                                normalizer_fn=None,
                                normalizer_params=None):
                box_encodings = slim.conv2d(
                    net,
                    num_predictions_per_location * self._box_code_size,
                    [self._kernel_size, self._kernel_size],
                    activation_fn=tf.nn.sigmoid,
                    scope='BoxEncodingPredictor')
                angle_encodings = slim.conv2d(
                    net,
                    num_predictions_per_location * 1,
                    [self._kernel_size, self._kernel_size],
                    scope='AngleEncodingPredictor')
                score_predictions = slim.conv2d(
                    net,
                    num_predictions_per_location * 1,
                    [self._kernel_size, self._kernel_size],
                    scope='ScorePredictor')
                #score_predictions = tf.sigmoid(score_predictions)

        batch_size = static_shape.get_batch_size(image_features.get_shape())
        if batch_size is None:
            features_height = static_shape.get_height(
                image_features.get_shape())
            features_width = static_shape.get_width(image_features.get_shape())
            flattened_predictions_size = (features_height * features_width *
                                          num_predictions_per_location)
            box_encodings = tf.reshape(
                box_encodings,
                [-1, flattened_predictions_size, 1, self._box_code_size])
            angle_encodings = tf.reshape(
                angle_encodings, [-1, flattened_predictions_size, 1, 1])
            score_predictions = tf.reshape(score_predictions,
                                           [-1, flattened_predictions_size, 1])
        else:
            box_encodings = tf.reshape(
                box_encodings, [batch_size, -1, 1, self._box_code_size])
            angle_encodings = tf.reshape(angle_encodings,
                                         [batch_size, -1, 1, 1])
            score_predictions = tf.reshape(score_predictions,
                                           [batch_size, -1, 1])
        return {
            BOX_ENCODINGS: box_encodings,
            ANGLE_ENCODINGS: angle_encodings,
            SCORE_PREDICTIONS: score_predictions
        }