def test_die_on_tensor_shape_with_rank_three(self): tensor_shape = tf.TensorShape(dims=[32, 299, 384]) with self.assertRaises(ValueError): static_shape.get_batch_size(tensor_shape) static_shape.get_height(tensor_shape) static_shape.get_width(tensor_shape) static_shape.get_depth(tensor_shape)
def pad_to_multiple(tensor, multiple): """Returns the tensor zero padded to the specified multiple. Appends 0s to the end of the first and second dimension (height and width) of the tensor until both dimensions are a multiple of the input argument 'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input multiple of 4, PadToMultiple will append 0s so that the resulting tensor will be of shape [1, 4, 8, 1]. Args: tensor: rank 4 float32 tensor, where tensor -> [batch_size, height, width, channels]. multiple: the multiple to pad to. Returns: padded_tensor: the tensor zero padded to the specified multiple. """ tensor_shape = tensor.get_shape() batch_size = static_shape.get_batch_size(tensor_shape) tensor_height = static_shape.get_height(tensor_shape) tensor_width = static_shape.get_width(tensor_shape) tensor_depth = static_shape.get_depth(tensor_shape) if batch_size is None: batch_size = tf.shape(tensor)[0] if tensor_height is None: tensor_height = tf.shape(tensor)[1] padded_tensor_height = tf.to_int32( tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple else: padded_tensor_height = int( math.ceil(float(tensor_height) / multiple) * multiple) if tensor_width is None: tensor_width = tf.shape(tensor)[2] padded_tensor_width = tf.to_int32( tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple else: padded_tensor_width = int( math.ceil(float(tensor_width) / multiple) * multiple) if tensor_depth is None: tensor_depth = tf.shape(tensor)[3] # Use tf.concat instead of tf.pad to preserve static shape if padded_tensor_height != tensor_height: height_pad = tf.zeros([ batch_size, padded_tensor_height - tensor_height, tensor_width, tensor_depth ]) tensor = tf.concat([tensor, height_pad], 1) if padded_tensor_width != tensor_width: width_pad = tf.zeros([ batch_size, padded_tensor_height, padded_tensor_width - tensor_width, tensor_depth ]) tensor = tf.concat([tensor, width_pad], 2) return tensor
def pad_to_multiple(tensor, multiple): """Returns the tensor zero padded to the specified multiple. Appends 0s to the end of the first and second dimension (height and width) of the tensor until both dimensions are a multiple of the input argument 'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input multiple of 4, PadToMultiple will append 0s so that the resulting tensor will be of shape [1, 4, 8, 1]. Args: tensor: rank 4 float32 tensor, where tensor -> [batch_size, height, width, channels]. multiple: the multiple to pad to. Returns: padded_tensor: the tensor zero padded to the specified multiple. """ tensor_shape = tensor.get_shape() batch_size = static_shape.get_batch_size(tensor_shape) tensor_height = static_shape.get_height(tensor_shape) tensor_width = static_shape.get_width(tensor_shape) tensor_depth = static_shape.get_depth(tensor_shape) if batch_size is None: batch_size = tf.shape(tensor)[0] if tensor_height is None: tensor_height = tf.shape(tensor)[1] padded_tensor_height = tf.to_int32( tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple else: padded_tensor_height = int( math.ceil(float(tensor_height) / multiple) * multiple) if tensor_width is None: tensor_width = tf.shape(tensor)[2] padded_tensor_width = tf.to_int32( tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple else: padded_tensor_width = int( math.ceil(float(tensor_width) / multiple) * multiple) if tensor_depth is None: tensor_depth = tf.shape(tensor)[3] # Use tf.concat instead of tf.pad to preserve static shape if padded_tensor_height != tensor_height: height_pad = tf.zeros([ batch_size, padded_tensor_height - tensor_height, tensor_width, tensor_depth ]) tensor = tf.concat([tensor, height_pad], 1) if padded_tensor_width != tensor_width: width_pad = tf.zeros([ batch_size, padded_tensor_height, padded_tensor_width - tensor_width, tensor_depth ]) tensor = tf.concat([tensor, width_pad], 2) return tensor
def check_min_image_dim(min_dim, image_tensor): """Checks that the image width/height are greater than some number. This function is used to check that the width and height of an image are above a certain value. If the image shape is static, this function will perform the check at graph construction time. Otherwise, if the image shape varies, an Assertion control dependency will be added to the graph. Args: min_dim: The minimum number of pixels along the width and height of the image. image_tensor: The image tensor to check size for. Returns: If `image_tensor` has dynamic size, return `image_tensor` with a Assert control dependency. Otherwise returns image_tensor. Raises: ValueError: if `image_tensor`'s' width or height is smaller than `min_dim`. """ image_shape = image_tensor.get_shape() image_height = static_shape.get_height(image_shape) image_width = static_shape.get_width(image_shape) if image_height is None or image_width is None: shape_assert = tf.Assert( tf.logical_and( tf.greater_equal(tf.shape(image_tensor)[1], min_dim), tf.greater_equal(tf.shape(image_tensor)[2], min_dim)), [ 'image size must be >= {} in both height and width.'.format( min_dim) ]) with tf.control_dependencies([shape_assert]): return tf.identity(image_tensor) if image_height < min_dim or image_width < min_dim: raise ValueError( 'image size must be >= %d in both height and width; image dim = %d,%d' % (min_dim, image_height, image_width)) return image_tensor
def check_min_image_dim(min_dim, image_tensor): """Checks that the image width/height are greater than some number""" image_shape = image_tensor.get_shape() image_height = static_shape.get_height(image_shape) image_width = static_shape.get_width(image_shape) if image_height is None or image_width is None: shape_assert = tf.Assert( tf.logical_and( tf.greater_equal(tf.shape(image_tensor)[1], min_dim), tf.greater_equal(tf.shape(image_tensor)[2], min_dim)), [ 'image size must be >= {} in both height and width.'.format( min_dim) ]) with tf.control_dependencies([shape_assert]): return tf.identity(image_tensor) if image_height < min_dim or image_width < min_dim: raise ValueError( 'image size must be >= %d in both height and width; image dim = %d,%d' % (min_dim, image_height, image_width)) return image_tensor
def check_min_image_dim(min_dim, image_tensor): """Checks that the image width/height are greater than some number. This function is used to check that the width and height of an image are above a certain value. If the image shape is static, this function will perform the check at graph construction time. Otherwise, if the image shape varies, an Assertion control dependency will be added to the graph. Args: min_dim: The minimum number of pixels along the width and height of the image. image_tensor: The image tensor to check size for. Returns: If `image_tensor` has dynamic size, return `image_tensor` with a Assert control dependency. Otherwise returns image_tensor. Raises: ValueError: if `image_tensor`'s' width or height is smaller than `min_dim`. """ image_shape = image_tensor.get_shape() image_height = static_shape.get_height(image_shape) image_width = static_shape.get_width(image_shape) if image_height is None or image_width is None: shape_assert = tf.Assert( tf.logical_and(tf.greater_equal(tf.shape(image_tensor)[1], min_dim), tf.greater_equal(tf.shape(image_tensor)[2], min_dim)), ['image size must be >= {} in both height and width.'.format(min_dim)]) with tf.control_dependencies([shape_assert]): return tf.identity(image_tensor) if image_height < min_dim or image_width < min_dim: raise ValueError( 'image size must be >= %d in both height and width; image dim = %d,%d' % (min_dim, image_height, image_width)) return image_tensor
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with tf.variable_scope('gradientconv2d'): box_encodings = gradient_conv2d411( inputs=net, inputs_hwc=[ features_height, features_width, features_depth ], num_outputs=num_predictions_per_location * self._box_code_size, scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = gradient_conv2d411( inputs=net, inputs_hwc=[ features_height, features_width, features_depth ], num_outputs=num_predictions_per_location * num_class_slots, scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape( image_features) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size ])) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) return { BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background }
def test_return_correct_width(self): tensor_shape = tf.TensorShape(dims=[32, 299, 384, 3]) self.assertEqual(384, static_shape.get_width(tensor_shape))
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) #here this is the depth of the feature map depth = max(min(features_depth, self._max_depth), self._min_depth) #here the depth is zero This is like how many additional layers # Add a slot for the background class. num_class_slots = self.num_classes + 1 #this is for ground trth class also net = image_features #get the image feature or the last layer this is for that convolutinal wilter with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: #number of layers means at which depth we calculate things for i in range(self._num_layers_before_predictor): #here both are zero in our case so no net = slim.conv2d( #all additoonal layers are covolutional layer 1*1 net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) #extra convolution perform to feature extractor with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( #here input is the net , then second parameter is the how many filters , Kernal size net, num_predictions_per_location * self._box_code_size, #This is basically convolition we will get [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') #here the output depth will be net, num_predictions_per_location * self._box_code_size if self._use_dropout: #we only use dropout for the class prediction net = slim.dropout(net, keep_prob=self._dropout_keep_prob) #we are using the dropout class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, #like above we get a 3d feature set as aboive [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: #We don't apply elementwise sigmoid to the score class_predictions_with_background = tf.sigmoid( class_predictions_with_background) batch_size = static_shape.get_batch_size(image_features.get_shape()) #batch_size if batch_size is None: # features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( #do some reshaping box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [-1, flattened_predictions_size, num_class_slots]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size, -1, num_class_slots]) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) batch_size = static_shape.get_batch_size(image_features.get_shape()) if batch_size is None: features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [-1, flattened_predictions_size, num_class_slots]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size, -1, num_class_slots]) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) batch_size = static_shape.get_batch_size(image_features.get_shape()) if batch_size is None: features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [-1, flattened_predictions_size, num_class_slots]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size, -1, num_class_slots]) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location score_predictions: A float tensor of shape [batch_size, num_anchors, 1] representing the score predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) num_class_slots = 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], activation_fn=tf.nn.sigmoid, scope='BoxEncodingPredictor') angle_encodings = slim.conv2d( net, num_predictions_per_location * 1, [self._kernel_size, self._kernel_size], scope='AngleEncodingPredictor') score_predictions = slim.conv2d( net, num_predictions_per_location * 1, [self._kernel_size, self._kernel_size], scope='ScorePredictor') #score_predictions = tf.sigmoid(score_predictions) batch_size = static_shape.get_batch_size(image_features.get_shape()) if batch_size is None: features_height = static_shape.get_height( image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) angle_encodings = tf.reshape( angle_encodings, [-1, flattened_predictions_size, 1, 1]) score_predictions = tf.reshape(score_predictions, [-1, flattened_predictions_size, 1]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) angle_encodings = tf.reshape(angle_encodings, [batch_size, -1, 1, 1]) score_predictions = tf.reshape(score_predictions, [batch_size, -1, 1]) return { BOX_ENCODINGS: box_encodings, ANGLE_ENCODINGS: angle_encodings, SCORE_PREDICTIONS: score_predictions }