def test_die_on_tensor_shape_with_rank_three(self): tensor_shape = tf.TensorShape(dims=[32, 299, 384]) with self.assertRaises(ValueError): static_shape.get_batch_size(tensor_shape) static_shape.get_height(tensor_shape) static_shape.get_width(tensor_shape) static_shape.get_depth(tensor_shape)
def pad_to_multiple(tensor, multiple): """Returns the tensor zero padded to the specified multiple. Appends 0s to the end of the first and second dimension (height and width) of the tensor until both dimensions are a multiple of the input argument 'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input multiple of 4, PadToMultiple will append 0s so that the resulting tensor will be of shape [1, 4, 8, 1]. Args: tensor: rank 4 float32 tensor, where tensor -> [batch_size, height, width, channels]. multiple: the multiple to pad to. Returns: padded_tensor: the tensor zero padded to the specified multiple. """ tensor_shape = tensor.get_shape() batch_size = static_shape.get_batch_size(tensor_shape) tensor_height = static_shape.get_height(tensor_shape) tensor_width = static_shape.get_width(tensor_shape) tensor_depth = static_shape.get_depth(tensor_shape) if batch_size is None: batch_size = tf.shape(tensor)[0] if tensor_height is None: tensor_height = tf.shape(tensor)[1] padded_tensor_height = tf.to_int32( tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple else: padded_tensor_height = int( math.ceil(float(tensor_height) / multiple) * multiple) if tensor_width is None: tensor_width = tf.shape(tensor)[2] padded_tensor_width = tf.to_int32( tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple else: padded_tensor_width = int( math.ceil(float(tensor_width) / multiple) * multiple) if tensor_depth is None: tensor_depth = tf.shape(tensor)[3] # Use tf.concat instead of tf.pad to preserve static shape if padded_tensor_height != tensor_height: height_pad = tf.zeros([ batch_size, padded_tensor_height - tensor_height, tensor_width, tensor_depth ]) tensor = tf.concat([tensor, height_pad], 1) if padded_tensor_width != tensor_width: width_pad = tf.zeros([ batch_size, padded_tensor_height, padded_tensor_width - tensor_width, tensor_depth ]) tensor = tf.concat([tensor, width_pad], 2) return tensor
def test_return_correct_batchSize(self): tensor_shape = tf.TensorShape(dims=[32, 299, 384, 3]) self.assertEqual(32, static_shape.get_batch_size(tensor_shape))
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) #here this is the depth of the feature map depth = max(min(features_depth, self._max_depth), self._min_depth) #here the depth is zero This is like how many additional layers # Add a slot for the background class. num_class_slots = self.num_classes + 1 #this is for ground trth class also net = image_features #get the image feature or the last layer this is for that convolutinal wilter with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: #number of layers means at which depth we calculate things for i in range(self._num_layers_before_predictor): #here both are zero in our case so no net = slim.conv2d( #all additoonal layers are covolutional layer 1*1 net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) #extra convolution perform to feature extractor with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( #here input is the net , then second parameter is the how many filters , Kernal size net, num_predictions_per_location * self._box_code_size, #This is basically convolition we will get [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') #here the output depth will be net, num_predictions_per_location * self._box_code_size if self._use_dropout: #we only use dropout for the class prediction net = slim.dropout(net, keep_prob=self._dropout_keep_prob) #we are using the dropout class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, #like above we get a 3d feature set as aboive [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: #We don't apply elementwise sigmoid to the score class_predictions_with_background = tf.sigmoid( class_predictions_with_background) batch_size = static_shape.get_batch_size(image_features.get_shape()) #batch_size if batch_size is None: # features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( #do some reshaping box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [-1, flattened_predictions_size, num_class_slots]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size, -1, num_class_slots]) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) batch_size = static_shape.get_batch_size(image_features.get_shape()) if batch_size is None: features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [-1, flattened_predictions_size, num_class_slots]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size, -1, num_class_slots]) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location score_predictions: A float tensor of shape [batch_size, num_anchors, 1] representing the score predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) num_class_slots = 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], activation_fn=tf.nn.sigmoid, scope='BoxEncodingPredictor') angle_encodings = slim.conv2d( net, num_predictions_per_location * 1, [self._kernel_size, self._kernel_size], scope='AngleEncodingPredictor') score_predictions = slim.conv2d( net, num_predictions_per_location * 1, [self._kernel_size, self._kernel_size], scope='ScorePredictor') #score_predictions = tf.sigmoid(score_predictions) batch_size = static_shape.get_batch_size(image_features.get_shape()) if batch_size is None: features_height = static_shape.get_height( image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) angle_encodings = tf.reshape( angle_encodings, [-1, flattened_predictions_size, 1, 1]) score_predictions = tf.reshape(score_predictions, [-1, flattened_predictions_size, 1]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) angle_encodings = tf.reshape(angle_encodings, [batch_size, -1, 1, 1]) score_predictions = tf.reshape(score_predictions, [batch_size, -1, 1]) return { BOX_ENCODINGS: box_encodings, ANGLE_ENCODINGS: angle_encodings, SCORE_PREDICTIONS: score_predictions }