def test_die_on_tensor_shape_with_rank_three(self): tensor_shape = tf.TensorShape(dims=[32, 299, 384]) with self.assertRaises(ValueError): static_shape.get_batch_size(tensor_shape) static_shape.get_height(tensor_shape) static_shape.get_width(tensor_shape) static_shape.get_depth(tensor_shape)
def pad_to_multiple(tensor, multiple): """Returns the tensor zero padded to the specified multiple. Appends 0s to the end of the first and second dimension (height and width) of the tensor until both dimensions are a multiple of the input argument 'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input multiple of 4, PadToMultiple will append 0s so that the resulting tensor will be of shape [1, 4, 8, 1]. Args: tensor: rank 4 float32 tensor, where tensor -> [batch_size, height, width, channels]. multiple: the multiple to pad to. Returns: padded_tensor: the tensor zero padded to the specified multiple. """ tensor_shape = tensor.get_shape() batch_size = static_shape.get_batch_size(tensor_shape) tensor_height = static_shape.get_height(tensor_shape) tensor_width = static_shape.get_width(tensor_shape) tensor_depth = static_shape.get_depth(tensor_shape) if batch_size is None: batch_size = tf.shape(tensor)[0] if tensor_height is None: tensor_height = tf.shape(tensor)[1] padded_tensor_height = tf.to_int32( tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple else: padded_tensor_height = int( math.ceil(float(tensor_height) / multiple) * multiple) if tensor_width is None: tensor_width = tf.shape(tensor)[2] padded_tensor_width = tf.to_int32( tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple else: padded_tensor_width = int( math.ceil(float(tensor_width) / multiple) * multiple) if tensor_depth is None: tensor_depth = tf.shape(tensor)[3] # Use tf.concat instead of tf.pad to preserve static shape if padded_tensor_height != tensor_height: height_pad = tf.zeros([ batch_size, padded_tensor_height - tensor_height, tensor_width, tensor_depth ]) tensor = tf.concat([tensor, height_pad], 1) if padded_tensor_width != tensor_width: width_pad = tf.zeros([ batch_size, padded_tensor_height, padded_tensor_width - tensor_width, tensor_depth ]) tensor = tf.concat([tensor, width_pad], 2) return tensor
def build(self, input_shapes): """Creates the variables of the layer.""" if len(input_shapes) != len(self._prediction_heads[BOX_ENCODINGS]): raise ValueError( 'This box predictor was constructed with %d heads,' 'but there are %d inputs.' % (len( self._prediction_heads[BOX_ENCODINGS]), len(input_shapes))) for stack_index, input_shape in enumerate(input_shapes): net = [] # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth(input_shape) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info( 'depth of additional conv before box predictor: {}'.format( depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net.append( keras.Conv2D( depth, [1, 1], name='SharedConvolutions_%d/Conv2d_%d_1x1_%d' % (stack_index, i, depth), padding='SAME', **self._conv_hyperparams.params())) net.append( self._conv_hyperparams.build_batch_norm( training=(self._is_training and not self._freeze_batchnorm), name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_norm' % (stack_index, i, depth))) net.append( self._conv_hyperparams.build_activation_layer( name= 'SharedConvolutions_%d/Conv2d_%d_1x1_%d_activation' % (stack_index, i, depth), )) # Until certain bugs are fixed in checkpointable lists, # this net must be appended only once it's been filled with layers self._shared_nets.append(net) self.built = True
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location oriented_box_encodings: A float tensor of shape [batch_size, num_anchors, 1, oriented_code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): #regular box encoding predictions box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') #oriented box encoding predictions oriented_box_encodings = slim.conv2d( net, num_predictions_per_location * self._oriented_box_code_size, [self._kernel_size, self._kernel_size], scope='OrientedBoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape( image_features) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size ])) oriented_box_encodings = tf.reshape( oriented_box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, 4, 2 ])) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) return { BOX_ENCODINGS: box_encodings, BOX_ENCODINGS_ORIENTED: oriented_box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background }
def _predict(self, image_features, num_predictions_per_location_list): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels_i] containing features for a batch of images. num_predictions_per_location_list: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ box_encodings_list = [] class_predictions_list = [] # TODO: Come up with a better way to generate scope names # in box predictor once we have time to retrain all models in the zoo. # The following lines create scope names to be backwards compatible with the # existing checkpoints. box_predictor_scopes = [_NoopVariableScope()] if len(image_features) > 1: box_predictor_scopes = [ tf.variable_scope('BoxPredictor_{}'.format(i)) for i in range(len(image_features)) ] for (image_feature, num_predictions_per_location, box_predictor_scope) in zip( image_features, num_predictions_per_location_list, box_predictor_scopes): with box_predictor_scope: # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_feature with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth(image_feature.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info('depth of additional conv before box predictor: {}'. format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): if self._use_depthwise: box_encodings = slim.separable_conv2d( net, None, [self._kernel_size, self._kernel_size], padding='SAME', depth_multiplier=1, stride=1, rate=1, scope='BoxEncodingPredictor_depthwise') box_encodings = slim.conv2d( box_encodings, num_predictions_per_location * self._box_code_size, [1, 1], scope='BoxEncodingPredictor') else: box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) if self._use_depthwise: class_predictions_with_background = slim.separable_conv2d( net, None, [self._kernel_size, self._kernel_size], padding='SAME', depth_multiplier=1, stride=1, rate=1, scope='ClassPredictor_depthwise') class_predictions_with_background = slim.conv2d( class_predictions_with_background, num_predictions_per_location * num_class_slots, [1, 1], scope='ClassPredictor') else: class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor', biases_initializer=tf.constant_initializer( self._class_prediction_bias_init)) if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = (shape_utils. combined_static_and_dynamic_shape( image_feature)) box_encodings = tf.reshape( box_encodings, tf.stack([combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size])) box_encodings_list.append(box_encodings) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots])) class_predictions_list.append(class_predictions_with_background) return {BOX_ENCODINGS: tf.concat(box_encodings_list, axis=1), CLASS_PREDICTIONS_WITH_BACKGROUND: tf.concat(class_predictions_list, axis=1)}
def test_return_correct_depth(self): tensor_shape = tf.TensorShape(dims=[32, 299, 384, 3]) self.assertEqual(3, static_shape.get_depth(tensor_shape))
def _predict(self, image_features, num_predictions_per_location_list): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels_i] containing features for a batch of images. num_predictions_per_location_list: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Returns: A dictionary containing: box_encodings: A list of float tensors of shape [batch_size, num_anchors_i, q, code_size] representing the location of the objects, where q is 1 or the number of classes. Each entry in the list corresponds to a feature map in the input `image_features` list. class_predictions_with_background: A list of float tensors of shape [batch_size, num_anchors_i, num_classes + 1] representing the class predictions for the proposals. Each entry in the list corresponds to a feature map in the input `image_features` list. (optional) Predictions from other heads. """ predictions = { BOX_ENCODINGS: [], CLASS_PREDICTIONS_WITH_BACKGROUND: [], } for head_name in self._other_heads.keys(): predictions[head_name] = [] # TODO(rathodv): Come up with a better way to generate scope names # in box predictor once we have time to retrain all models in the zoo. # The following lines create scope names to be backwards compatible with the # existing checkpoints. box_predictor_scopes = [_NoopVariableScope()] if len(image_features) > 1: box_predictor_scopes = [ tf.variable_scope('BoxPredictor_{}'.format(i)) for i in range(len(image_features)) ] for (image_feature, num_predictions_per_location, box_predictor_scope) in zip(image_features, num_predictions_per_location_list, box_predictor_scopes): net = image_feature with box_predictor_scope: with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth( image_feature.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info( 'depth of additional conv before box predictor: {}' .format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], reuse=tf.AUTO_REUSE, scope='Conv2d_%d_1x1_%d' % (i, depth)) sorted_keys = sorted(self._other_heads.keys()) sorted_keys.append(BOX_ENCODINGS) sorted_keys.append(CLASS_PREDICTIONS_WITH_BACKGROUND) for head_name in sorted_keys: if head_name == BOX_ENCODINGS: head_obj = self._box_prediction_head elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND: head_obj = self._class_prediction_head else: head_obj = self._other_heads[head_name] prediction = head_obj.predict( features=net, num_predictions_per_location= num_predictions_per_location) predictions[head_name].append(prediction) return predictions