def test_equal_static_shape_along_first_dim_succeeds(self): shape_a = tf.constant(np.zeros([4, 2, 2, 1])) shape_b = tf.constant(np.zeros([4, 7, 2])) with self.test_session() as sess: op = shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(shape_a), shape_utils.combined_static_and_dynamic_shape(shape_b)) sess.run(op)
def test_unequal_static_shape_along_first_dim_raises_exception(self): shape_a = tf.constant(np.zeros([4, 2, 2, 1])) shape_b = tf.constant(np.zeros([6, 2, 3, 1])) with self.assertRaisesRegexp( ValueError, 'Unequal first dimension'): shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(shape_a), shape_utils.combined_static_and_dynamic_shape(shape_b))
def test_equal_dynamic_shape_along_first_dim_succeeds(self): tensor_a = tf.placeholder(tf.float32, shape=[None, None, None, 3]) tensor_b = tf.placeholder(tf.float32, shape=[None]) op = shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(tensor_a), shape_utils.combined_static_and_dynamic_shape(tensor_b)) with self.test_session() as sess: sess.run(op, feed_dict={tensor_a: np.zeros([5, 2, 2, 3]), tensor_b: np.zeros([5])})
def _find_interval_containing_new_value(x, new_value): """Find the index of x (ascending-ordered) after which new_value occurs.""" new_value_shape = shape_utils.combined_static_and_dynamic_shape(new_value)[0] x_shape = shape_utils.combined_static_and_dynamic_shape(x)[0] compare = tf.cast(tf.reshape(new_value, shape=(new_value_shape, 1)) >= tf.reshape(x, shape=(1, x_shape)), dtype=tf.int32) diff = compare[:, 1:] - compare[:, :-1] interval_idx = tf.argmin(diff, axis=1) return interval_idx
def test_unequal_dynamic_shape_along_first_dim_raises_tf_assert(self): tensor_a = tf.placeholder(tf.float32, shape=[None, None, None, 3]) tensor_b = tf.placeholder(tf.float32, shape=[None, None, 3]) op = shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(tensor_a), shape_utils.combined_static_and_dynamic_shape(tensor_b)) with self.test_session() as sess: with self.assertRaises(tf.errors.InvalidArgumentError): sess.run(op, feed_dict={tensor_a: np.zeros([1, 2, 2, 3]), tensor_b: np.zeros([2, 4, 3])})
def test_equal_dynamic_shape_along_first_dim_succeeds(self): tensor_a = tf.placeholder(tf.float32, shape=[None, None, None, 3]) tensor_b = tf.placeholder(tf.float32, shape=[None]) op = shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(tensor_a), shape_utils.combined_static_and_dynamic_shape(tensor_b)) with self.test_session() as sess: sess.run(op, feed_dict={ tensor_a: np.zeros([5, 2, 2, 3]), tensor_b: np.zeros([5]) })
def _build_(self, fea0, fea1, ind0, ind1, score_size, neg_fea, matched_class0, neg_matched_class, reuse_vars, scope, tile_fea1=True): print( 'Warning: Do not use this function (DeepCrossSimilarity._build) for training' ) if fea1 is None or not tile_fea1: a, b = self._build_inner(fea0, fea1, ind0, ind1, score_size, neg_fea, matched_class0, neg_matched_class, reuse_vars, scope, tile_fea1) return a, b def fn(fea0): fea0 = fea0[tf.newaxis] scores, loss = self._build_inner(fea0, fea1, ind0, ind1, score_size, neg_fea, matched_class0, neg_matched_class, reuse_vars, scope, tile_fea1) return scores[0] mini_bs = 64 fea0 = fea0[0] fea0_shape = shape_utils.combined_static_and_dynamic_shape(fea0) rem = tf.mod(mini_bs - tf.mod(fea0_shape[0], mini_bs), mini_bs) fea0 = tf.pad(fea0, [[0, rem], [0, 0]]) fea0 = tf.reshape(fea0, [-1, mini_bs, fea0_shape[-1]]) scores = tf.map_fn(fn, fea0, dtype=tf.float32, parallel_iterations=1, back_prop=False, swap_memory=True, infer_shape=True, name='efficient_memory_deep_cs') scores_shape = shape_utils.combined_static_and_dynamic_shape(scores) scores = tf.reshape(scores, [-1] + scores_shape[2:]) scores_shape = shape_utils.combined_static_and_dynamic_shape(scores) scores = scores[:(scores_shape[0] - rem)] return scores[tf.newaxis], None
def test_unequal_dynamic_shape_along_first_dim_raises_tf_assert(self): tensor_a = tf.placeholder(tf.float32, shape=[None, None, None, 3]) tensor_b = tf.placeholder(tf.float32, shape=[None, None, 3]) op = shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(tensor_a), shape_utils.combined_static_and_dynamic_shape(tensor_b)) with self.test_session() as sess: with self.assertRaises(tf.errors.InvalidArgumentError): sess.run(op, feed_dict={ tensor_a: np.zeros([1, 2, 2, 3]), tensor_b: np.zeros([2, 4, 3]) })
def calibration_fn(class_predictions_with_background): """Calibrate predictions via 1-d linear interpolation. Predictions scores are linearly interpolated based on a class-agnostic function approximation. Note that the 0-indexed background class is also transformed. Args: class_predictions_with_background: tf.float32 tensor of shape [batch_size, num_anchors, num_classes + 1] containing scores on the interval [0,1]. This is usually produced by a sigmoid or softmax layer and the result of calling the `predict` method of a detection model. Returns: tf.float32 tensor of the same shape as the input with values on the interval [0, 1]. """ # Flattening Tensors and then reshaping at the end. flat_class_predictions_with_background = tf.reshape( class_predictions_with_background, shape=[-1]) fn_x, fn_y = _function_approximation_proto_to_tf_tensors( calibration_config.function_approximation.x_y_pairs) updated_scores = _tf_linear_interp1d( flat_class_predictions_with_background, fn_x, fn_y) # Un-flatten the scores original_detections_shape = shape_utils.combined_static_and_dynamic_shape( class_predictions_with_background) calibrated_class_predictions_with_background = tf.reshape( updated_scores, shape=original_detections_shape, name='calibrate_scores') return calibrated_class_predictions_with_background
def calibration_fn(class_predictions_with_background): class_id_function_dict = _get_class_id_function_dict( calibration_config) # Tensors are split by class and then recombined at the end to recover # the input's original shape. If a class id does not have calibration # parameters, it is left unchanged. class_tensors = tf.unstack(class_predictions_with_background, axis=-1) calibrated_class_tensors = [] for class_id, class_tensor in enumerate(class_tensors): flat_class_tensor = tf.reshape(class_tensor, shape=[-1]) if class_id in class_id_function_dict: output_tensor = _tf_linear_interp1d( x_to_interpolate=flat_class_tensor, fn_x=class_id_function_dict[class_id][0], fn_y=class_id_function_dict[class_id][1]) else: tf.logging.info( 'Calibration parameters for class id `%d` not not found', class_id) output_tensor = flat_class_tensor calibrated_class_tensors.append(output_tensor) combined_calibrated_tensor = tf.stack(calibrated_class_tensors, axis=1) input_shape = shape_utils.combined_static_and_dynamic_shape( class_predictions_with_background) calibrated_class_predictions_with_background = tf.reshape( combined_calibrated_tensor, shape=input_shape, name='calibrate_scores') return calibrated_class_predictions_with_background
def _predict(self, image_features, **kwargs): image_feature = image_features[0] combined_feature_shape = shape_utils.combined_static_and_dynamic_shape( image_feature) batch_size = combined_feature_shape[0] num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) code_size = 4 zero = tf.reduce_sum(0 * image_feature) num_class_slots = self.num_classes if self._add_background_class: num_class_slots = num_class_slots + 1 box_encodings = zero + tf.zeros( (batch_size, num_anchors, 1, code_size), dtype=tf.float32) class_predictions_with_background = zero + tf.zeros( (batch_size, num_anchors, num_class_slots), dtype=tf.float32) masks = zero + tf.zeros((batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE, DEFAULT_MASK_SIZE), dtype=tf.float32) predictions_dict = { box_predictor.BOX_ENCODINGS: box_encodings, box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background } if self._predict_mask: predictions_dict[box_predictor.MASK_PREDICTIONS] = masks return predictions_dict
def reduced_score(inp, reuse_vars=True): solo_fea, ids0, ids1, target_score_inds = inp u_ids0, idx_map0, inverse0 = unique_with_inverse(ids0) u_ids1, idx_map1, inverse1 = unique_with_inverse(ids1) reduced_fea0 = tf.gather(solo_fea[0], u_ids0) reduced_fea1 = tf.gather(solo_fea[1], u_ids1) with tf.variable_scope(scope, reuse=reuse_vars) as sc: self._cross_similarity._target_score_inds = target_score_inds reduced_scores, loss = self._cross_similarity._build( reduced_fea0[tf.newaxis, ...], reduced_fea1[tf.newaxis, ...], None, None, score_size, None, None, None, reuse_vars=reuse_vars, scope=sc) assert (reduced_scores.shape[-1] == 1) # [1, m', l', 1] reduced_scores_shape = shape_utils.combined_static_and_dynamic_shape( reduced_scores) nscores = reduced_scores_shape[1] * reduced_scores_shape[2] # [m', l', 1] reduced_scores = tf.reshape(reduced_scores, reduced_scores_shape[1:]) # [m, l', 1] scores_0 = tf.gather(reduced_scores, idx_map0) # [m, l, 1] scores = tf.gather(scores_0, idx_map1, axis=1) return scores, nscores
def calibration_fn(class_predictions_with_background): """Calibrate predictions via 1-d linear interpolation. Predictions scores are linearly interpolated based on class-agnostic function approximations. Note that the 0-indexed background class may also transformed. Args: class_predictions_with_background: tf.float32 tensor of shape [batch_size, num_anchors, num_classes + 1] containing scores on the interval [0,1]. This is usually produced by a sigmoid or softmax layer and the result of calling the `predict` method of a detection model. Returns: tf.float32 tensor of shape [batch_size, num_anchors, num_classes] if background class is not present (else shape is [batch_size, num_anchors, num_classes + 1]) on the interval [0, 1]. """ # Flattening Tensors and then reshaping at the end. flat_class_predictions_with_background = tf.reshape( class_predictions_with_background, shape=[-1]) fn_x, fn_y = _function_approximation_proto_to_tf_tensors( calibration_config.function_approximation.x_y_pairs) updated_scores = _tf_linear_interp1d( flat_class_predictions_with_background, fn_x, fn_y) # Un-flatten the scores original_detections_shape = shape_utils.combined_static_and_dynamic_shape( class_predictions_with_background) calibrated_class_predictions_with_background = tf.reshape( updated_scores, shape=original_detections_shape, name='calibrate_scores') return calibrated_class_predictions_with_background
def _batch_decode(self, box_encodings): """Decodes a batch of box encodings with respect to the anchors. Args: box_encodings: A float32 tensor of shape [batch_size, num_anchors, box_code_size] containing box encodings. Returns: decoded_boxes: A float32 tensor of shape [batch_size, num_anchors, 4] containing the decoded boxes. decoded_keypoints: A float32 tensor of shape [batch_size, num_anchors, num_keypoints, 2] containing the decoded keypoints if present in the input `box_encodings`, None otherwise. """ combined_shape = shape_utils.combined_static_and_dynamic_shape( box_encodings) batch_size = combined_shape[0] tiled_anchor_boxes = tf.tile( tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1]) tiled_anchors_boxlist = box_list.BoxList( tf.reshape(tiled_anchor_boxes, [-1, 4])) decoded_boxes = self._box_coder.decode( tf.reshape(box_encodings, [-1, self._box_coder.code_size]), tiled_anchors_boxlist) decoded_keypoints = None if decoded_boxes.has_field(fields.BoxListFields.keypoints): decoded_keypoints = decoded_boxes.get_field( fields.BoxListFields.keypoints) num_keypoints = decoded_keypoints.get_shape()[1] decoded_keypoints = tf.reshape( decoded_keypoints, tf.stack([combined_shape[0], combined_shape[1], num_keypoints, 2])) decoded_boxes = tf.reshape(decoded_boxes.get(), tf.stack( [combined_shape[0], combined_shape[1], 4])) return decoded_boxes, decoded_keypoints
def select_random_box(boxlist, default_box=None, seed=None, scope=None): """Selects a random bounding box from a `BoxList`. Args: boxlist: A BoxList. default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`, this default box will be returned. If None, will use a default box of [[-1., -1., -1., -1.]]. seed: Random seed. scope: Name scope. Returns: bbox: A [1, 4] tensor with a random bounding box. valid: A bool tensor indicating whether a valid bounding box is returned (True) or whether the default box is returned (False). """ with tf.name_scope(scope, 'SelectRandomBox'): bboxes = boxlist.get() combined_shape = shape_utils.combined_static_and_dynamic_shape(bboxes) number_of_boxes = combined_shape[0] default_box = default_box or tf.constant([[-1., -1., -1., -1.]]) def select_box(): random_index = tf.random_uniform([], maxval=number_of_boxes, dtype=tf.int32, seed=seed) return tf.expand_dims(bboxes[random_index], axis=0), tf.constant(True) return tf.cond(tf.greater_equal(number_of_boxes, 1), true_fn=select_box, false_fn=lambda: (default_box, tf.constant(False)))
def nearest_neighbor_upsampling(input_tensor, scale): """Nearest neighbor upsampling implementation. Nearest neighbor upsampling function that maps input tensor with shape [batch_size, height, width, channels] to [batch_size, height * scale , width * scale, channels]. This implementation only uses reshape and broadcasting to make it TPU compatible. Args: input_tensor: A float32 tensor of size [batch, height_in, width_in, channels]. scale: An integer multiple to scale resolution of input data. Returns: data_up: A float32 tensor of size [batch, height_in*scale, width_in*scale, channels]. """ with tf.name_scope('nearest_neighbor_upsampling'): (batch_size, height, width, channels ) = shape_utils.combined_static_and_dynamic_shape(input_tensor) output_tensor = tf.reshape(input_tensor, [ batch_size, height, 1, width, 1, channels ]) * tf.ones([1, 1, scale, 1, scale, 1], dtype=input_tensor.dtype) return tf.reshape( output_tensor, [batch_size, height * scale, width * scale, channels])
def test_combined_static_dynamic_shape(self): for n in [2, 3, 4]: tensor = tf.zeros((n, 2, 3)) combined_shape = shape_utils.combined_static_and_dynamic_shape( tensor) self.assertListEqual(combined_shape[1:], [2, 3])
def predict(self, preprocessed_inputs, true_image_shapes): """Predicts unpostprocessed tensors from input tensor. This function takes an input batch of images and runs it through the forward pass of the network to yield unpostprocessesed predictions. A side effect of calling the predict method is that self._anchors is populated with a box_list.BoxList of anchors. These anchors must be constructed before the postprocess or loss functions can be called. Args: preprocessed_inputs: a [batch, height, width, channels] image tensor. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) preprocessed_inputs: the [batch, height, width, channels] image tensor. 2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 3) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). 4) feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i]. 5) anchors: 2-D float tensor of shape [num_anchors, 4] containing the generated anchors in normalized coordinates. """ with tf.variable_scope(None, self._extract_features_scope, [preprocessed_inputs]): feature_maps = self._feature_extractor.extract_features( preprocessed_inputs) feature_map_spatial_dims = self._get_feature_map_spatial_dims( feature_maps) image_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_inputs) self._anchors = box_list_ops.concatenate( self._anchor_generator.generate(feature_map_spatial_dims, im_height=image_shape[1], im_width=image_shape[2])) prediction_dict = self._box_predictor.predict( feature_maps, self._anchor_generator.num_anchors_per_location()) box_encodings = tf.squeeze(tf.concat(prediction_dict['box_encodings'], axis=1), axis=2) class_predictions_with_background = tf.concat( prediction_dict['class_predictions_with_background'], axis=1) predictions_dict = { 'preprocessed_inputs': preprocessed_inputs, 'box_encodings': box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'feature_maps': feature_maps, 'anchors': self._anchors.get() } return predictions_dict
def _batch_decode(self, box_encodings): """Decodes a batch of box encodings with respect to the anchors. Args: box_encodings: A float32 tensor of shape [batch_size, num_anchors, box_code_size] containing box encodings. Returns: decoded_boxes: A float32 tensor of shape [batch_size, num_anchors, 4] containing the decoded boxes. decoded_keypoints: A float32 tensor of shape [batch_size, num_anchors, num_keypoints, 2] containing the decoded keypoints if present in the input `box_encodings`, None otherwise. """ combined_shape = shape_utils.combined_static_and_dynamic_shape( box_encodings) batch_size = combined_shape[0] tiled_anchor_boxes = tf.tile( tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1]) tiled_anchors_boxlist = box_list.BoxList( tf.reshape(tiled_anchor_boxes, [-1, 4])) decoded_boxes = self._box_coder.decode( tf.reshape(box_encodings, [-1, self._box_coder.code_size]), tiled_anchors_boxlist) decoded_keypoints = None if decoded_boxes.has_field(fields.BoxListFields.keypoints): decoded_keypoints = decoded_boxes.get_field( fields.BoxListFields.keypoints) num_keypoints = decoded_keypoints.get_shape()[1] decoded_keypoints = tf.reshape( decoded_keypoints, tf.stack([combined_shape[0], combined_shape[1], num_keypoints, 2])) decoded_boxes = tf.reshape(decoded_boxes.get(), tf.stack( [combined_shape[0], combined_shape[1], 4])) return decoded_boxes, decoded_keypoints
def _onehot_labels(class_hist, min_nmatch, neg_class_hist): ''' If the highest value in the class_hist is more than the required threshold the label would be the corresponding class for the highest value. Otherwise, it would be background. Length of the one_hot label might be num_classes + 1 or 2 depending on the class_agnostic value. ''' # Match happens when at least min_nmatch # objects belongs to the same foreground class. labels = tf.less_equal(np.float32(min_nmatch), class_hist) labels = tf.to_float(labels) labels = _clear_negative_classes(labels, neg_class_hist) labels = tf.cast(labels, dtype=tf.bool)[..., 1:] num_classes = shape_utils.combined_static_and_dynamic_shape(labels)[-1] # Choose at most one positive label argmax = tf.argmax(class_hist[..., 1:], axis=-1) optim_labels = tf.cast(tf.one_hot(argmax, num_classes), tf.bool) labels = tf.logical_and(labels, optim_labels) fg = tf.reduce_any(labels, axis=-1, keep_dims=True) bg = tf.logical_not(fg) return labels, fg, bg
def _coordinates_to_heatmap_dense(y_grid, x_grid, y_coordinates, x_coordinates, sigma, channel_onehot, channel_weights=None): """Dense version of coordinates to heatmap that uses an outer product.""" num_instances, num_channels = ( shape_utils.combined_static_and_dynamic_shape(channel_onehot)) x_grid = tf.expand_dims(x_grid, 2) y_grid = tf.expand_dims(y_grid, 2) # The raw center coordinates in the output space. x_diff = x_grid - tf.math.floor(x_coordinates) y_diff = y_grid - tf.math.floor(y_coordinates) squared_distance = x_diff**2 + y_diff**2 gaussian_map = tf.exp(-squared_distance / (2 * sigma * sigma)) reshaped_gaussian_map = tf.expand_dims(gaussian_map, axis=-1) reshaped_channel_onehot = tf.reshape(channel_onehot, (1, 1, num_instances, num_channels)) gaussian_per_box_per_class_map = ( reshaped_gaussian_map * reshaped_channel_onehot) if channel_weights is not None: reshaped_weights = tf.reshape(channel_weights, (1, 1, num_instances, 1)) gaussian_per_box_per_class_map *= reshaped_weights # Take maximum along the "instance" dimension so that all per-instance # heatmaps of the same class are merged together. heatmap = tf.reduce_max(gaussian_per_box_per_class_map, axis=2) # Maximum of an empty tensor is -inf, the following is to avoid that. heatmap = tf.maximum(heatmap, 0) return tf.stop_gradient(heatmap)
def _predict(self, image_features, **kwargs): image_feature = image_features[0] combined_feature_shape = shape_utils.combined_static_and_dynamic_shape( image_feature) batch_size = combined_feature_shape[0] num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) code_size = 4 zero = tf.reduce_sum(0 * image_feature) box_encodings = zero + tf.zeros( (batch_size, num_anchors, 1, code_size), dtype=tf.float32) class_predictions_with_background = zero + tf.zeros( (batch_size, num_anchors, self.num_classes + 1), dtype=tf.float32) masks = zero + tf.zeros( (batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE, DEFAULT_MASK_SIZE), dtype=tf.float32) predictions_dict = { box_predictor.BOX_ENCODINGS: box_encodings, box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background } if self._predict_mask: predictions_dict[box_predictor.MASK_PREDICTIONS] = masks return predictions_dict
def _create_regression_targets_3d(self, anchors, groundtruth_boxes_3d, match): """Returns a regression target for each anchor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth_boxes match: a matcher.Match object Returns: reg_targets: a float32 tensor with shape [N, box_code_dimension] """ matched_gt_boxes_3d = match.gather_based_on_match( groundtruth_boxes_3d.get(), unmatched_value=tf.zeros(6), ignored_value=tf.zeros(6)) matched_gt_boxlist_3d = box_list.Box3dList(matched_gt_boxes_3d) matched_reg_targets_3d = self._box_coder.encode_3d(matched_gt_boxlist_3d, anchors) match_results_shape = shape_utils.combined_static_and_dynamic_shape( match.match_results) # Zero out the unmatched and ignored regression targets. unmatched_ignored_reg_targets = tf.tile( self._default_regression_target_3d(), [match_results_shape[0], 1]) matched_anchors_mask = match.matched_column_indicator() reg_targets_3d = tf.where(matched_anchors_mask, matched_reg_targets_3d, unmatched_ignored_reg_targets) return reg_targets_3d
def tile_and_reshape_cobj_prop(prop, k): # Since we have one feature vector for each co-object # (each co-object is for k images) we need to repeat each # co-object feature vector k times. shape = shape_utils.combined_static_and_dynamic_shape(prop) prop = tf.tile(prop[:, tf.newaxis], [1, k] + [1] * (len(shape) - 1)) shape = [-1] + shape[1:] return tf.reshape(prop, shape)
def predict(self, preprocessed_inputs, true_image_shapes): """Predicts unpostprocessed tensors from input tensor. This function takes an input batch of images and runs it through the forward pass of the network to yield unpostprocessesed predictions. A side effect of calling the predict method is that self._anchors is populated with a box_list.BoxList of anchors. These anchors must be constructed before the postprocess or loss functions can be called. Args: preprocessed_inputs: a [batch, height, width, channels] image tensor. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) preprocessed_inputs: the [batch, height, width, channels] image tensor. 2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 3) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). 4) feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i]. 5) anchors: 2-D float tensor of shape [num_anchors, 4] containing the generated anchors in normalized coordinates. """ with tf.variable_scope(None, self._extract_features_scope, [preprocessed_inputs]): feature_maps = self._feature_extractor.extract_features( preprocessed_inputs) feature_map_spatial_dims = self._get_feature_map_spatial_dims(feature_maps) image_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_inputs) self._anchors = box_list_ops.concatenate( self._anchor_generator.generate( feature_map_spatial_dims, im_height=image_shape[1], im_width=image_shape[2])) prediction_dict = self._box_predictor.predict( feature_maps, self._anchor_generator.num_anchors_per_location()) box_encodings = tf.squeeze( tf.concat(prediction_dict['box_encodings'], axis=1), axis=2) class_predictions_with_background = tf.concat( prediction_dict['class_predictions_with_background'], axis=1) predictions_dict = { 'preprocessed_inputs': preprocessed_inputs, 'box_encodings': box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'feature_maps': feature_maps, 'anchors': self._anchors.get() } return predictions_dict
def convert_proposal_inds(proposal_inds): # [N, M, J] # proposal_inds.shape = [meta_batch_size, self.ncobj_proposals, k_shot] # ==> [meta_batch_size, k_shot, self.ncobj_proposals] proposal_inds = tf.transpose(proposal_inds, perm=[0, 2, 1]) ncobj_proposals = shape_utils.combined_static_and_dynamic_shape( proposal_inds)[2] # ==> [meta_batch_size*k_shot, self.ncobj_proposals] return tf.reshape(proposal_inds, [-1, ncobj_proposals])
def _match_when_rows_are_non_empty(): """Performs matching when the rows of similarity matrix are non empty. Returns: matches: int32 tensor indicating the row each column matches to. """ # Matches for each column matches = tf.argmax(input=similarity_matrix, axis=0, output_type=tf.int32) # Deal with matched and unmatched threshold if self._matched_threshold is not None: # Get logical indices of ignored and unmatched columns as tf.int64 matched_vals = tf.reduce_max(input_tensor=similarity_matrix, axis=0) below_unmatched_threshold = tf.greater( self._unmatched_threshold, matched_vals) between_thresholds = tf.logical_and( tf.greater_equal(matched_vals, self._unmatched_threshold), tf.greater(self._matched_threshold, matched_vals)) if self._negatives_lower_than_unmatched: matches = self._set_values_using_indicator( matches, below_unmatched_threshold, -1) matches = self._set_values_using_indicator( matches, between_thresholds, -2) else: matches = self._set_values_using_indicator( matches, below_unmatched_threshold, -2) matches = self._set_values_using_indicator( matches, between_thresholds, -1) if self._force_match_for_each_row: similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) force_match_column_ids = tf.argmax(input=similarity_matrix, axis=1, output_type=tf.int32) force_match_column_indicators = ( tf.one_hot(force_match_column_ids, depth=similarity_matrix_shape[1]) * tf.cast(tf.expand_dims(valid_rows, axis=-1), dtype=tf.float32)) force_match_row_ids = tf.argmax( input=force_match_column_indicators, axis=0, output_type=tf.int32) force_match_column_mask = tf.cast( tf.reduce_max(input_tensor=force_match_column_indicators, axis=0), tf.bool) final_matches = tf.compat.v1.where(force_match_column_mask, force_match_row_ids, matches) return final_matches else: return matches
def predict(self, preprocessed_inputs, true_image_shapes, states=None, state_name='lstm_state', feature_scope=None): with tf.variable_scope(self._extract_features_scope, values=[preprocessed_inputs], reuse=tf.AUTO_REUSE): feature_maps = self._feature_extractor.extract_features( preprocessed_inputs, states, state_name, unroll_length=self._unroll_length, scope=feature_scope) feature_map_spatial_dims = self._get_feature_map_spatial_dims( feature_maps) image_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_inputs) self._batch_size = preprocessed_inputs.shape[ 0].value / self._unroll_length self._states = states self._anchors = box_list_ops.concatenate( self._anchor_generator.generate(feature_map_spatial_dims, im_height=image_shape[1], im_width=image_shape[2])) prediction_dict = self._box_predictor.predict( feature_maps, self._anchor_generator.num_anchors_per_location()) # Multiscale_anchor_generator currently has a different dim compared to # ssd_anchor_generator. Current fix is to check the dim of the box_encodings # tensor. If dim is not 3(multiscale_anchor_generator), squeeze the 3rd dim. # TODO(yinxiao): Remove this check once the anchor generator has unified # dimension. if len(prediction_dict['box_encodings'][0].get_shape().as_list()) == 3: box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1) else: box_encodings = tf.squeeze(tf.concat( prediction_dict['box_encodings'], axis=1), axis=2) class_predictions_with_background = tf.concat( prediction_dict['class_predictions_with_background'], axis=1) predictions_dict = { 'preprocessed_inputs': preprocessed_inputs, 'box_encodings': box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'feature_maps': feature_maps, 'anchors': self._anchors.get(), 'states_and_outputs': self._feature_extractor.states_and_outputs, } # In cases such as exporting the model, the states is always zero. Thus the # step should be ignored. if states is not None: predictions_dict['step'] = self._feature_extractor.step return predictions_dict
def _coordinates_to_heatmap_sparse(y_grid, x_grid, y_coordinates, x_coordinates, sigma, channel_onehot, channel_weights=None): """Sparse version of coordinates to heatmap using tf.scatter.""" if not hasattr(tf, 'tensor_scatter_nd_max'): raise RuntimeError( ('Please upgrade tensowflow to use `tensor_scatter_nd_max` or set ' 'compute_heatmap_sparse=False')) _, num_channels = ( shape_utils.combined_static_and_dynamic_shape(channel_onehot)) height, width = shape_utils.combined_static_and_dynamic_shape(y_grid) x_grid = tf.expand_dims(x_grid, 2) y_grid = tf.expand_dims(y_grid, 2) # The raw center coordinates in the output space. x_diff = x_grid - tf.math.floor(x_coordinates) y_diff = y_grid - tf.math.floor(y_coordinates) squared_distance = x_diff**2 + y_diff**2 gaussian_map = tf.exp(-squared_distance / (2 * sigma * sigma)) if channel_weights is not None: gaussian_map = gaussian_map * channel_weights[tf.newaxis, tf.newaxis, :] channel_indices = tf.argmax(channel_onehot, axis=1) channel_indices = channel_indices[:, tf.newaxis] heatmap_init = tf.zeros((num_channels, height, width)) gaussian_map = tf.transpose(gaussian_map, (2, 0, 1)) heatmap = tf.tensor_scatter_nd_max(heatmap_init, channel_indices, gaussian_map) # Maximum of an empty tensor is -inf, the following is to avoid that. heatmap = tf.maximum(heatmap, 0) return tf.stop_gradient(tf.transpose(heatmap, (1, 2, 0)))
def _build_(self, fea0, fea1, ind0, ind1, score_size, neg_fea, matched_class0, neg_matched_class, reuse_vars, scope): print( 'Warning: Do not use this function (K1CrossSimilarity._build) for training' ) def fn(fea0): fea0 = fea0[tf.newaxis] scores, loss = self._build_inner(fea0, fea1, ind0, ind1, score_size, neg_fea, matched_class0, neg_matched_class, reuse_vars, scope) return scores[0] mini_bs = 64 fea0_shape = shape_utils.combined_static_and_dynamic_shape(fea0) rfea0 = tf.reshape(fea0, [-1] + fea0_shape[2:]) rem = tf.mod(mini_bs - tf.mod(fea0_shape[0] * fea0_shape[1], mini_bs), mini_bs) rfea0 = tf.pad(rfea0, [[0, rem], [0, 0]]) rfea0 = tf.reshape(rfea0, [-1, mini_bs, fea0_shape[-1]]) scores = tf.map_fn(fn, rfea0, dtype=tf.float32, parallel_iterations=1, back_prop=False, swap_memory=True, infer_shape=True, name='memory_efficient_k1') scores_shape = shape_utils.combined_static_and_dynamic_shape(scores) scores = tf.reshape(scores, [-1] + scores_shape[2:]) scores_shape = shape_utils.combined_static_and_dynamic_shape(scores) scores = scores[:(scores_shape[0] - rem)] scores = tf.reshape(scores, fea0_shape[:2] + [1, 1]) # Ignores postconvline self._joined_fea = fea0[:, :, tf.newaxis] return scores, None
def predict(self, features, num_predictions_per_location): """Predicts boxes. Args: features: A float tensor of shape [batch_size, height, width, channels] containing image features. num_predictions_per_location: Number of box predictions to be made per spatial location. Returns: class_predictions_with_background: A tensor of shape [batch_size, num_anchors, num_class_slots] representing the class predictions for the proposals, or a tensor of shape [batch, height, width, num_predictions_per_location * num_class_slots] representing class predictions before reshaping if self._return_flat_predictions is False. """ class_predictions_net = features if self._use_dropout: class_predictions_net = slim.dropout( class_predictions_net, keep_prob=self._dropout_keep_prob) if self._use_depthwise: conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d class_predictions_with_background = conv_op( class_predictions_net, num_predictions_per_location * self._num_class_slots, [self._kernel_size, self._kernel_size], activation_fn=None, stride=1, padding='SAME', normalizer_fn=None, biases_initializer=tf.constant_initializer( self._class_prediction_bias_init), scope=self._scope) batch_size, height, width = shape_utils.combined_static_and_dynamic_shape( features)[0:3] class_predictions_with_background = tf.reshape( class_predictions_with_background, [ batch_size, height, width, num_predictions_per_location, self._num_class_slots ]) class_predictions_with_background = self._score_converter_fn( class_predictions_with_background) if self._return_flat_predictions: class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size, -1, self._num_class_slots]) else: class_predictions_with_background = tf.reshape( class_predictions_with_background, [ batch_size, height, width, num_predictions_per_location * self._num_class_slots ]) return class_predictions_with_background
def _build(self, fea0, fea1, score_size, reuse_vars): fea0_shape = shape_utils.combined_static_and_dynamic_shape(fea0) m = fea0_shape[1] if fea1 is not None: fea0 = tf.tile(fea0[:, :, tf.newaxis], [1, 1, m, 1]) fea1 = tf.tile(fea1[:, tf.newaxis], [1, m, 1, 1]) fea01 = tf.concat((fea0, fea1), axis=-1) else: fea01 = fea0[:, :, tf.newaxis] self._joined_fea = fea01 fea01 = tf.mod(fea01, PROPOSALS_OFFSETS) shape = shape_utils.combined_static_and_dynamic_shape(fea01) fea01 = tf.to_int32(tf.reshape(fea01, [-1] + [shape[-1]])) fea01 = tf.map_fn(lambda fea: tf.bincount( fea, minlength=score_size, maxlength=score_size), fea01, dtype=tf.int32) fea01 = tf.to_float(tf.reshape(fea01, shape[:-1] + [-1])) return fea01
def _match_when_rows_are_non_empty(): """Performs matching when the rows of similarity matrix are non empty. Returns: matches: int32 tensor indicating the row each column matches to. """ # Matches for each column matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32) # Deal with matched and unmatched threshold if self._matched_threshold is not None: # Get logical indices of ignored and unmatched columns as tf.int64 matched_vals = tf.reduce_max(similarity_matrix, 0) below_unmatched_threshold = tf.greater( self._unmatched_threshold, matched_vals) between_thresholds = tf.logical_and( tf.greater_equal(matched_vals, self._unmatched_threshold), tf.greater(self._matched_threshold, matched_vals)) if self._negatives_lower_than_unmatched: matches = self._set_values_using_indicator( matches, below_unmatched_threshold, -1) matches = self._set_values_using_indicator( matches, between_thresholds, -2) else: matches = self._set_values_using_indicator( matches, below_unmatched_threshold, -2) matches = self._set_values_using_indicator( matches, between_thresholds, -1) if self._force_match_for_each_row: similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) force_match_column_ids = tf.argmax(similarity_matrix, 1, output_type=tf.int32) force_match_column_indicators = tf.one_hot( force_match_column_ids, depth=similarity_matrix_shape[1]) force_match_row_ids = tf.argmax(force_match_column_indicators, 0, output_type=tf.int32) force_match_column_mask = tf.cast( tf.reduce_max(force_match_column_indicators, 0), tf.bool) final_matches = tf.where( force_match_column_mask, force_match_row_ids, matches ) # returns elements of force_match_row_ids if column_mask is True, vice versa return final_matches else: num_anchors = tf.reduce_sum( tf.cast(tf.greater(matches, 0), tf.int32)) return tf.cond(num_anchors < self._minimum_anchor_num, true_fn=_match_when_not_enough_anchors, false_fn=lambda: matches)
def _predict(self, image_features, num_predictions_per_location): combined_feature_shape = shape_utils.combined_static_and_dynamic_shape( image_features) batch_size = combined_feature_shape[0] #num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) zero = tf.reduce_sum(0 * image_features) class_predictions = zero + tf.zeros( (batch_size, self.num_classes), dtype=tf.float32) return { class_predictor.IMAGE_LEVEL_CLASS_PREDICTIONS: class_predictions }
def _match_when_rows_are_empty(): """Performs matching when the rows of similarity matrix are empty. When the rows are empty, all detections are false positives. So we return a tensor of -1's to indicate that the columns do not match to any rows. Returns: matches: int32 tensor indicating the row each column matches to. """ similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)
def _match_when_rows_are_empty(): """Performs matching when the rows of similarity matrix are empty. When the rows are empty, all detections are false positives. So we return a tensor of -1's to indicate that the columns do not match to any rows. Returns: matches: int32 tensor indicating the row each column matches to. """ similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)
def calibration_fn(class_predictions_with_background): """Calibrate predictions per class via 1-d linear interpolation. Prediction scores are linearly interpolated with class-specific function approximations. Note that after calibration, an anchor's class scores will not necessarily sum to 1, and score ordering may change, depending on each class' calibration parameters. Args: class_predictions_with_background: tf.float32 tensor of shape [batch_size, num_anchors, num_classes + 1] containing scores on the interval [0,1]. This is usually produced by a sigmoid or softmax layer and the result of calling the `predict` method of a detection model. Returns: tf.float32 tensor of the same shape as the input with values on the interval [0, 1]. Raises: KeyError: Calibration parameters are not present for a class. """ class_id_function_dict = _get_class_id_function_dict( calibration_config) # Tensors are split by class and then recombined at the end to recover # the input's original shape. If a class id does not have calibration # parameters, it is left unchanged. class_tensors = tf.unstack(class_predictions_with_background, axis=-1) calibrated_class_tensors = [] for class_id, class_tensor in enumerate(class_tensors): flat_class_tensor = tf.reshape(class_tensor, shape=[-1]) if class_id in class_id_function_dict: output_tensor = _tf_linear_interp1d( x_to_interpolate=flat_class_tensor, fn_x=class_id_function_dict[class_id][0], fn_y=class_id_function_dict[class_id][1]) else: tf.logging.info( 'Calibration parameters for class id `%d` not not found', class_id) output_tensor = flat_class_tensor calibrated_class_tensors.append(output_tensor) combined_calibrated_tensor = tf.stack(calibrated_class_tensors, axis=1) input_shape = shape_utils.combined_static_and_dynamic_shape( class_predictions_with_background) calibrated_class_predictions_with_background = tf.reshape( combined_calibrated_tensor, shape=input_shape, name='calibrate_scores') return calibrated_class_predictions_with_background
def _match_when_rows_are_non_empty(): """Performs matching when the rows of similarity matrix are non empty. Returns: matches: int32 tensor indicating the row each column matches to. """ # Matches for each column matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32) # Deal with matched and unmatched threshold if self._matched_threshold is not None: # Get logical indices of ignored and unmatched columns as tf.int64 matched_vals = tf.reduce_max(similarity_matrix, 0) below_unmatched_threshold = tf.greater( self._unmatched_threshold, matched_vals) between_thresholds = tf.logical_and( tf.greater_equal(matched_vals, self._unmatched_threshold), tf.greater(self._matched_threshold, matched_vals)) if self._negatives_lower_than_unmatched: matches = self._set_values_using_indicator( matches, below_unmatched_threshold, -1) matches = self._set_values_using_indicator( matches, between_thresholds, -2) else: matches = self._set_values_using_indicator( matches, below_unmatched_threshold, -2) matches = self._set_values_using_indicator( matches, between_thresholds, -1) #It seems to return a matrix / vector that specifies the location of all matches if self._force_match_for_each_row: similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) force_match_column_ids = tf.argmax(similarity_matrix, 1, output_type=tf.int32) force_match_column_indicators = tf.one_hot( force_match_column_ids, depth=similarity_matrix_shape[1]) force_match_row_ids = tf.argmax(force_match_column_indicators, 0, output_type=tf.int32) force_match_column_mask = tf.cast( tf.reduce_max(force_match_column_indicators, 0), tf.bool) final_matches = tf.where(force_match_column_mask, force_match_row_ids, matches) #Need to follow up on what final_matches end up being and where it goes -> this will lead to where we 'pick out' the best ones return final_matches else: return matches
def retinanet(images, num_classes, num_anchors_per_loc, resnet_arch='resnet50', is_training=True): """ Get box prediction features and class prediction features from given images Args: images: input batch of images with shape (batch_size, h, w, 3) num_classes: number of classes for prediction num_anchors_per_loc: number of anchors at each feature map spatial location resnet_arch: name of which resnet architecture used is_training: indicate training or not return: prediciton dict: holding following items: box_predictions tensor from each feature map with shape (batch_size, num_anchors, 4) class_predictions_with_bg tensor from each feature map with shape (batch_size, num_anchors, num_class+1) feature_maps: list of tensor of feature map """ assert resnet_arch in list( RESNET_ARCH_BLOCK.keys()), "resnet architecture not defined" with tf.variable_scope('retinanet'): batch_size = combined_static_and_dynamic_shape(images)[0] features = retinanet_fpn(images, block_layers=RESNET_ARCH_BLOCK[resnet_arch], is_training=is_training) class_pred = [] box_pred = [] feature_map_list = [] num_slots = num_classes + 1 with tf.variable_scope('class_net', reuse=tf.AUTO_REUSE): for level in features.keys(): class_outputs = share_weight_class_net(features[level], level, num_slots, num_anchors_per_loc, is_training=is_training) class_outputs = tf.reshape(class_outputs, shape=[batch_size, -1, num_slots]) class_pred.append(class_outputs) feature_map_list.append(features[level]) with tf.variable_scope('box_net', reuse=tf.AUTO_REUSE): for level in features.keys(): box_outputs = share_weight_box_net(features[level], level, num_anchors_per_loc, is_training=is_training) box_outputs = tf.reshape(box_outputs, shape=[batch_size, -1, 4]) box_pred.append(box_outputs) return dict(box_pred=tf.concat(box_pred, axis=1), cls_pred=tf.concat(class_pred, axis=1), feature_map_list=feature_map_list)
def _sim(self, pos_fea, neg_fea, scope): # Reshape neg_fea [MBS, L, 1, 1, d] ==> [MBS, L, d] neg_fea = tf.squeeze(neg_fea, [2, 3]) # Reshape pos_fea: [MBS*K, M, d] ==> [MBS, K*M, d] neg_shape = shape_utils.combined_static_and_dynamic_shape(neg_fea) pos_shape = shape_utils.combined_static_and_dynamic_shape(pos_fea) pos_fea = tf.reshape(pos_fea, [neg_shape[0], -1, pos_shape[-1]]) if self._share_weights_with_pairwise_cs: scope = PairwiseCrossSimilarity.k2_scope[ 'pairwise_cross_similarity'] pos_shape = shape_utils.combined_static_and_dynamic_shape(pos_fea) kwargs = {} ## Only compute sim to topk nn in the negative bags if self._topk and not isinstance(self._cross_similarity, CosineCrossSimilarity): cs = CosineCrossSimilarity() fast_sim, _ = cs._build(pos_fea, neg_fea, None, None, 1, None, None, None, False, None) fast_sim = tf.stop_gradient(fast_sim[..., 0]) _, inds = tf.nn.top_k(fast_sim, self._topk, sorted=False) inds = tf.reshape(inds, [neg_shape[0], -1]) neg_fea = util.batched_gather(inds, neg_fea) neg_fea = tf.reshape(neg_fea, pos_shape[:2] + [self._topk, neg_shape[-1]]) kwargs['tile_fea1'] = False with tf.variable_scope(scope, 'k1_cross_similarity') as scope: self._cross_similarity._target_score_inds = self._target_score_inds sim, _ = self._cross_similarity._build(pos_fea, neg_fea, None, None, 1, None, None, None, False, None, **kwargs) if self._share_weights_with_pairwise_cs: PairwiseCrossSimilarity.k2_scope[ 'pairwise_cross_similarity'] = scope return sim
def _match_when_rows_are_non_empty(): """Performs matching when the rows of similarity matrix are non empty. Returns: matches: int32 tensor indicating the row each column matches to. """ # Matches for each column matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32) # Deal with matched and unmatched threshold if self._matched_threshold is not None: # Get logical indices of ignored and unmatched columns as tf.int64 matched_vals = tf.reduce_max(similarity_matrix, 0) below_unmatched_threshold = tf.greater(self._unmatched_threshold, matched_vals) between_thresholds = tf.logical_and( tf.greater_equal(matched_vals, self._unmatched_threshold), tf.greater(self._matched_threshold, matched_vals)) if self._negatives_lower_than_unmatched: matches = self._set_values_using_indicator(matches, below_unmatched_threshold, -1) matches = self._set_values_using_indicator(matches, between_thresholds, -2) else: matches = self._set_values_using_indicator(matches, below_unmatched_threshold, -2) matches = self._set_values_using_indicator(matches, between_thresholds, -1) if self._force_match_for_each_row: similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) force_match_column_ids = tf.argmax(similarity_matrix, 1, output_type=tf.int32) force_match_column_indicators = ( tf.one_hot( force_match_column_ids, depth=similarity_matrix_shape[1]) * tf.cast(tf.expand_dims(valid_rows, axis=-1), dtype=tf.float32)) force_match_row_ids = tf.argmax(force_match_column_indicators, 0, output_type=tf.int32) force_match_column_mask = tf.cast( tf.reduce_max(force_match_column_indicators, 0), tf.bool) final_matches = tf.where(force_match_column_mask, force_match_row_ids, matches) return final_matches else: return matches
def _predict(self, image_features, num_predictions_per_location): combined_feature_shape = shape_utils.combined_static_and_dynamic_shape( image_features) batch_size = combined_feature_shape[0] num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) code_size = 4 zero = tf.reduce_sum(0 * image_features) box_encodings = zero + tf.zeros( (batch_size, num_anchors, 1, code_size), dtype=tf.float32) class_predictions_with_background = zero + tf.zeros( (batch_size, num_anchors, self.num_classes + 1), dtype=tf.float32) return {box_predictor.BOX_ENCODINGS: box_encodings, box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _get_feature_map_spatial_dims(self, feature_maps): """Return list of spatial dimensions for each feature map in a list. Args: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i]. Returns: a list of pairs (height, width) for each feature map in feature_maps """ feature_map_shapes = [ shape_utils.combined_static_and_dynamic_shape( feature_map) for feature_map in feature_maps ] return [(shape[1], shape[2]) for shape in feature_map_shapes]
def matmul_gather_on_zeroth_axis(params, indices, scope=None): """Matrix multiplication based implementation of tf.gather on zeroth axis. TODO(rathodv, jonathanhuang): enable sparse matmul option. Args: params: A float32 Tensor. The tensor from which to gather values. Must be at least rank 1. indices: A Tensor. Must be one of the following types: int32, int64. Must be in range [0, params.shape[0]) scope: A name for the operation (optional). Returns: A Tensor. Has the same type as params. Values from params gathered from indices given by indices, with shape indices.shape + params.shape[1:]. """ with tf.name_scope(scope, 'MatMulGather'): params_shape = shape_utils.combined_static_and_dynamic_shape(params) indices_shape = shape_utils.combined_static_and_dynamic_shape(indices) params2d = tf.reshape(params, [params_shape[0], -1]) indicator_matrix = tf.one_hot(indices, params_shape[0]) gathered_result_flattened = tf.matmul(indicator_matrix, params2d) return tf.reshape(gathered_result_flattened, tf.stack(indices_shape + params_shape[1:]))
def predict(self, preprocessed_inputs, true_image_shapes, states=None, state_name='lstm_state', feature_scope=None): with tf.variable_scope(self._extract_features_scope, values=[preprocessed_inputs], reuse=tf.AUTO_REUSE): feature_maps = self._feature_extractor.extract_features( preprocessed_inputs, states, state_name, unroll_length=self._unroll_length, scope=feature_scope) feature_map_spatial_dims = self._get_feature_map_spatial_dims(feature_maps) image_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_inputs) self._batch_size = preprocessed_inputs.shape[0].value / self._unroll_length self._states = states self._anchors = box_list_ops.concatenate( self._anchor_generator.generate( feature_map_spatial_dims, im_height=image_shape[1], im_width=image_shape[2])) prediction_dict = self._box_predictor.predict( feature_maps, self._anchor_generator.num_anchors_per_location()) # Multiscale_anchor_generator currently has a different dim compared to # ssd_anchor_generator. Current fix is to check the dim of the box_encodings # tensor. If dim is not 3(multiscale_anchor_generator), squeeze the 3rd dim. # TODO(yinxiao): Remove this check once the anchor generator has unified # dimension. if len(prediction_dict['box_encodings'][0].get_shape().as_list()) == 3: box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1) else: box_encodings = tf.squeeze( tf.concat(prediction_dict['box_encodings'], axis=1), axis=2) class_predictions_with_background = tf.concat( prediction_dict['class_predictions_with_background'], axis=1) predictions_dict = { 'preprocessed_inputs': preprocessed_inputs, 'box_encodings': box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'feature_maps': feature_maps, 'anchors': self._anchors.get(), 'states_and_outputs': self._feature_extractor.states_and_outputs, } # In cases such as exporting the model, the states is always zero. Thus the # step should be ignored. if states is not None: predictions_dict['step'] = self._feature_extractor.step return predictions_dict
def nearest_neighbor_upsampling(input_tensor, scale=None, height_scale=None, width_scale=None): """Nearest neighbor upsampling implementation. Nearest neighbor upsampling function that maps input tensor with shape [batch_size, height, width, channels] to [batch_size, height * scale , width * scale, channels]. This implementation only uses reshape and broadcasting to make it TPU compatible. Args: input_tensor: A float32 tensor of size [batch, height_in, width_in, channels]. scale: An integer multiple to scale resolution of input data in both height and width dimensions. height_scale: An integer multiple to scale the height of input image. This option when provided overrides `scale` option. width_scale: An integer multiple to scale the width of input image. This option when provided overrides `scale` option. Returns: data_up: A float32 tensor of size [batch, height_in*scale, width_in*scale, channels]. Raises: ValueError: If both scale and height_scale or if both scale and width_scale are None. """ if not scale and (height_scale is None or width_scale is None): raise ValueError('Provide either `scale` or `height_scale` and' ' `width_scale`.') with tf.name_scope('nearest_neighbor_upsampling'): h_scale = scale if height_scale is None else height_scale w_scale = scale if width_scale is None else width_scale (batch_size, height, width, channels) = shape_utils.combined_static_and_dynamic_shape(input_tensor) output_tensor = tf.reshape( input_tensor, [batch_size, height, 1, width, 1, channels]) * tf.ones( [1, 1, h_scale, 1, w_scale, 1], dtype=input_tensor.dtype) return tf.reshape(output_tensor, [batch_size, height * h_scale, width * w_scale, channels])
def _compute_clip_window(self, preprocessed_images, true_image_shapes): """Computes clip window to use during post_processing. Computes a new clip window to use during post-processing based on `resized_image_shapes` and `true_image_shapes` only if `preprocess` method has been called. Otherwise returns a default clip window of [0, 0, 1, 1]. Args: preprocessed_images: the [batch, height, width, channels] image tensor. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Or None if the clip window should cover the full image. Returns: a 2-D float32 tensor of the form [batch_size, 4] containing the clip window for each image in the batch in normalized coordinates (relative to the resized dimensions) where each clip window is of the form [ymin, xmin, ymax, xmax] or a default clip window of [0, 0, 1, 1]. """ if true_image_shapes is None: return tf.constant([0, 0, 1, 1], dtype=tf.float32) resized_inputs_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_images) true_heights, true_widths, _ = tf.unstack( tf.to_float(true_image_shapes), axis=1) padded_height = tf.to_float(resized_inputs_shape[1]) padded_width = tf.to_float(resized_inputs_shape[2]) return tf.stack( [ tf.zeros_like(true_heights), tf.zeros_like(true_widths), true_heights / padded_height, true_widths / padded_width ], axis=1)
def nearest_neighbor_upsampling(input_tensor, scale): """Nearest neighbor upsampling implementation. Nearest neighbor upsampling function that maps input tensor with shape [batch_size, height, width, channels] to [batch_size, height * scale , width * scale, channels]. This implementation only uses reshape and tile to make it compatible with certain hardware. Args: input_tensor: A float32 tensor of size [batch, height_in, width_in, channels]. scale: An integer multiple to scale resolution of input data. Returns: data_up: A float32 tensor of size [batch, height_in*scale, width_in*scale, channels]. """ shape = shape_utils.combined_static_and_dynamic_shape(input_tensor) shape_before_tile = [shape[0], shape[1], 1, shape[2], 1, shape[3]] shape_after_tile = [shape[0], shape[1] * scale, shape[2] * scale, shape[3]] data_reshaped = tf.reshape(input_tensor, shape_before_tile) resized_tensor = tf.tile(data_reshaped, [1, 1, scale, 1, scale, 1]) resized_tensor = tf.reshape(resized_tensor, shape_after_tile) return resized_tensor
def _create_regression_targets(self, anchors, groundtruth_boxes, match): """Returns a regression target for each anchor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth_boxes match: a matcher.Match object Returns: reg_targets: a float32 tensor with shape [N, box_code_dimension] """ matched_gt_boxes = match.gather_based_on_match( groundtruth_boxes.get(), unmatched_value=tf.zeros(4), ignored_value=tf.zeros(4)) matched_gt_boxlist = box_list.BoxList(matched_gt_boxes) if groundtruth_boxes.has_field(fields.BoxListFields.keypoints): groundtruth_keypoints = groundtruth_boxes.get_field( fields.BoxListFields.keypoints) matched_keypoints = match.gather_based_on_match( groundtruth_keypoints, unmatched_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]), ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:])) matched_gt_boxlist.add_field(fields.BoxListFields.keypoints, matched_keypoints) matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors) match_results_shape = shape_utils.combined_static_and_dynamic_shape( match.match_results) # Zero out the unmatched and ignored regression targets. unmatched_ignored_reg_targets = tf.tile( self._default_regression_target(), [match_results_shape[0], 1]) matched_anchors_mask = match.matched_column_indicator() reg_targets = tf.where(matched_anchors_mask, matched_reg_targets, unmatched_ignored_reg_targets) return reg_targets
def nearest_neighbor_upsampling(input_tensor, scale): """Nearest neighbor upsampling implementation. Nearest neighbor upsampling function that maps input tensor with shape [batch_size, height, width, channels] to [batch_size, height * scale , width * scale, channels]. This implementation only uses reshape and broadcasting to make it TPU compatible. Args: input_tensor: A float32 tensor of size [batch, height_in, width_in, channels]. scale: An integer multiple to scale resolution of input data. Returns: data_up: A float32 tensor of size [batch, height_in*scale, width_in*scale, channels]. """ with tf.name_scope('nearest_neighbor_upsampling'): (batch_size, height, width, channels) = shape_utils.combined_static_and_dynamic_shape(input_tensor) output_tensor = tf.reshape( input_tensor, [batch_size, height, 1, width, 1, channels]) * tf.ones( [1, 1, scale, 1, scale, 1], dtype=input_tensor.dtype) return tf.reshape(output_tensor, [batch_size, height * scale, width * scale, channels])
def select_random_box(boxlist, default_box=None, seed=None, scope=None): """Selects a random bounding box from a `BoxList`. Args: boxlist: A BoxList. default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`, this default box will be returned. If None, will use a default box of [[-1., -1., -1., -1.]]. seed: Random seed. scope: Name scope. Returns: bbox: A [1, 4] tensor with a random bounding box. valid: A bool tensor indicating whether a valid bounding box is returned (True) or whether the default box is returned (False). """ with tf.name_scope(scope, 'SelectRandomBox'): bboxes = boxlist.get() combined_shape = shape_utils.combined_static_and_dynamic_shape(bboxes) number_of_boxes = combined_shape[0] default_box = default_box or tf.constant([[-1., -1., -1., -1.]]) def select_box(): random_index = tf.random_uniform([], maxval=number_of_boxes, dtype=tf.int32, seed=seed) return tf.expand_dims(bboxes[random_index], axis=0), tf.constant(True) return tf.cond( tf.greater_equal(number_of_boxes, 1), true_fn=select_box, false_fn=lambda: (default_box, tf.constant(False)))
def _batch_decode(self, box_encodings): """Decodes a batch of box encodings with respect to the anchors. Args: box_encodings: A float32 tensor of shape [batch_size, num_anchors, box_code_size] containing box encodings. Returns: decoded_boxes: A float32 tensor of shape [batch_size, num_anchors, 4] containing the decoded boxes. """ combined_shape = shape_utils.combined_static_and_dynamic_shape( box_encodings) batch_size = combined_shape[0] tiled_anchor_boxes = tf.tile( tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1]) tiled_anchors_boxlist = box_list.BoxList( tf.reshape(tiled_anchor_boxes, [-1, self._box_coder.code_size])) decoded_boxes = self._box_coder.decode( tf.reshape(box_encodings, [-1, self._box_coder.code_size]), tiled_anchors_boxlist) return tf.reshape(decoded_boxes.get(), tf.stack([combined_shape[0], combined_shape[1], 4]))
def test_combines_static_dynamic_shape(self): tensor = tf.placeholder(tf.float32, shape=(None, 2, 3)) combined_shape = shape_utils.combined_static_and_dynamic_shape( tensor) self.assertTrue(tf.contrib.framework.is_tensor(combined_shape[0])) self.assertListEqual(combined_shape[1:], [2, 3])
def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True): """Unstacks all tensors in `tensor_dict` along 0th dimension. Unstacks tensor from the tensor dict along 0th dimension and returns a tensor_dict containing values that are lists of unstacked tensors. Tensors in the `tensor_dict` are expected to be of one of the three shapes: 1. [batch_size] 2. [batch_size, height, width, channels] 3. [batch_size, num_boxes, d1, d2, ... dn] When unpad_groundtruth_tensors is set to true, unstacked tensors of form 3 above are sliced along the `num_boxes` dimension using the value in tensor field.InputDataFields.num_groundtruth_boxes. Note that this function has a static list of input data fields and has to be kept in sync with the InputDataFields defined in core/standard_fields.py Args: tensor_dict: A dictionary of batched groundtruth tensors. unpad_groundtruth_tensors: Whether to remove padding along `num_boxes` dimension of the groundtruth tensors. Returns: A dictionary where the keys are from fields.InputDataFields and values are a list of unstacked (optionally unpadded) tensors. Raises: ValueError: If unpad_tensors is True and `tensor_dict` does not contain `num_groundtruth_boxes` tensor. """ unbatched_tensor_dict = {key: tf.unstack(tensor) for key, tensor in tensor_dict.items()} if unpad_groundtruth_tensors: if (fields.InputDataFields.num_groundtruth_boxes not in unbatched_tensor_dict): raise ValueError('`num_groundtruth_boxes` not found in tensor_dict. ' 'Keys available: {}'.format( unbatched_tensor_dict.keys())) unbatched_unpadded_tensor_dict = {} unpad_keys = set([ # List of input data fields that are padded along the num_boxes # dimension. This list has to be kept in sync with InputDataFields in # standard_fields.py. fields.InputDataFields.groundtruth_instance_masks, fields.InputDataFields.groundtruth_classes, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_keypoints, fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_area, fields.InputDataFields.groundtruth_weights ]).intersection(set(unbatched_tensor_dict.keys())) for key in unpad_keys: unpadded_tensor_list = [] for num_gt, padded_tensor in zip( unbatched_tensor_dict[fields.InputDataFields.num_groundtruth_boxes], unbatched_tensor_dict[key]): tensor_shape = shape_utils.combined_static_and_dynamic_shape( padded_tensor) slice_begin = tf.zeros([len(tensor_shape)], dtype=tf.int32) slice_size = tf.stack( [num_gt] + [-1 if dim is None else dim for dim in tensor_shape[1:]]) unpadded_tensor = tf.slice(padded_tensor, slice_begin, slice_size) unpadded_tensor_list.append(unpadded_tensor) unbatched_unpadded_tensor_dict[key] = unpadded_tensor_list unbatched_tensor_dict.update(unbatched_unpadded_tensor_dict) return unbatched_tensor_dict
def postprocess(self, prediction_dict, true_image_shapes): """Converts prediction tensors to final detections. This function converts raw predictions tensors to final detection results by slicing off the background class, decoding box predictions and applying non max suppression and clipping to the image window. See base class for output format conventions. Note also that by default, scores are to be interpreted as logits, but if a score_conversion_fn is used, then scores are remapped (and may thus have a different interpretation). Args: prediction_dict: a dictionary holding prediction tensors with 1) preprocessed_inputs: a [batch, height, width, channels] image tensor. 2) box_encodings: 3-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 3) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions. 4) mask_predictions: (optional) a 5-D float tensor of shape [batch_size, num_anchors, q, mask_height, mask_width]. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Or None, if the clip window should cover the full image. Returns: detections: a dictionary containing the following fields detection_boxes: [batch, max_detections, 4] detection_scores: [batch, max_detections] detection_classes: [batch, max_detections] detection_keypoints: [batch, max_detections, num_keypoints, 2] (if encoded in the prediction_dict 'box_encodings') detection_masks: [batch_size, max_detections, mask_height, mask_width] (optional) num_detections: [batch] Raises: ValueError: if prediction_dict does not contain `box_encodings` or `class_predictions_with_background` fields. """ if ('box_encodings' not in prediction_dict or 'class_predictions_with_background' not in prediction_dict): raise ValueError('prediction_dict does not contain expected entries.') with tf.name_scope('Postprocessor'): preprocessed_images = prediction_dict['preprocessed_inputs'] box_encodings = prediction_dict['box_encodings'] box_encodings = tf.identity(box_encodings, 'raw_box_encodings') class_predictions = prediction_dict['class_predictions_with_background'] detection_boxes, detection_keypoints = self._batch_decode(box_encodings) detection_boxes = tf.identity(detection_boxes, 'raw_box_locations') detection_boxes = tf.expand_dims(detection_boxes, axis=2) detection_scores = self._score_conversion_fn(class_predictions) detection_scores = tf.identity(detection_scores, 'raw_box_scores') if self._add_background_class: detection_scores = tf.slice(detection_scores, [0, 0, 1], [-1, -1, -1]) additional_fields = None batch_size = ( shape_utils.combined_static_and_dynamic_shape(preprocessed_images)[0]) if 'feature_maps' in prediction_dict: feature_map_list = [] for feature_map in prediction_dict['feature_maps']: feature_map_list.append(tf.reshape(feature_map, [batch_size, -1])) box_features = tf.concat(feature_map_list, 1) box_features = tf.identity(box_features, 'raw_box_features') if detection_keypoints is not None: additional_fields = { fields.BoxListFields.keypoints: detection_keypoints} (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, nmsed_additional_fields, num_detections) = self._non_max_suppression_fn( detection_boxes, detection_scores, clip_window=self._compute_clip_window(preprocessed_images, true_image_shapes), additional_fields=additional_fields, masks=prediction_dict.get('mask_predictions')) detection_dict = { fields.DetectionResultFields.detection_boxes: nmsed_boxes, fields.DetectionResultFields.detection_scores: nmsed_scores, fields.DetectionResultFields.detection_classes: nmsed_classes, fields.DetectionResultFields.num_detections: tf.to_float(num_detections) } if (nmsed_additional_fields is not None and fields.BoxListFields.keypoints in nmsed_additional_fields): detection_dict[fields.DetectionResultFields.detection_keypoints] = ( nmsed_additional_fields[fields.BoxListFields.keypoints]) if nmsed_masks is not None: detection_dict[ fields.DetectionResultFields.detection_masks] = nmsed_masks return detection_dict
def predict(self, preprocessed_inputs, true_image_shapes): """Predicts unpostprocessed tensors from input tensor. This function takes an input batch of images and runs it through the forward pass of the network to yield unpostprocessesed predictions. A side effect of calling the predict method is that self._anchors is populated with a box_list.BoxList of anchors. These anchors must be constructed before the postprocess or loss functions can be called. Args: preprocessed_inputs: a [batch, height, width, channels] image tensor. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) preprocessed_inputs: the [batch, height, width, channels] image tensor. 2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 3) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). 4) feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i]. 5) anchors: 2-D float tensor of shape [num_anchors, 4] containing the generated anchors in normalized coordinates. """ batchnorm_updates_collections = (None if self._inplace_batchnorm_update else tf.GraphKeys.UPDATE_OPS) if self._feature_extractor.is_keras_model: feature_maps = self._feature_extractor(preprocessed_inputs) else: with slim.arg_scope([slim.batch_norm], is_training=(self._is_training and not self._freeze_batchnorm), updates_collections=batchnorm_updates_collections): with tf.variable_scope(None, self._extract_features_scope, [preprocessed_inputs]): feature_maps = self._feature_extractor.extract_features( preprocessed_inputs) feature_map_spatial_dims = self._get_feature_map_spatial_dims( feature_maps) image_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_inputs) self._anchors = box_list_ops.concatenate( self._anchor_generator.generate( feature_map_spatial_dims, im_height=image_shape[1], im_width=image_shape[2])) if self._box_predictor.is_keras_model: predictor_results_dict = self._box_predictor(feature_maps) else: with slim.arg_scope([slim.batch_norm], is_training=(self._is_training and not self._freeze_batchnorm), updates_collections=batchnorm_updates_collections): predictor_results_dict = self._box_predictor.predict( feature_maps, self._anchor_generator.num_anchors_per_location()) predictions_dict = { 'preprocessed_inputs': preprocessed_inputs, 'feature_maps': feature_maps, 'anchors': self._anchors.get() } for prediction_key, prediction_list in iter(predictor_results_dict.items()): prediction = tf.concat(prediction_list, axis=1) if (prediction_key == 'box_encodings' and prediction.shape.ndims == 4 and prediction.shape[2] == 1): prediction = tf.squeeze(prediction, axis=2) predictions_dict[prediction_key] = prediction self._batched_prediction_tensor_names = [x for x in predictions_dict if x != 'anchors'] return predictions_dict
def assign(self, anchors, groundtruth_boxes, groundtruth_labels=None, unmatched_class_label=None, groundtruth_weights=None): """Assign classification and regression targets to each anchor. For a given set of anchors and groundtruth detections, match anchors to groundtruth_boxes and assign classification and regression targets to each anchor as well as weights based on the resulting match (specifying, e.g., which anchors should not contribute to training loss). Anchors that are not matched to anything are given a classification target of self._unmatched_cls_target which can be specified via the constructor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth boxes groundtruth_labels: a tensor of shape [M, d_1, ... d_k] with labels for each of the ground_truth boxes. The subshape [d_1, ... d_k] can be empty (corresponding to scalar inputs). When set to None, groundtruth_labels assumes a binary problem where all ground_truth boxes get a positive label (of 1). unmatched_class_label: a float32 tensor with shape [d_1, d_2, ..., d_k] which is consistent with the classification target for each anchor (and can be empty for scalar targets). This shape must thus be compatible with the groundtruth labels that are passed to the "assign" function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]). If set to None, unmatched_cls_target is set to be [0] for each anchor. groundtruth_weights: a float tensor of shape [M] indicating the weight to assign to all anchors match to a particular groundtruth box. The weights must be in [0., 1.]. If None, all weights are set to 1. Generally no groundtruth boxes with zero weight match to any anchors as matchers are aware of groundtruth weights. Additionally, `cls_weights` and `reg_weights` are calculated using groundtruth weights as an added safety. Returns: cls_targets: a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], where the subshape [d_1, ..., d_k] is compatible with groundtruth_labels which has shape [num_gt_boxes, d_1, d_2, ... d_k]. cls_weights: a float32 tensor with shape [num_anchors] reg_targets: a float32 tensor with shape [num_anchors, box_code_dimension] reg_weights: a float32 tensor with shape [num_anchors] match: a matcher.Match object encoding the match between anchors and groundtruth boxes, with rows corresponding to groundtruth boxes and columns corresponding to anchors. Raises: ValueError: if anchors or groundtruth_boxes are not of type box_list.BoxList """ if not isinstance(anchors, box_list.BoxList): raise ValueError('anchors must be an BoxList') if not isinstance(groundtruth_boxes, box_list.BoxList): raise ValueError('groundtruth_boxes must be an BoxList') if unmatched_class_label is None: unmatched_class_label = tf.constant([0], tf.float32) if groundtruth_labels is None: groundtruth_labels = tf.ones(tf.expand_dims(groundtruth_boxes.num_boxes(), 0)) groundtruth_labels = tf.expand_dims(groundtruth_labels, -1) unmatched_shape_assert = shape_utils.assert_shape_equal( shape_utils.combined_static_and_dynamic_shape(groundtruth_labels)[1:], shape_utils.combined_static_and_dynamic_shape(unmatched_class_label)) labels_and_box_shapes_assert = shape_utils.assert_shape_equal( shape_utils.combined_static_and_dynamic_shape( groundtruth_labels)[:1], shape_utils.combined_static_and_dynamic_shape( groundtruth_boxes.get())[:1]) if groundtruth_weights is None: num_gt_boxes = groundtruth_boxes.num_boxes_static() if not num_gt_boxes: num_gt_boxes = groundtruth_boxes.num_boxes() groundtruth_weights = tf.ones([num_gt_boxes], dtype=tf.float32) # set scores on the gt boxes scores = 1 - groundtruth_labels[:, 0] groundtruth_boxes.add_field(fields.BoxListFields.scores, scores) with tf.control_dependencies( [unmatched_shape_assert, labels_and_box_shapes_assert]): match_quality_matrix = self._similarity_calc.compare(groundtruth_boxes, anchors) match = self._matcher.match(match_quality_matrix, valid_rows=tf.greater(groundtruth_weights, 0)) reg_targets = self._create_regression_targets(anchors, groundtruth_boxes, match) cls_targets = self._create_classification_targets(groundtruth_labels, unmatched_class_label, match) if self._weight_regression_loss_by_score: reg_weights = self._create_regression_weights( match, groundtruth_weights * scores) else: reg_weights = self._create_regression_weights(match, groundtruth_weights) cls_weights = self._create_classification_weights(match, groundtruth_weights) num_anchors = anchors.num_boxes_static() if num_anchors is not None: reg_targets = self._reset_target_shape(reg_targets, num_anchors) cls_targets = self._reset_target_shape(cls_targets, num_anchors) reg_weights = self._reset_target_shape(reg_weights, num_anchors) cls_weights = self._reset_target_shape(cls_weights, num_anchors) return cls_targets, cls_weights, reg_targets, reg_weights, match
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape( image_features) box_encodings = tf.reshape( box_encodings, tf.stack([combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size])) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots])) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def assign(self, anchors, groundtruth_boxes, groundtruth_labels=None, groundtruth_weights=None, **params): """Assign classification and regression targets to each anchor. For a given set of anchors and groundtruth detections, match anchors to groundtruth_boxes and assign classification and regression targets to each anchor as well as weights based on the resulting match (specifying, e.g., which anchors should not contribute to training loss). Anchors that are not matched to anything are given a classification target of self._unmatched_cls_target which can be specified via the constructor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth boxes groundtruth_labels: a tensor of shape [M, d_1, ... d_k] with labels for each of the ground_truth boxes. The subshape [d_1, ... d_k] can be empty (corresponding to scalar inputs). When set to None, groundtruth_labels assumes a binary problem where all ground_truth boxes get a positive label (of 1). groundtruth_weights: a float tensor of shape [M] indicating the weight to assign to all anchors match to a particular groundtruth box. The weights must be in [0., 1.]. If None, all weights are set to 1. **params: Additional keyword arguments for specific implementations of the Matcher. Returns: cls_targets: a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], where the subshape [d_1, ..., d_k] is compatible with groundtruth_labels which has shape [num_gt_boxes, d_1, d_2, ... d_k]. cls_weights: a float32 tensor with shape [num_anchors] reg_targets: a float32 tensor with shape [num_anchors, box_code_dimension] reg_weights: a float32 tensor with shape [num_anchors] match: a matcher.Match object encoding the match between anchors and groundtruth boxes, with rows corresponding to groundtruth boxes and columns corresponding to anchors. Raises: ValueError: if anchors or groundtruth_boxes are not of type box_list.BoxList """ if not isinstance(anchors, box_list.BoxList): raise ValueError('anchors must be an BoxList') if not isinstance(groundtruth_boxes, box_list.BoxList): raise ValueError('groundtruth_boxes must be an BoxList') if groundtruth_labels is None: groundtruth_labels = tf.ones(tf.expand_dims(groundtruth_boxes.num_boxes(), 0)) groundtruth_labels = tf.expand_dims(groundtruth_labels, -1) unmatched_shape_assert = shape_utils.assert_shape_equal( shape_utils.combined_static_and_dynamic_shape(groundtruth_labels)[1:], shape_utils.combined_static_and_dynamic_shape( self._unmatched_cls_target)) labels_and_box_shapes_assert = shape_utils.assert_shape_equal( shape_utils.combined_static_and_dynamic_shape( groundtruth_labels)[:1], shape_utils.combined_static_and_dynamic_shape( groundtruth_boxes.get())[:1]) if groundtruth_weights is None: num_gt_boxes = groundtruth_boxes.num_boxes_static() if not num_gt_boxes: num_gt_boxes = groundtruth_boxes.num_boxes() groundtruth_weights = tf.ones([num_gt_boxes], dtype=tf.float32) with tf.control_dependencies( [unmatched_shape_assert, labels_and_box_shapes_assert]): match_quality_matrix = self._similarity_calc.compare(groundtruth_boxes, anchors) match = self._matcher.match(match_quality_matrix, **params) reg_targets = self._create_regression_targets(anchors, groundtruth_boxes, match) cls_targets = self._create_classification_targets(groundtruth_labels, match) reg_weights = self._create_regression_weights(match, groundtruth_weights) cls_weights = self._create_classification_weights(match, groundtruth_weights) num_anchors = anchors.num_boxes_static() if num_anchors is not None: reg_targets = self._reset_target_shape(reg_targets, num_anchors) cls_targets = self._reset_target_shape(cls_targets, num_anchors) reg_weights = self._reset_target_shape(reg_weights, num_anchors) cls_weights = self._reset_target_shape(cls_weights, num_anchors) return cls_targets, cls_weights, reg_targets, reg_weights, match