def __call__(self, fpn_features, boxes, outer_boxes, classes, is_training): """Generate the detection priors from the box detections and FPN features. This corresponds to the Fig. 4 of the ShapeMask paper at https://arxiv.org/pdf/1904.03239.pdf Args: fpn_features: a dictionary of FPN features. boxes: a float tensor of shape [batch_size, num_instances, 4] representing the tight gt boxes from dataloader/detection. outer_boxes: a float tensor of shape [batch_size, num_instances, 4] representing the loose gt boxes from dataloader/detection. classes: a int Tensor of shape [batch_size, num_instances] of instance classes. is_training: training mode or not. Returns: instance_features: a float Tensor of shape [batch_size * num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. This is the instance feature crop. detection_priors: A float Tensor of shape [batch_size * num_instances, mask_size, mask_size, 1]. """ with keras_utils.maybe_enter_backend_graph(), tf.name_scope( 'prior_mask'): batch_size, num_instances, _ = boxes.get_shape().as_list() outer_boxes = tf.cast(outer_boxes, tf.float32) boxes = tf.cast(boxes, tf.float32) instance_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, outer_boxes, output_size=self._mask_crop_size) instance_features = self._shape_prior_fc(instance_features) shape_priors = self._get_priors() # Get uniform priors for each outer box. uniform_priors = tf.ones([ batch_size, num_instances, self._mask_crop_size, self._mask_crop_size ]) uniform_priors = spatial_transform_ops.crop_mask_in_target_box( uniform_priors, boxes, outer_boxes, self._mask_crop_size) # Classify shape priors using uniform priors + instance features. prior_distribution = self._classify_shape_priors( tf.cast(instance_features, tf.float32), uniform_priors, classes) instance_priors = tf.gather(shape_priors, classes) instance_priors *= tf.expand_dims(tf.expand_dims(tf.cast( prior_distribution, tf.float32), axis=-1), axis=-1) instance_priors = tf.reduce_sum(instance_priors, axis=2) detection_priors = spatial_transform_ops.crop_mask_in_target_box( instance_priors, boxes, outer_boxes, self._mask_crop_size) return instance_features, detection_priors
def build_model(self, params, mode=None): if self._keras_model is None: with keras_utils.maybe_enter_backend_graph(): outputs = self.model_outputs(self._input_layer, mode) model = tf.keras.models.Model( inputs=self._input_layer, outputs=outputs, name='retinanet') assert model is not None, 'Fail to build tf.keras.Model.' model.optimizer = self.build_optimizer() self._keras_model = model return self._keras_model
def __call__(self, features, is_training=None): scores_outputs = {} box_outputs = {} with keras_utils.maybe_enter_backend_graph(), tf.name_scope('rpn_head'): for level in range(self._min_level, self._max_level + 1): scores_output, box_output = self._shared_rpn_heads( features[level], self._anchors_per_location, level, is_training) scores_outputs[level] = scores_output box_outputs[level] = box_output return scores_outputs, box_outputs
def __call__(self, multilevel_features, is_training=None): """Returns the FPN features for a given multilevel features. Args: multilevel_features: a `dict` containing `int` keys for continuous feature levels, e.g., [2, 3, 4, 5]. The values are corresponding features with shape [batch_size, height_l, width_l, num_filters]. is_training: `bool` if True, the model is in training mode. Returns: a `dict` containing `int` keys for continuous feature levels [min_level, min_level + 1, ..., max_level]. The values are corresponding FPN features with shape [batch_size, height_l, width_l, fpn_feat_dims]. """ input_levels = list(multilevel_features.keys()) if min(input_levels) > self._min_level: raise ValueError('The minimum backbone level %d should be ' % (min(input_levels)) + 'less or equal to FPN minimum level %d.:' % (self._min_level)) backbone_max_level = min(max(input_levels), self._max_level) with keras_utils.maybe_enter_backend_graph(), tf.name_scope('fpn'): # Adds lateral connections. feats_lateral = {} for level in range(self._min_level, backbone_max_level + 1): feats_lateral[level] = self._lateral_conv2d_op[level]( multilevel_features[level]) # Adds top-down path. feats = {backbone_max_level: feats_lateral[backbone_max_level]} for level in range(backbone_max_level - 1, self._min_level - 1, -1): feats[level] = spatial_transform_ops.nearest_upsampling( feats[level + 1], 2) + feats_lateral[level] # Adds post-hoc 3x3 convolution kernel. for level in range(self._min_level, backbone_max_level + 1): feats[level] = self._post_hoc_conv2d_op[level](feats[level]) # Adds coarser FPN levels introduced for RetinaNet. for level in range(backbone_max_level + 1, self._max_level + 1): feats_in = feats[level - 1] if level > backbone_max_level + 1: feats_in = self._activation_op(feats_in) feats[level] = self._coarse_conv2d_op[level](feats_in) if self._use_batch_norm: # Adds batch_norm layer. for level in range(self._min_level, self._max_level + 1): feats[level] = self._norm_activations[level]( feats[level], is_training=is_training) return feats
def __call__(self, features, mask_logits, classes, is_training): """Generate instance masks from FPN features and detection priors. This corresponds to the Fig. 5-6 of the ShapeMask paper at https://arxiv.org/pdf/1904.03239.pdf Args: features: a float Tensor of shape [batch_size, num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. This is the instance feature crop. mask_logits: a float Tensor of shape [batch_size, num_instances, mask_crop_size, mask_crop_size] indicating predicted mask logits. classes: a int Tensor of shape [batch_size, num_instances] of instance classes. is_training: a bool indicating whether in training mode. Returns: mask_outputs: instance mask prediction as a float Tensor of shape [batch_size, num_instances, mask_size, mask_size]. """ # Extract the foreground mean features # with tf.variable_scope('fine_mask', reuse=tf.AUTO_REUSE): with keras_utils.maybe_enter_backend_graph(), tf.name_scope( 'fine_mask'): mask_probs = tf.nn.sigmoid(mask_logits) # Compute instance embedding for hard average. binary_mask = tf.cast(tf.greater(mask_probs, 0.5), features.dtype) instance_embedding = tf.reduce_sum( features * tf.expand_dims(binary_mask, axis=-1), axis=(2, 3)) instance_embedding /= tf.expand_dims( tf.reduce_sum(binary_mask, axis=(2, 3)) + 1e-20, axis=-1) # Take the difference between crop features and mean instance features. features -= tf.expand_dims(tf.expand_dims(instance_embedding, axis=2), axis=2) features += self._fine_mask_fc(tf.expand_dims(mask_probs, axis=-1)) # Decoder to generate upsampled segmentation mask. mask_logits = self.decoder_net(features, is_training) if self._use_category_for_mask: mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3]) mask_logits = tf.gather(mask_logits, tf.expand_dims(classes, -1), batch_dims=2) mask_logits = tf.squeeze(mask_logits, axis=2) else: mask_logits = mask_logits[..., 0] return mask_logits
def __call__(self, fpn_features, is_training=None): """Returns outputs of RetinaNet head.""" class_outputs = {} box_outputs = {} with keras_utils.maybe_enter_backend_graph(), tf.name_scope( 'retinanet_head'): for level in range(self._min_level, self._max_level + 1): features = fpn_features[level] class_outputs[level] = self.class_net( features, level, is_training=is_training) box_outputs[level] = self.box_net( features, level, is_training=is_training) return class_outputs, box_outputs
def __call__(self, inputs, is_training=None): """Returns the ResNet model for a given size and number of output classes. Args: inputs: a `Tesnor` with shape [batch_size, height, width, 3] representing a batch of images. is_training: `bool` if True, the model is in training mode. Returns: a `dict` containing `int` keys for continuous feature levels [2, 3, 4, 5]. The values are corresponding feature hierarchy in ResNet with shape [batch_size, height_l, width_l, num_filters]. """ with keras_utils.maybe_enter_backend_graph(): with tf.name_scope('resnet%s' % self._resnet_depth): return self._resnet_fn(inputs, is_training)
def __call__(self, inputs, is_training=None): with keras_utils.maybe_enter_backend_graph(): model = SpineNet(input_specs=self._input_specs, min_level=self._min_level, max_level=self._max_level, block_specs=self._block_specs, endpoints_num_filters=self._endpoints_num_filters, resample_alpha=self._resample_alpha, block_repeats=self._block_repeats, filter_size_scale=self._filter_size_scale, kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activation=self._activation, use_sync_bn=self._use_sync_bn, norm_momentum=self._norm_momentum, norm_epsilon=self._norm_epsilon) return model(inputs)
def __call__(self, features, detection_priors, classes, is_training): """Generate instance masks from FPN features and detection priors. This corresponds to the Fig. 5-6 of the ShapeMask paper at https://arxiv.org/pdf/1904.03239.pdf Args: features: a float Tensor of shape [batch_size, num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. This is the instance feature crop. detection_priors: a float Tensor of shape [batch_size, num_instances, mask_crop_size, mask_crop_size, 1]. This is the detection prior for the instance. classes: a int Tensor of shape [batch_size, num_instances] of instance classes. is_training: a bool indicating whether in training mode. Returns: mask_outputs: instance mask prediction as a float Tensor of shape [batch_size, num_instances, mask_size, mask_size]. """ with keras_utils.maybe_enter_backend_graph(), tf.name_scope( 'coarse_mask'): # Transform detection priors to have the same dimension as features. detection_priors = tf.expand_dims(detection_priors, axis=-1) detection_priors = self._coarse_mask_fc(detection_priors) features += detection_priors mask_logits = self.decoder_net(features, is_training) # Gather the logits with right input class. if self._use_category_for_mask: mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3]) mask_logits = tf.gather(mask_logits, tf.expand_dims(classes, -1), batch_dims=2) mask_logits = tf.squeeze(mask_logits, axis=2) else: mask_logits = mask_logits[..., 0] return mask_logits
def __call__(self, roi_features, is_training=None): """Box and class branches for the Mask-RCNN model. Args: roi_features: A ROI feature tensor of shape [batch_size, num_rois, height_l, width_l, num_filters]. is_training: `boolean`, if True if model is in training mode. Returns: class_outputs: a tensor with a shape of [batch_size, num_rois, num_classes], representing the class predictions. box_outputs: a tensor with a shape of [batch_size, num_rois, num_classes * 4], representing the box predictions. """ with keras_utils.maybe_enter_backend_graph(), tf.name_scope( 'fast_rcnn_head'): # reshape inputs beofre FC. _, num_rois, height, width, filters = roi_features.get_shape().as_list() net = tf.reshape(roi_features, [-1, height, width, filters]) for i in range(self._num_convs): net = self._conv_ops[i](net) if self._use_batch_norm: net = self._conv_bn_ops[i](net, is_training=is_training) filters = self._num_filters if self._num_convs > 0 else filters net = tf.reshape(net, [-1, num_rois, height * width * filters]) for i in range(self._num_fcs): net = self._fc_ops[i](net) if self._use_batch_norm: net = self._fc_bn_ops[i](net, is_training=is_training) class_outputs = self._class_predict(net) box_outputs = self._box_predict(net) score_outputs = self._score_predict(net) return class_outputs, box_outputs, score_outputs
def __call__(self, roi_features, class_indices, is_training=None): """Mask branch for the Mask-RCNN model. Args: roi_features: A ROI feature tensor of shape [batch_size, num_rois, height_l, width_l, num_filters]. class_indices: a Tensor of shape [batch_size, num_rois], indicating which class the ROI is. is_training: `boolean`, if True if model is in training mode. Returns: mask_outputs: a tensor with a shape of [batch_size, num_masks, mask_height, mask_width, num_classes], representing the mask predictions. fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2], representing the fg mask targets. Raises: ValueError: If boxes is not a rank-3 tensor or the last dimension of boxes is not 4. """ with keras_utils.maybe_enter_backend_graph(): with tf.name_scope('mask_head'): _, num_rois, height, width, filters = roi_features.get_shape().as_list() net = tf.reshape(roi_features, [-1, height, width, filters]) for i in range(self._num_convs): net = self._conv2d_ops[i](net) if self._use_batch_norm: net = self._norm_activation()(net, is_training=is_training) net = self._mask_conv_transpose(net) if self._use_batch_norm: net = self._norm_activation()(net, is_training=is_training) mask_outputs = self._conv2d_op( self._num_classes, kernel_size=(1, 1), strides=(1, 1), padding='valid', name='mask_fcn_logits')( net) mask_outputs = tf.reshape(mask_outputs, [ -1, num_rois, self._mask_target_size, self._mask_target_size, self._num_classes ]) with tf.name_scope('masks_post_processing'): # TODO(pengchong): Figure out the way not to use the static inferred # batch size. batch_size, num_masks = class_indices.get_shape().as_list() mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3]) # Contructs indices for gather. batch_indices = tf.tile( tf.expand_dims(tf.range(batch_size), axis=1), [1, num_masks]) mask_indices = tf.tile( tf.expand_dims(tf.range(num_masks), axis=0), [batch_size, 1]) gather_indices = tf.stack( [batch_indices, mask_indices, class_indices], axis=2) mask_outputs = tf.gather_nd(mask_outputs, gather_indices) return mask_outputs
def build_model(self, params, mode=None): if self._keras_model is None: with keras_utils.maybe_enter_backend_graph():