def get_pixel_fb_classification(self, x, anchor_stride, anchor_per_location): ''' Get the pixel classification of foreground and background :return: ''' sh_in = x.get_shape().as_list()[-1] # Here 2*anchor_per_location = 6, where 2 indicates the binary classification of Foreground and background and anchor_per_location = 3 x = ops.conv_layer(x, k_shape=[1, 1, sh_in, 2 * anchor_per_location], stride=anchor_stride, padding='VALID', scope_name='rpn_class_raw', trainable=True) logging.info('RPN - Conv Class: %s', str(x.get_shape().as_list())) # Here we convert {anchor_per_location = 3} # [batch_size, h, w, num_anchors] to [batch_size, h*w*anchor_per_location, 2] # For each image, at each pixel classify 3 anchors as foreground or background self.rpn_class_logits = tf.reshape(x, [tf.shape(x)[0], -1, 2]) # self.rpn_class_logits = tf.reshape(x, [x.get_shape().as_list()[0], -1, 2]) logging.info('rpn_class_logits: %s', self.rpn_class_logits.get_shape().as_list()) # Do a softmax classificaion to get output probabilities self.rpn_class_probs = tf.nn.softmax(self.rpn_class_logits, name='rpn_class_xxx') logging.info('rpn_class_probs: %s', self.rpn_class_probs.get_shape().as_list()) print('(RPN) Class Logits (shape) ', self.rpn_class_logits.shape) print('(RPN) Class Probs (shape) ', self.rpn_class_probs.shape)
def build(self): shared = ops.conv_layer( self.xrpn, k_shape=[3, 3, self.xrpn.get_shape().as_list()[-1], 512], stride=self.rpn_anchor_stride, padding='SAME', scope_name='rpn_conv_shared', trainable=True) shared = ops.activation(shared, 'relu', scope_name='rpn_relu_shared') logging.info('RPN - Shared_conv: %s', str(shared.get_shape().as_list())) ## Classification Output: Binary classification, # Get the pixel wise Classification self.get_pixel_fb_classification(shared, self.rpn_anchor_stride, len(self.rpn_anchor_ratios)) ## Bounding Box Output: Get the coordinates , height and width of bounding box self.get_bounding_box(shared, self.rpn_anchor_stride, len(self.rpn_anchor_ratios))
def get_bounding_box(self, x, anchor_stride, anchor_per_location): ''' ALL ABOUT THIS MODULE Input: anchor_stride: controls the number of anchors, for instance: if stride = 1, feature_map = 32x32, num_anchors = 9 then number of anchors = 32 x 32 x 9 if stride = 2, feature_map = 32x32, num_anchors = 9 then number of anchors = (32 x 32 x 9)/2 anchor_per_location: How many anchors to build per location Outputs: This module generates 4 values self.rpn_bbox = [batch_size, num_anchors, (dy, dx, log(dh), log(dw))] 1. dy = center y pixel 2. dx = center x pixel 3. log(dh) = height of bounding box 4. log(dw) = width of bounding box This is a linear classifier :param x: :return: ''' sh_in = x.get_shape().as_list()[-1] # Here 4*len(anchor_ratio) = 8, where 4 is the count of bounding box output x = ops.conv_layer(x, k_shape=[1, 1, sh_in, 4 * anchor_per_location], stride=anchor_stride, padding='VALID', scope_name='rpn_bbox_pred', trainable=True) logging.info('RPN - Conv Bbox: %s', str(x.get_shape().as_list())) # The shape of rpn_bbox = [None, None, 4] = Which says for each image for each pixel position of a feature map the output of box is 4 -> center_x, center_y, width and height. Since we do it in pixel basis, we would end up having many many bounding boxes overlapping and hence we use non-max suppression to overcome this situation. self.rpn_bbox = tf.reshape(x, [tf.shape(x)[0], -1, 4]) # self.rpn_bbox = tf.reshape(x, [x.get_shape().as_list()[0], -1, 4]) logging.info('rpn_bbox: %s', self.rpn_bbox.get_shape().as_list()) print('(RPN) Bbox (shape) ', self.rpn_bbox.shape)
def classifier_with_fpn_tf(self): rois_shape = self.pooled_rois.get_shape().as_list() # Note we dont perform batch normalization, because as per matterport github implementation of Mask RCNN, # it is suggested to not have it becasue batch norm doesnt perform well with very small batches. # FC Layer 1 x = tf.concat([ tf.stack([ ops.activation( ops.conv_layer( self.pooled_rois[i], k_shape=self.pool_shape + [rois_shape[-1], 1024], stride=1, padding='VALID', scope_name='mrcnn_class_conv1'), 'relu', 'FC1_relu') for i in range(0, rois_shape[0]) ]) ], axis=0) self.FC1 = x if self.DEBUG else [] # FC Layer 2 x = tf.concat([ tf.stack([ ops.activation( ops.conv_layer(x[i], k_shape=[1, 1, 1024, 1024], stride=1, padding='VALID', scope_name='mrcnn_class_conv2'), 'relu', 'FC2_relu') for i in range(0, rois_shape[0]) ]) ], axis=0) self.FC2 = x if self.DEBUG else [] # Squeeze the 2nd and 3rd dimension [num_batch, num_proposals, 1, 1, 1024] to [num_batch, num_proposals, 1024] shared = tf.squeeze(x, [2, 3]) self.shared = shared if self.DEBUG else [] with tf.variable_scope('mrcnn_class_scores'): mrcnn_class_logits = tf.concat([ tf.stack([ ops.fc_layers(shared[i], k_shape=[1024, self.num_classes], scope_name='mrcnn_class_logits') for i in range(0, rois_shape[0]) ]) ], axis=0) self.mrcnn_class_probs = tf.concat([ tf.stack([ ops.activation(mrcnn_class_logits[i], 'softmax', scope_name='mrcnn_class') for i in range(0, rois_shape[0]) ]) ], axis=0) with tf.variable_scope('mrcnn_class_bbox'): x = tf.concat([ tf.stack([ ops.fc_layers(shared[i], k_shape=[1024, self.num_classes * 4], scope_name='mrcnn_bbox') for i in range(0, rois_shape[0]) ]) ], axis=0) s = tf.shape(x) self.mrcnn_bbox = tf.reshape(x, [s[0], s[1], self.num_classes, 4], name="mrcnn_bbox")
def identity_block(self, x_in, filters, stage, block): ''' No Convolution applied to Shortcut or (layer to be used for skip connection) ''' f1, f2, f3 = filters conv_name = 'res' + str(stage) + block + '_branch' bn_name = 'bn' + str(stage) + block + '_branch' relu_name = 'relu' + str(stage) + block + '_branch' x_shape = x_in.get_shape().as_list() ## BRANCH 2a x = ops.conv_layer(x_in, [1, 1, x_shape[-1], f1], stride=1, padding='SAME', scope_name=conv_name + '2a') x = tf.layers.batch_normalization(x, axis=-1, name=bn_name + '2a', trainable=False) # x = ops.batch_norm(x, axis=[0, 1, 2], scope_name=bn_name + '2a') # x = BatchNorm(name=bn_name + '2a')(x, training=False) x = ops.activation(x, 'relu', relu_name + '2a') logging.info('%s: %s', str(conv_name + '2a'), str(x.get_shape().as_list())) ## BRANCH 2b x = ops.conv_layer(x, [3, 3, f1, f2], stride=1, padding='SAME', scope_name=conv_name + '2b') x = tf.layers.batch_normalization(x, axis=-1, name=bn_name + '2b', trainable=False) # x = ops.batch_norm(x, axis=[0, 1, 2], scope_name=bn_name + '2b') # x = BatchNorm(name=bn_name + '2b')(x, training=False) x = ops.activation(x, 'relu', relu_name + '2b') logging.info('%s: %s', str(conv_name + '2b'), str(x.get_shape().as_list())) ## BRANCH 2c x = ops.conv_layer(x, [1, 1, f2, f3], stride=1, padding='SAME', scope_name=conv_name + '2c') x = tf.layers.batch_normalization(x, axis=-1, name=bn_name + '2c', trainable=False) # x = ops.batch_norm(x, axis=[0, 1, 2], scope_name=bn_name + '2c') # x = BatchNorm(name=bn_name + '2c')(x, training=False) logging.info('%s: %s', str(conv_name + '2c'), str(x.get_shape().as_list())) ## Add x = x + x_in x = ops.activation(x, 'relu', relu_name + '_out') logging.info('%s: %s', str(relu_name + '_out'), str(x.get_shape().as_list())) return x
def fpn_top_down_graph(self): ''' Feature Pyramid Networks: Detecting objects at different scale is difficult, especially time consuming and memory intensive . Here C1,C2,C3,C4,C5 can be thought as feature maps for each stage. They are useful to build the Feature Pyramid Network, each C's are down-sampled at every stage. P1, P2, P3, P4, P5 are the feature map layer for prediction ''' logging.info( 'Initiating FPN TOP-DOWN .................................') # Feature Map 1 M5 = ops.conv_layer( self.C5, [1, 1, self.C5.get_shape().as_list()[-1], 256], stride=1, padding='SAME', scope_name='fpn_c5p5', trainable=True) # to reduce the channel depth logging.info('FPN - M5: %s', str(M5.get_shape().as_list())) # Feature Map 2 m4_c = ops.conv_layer( self.C4, [1, 1, self.C4.get_shape().as_list()[-1], 256], stride=1, padding='SAME', scope_name='fpn_c4p4', trainable=True) m4_up = KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(M5) M4 = KL.Add(name="fpn_p4add")([m4_up, m4_c]) logging.info('FPN - M4: %s', str(M4.get_shape().as_list())) # Feature Map 3 m3_c = ops.conv_layer( self.C3, [1, 1, self.C3.get_shape().as_list()[-1], 256], stride=1, padding='SAME', scope_name='fpn_c3p3', trainable=True) m3_up = KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(M4) M3 = KL.Add(name="fpn_p3add")([m3_up, m3_c]) logging.info('FPN - M3: %s', str(M3.get_shape().as_list())) # Feature Map 4 m2_c = ops.conv_layer( self.C2, [1, 1, self.C2.get_shape().as_list()[-1], 256], stride=1, padding='SAME', scope_name='fpn_c2p2', trainable=True) m2_up = KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(M3) M2 = KL.Add(name="fpn_p2add")([m2_up, m2_c]) logging.info('FPN - M2: %s', str(M2.get_shape().as_list())) #### CREATE THE FEATURE MAP FOR PREDICTION self.P2 = ops.conv_layer(M2, [3, 3, 256, 256], stride=1, padding='SAME', scope_name='fpn_p2', trainable=True) self.P3 = ops.conv_layer(M3, [3, 3, 256, 256], stride=1, padding='SAME', scope_name='fpn_p3', trainable=True) self.P4 = ops.conv_layer(M4, [3, 3, 256, 256], stride=1, padding='SAME', scope_name='fpn_p4', trainable=True) self.P5 = ops.conv_layer(M5, [3, 3, 256, 256], stride=1, padding='SAME', scope_name='fpn_p5', trainable=True) self.P6 = tf.layers.max_pooling2d(self.P5, pool_size=1, strides=2, padding='SAME', name='fpn_p6') logging.info('FPN - P2 = %s, P3 = %s, P4 = %s, P5 = %s, P6 = %s:', str(self.P2.get_shape().as_list()), str(self.P3.get_shape().as_list()), str(self.P4.get_shape().as_list()), str(self.P5.get_shape().as_list()), str(self.P6.get_shape().as_list())) print('(FPN) P2: (shape) ', self.P2.shape) print('(FPN) P3: (shape) ', self.P3.shape) print('(FPN) P4: (shape) ', self.P4.shape) print('(FPN) P5: (shape) ', self.P5.shape) print('(FPN) P6: (shape) ', self.P6.shape)
def fpn_bottom_up_graph(self): ''' Here we implement a Resnet101 model, and make sure that at every stage we capture the feature map to be used by the top-down FPN network. This is required in assistance to further work on the feature map. :param input_image: :param stage_5: :return: ''' assert self.resnet_model in ["resnet50", "resnet101"] h, w = self.conf.IMAGE_SHAPE[:2] logging.info('Image height = %s, width = %s ................', str(h), str(w)) if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") logging.info( 'Initiating FPN BOTTOM-UP .................................') x = tf.pad(self.input_image, paddings=[[0, 0], [3, 3], [3, 3], [0, 0]]) logging.info('Zero_padded: %s', str(x.get_shape().as_list())) # STAGE 1 logging.info('STAGE 1 ...........................') x = ops.conv_layer(x, [7, 7, 3, 64], stride=2, padding='VALID', scope_name='conv1') x = tf.layers.batch_normalization(x, axis=-1, name='bn_conv1', trainable=False) # x = BatchNorm(name='bn_conv1')(x, training=False) # x = ops.batch_norm(x, axis=[0, 1, 2], scope_name='bn_conv1') x = ops.activation(x, 'relu', 'relu_conv1') logging.info('Conv2D: %s', str(x.get_shape().as_list())) x = tf.layers.max_pooling2d(x, pool_size=3, strides=2, padding="SAME") logging.info('MaxPool2d: %s', str(x.get_shape().as_list())) # self.C1 = x # STAGE 2 logging.info('STAGE 2 ...........................') x = self.conv_block(x, filters=[64, 64, 256], strides=1, stage=2, block='a') x = self.identity_block(x, filters=[64, 64, 256], stage=2, block='b') x = self.identity_block(x, filters=[64, 64, 256], stage=2, block='c') self.C2 = x # STAGE 3 logging.info('STAGE 3 ...........................') x = self.conv_block(x, filters=[128, 128, 512], strides=2, stage=3, block='a') x = self.identity_block(x, filters=[128, 128, 512], stage=3, block='b') x = self.identity_block(x, filters=[128, 128, 512], stage=3, block='c') x = self.identity_block(x, filters=[128, 128, 512], stage=3, block='d') self.C3 = x # STAGE 4 logging.info('STAGE 4 ...........................') x = self.conv_block(x, filters=[256, 256, 1024], strides=2, stage=4, block='a') block_count = {"resnet50": 5, "resnet101": 22}[self.resnet_model] for i in range(block_count): x = self.identity_block(x, filters=[256, 256, 1024], stage=4, block=chr(98 + i)) self.C4 = x # STAGE 5 logging.info('STAGE 5 ...........................') if self.stage_5: x = self.conv_block(x, filters=[512, 512, 2048], strides=2, stage=5, block='a') x = self.identity_block(x, filters=[512, 512, 2048], stage=5, block='b') x = self.identity_block(x, filters=[512, 512, 2048], stage=5, block='c') self.C5 = x else: self.C5 = None # print('(FPN) C1: (shape) ', self.C1.shape) print('(FPN) C2: (shape) ', self.C2.shape) print('(FPN) C3: (shape) ', self.C3.shape) print('(FPN) C4: (shape) ', self.C4.shape) print('(FPN) C5: (shape) ', self.C5.shape)