def maskrcnn_upXconv_head(feature, num_category, seed_gen, num_convs, norm=None, fp16=False): """ Args: feature: roi feature maps, Num_boxes x NumChannel x H_roi x W_roi, num_category(int): Number of total classes num_convs (int): number of convolution layers norm (str or None): either None or 'GN' Returns: mask_logits: Num_boxes x num_category x (2 * H_roi) x (2 * W_roi) """ assert norm in [None, 'GN'], norm l = feature if fp16: l = tf.cast(l, tf.float16) with mixed_precision_scope(mixed=fp16): with argscope([Conv2D, Conv2DTranspose], data_format='channels_first', kernel_initializer=tf.variance_scaling_initializer( scale=2.0, mode='fan_out', seed=seed_gen.next(), distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal')): # c2's MSRAFill is fan_out for k in range(num_convs): l = Conv2D('fcn{}'.format(k), l, cfg.MRCNN.HEAD_DIM, 3, activation=tf.nn.relu, seed=seed_gen.next()) if norm is not None: if fp16: l = tf.cast(l, tf.float32) l = GroupNorm('gn{}'.format(k), l) if fp16: l = tf.cast(l, tf.float16) l = Conv2DTranspose('deconv', l, cfg.MRCNN.HEAD_DIM, 2, strides=2, activation=tf.nn.relu, seed=seed_gen.next()) # 2x upsampling l = Conv2D('conv', l, num_category, 1, seed=seed_gen.next()) if fp16: l = tf.cast(l, tf.float32) return l
def boxclass_2fc_head(feature, seed_gen, fp16=False): """ Fully connected layer for the class and box branch Args: feature map: The roi feature map, Num_boxes x Num_channels x H_roi x W_roi Returns: 2D head feature: Num_boxes x Num_features """ dim = cfg.FPN.BOXCLASS_FC_HEAD_DIM if fp16: feature = tf.cast(feature, tf.float16) with mixed_precision_scope(mixed=fp16): init = tf.variance_scaling_initializer( dtype=tf.float16 if fp16 else tf.float32, seed=seed_gen.next()) hidden = FullyConnected('fc6', feature, dim, kernel_initializer=init, activation=tf.nn.relu) hidden = FullyConnected('fc7', hidden, dim, kernel_initializer=init, activation=tf.nn.relu) if fp16: hidden = tf.cast(hidden, tf.float32) return hidden
def rpn_head(featuremap, channel, num_anchors, seed_gen, fp16=False): """ The RPN head that takes the feature map from the FPN and outputs bounding box logits. For every pixel on the feature maps, there are a certain number of anchors. The output will be: label logits: indicate whether there is an object for a certain anchor in one pixel box logits: The encoded box logits from fast-rcnn paper https://arxiv.org/abs/1506.01497 page 5, in order to be consistent with the ground truth encoded boxes Args: featuremap: feature map for a single FPN layer, i.e. one from P23456, BS x NumChannel x H_feature x W_feature channel: NumChannel of the feature map, scalar, default 256 num_anchors(NA): # of anchors for each pixel in the current feature map, scalar, default 3 Returns: label_logits: BS x H_feature x W_feature x NA box_logits: BS x (NA * 4) x H_feature x W_feature, encoded """ if fp16: featuremap = tf.cast(featuremap, tf.float16) with mixed_precision_scope(mixed=fp16): with argscope(Conv2D, data_format='channels_first', kernel_initializer=tf.random_normal_initializer( stddev=0.01, seed=seed_gen.next())): hidden = Conv2D('conv0', featuremap, channel, 3, activation=tf.nn.relu, seed=seed_gen.next()) # BS x NumChannel x H_feature x W_feature label_logits = Conv2D('class', hidden, num_anchors, 1, seed=seed_gen.next()) # BS x NA x H_feature x W_feature box_logits = Conv2D('box', hidden, 4 * num_anchors, 1, seed=seed_gen.next()) # BS x (NA*4) x H_feature x W_feature label_logits = tf.transpose( label_logits, [0, 2, 3, 1]) # BS x H_feature x W_feature x NA if fp16: label_logits = tf.cast(label_logits, tf.float32) box_logits = tf.cast(box_logits, tf.float32) return label_logits, box_logits
def fpn_model(features, seed_gen, fp16=False): """ Args: features ([tf.Tensor]): ResNet features c2-c5 Returns: [tf.Tensor]: FPN features p2-p6 """ assert len(features) == 4, features num_channel = cfg.FPN.NUM_CHANNEL use_gn = cfg.FPN.NORM == 'GN' def upsample2x(name, x): dtype_str = 'float16' if fp16 else 'float32' return FixedUnPooling( name, x, 2, unpool_mat=np.ones((2, 2), dtype=dtype_str), data_format='channels_first' if cfg.TRAIN.FPN_NCHW else 'channels_last') # tf.image.resize is, again, not aligned. # with tf.name_scope(name): # shape2d = tf.shape(x)[2:] # x = tf.transpose(x, [0, 2, 3, 1]) # x = tf.image.resize_nearest_neighbor(x, shape2d * 2, align_corners=True) # x = tf.transpose(x, [0, 3, 1, 2]) # return x with mixed_precision_scope(mixed=fp16): with argscope(Conv2D, data_format='channels_first' if cfg.TRAIN.FPN_NCHW else 'channels_last', activation=tf.identity, use_bias=True, kernel_initializer=tf.variance_scaling_initializer(scale=1., seed=seed_gen.next())): lat_2345 = [Conv2D('lateral_1x1_c{}'.format(i + 2), c, num_channel, 1, seed=seed_gen.next()) for i, c in enumerate(features)] if use_gn: lat_2345 = [GroupNorm('gn_c{}'.format(i + 2), c) for i, c in enumerate(lat_2345)] lat_sum_5432 = [] for idx, lat in enumerate(lat_2345[::-1]): if idx == 0: lat_sum_5432.append(lat) else: lat = lat + upsample2x('upsample_lat{}'.format(6 - idx), lat_sum_5432[-1]) lat_sum_5432.append(lat) p2345 = [Conv2D('posthoc_3x3_p{}'.format(i + 2), c, num_channel, 3, seed=seed_gen.next()) for i, c in enumerate(lat_sum_5432[::-1])] if use_gn: p2345 = [GroupNorm('gn_p{}'.format(i + 2), c) for i, c in enumerate(p2345)] p6 = MaxPooling('maxpool_p6', p2345[-1], pool_size=1, strides=2, data_format='channels_first' if cfg.TRAIN.FPN_NCHW else 'channels_last', padding='VALID') if fp16: return [tf.cast(l, tf.float32) for l in p2345] + [tf.cast(p6, tf.float32)] return p2345 + [p6]
def resnet_fpn_backbone(image, num_blocks, seed_gen, fp16=False): """ Args: image: BS x NumChannel x H_image x W_image num_blocks: list of resnet block numbers for c2-c5 Returns: Resnet features: c2-c5 """ freeze_at = cfg.BACKBONE.FREEZE_AT shape2d = tf.shape(image)[2:] mult = float(cfg.FPN.RESOLUTION_REQUIREMENT) new_shape2d = tf.cast( tf.ceil(tf.cast(shape2d, tf.float32) / mult) * mult, tf.int32) pad_shape2d = new_shape2d - shape2d assert len(num_blocks) == 4, num_blocks if fp16: image = tf.cast(image, tf.float16) with mixed_precision_scope(mixed=fp16): with backbone_scope(freeze=freeze_at > 0): chan = image.shape[1] pad_base = maybe_reverse_pad(2, 3) l = tf.pad( image, tf.stack([[0, 0], [0, 0], [pad_base[0], pad_base[1] + pad_shape2d[0]], [pad_base[0], pad_base[1] + pad_shape2d[1]]])) l.set_shape([None, chan, None, None]) l = Conv2D('conv0', l, 64, 7, strides=2, padding='VALID', seed=seed_gen.next()) l = tf.pad(l, [[0, 0], [0, 0], maybe_reverse_pad(0, 1), maybe_reverse_pad(0, 1)]) l = MaxPooling('pool0', l, 3, strides=2, padding='VALID') with backbone_scope(freeze=freeze_at > 1): c2 = resnet_group('group0', l, resnet_bottleneck, 64, num_blocks[0], 1, seed_gen=seed_gen) with backbone_scope(freeze=False): c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2, seed_gen=seed_gen) c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2, seed_gen=seed_gen) c5 = resnet_group('group3', c4, resnet_bottleneck, 512, num_blocks[3], 2, seed_gen=seed_gen) # 32x downsampling up to now # size of c5: ceil(input/32) return c2, c3, c4, c5