Ejemplo n.º 1
0
def maskrcnn_upXconv_head(feature, num_category, seed_gen, num_convs, norm=None, fp16=False):
    """
    Args:
        feature: roi feature maps, Num_boxes x NumChannel x H_roi x W_roi,
        num_category(int): Number of total classes
        num_convs (int): number of convolution layers
        norm (str or None): either None or 'GN'

    Returns:
        mask_logits: Num_boxes x num_category x (2 * H_roi) x (2 * W_roi)
    """
    assert norm in [None, 'GN'], norm
    l = feature
    if fp16:
        l = tf.cast(l, tf.float16)
    with mixed_precision_scope(mixed=fp16):
      with argscope([Conv2D, Conv2DTranspose], data_format='channels_first',
                  kernel_initializer=tf.variance_scaling_initializer(
                      scale=2.0, mode='fan_out', seed=seed_gen.next(),
                      distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal')):
        # c2's MSRAFill is fan_out
        for k in range(num_convs):
            l = Conv2D('fcn{}'.format(k), l, cfg.MRCNN.HEAD_DIM, 3, activation=tf.nn.relu, seed=seed_gen.next())
            if norm is not None:
                if fp16: l = tf.cast(l, tf.float32)
                l = GroupNorm('gn{}'.format(k), l)
                if fp16: l = tf.cast(l, tf.float16)
        l = Conv2DTranspose('deconv', l, cfg.MRCNN.HEAD_DIM, 2, strides=2, activation=tf.nn.relu, seed=seed_gen.next()) # 2x upsampling
        l = Conv2D('conv', l, num_category, 1, seed=seed_gen.next())
    if fp16:
        l = tf.cast(l, tf.float32)
    return l
Ejemplo n.º 2
0
def boxclass_2fc_head(feature, seed_gen, fp16=False):
    """
    Fully connected layer for the class and box branch

    Args:
        feature map: The roi feature map, Num_boxes x Num_channels x H_roi x W_roi

    Returns:
        2D head feature: Num_boxes x Num_features
    """
    dim = cfg.FPN.BOXCLASS_FC_HEAD_DIM
    if fp16:
        feature = tf.cast(feature, tf.float16)

    with mixed_precision_scope(mixed=fp16):
        init = tf.variance_scaling_initializer(
            dtype=tf.float16 if fp16 else tf.float32, seed=seed_gen.next())
        hidden = FullyConnected('fc6',
                                feature,
                                dim,
                                kernel_initializer=init,
                                activation=tf.nn.relu)
        hidden = FullyConnected('fc7',
                                hidden,
                                dim,
                                kernel_initializer=init,
                                activation=tf.nn.relu)

    if fp16:
        hidden = tf.cast(hidden, tf.float32)

    return hidden
Ejemplo n.º 3
0
def rpn_head(featuremap, channel, num_anchors, seed_gen, fp16=False):
    """
    The RPN head that takes the feature map from the FPN and outputs bounding box logits.
    For every pixel on the feature maps, there are a certain number of anchors.
    The output will be:
    label logits: indicate whether there is an object for a certain anchor in one pixel
    box logits: The encoded box logits from fast-rcnn paper https://arxiv.org/abs/1506.01497
                page 5, in order to be consistent with the ground truth encoded boxes

    Args:
        featuremap: feature map for a single FPN layer, i.e. one from P23456, BS x NumChannel x H_feature x W_feature
        channel: NumChannel of the feature map, scalar, default 256
        num_anchors(NA): # of anchors for each pixel in the current feature map, scalar, default 3
    Returns:
        label_logits: BS x H_feature x W_feature x NA
        box_logits: BS x (NA * 4) x H_feature x W_feature, encoded
    """
    if fp16:
        featuremap = tf.cast(featuremap, tf.float16)

    with mixed_precision_scope(mixed=fp16):
        with argscope(Conv2D,
                      data_format='channels_first',
                      kernel_initializer=tf.random_normal_initializer(
                          stddev=0.01, seed=seed_gen.next())):
            hidden = Conv2D('conv0',
                            featuremap,
                            channel,
                            3,
                            activation=tf.nn.relu,
                            seed=seed_gen.next())
            # BS x NumChannel x H_feature x W_feature
            label_logits = Conv2D('class',
                                  hidden,
                                  num_anchors,
                                  1,
                                  seed=seed_gen.next())
            # BS x NA x H_feature x W_feature
            box_logits = Conv2D('box',
                                hidden,
                                4 * num_anchors,
                                1,
                                seed=seed_gen.next())
            # BS x (NA*4) x H_feature x W_feature

            label_logits = tf.transpose(
                label_logits, [0, 2, 3, 1])  # BS x H_feature x W_feature x NA

    if fp16:
        label_logits = tf.cast(label_logits, tf.float32)
        box_logits = tf.cast(box_logits, tf.float32)

    return label_logits, box_logits
Ejemplo n.º 4
0
def fpn_model(features, seed_gen, fp16=False):
    """
    Args:
        features ([tf.Tensor]): ResNet features c2-c5

    Returns:
        [tf.Tensor]: FPN features p2-p6
    """
    assert len(features) == 4, features
    num_channel = cfg.FPN.NUM_CHANNEL

    use_gn = cfg.FPN.NORM == 'GN'

    def upsample2x(name, x):
        dtype_str = 'float16' if fp16 else 'float32'
        return FixedUnPooling(
            name, x, 2, unpool_mat=np.ones((2, 2), dtype=dtype_str),
            data_format='channels_first' if cfg.TRAIN.FPN_NCHW else 'channels_last')

        # tf.image.resize is, again, not aligned.
        # with tf.name_scope(name):
        #     shape2d = tf.shape(x)[2:]
        #     x = tf.transpose(x, [0, 2, 3, 1])
        #     x = tf.image.resize_nearest_neighbor(x, shape2d * 2, align_corners=True)
        #     x = tf.transpose(x, [0, 3, 1, 2])
        #     return x

    with mixed_precision_scope(mixed=fp16):
      with argscope(Conv2D, data_format='channels_first' if cfg.TRAIN.FPN_NCHW else 'channels_last',
                  activation=tf.identity, use_bias=True,
                  kernel_initializer=tf.variance_scaling_initializer(scale=1., seed=seed_gen.next())):
        lat_2345 = [Conv2D('lateral_1x1_c{}'.format(i + 2), c, num_channel, 1, seed=seed_gen.next())
                    for i, c in enumerate(features)]
        if use_gn:
            lat_2345 = [GroupNorm('gn_c{}'.format(i + 2), c) for i, c in enumerate(lat_2345)]
        lat_sum_5432 = []
        for idx, lat in enumerate(lat_2345[::-1]):
            if idx == 0:
                lat_sum_5432.append(lat)
            else:
                lat = lat + upsample2x('upsample_lat{}'.format(6 - idx), lat_sum_5432[-1])
                lat_sum_5432.append(lat)
        p2345 = [Conv2D('posthoc_3x3_p{}'.format(i + 2), c, num_channel, 3, seed=seed_gen.next())
                 for i, c in enumerate(lat_sum_5432[::-1])]
        if use_gn:
            p2345 = [GroupNorm('gn_p{}'.format(i + 2), c) for i, c in enumerate(p2345)]
        p6 = MaxPooling('maxpool_p6', p2345[-1], pool_size=1, strides=2, data_format='channels_first' if cfg.TRAIN.FPN_NCHW else 'channels_last', padding='VALID')

        if fp16:
            return [tf.cast(l, tf.float32) for l in p2345] + [tf.cast(p6, tf.float32)]

        return p2345 + [p6]
Ejemplo n.º 5
0
def resnet_fpn_backbone(image, num_blocks, seed_gen, fp16=False):
    """
    Args:
        image: BS x NumChannel x H_image x W_image
        num_blocks: list of resnet block numbers for c2-c5
    Returns:
        Resnet features: c2-c5
    """
    freeze_at = cfg.BACKBONE.FREEZE_AT
    shape2d = tf.shape(image)[2:]
    mult = float(cfg.FPN.RESOLUTION_REQUIREMENT)
    new_shape2d = tf.cast(
        tf.ceil(tf.cast(shape2d, tf.float32) / mult) * mult, tf.int32)
    pad_shape2d = new_shape2d - shape2d
    assert len(num_blocks) == 4, num_blocks

    if fp16:
        image = tf.cast(image, tf.float16)

    with mixed_precision_scope(mixed=fp16):
        with backbone_scope(freeze=freeze_at > 0):
            chan = image.shape[1]
            pad_base = maybe_reverse_pad(2, 3)
            l = tf.pad(
                image,
                tf.stack([[0, 0], [0, 0],
                          [pad_base[0], pad_base[1] + pad_shape2d[0]],
                          [pad_base[0], pad_base[1] + pad_shape2d[1]]]))
            l.set_shape([None, chan, None, None])
            l = Conv2D('conv0',
                       l,
                       64,
                       7,
                       strides=2,
                       padding='VALID',
                       seed=seed_gen.next())
            l = tf.pad(l, [[0, 0], [0, 0],
                           maybe_reverse_pad(0, 1),
                           maybe_reverse_pad(0, 1)])
            l = MaxPooling('pool0', l, 3, strides=2, padding='VALID')
        with backbone_scope(freeze=freeze_at > 1):
            c2 = resnet_group('group0',
                              l,
                              resnet_bottleneck,
                              64,
                              num_blocks[0],
                              1,
                              seed_gen=seed_gen)
        with backbone_scope(freeze=False):
            c3 = resnet_group('group1',
                              c2,
                              resnet_bottleneck,
                              128,
                              num_blocks[1],
                              2,
                              seed_gen=seed_gen)
            c4 = resnet_group('group2',
                              c3,
                              resnet_bottleneck,
                              256,
                              num_blocks[2],
                              2,
                              seed_gen=seed_gen)
            c5 = resnet_group('group3',
                              c4,
                              resnet_bottleneck,
                              512,
                              num_blocks[3],
                              2,
                              seed_gen=seed_gen)

    # 32x downsampling up to now
    # size of c5: ceil(input/32)
    return c2, c3, c4, c5