def create_fast_rcnn_predictor(conv_out, rois, fc_layers):
    # RCNN
    roi_out = roipooling(conv_out,
                         rois,
                         cntk.MAX_POOLING, (roi_dim, roi_dim),
                         spatial_scale=1 / 16.0)
    print("roi_out shape:{}".format(roi_out.shape))
    fc_out = fc_layers(roi_out)

    # prediction head
    fea_map_dim = globalvars['fea_map_dim']
    W_pred = parameter(shape=(fea_map_dim, globalvars['num_classes']),
                       init=normal(scale=0.01),
                       name="cls_score.W")
    b_pred = parameter(shape=globalvars['num_classes'],
                       init=0,
                       name="cls_score.b")
    cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score')

    # regression head
    W_regr = parameter(shape=(fea_map_dim, globalvars['num_classes'] * 4),
                       init=normal(scale=0.001),
                       name="bbox_regr.W")
    b_regr = parameter(shape=globalvars['num_classes'] * 4,
                       init=0,
                       name="bbox_regr.b")
    bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr')

    return cls_score, bbox_pred
Example #2
0
def create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg):
    # RCNN
    roi_out = roipooling(conv_out,
                         rois,
                         cntk.MAX_POOLING,
                         (cfg["MODEL"].ROI_DIM, cfg["MODEL"].ROI_DIM),
                         spatial_scale=1 / 16.0)
    fc_out = fc_layers(roi_out)

    # prediction head
    W_pred = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES),
                       init=normal(scale=0.01),
                       name="cls_score.W")
    b_pred = parameter(shape=cfg["DATA"].NUM_CLASSES,
                       init=0,
                       name="cls_score.b")
    cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score')

    # regression head
    W_regr = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES * 4),
                       init=normal(scale=0.001),
                       name="bbox_regr.W")
    b_regr = parameter(shape=cfg["DATA"].NUM_CLASSES * 4,
                       init=0,
                       name="bbox_regr.b")
    bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr')

    return cls_score, bbox_pred
Example #3
0
def create_imagenet_model_bottleneck(input, num_stack_layers, num_classes, stride1x1, stride3x3):
    c_map = [64, 128, 256, 512, 1024, 2048]

    # conv1 and max pooling
    conv1 = conv_bn_relu(input, (7, 7), c_map[0], strides=(2, 2))
    pool1 = MaxPooling((3,3), strides=(2,2), pad=True)(conv1)

    # conv2_x
    r2_1 = resnet_bottleneck_inc(pool1, c_map[2], c_map[0], (1, 1), (1, 1))
    r2_2 = resnet_bottleneck_stack(r2_1, num_stack_layers[0], c_map[2], c_map[0])

    # conv3_x
    r3_1 = resnet_bottleneck_inc(r2_2, c_map[3], c_map[1], stride1x1, stride3x3)
    r3_2 = resnet_bottleneck_stack(r3_1, num_stack_layers[1], c_map[3], c_map[1])

    # conv4_x
    r4_1 = resnet_bottleneck_inc(r3_2, c_map[4], c_map[2], stride1x1, stride3x3)
    r4_2 = resnet_bottleneck_stack(r4_1, num_stack_layers[2], c_map[4], c_map[2])

    # conv5_x
    r5_1 = resnet_bottleneck_inc(r4_2, c_map[5], c_map[3], stride1x1, stride3x3)
    r5_2 = resnet_bottleneck_stack(r5_1, num_stack_layers[3], c_map[5], c_map[3])

    # Global average pooling and output
    pool = AveragePooling(filter_shape=(7, 7), name='final_avg_pooling')(r5_2)
    z = Dense(num_classes, init=normal(0.01))(pool)
    return z
Example #4
0
def create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg):
    # RCNN
    roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (cfg["MODEL"].ROI_DIM, cfg["MODEL"].ROI_DIM), spatial_scale=1/16.0)
    fc_out = fc_layers(roi_out)

    # prediction head
    W_pred = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES), init=normal(scale=0.01), name="cls_score.W")
    b_pred = parameter(shape=cfg["DATA"].NUM_CLASSES, init=0, name="cls_score.b")
    cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score')

    # regression head
    W_regr = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES*4), init=normal(scale=0.001), name="bbox_regr.W")
    b_regr = parameter(shape=cfg["DATA"].NUM_CLASSES*4, init=0, name="bbox_regr.b")
    bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr')

    return cls_score, bbox_pred
Example #5
0
def create_fast_rcnn_predictor(conv_out, rois, fc_layers):
    # RCNN
    roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (roi_dim, roi_dim), spatial_scale=1/16.0)
    fc_out = fc_layers(roi_out)

    # prediction head
    W_pred = parameter(shape=(4096, globalvars['num_classes']), init=normal(scale=0.01), name="cls_score.W")
    b_pred = parameter(shape=globalvars['num_classes'], init=0, name="cls_score.b")
    cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score')

    # regression head
    W_regr = parameter(shape=(4096, globalvars['num_classes']*4), init=normal(scale=0.001), name="bbox_regr.W")
    b_regr = parameter(shape=globalvars['num_classes']*4, init=0, name="bbox_regr.b")
    bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr')

    return cls_score, bbox_pred
def create_alexnet():

    # Input variables denoting the features and label data
    feature_var = C.input_variable((num_channels, image_height, image_width))
    label_var = C.input_variable((num_classes))

    # apply model to input
    # remove mean value 
    mean_removed_features = minus(feature_var, constant(114), name='mean_removed_input')
    
    with default_options(activation=None, pad=True, bias=True):
        z = Sequential([
            # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU) 
            Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
            Activation(activation=relu, name='relu1'),
            LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'),
            MaxPooling((3,3), (2,2), name='pool1'),

            Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'), 
            Activation(activation=relu, name='relu2'),
            LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'),
            MaxPooling((3,3), (2,2), name='pool2'),

            Convolution2D((3,3), 384, init=normal(0.01), name='conv3'), 
            Activation(activation=relu, name='relu3'),
            Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'), 
            Activation(activation=relu, name='relu4'),
            Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'), 
            Activation(activation=relu, name='relu5'), 
            MaxPooling((3,3), (2,2), name='pool5'), 

            Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'),
            Activation(activation=relu, name='relu6'),
            Dropout(0.5, name='drop6'),
            Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'),
            Activation(activation=relu, name='relu7'),
            Dropout(0.5, name='drop7'),
            Dense(num_classes, init=normal(0.01), name='fc8')
            ])(mean_removed_features)

    # loss and metric
    ce  = cross_entropy_with_softmax(z, label_var)
    pe  = classification_error(z, label_var)
    pe5 = classification_error(z, label_var, topN=5)

    log_number_of_parameters(z) ; print()

    return {
        'feature': feature_var,
        'label': label_var,
        'ce' : ce,
        'pe' : pe,
        'pe5': pe5,
        'output': z
    }
Example #7
0
def create_alexnet():

    # Input variables denoting the features and label data
    feature_var = input_variable((num_channels, image_height, image_width))
    label_var = input_variable((num_classes))

    # apply model to input
    # remove mean value
    input = minus(feature_var, constant(114), name='mean_removed_input')

    with default_options(activation=None, pad=True, bias=True):
        z = Sequential([
            # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
            Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
            Activation(activation=relu, name='relu1'),
            LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'),
            MaxPooling((3,3), (2,2), name='pool1'),

            Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'),
            Activation(activation=relu, name='relu2'),
            LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'),
            MaxPooling((3,3), (2,2), name='pool2'),

            Convolution2D((3,3), 384, init=normal(0.01), name='conv3'),
            Activation(activation=relu, name='relu3'),
            Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'),
            Activation(activation=relu, name='relu4'),
            Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'),
            Activation(activation=relu, name='relu5'),
            MaxPooling((3,3), (2,2), name='pool5'),

            Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'),
            Activation(activation=relu, name='relu6'),
            Dropout(0.5, name='drop6'),
            Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'),
            Activation(activation=relu, name='relu7'),
            Dropout(0.5, name='drop7'),
            Dense(num_classes, init=normal(0.01), name='fc8')
            ])(input)

    # loss and metric
    ce  = cross_entropy_with_softmax(z, label_var)
    pe  = classification_error(z, label_var)
    pe5 = classification_error(z, label_var, topN=5)

    log_number_of_parameters(z) ; print()

    return {
        'feature': feature_var,
        'label': label_var,
        'ce' : ce,
        'pe' : pe,
        'pe5': pe5,
        'output': z
    }
Example #8
0
    def create_model(self):

        mean_removed_features = minus(self.input,
                                      constant(114),
                                      name='mean_removed_input')

        with default_options(activation=None, pad=True, bias=True):
            self.model = Sequential([
                Convolution2D((11, 11),
                              96,
                              init=normal(0.01),
                              pad=False,
                              name='conv1'),
                Activation(activation=relu, name='relu1'),
                self.__local_response_normalization(1.0,
                                                    2,
                                                    0.0001,
                                                    0.75,
                                                    name='norm1'),
                MaxPooling((3, 3), (2, 2), name='pool1'),
                Convolution2D((5, 5),
                              192,
                              init=normal(0.01),
                              init_bias=0.1,
                              name='conv2'),
                Activation(activation=relu, name='relu2'),
                self.__local_response_normalization(1.0,
                                                    2,
                                                    0.0001,
                                                    0.75,
                                                    name='norm2'),
                MaxPooling((3, 3), (2, 2), name='pool2'),
                Convolution2D((3, 3), 384, init=normal(0.01), name='conv3'),
                Activation(activation=relu, name='relu3'),
                Convolution2D((3, 3),
                              384,
                              init=normal(0.01),
                              init_bias=0.1,
                              name='conv4'),
                Activation(activation=relu, name='relu4'),
                Convolution2D((3, 3),
                              256,
                              init=normal(0.01),
                              init_bias=0.1,
                              name='conv5'),
                Activation(activation=relu, name='relu5'),
                MaxPooling((3, 3), (2, 2), name='pool5'),
                Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'),
                Activation(activation=relu, name='relu6'),
                Dropout(0.5, name='drop6'),
                Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'),
                Activation(activation=relu, name='relu7'),
                Dropout(0.5, name='drop7'),
                Dense(self.number_labels, init=normal(0.01), name='fc8')
            ])(mean_removed_features)
Example #9
0
 def __init__(self,
              hidden_dim,
              input_size,
              recurrent_min_abs=0,
              recurrent_max_abs=None,
              recurrent_kernel_initializer=1.0,
              input_kernel_initializer=normal(0.001),
              activation=C.relu,
              name=None):
     self._hidden_dim = hidden_dim
     self._recurrent_min_abs = recurrent_min_abs
     self._recurrent_max_abs = recurrent_max_abs
     self._recurrent_initializer = recurrent_kernel_initializer
     self._input_initializer = input_kernel_initializer
     self._activation = activation
     self._input_size = input_size
Example #10
0
def create_cifar10_model(input, num_stack_layers, num_classes):
    c_map = [16, 32, 64]

    conv = conv_bn_relu(input, (3, 3), c_map[0])
    r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0])

    r2_1 = resnet_basic_inc(r1, c_map[1])
    r2_2 = resnet_basic_stack(r2_1, num_stack_layers-1, c_map[1])

    r3_1 = resnet_basic_inc(r2_2, c_map[2])
    r3_2 = resnet_basic_stack(r3_1, num_stack_layers-1, c_map[2])

    # Global average pooling and output
    pool = AveragePooling(filter_shape=(8, 8), name='final_avg_pooling')(r3_2)
    z = Dense(num_classes, init=normal(0.01))(pool)
    return z
def create_cifar10_model(input, num_stack_layers, num_classes):
    c_map = [16, 32, 64]

    conv = conv_bn_relu(input, (3, 3), c_map[0])
    r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0])

    r2_1 = resnet_basic_inc(r1, c_map[1])
    r2_2 = resnet_basic_stack(r2_1, num_stack_layers - 1, c_map[1])

    r3_1 = resnet_basic_inc(r2_2, c_map[2])
    r3_2 = resnet_basic_stack(r3_1, num_stack_layers - 1, c_map[2])

    # Global average pooling and output
    pool = AveragePooling(filter_shape=(8, 8), name='final_avg_pooling')(r3_2)
    z = Dense(num_classes, init=normal(0.01))(pool)
    return z
Example #12
0
def create_imagenet_model_basic(input, num_stack_layers, num_classes):
    c_map = [64, 128, 256, 512]

    conv = conv_bn_relu(input, (7, 7), c_map[0], strides=(2, 2))
    pool1 = MaxPooling((3, 3), strides=(2, 2), pad=True)(conv)
    r1 = resnet_basic_stack(pool1, num_stack_layers[0], c_map[0])

    r2_1 = resnet_basic_inc(r1, c_map[1])
    r2_2 = resnet_basic_stack(r2_1, num_stack_layers[1], c_map[1])

    r3_1 = resnet_basic_inc(r2_2, c_map[2])
    r3_2 = resnet_basic_stack(r3_1, num_stack_layers[2], c_map[2])

    r4_1 = resnet_basic_inc(r3_2, c_map[3])
    r4_2 = resnet_basic_stack(r4_1, num_stack_layers[3], c_map[3])

    # Global average pooling and output
    pool = AveragePooling(filter_shape=(7, 7), name='final_avg_pooling')(r4_2)
    z = Dense(num_classes, init=normal(0.01))(pool)
    return z
Example #13
0
def InceptionV4_ResNet_model(input, num_classes):
    l = pre_block(input)

    # 32 x 32 x 128
    A1 = Res_A_stack(l,5, 64, 128)

    RA = reduction_A(A1, 32, 32, 64, 64)

    # 16 x 16 x 224
    B1 = Res_B_stack(RA, 10, 128, 224)

    # 16 x 16 x 448
    RB = reduction_B(B1, 32, 64, 32, 64, 128)

    # 8 x 8 x 480
    C1 = Res_C_stack(RB, 5, 192, 480)

    # Global average pooling and output
    pool = AveragePooling(filter_shape=(8, 8), name='final_avg_pooling')(C1)
    z = Dense(num_classes, init=normal(0.01))(pool)
    return z
Example #14
0
def Inception_ResNet_v2_model(input, num_classes):
    l = pre_block(input)

    # 32 x 32 x 128
    A1 = Res_A_stack(l, 5, 32, 32, 32, 48, 64, 128)

    # 32 x 32 x 128
    RA = reduction_A(A1, 64, 32, 64, 64)

    # 16 x 16 x 256
    B1 = Res_B_stack(RA, 10, 128, 64, 96, 128, 256)

    # 16 x 16 x 256
    RB = reduction_B(B1, 32, 64, 32, 64, 32, 64, 128)

    # 8 x 8 x 512
    C1 = Res_C_stack(RB, 5, 128, 64, 128, 256, 512)

    # Global average pooling and output
    pool = AveragePooling(filter_shape=(8, 8))(C1)
    z = Dense(num_classes, init=normal(0.01))(pool)
    return z
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True,
               proposal_layer_param_string=None, conv_bias_init=0.0):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         A CNTK variable or constant containing
                         (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height)
                         e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses
        proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer.

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    num_channels = cfg["CNTK"].RPN_NUM_CHANNELS
    rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1,
                                init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out)
    rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score",
                                init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred",
                                init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(rpn_cls_score.shape[0] / 2)
    rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp")
    p_rpn_cls_score_rshp = cntk.placeholder()
    rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0)
    rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob')
    rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape")

    # proposal layer
    rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string))
    rpn_rois = alias(rpn_rois_raw, name='rpn_rois')

    rpn_losses = None
    if(add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # classification loss
        p_rpn_labels = cntk.placeholder()
        p_rpn_cls_score_rshp = cntk.placeholder()

        keeps = cntk.greater_equal(p_rpn_labels, 0.0)
        fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets")
        bg_labels = minus(1, fg_labels, name="bg_targets")
        rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0)
        rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0)
        rpn_loss_cls = element_times(rpn_ce, keeps)

        # The terms that are accounted for in the cls loss are those that have a label >= 0
        cls_num_terms = reduce_sum(keeps)
        cls_normalization_factor = 1.0 / cls_num_terms
        normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor

        reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss,
                                         [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)],
                                         'CE_with_ignore', 'norm_rpn_cls_loss')

        # regression loss
        p_rpn_bbox_pred = cntk.placeholder()
        p_rpn_bbox_targets = cntk.placeholder()
        p_rpn_bbox_inside_weights = cntk.placeholder()
        rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0)
        # The bbox loss is normalized by the rpn batch size
        bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE
        normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor

        reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss,
                                          [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets),
                                           (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)],
                                          'SmoothL1Loss', 'norm_rpn_bbox_loss')

        rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses")

    return rpn_rois, rpn_losses
Example #16
0
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True,
               proposal_layer_param_string=None):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         (image_widht, image_height, image_scale) as CNTK variable or constant
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses
        proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer.

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    rpn_conv_3x3 = Convolution((3, 3), 256, activation=relu, pad=True, strides=1,
                                init = normal(scale=0.01), init_bias=0.1)(conv_out)
    rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score",
                                init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred",
                                init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(np.prod(rpn_cls_score.shape) / 2)
    rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions))
    rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax")
    rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape)

    # proposal layer
    rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string))
    rpn_rois = alias(rpn_rois_raw, name='rpn_rois')

    rpn_losses = None
    if(add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # For loss functions: ignore label predictions for the 'ignore label',
        # i.e. set target and prediction to 0 --> needs to be softmaxed before
        rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions))
        ignore = user_function(IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1))
        rpn_cls_prob_ignore = ignore.outputs[0]
        fg_targets = ignore.outputs[1]
        bg_targets = 1 - fg_targets
        rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0)

        # RPN losses
        rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore, rpn_labels_ignore, axis=0)
        rpn_loss_bbox = user_function(SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights))
        rpn_losses = plus(reduce_sum(rpn_loss_cls), reduce_sum(rpn_loss_bbox), name="rpn_losses")

    return rpn_rois, rpn_losses
Example #17
0
def create_rpn(conv_out,
               scaled_gt_boxes,
               im_info,
               cfg,
               add_loss_functions=True):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         A CNTK variable or constant containing
                         (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height)
                         e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000
        cfg:             The configuration dictionary
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    num_channels = cfg["MODEL"].RPN_NUM_CHANNELS
    rpn_conv_3x3 = Convolution((3, 3),
                               num_channels,
                               activation=relu,
                               pad=True,
                               strides=1,
                               init=normal(scale=0.01),
                               init_bias=0.0)(conv_out)
    rpn_cls_score = Convolution(
        (1, 1),
        18,
        activation=None,
        name="rpn_cls_score",
        init=normal(scale=0.01),
        init_bias=0.0)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution(
        (1, 1),
        36,
        activation=None,
        name="rpn_bbox_pred",
        init=normal(scale=0.01),
        init_bias=0.0)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(rpn_cls_score.shape[0] / 2)
    rpn_cls_score_rshp = reshape(
        rpn_cls_score,
        (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]),
        name="rpn_cls_score_rshp")
    p_rpn_cls_score_rshp = cntk.placeholder()
    rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0)
    rpn_cls_prob = cntk.as_block(rpn_cls_sm,
                                 [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)],
                                 'Softmax', 'rpn_cls_prob')
    rpn_cls_prob_reshape = reshape(rpn_cls_prob,
                                   rpn_cls_score.shape,
                                   name="rpn_cls_prob_reshape")

    # proposal layer
    rpn_rois = create_proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred,
                                     im_info, cfg)

    rpn_losses = None
    if (add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        proposal_layer_params = "'feat_stride': {}\n'scales':\n - {}". \
            format(cfg["MODEL"].FEATURE_STRIDE, "\n - ".join([str(v) for v in cfg["DATA"].PROPOSAL_LAYER_SCALES]))
        atl = user_function(
            AnchorTargetLayer(
                rpn_cls_score,
                scaled_gt_boxes,
                im_info,
                rpn_batch_size=cfg["TRAIN"].RPN_BATCHSIZE,
                rpn_fg_fraction=cfg["TRAIN"].RPN_FG_FRACTION,
                clobber_positives=cfg["TRAIN"].RPN_CLOBBER_POSITIVES,
                positive_overlap=cfg["TRAIN"].RPN_POSITIVE_OVERLAP,
                negative_overlap=cfg["TRAIN"].RPN_NEGATIVE_OVERLAP,
                param_str=proposal_layer_params))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # classification loss
        p_rpn_labels = cntk.placeholder()
        p_rpn_cls_score_rshp = cntk.placeholder()

        keeps = cntk.greater_equal(p_rpn_labels, 0.0)
        fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets")
        bg_labels = minus(1, fg_labels, name="bg_targets")
        rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0)
        rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp,
                                            rpn_labels_ignore,
                                            axis=0)
        rpn_loss_cls = element_times(rpn_ce, keeps)

        # The terms that are accounted for in the cls loss are those that have a label >= 0
        cls_num_terms = reduce_sum(keeps)
        cls_normalization_factor = 1.0 / cls_num_terms
        normalized_rpn_cls_loss = reduce_sum(
            rpn_loss_cls) * cls_normalization_factor

        reduced_rpn_loss_cls = cntk.as_block(
            normalized_rpn_cls_loss,
            [(p_rpn_labels, rpn_labels),
             (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore',
            'norm_rpn_cls_loss')

        # regression loss
        p_rpn_bbox_pred = cntk.placeholder()
        p_rpn_bbox_targets = cntk.placeholder()
        p_rpn_bbox_inside_weights = cntk.placeholder()
        rpn_loss_bbox = SmoothL1Loss(cfg.SIGMA_RPN_L1, p_rpn_bbox_pred,
                                     p_rpn_bbox_targets,
                                     p_rpn_bbox_inside_weights, 1.0)
        # The bbox loss is normalized by the rpn batch size
        bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE
        normalized_rpn_bbox_loss = reduce_sum(
            rpn_loss_bbox) * bbox_normalization_factor

        reduced_rpn_loss_bbox = cntk.as_block(
            normalized_rpn_bbox_loss,
            [(p_rpn_bbox_pred, rpn_bbox_pred),
             (p_rpn_bbox_targets, rpn_bbox_targets),
             (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)],
            'SmoothL1Loss', 'norm_rpn_bbox_loss')

        rpn_losses = plus(reduced_rpn_loss_cls,
                          reduced_rpn_loss_bbox,
                          name="rpn_losses")

    return rpn_rois, rpn_losses
Example #18
0
def create_rpn(conv_out,
               scaled_gt_boxes,
               im_info,
               add_loss_functions=True,
               proposal_layer_param_string=None):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         (image_widht, image_height, image_scale) as CNTK variable or constant
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses
        proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer.

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    rpn_conv_3x3 = Convolution((3, 3),
                               256,
                               activation=relu,
                               pad=True,
                               strides=1,
                               init=normal(scale=0.01),
                               init_bias=0.1)(conv_out)
    rpn_cls_score = Convolution(
        (1, 1),
        18,
        activation=None,
        name="rpn_cls_score",
        init=normal(scale=0.01),
        init_bias=0.1)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution(
        (1, 1),
        36,
        activation=None,
        name="rpn_bbox_pred",
        init=normal(scale=0.01),
        init_bias=0.1)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(np.prod(rpn_cls_score.shape) / 2)
    rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions))
    rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax")
    rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape)

    # proposal layer
    rpn_rois_raw = user_function(
        ProposalLayer(rpn_cls_prob_reshape,
                      rpn_bbox_pred,
                      im_info,
                      param_str=proposal_layer_param_string))
    rpn_rois = alias(rpn_rois_raw, name='rpn_rois')

    rpn_losses = None
    if (add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        atl = user_function(
            AnchorTargetLayer(rpn_cls_score,
                              scaled_gt_boxes,
                              im_info,
                              param_str=proposal_layer_param_string))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # For loss functions: ignore label predictions for the 'ignore label',
        # i.e. set target and prediction to 0 --> needs to be softmaxed before
        rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions))
        ignore = user_function(
            IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1))
        rpn_cls_prob_ignore = ignore.outputs[0]
        fg_targets = ignore.outputs[1]
        bg_targets = 1 - fg_targets
        rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0)

        # RPN losses
        rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore,
                                                  rpn_labels_ignore,
                                                  axis=0)
        rpn_loss_bbox = user_function(
            SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets,
                         rpn_bbox_inside_weights))
        rpn_losses = plus(reduce_sum(rpn_loss_cls),
                          reduce_sum(rpn_loss_bbox),
                          name="rpn_losses")

    return rpn_rois, rpn_losses