def create_fast_rcnn_predictor(conv_out, rois, fc_layers): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (roi_dim, roi_dim), spatial_scale=1 / 16.0) print("roi_out shape:{}".format(roi_out.shape)) fc_out = fc_layers(roi_out) # prediction head fea_map_dim = globalvars['fea_map_dim'] W_pred = parameter(shape=(fea_map_dim, globalvars['num_classes']), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=globalvars['num_classes'], init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(fea_map_dim, globalvars['num_classes'] * 4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=globalvars['num_classes'] * 4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (cfg["MODEL"].ROI_DIM, cfg["MODEL"].ROI_DIM), spatial_scale=1 / 16.0) fc_out = fc_layers(roi_out) # prediction head W_pred = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=cfg["DATA"].NUM_CLASSES, init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES * 4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=cfg["DATA"].NUM_CLASSES * 4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def create_imagenet_model_bottleneck(input, num_stack_layers, num_classes, stride1x1, stride3x3): c_map = [64, 128, 256, 512, 1024, 2048] # conv1 and max pooling conv1 = conv_bn_relu(input, (7, 7), c_map[0], strides=(2, 2)) pool1 = MaxPooling((3,3), strides=(2,2), pad=True)(conv1) # conv2_x r2_1 = resnet_bottleneck_inc(pool1, c_map[2], c_map[0], (1, 1), (1, 1)) r2_2 = resnet_bottleneck_stack(r2_1, num_stack_layers[0], c_map[2], c_map[0]) # conv3_x r3_1 = resnet_bottleneck_inc(r2_2, c_map[3], c_map[1], stride1x1, stride3x3) r3_2 = resnet_bottleneck_stack(r3_1, num_stack_layers[1], c_map[3], c_map[1]) # conv4_x r4_1 = resnet_bottleneck_inc(r3_2, c_map[4], c_map[2], stride1x1, stride3x3) r4_2 = resnet_bottleneck_stack(r4_1, num_stack_layers[2], c_map[4], c_map[2]) # conv5_x r5_1 = resnet_bottleneck_inc(r4_2, c_map[5], c_map[3], stride1x1, stride3x3) r5_2 = resnet_bottleneck_stack(r5_1, num_stack_layers[3], c_map[5], c_map[3]) # Global average pooling and output pool = AveragePooling(filter_shape=(7, 7), name='final_avg_pooling')(r5_2) z = Dense(num_classes, init=normal(0.01))(pool) return z
def create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (cfg["MODEL"].ROI_DIM, cfg["MODEL"].ROI_DIM), spatial_scale=1/16.0) fc_out = fc_layers(roi_out) # prediction head W_pred = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=cfg["DATA"].NUM_CLASSES, init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES*4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=cfg["DATA"].NUM_CLASSES*4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def create_fast_rcnn_predictor(conv_out, rois, fc_layers): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (roi_dim, roi_dim), spatial_scale=1/16.0) fc_out = fc_layers(roi_out) # prediction head W_pred = parameter(shape=(4096, globalvars['num_classes']), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=globalvars['num_classes'], init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(4096, globalvars['num_classes']*4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=globalvars['num_classes']*4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def create_alexnet(): # Input variables denoting the features and label data feature_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) # apply model to input # remove mean value mean_removed_features = minus(feature_var, constant(114), name='mean_removed_input') with default_options(activation=None, pad=True, bias=True): z = Sequential([ # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU) Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'), Activation(activation=relu, name='relu1'), LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'), MaxPooling((3,3), (2,2), name='pool1'), Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'), Activation(activation=relu, name='relu2'), LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'), MaxPooling((3,3), (2,2), name='pool2'), Convolution2D((3,3), 384, init=normal(0.01), name='conv3'), Activation(activation=relu, name='relu3'), Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'), Activation(activation=relu, name='relu4'), Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'), Activation(activation=relu, name='relu5'), MaxPooling((3,3), (2,2), name='pool5'), Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'), Activation(activation=relu, name='relu6'), Dropout(0.5, name='drop6'), Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'), Activation(activation=relu, name='relu7'), Dropout(0.5, name='drop7'), Dense(num_classes, init=normal(0.01), name='fc8') ])(mean_removed_features) # loss and metric ce = cross_entropy_with_softmax(z, label_var) pe = classification_error(z, label_var) pe5 = classification_error(z, label_var, topN=5) log_number_of_parameters(z) ; print() return { 'feature': feature_var, 'label': label_var, 'ce' : ce, 'pe' : pe, 'pe5': pe5, 'output': z }
def create_alexnet(): # Input variables denoting the features and label data feature_var = input_variable((num_channels, image_height, image_width)) label_var = input_variable((num_classes)) # apply model to input # remove mean value input = minus(feature_var, constant(114), name='mean_removed_input') with default_options(activation=None, pad=True, bias=True): z = Sequential([ # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU) Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'), Activation(activation=relu, name='relu1'), LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'), MaxPooling((3,3), (2,2), name='pool1'), Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'), Activation(activation=relu, name='relu2'), LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'), MaxPooling((3,3), (2,2), name='pool2'), Convolution2D((3,3), 384, init=normal(0.01), name='conv3'), Activation(activation=relu, name='relu3'), Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'), Activation(activation=relu, name='relu4'), Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'), Activation(activation=relu, name='relu5'), MaxPooling((3,3), (2,2), name='pool5'), Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'), Activation(activation=relu, name='relu6'), Dropout(0.5, name='drop6'), Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'), Activation(activation=relu, name='relu7'), Dropout(0.5, name='drop7'), Dense(num_classes, init=normal(0.01), name='fc8') ])(input) # loss and metric ce = cross_entropy_with_softmax(z, label_var) pe = classification_error(z, label_var) pe5 = classification_error(z, label_var, topN=5) log_number_of_parameters(z) ; print() return { 'feature': feature_var, 'label': label_var, 'ce' : ce, 'pe' : pe, 'pe5': pe5, 'output': z }
def create_model(self): mean_removed_features = minus(self.input, constant(114), name='mean_removed_input') with default_options(activation=None, pad=True, bias=True): self.model = Sequential([ Convolution2D((11, 11), 96, init=normal(0.01), pad=False, name='conv1'), Activation(activation=relu, name='relu1'), self.__local_response_normalization(1.0, 2, 0.0001, 0.75, name='norm1'), MaxPooling((3, 3), (2, 2), name='pool1'), Convolution2D((5, 5), 192, init=normal(0.01), init_bias=0.1, name='conv2'), Activation(activation=relu, name='relu2'), self.__local_response_normalization(1.0, 2, 0.0001, 0.75, name='norm2'), MaxPooling((3, 3), (2, 2), name='pool2'), Convolution2D((3, 3), 384, init=normal(0.01), name='conv3'), Activation(activation=relu, name='relu3'), Convolution2D((3, 3), 384, init=normal(0.01), init_bias=0.1, name='conv4'), Activation(activation=relu, name='relu4'), Convolution2D((3, 3), 256, init=normal(0.01), init_bias=0.1, name='conv5'), Activation(activation=relu, name='relu5'), MaxPooling((3, 3), (2, 2), name='pool5'), Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'), Activation(activation=relu, name='relu6'), Dropout(0.5, name='drop6'), Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'), Activation(activation=relu, name='relu7'), Dropout(0.5, name='drop7'), Dense(self.number_labels, init=normal(0.01), name='fc8') ])(mean_removed_features)
def __init__(self, hidden_dim, input_size, recurrent_min_abs=0, recurrent_max_abs=None, recurrent_kernel_initializer=1.0, input_kernel_initializer=normal(0.001), activation=C.relu, name=None): self._hidden_dim = hidden_dim self._recurrent_min_abs = recurrent_min_abs self._recurrent_max_abs = recurrent_max_abs self._recurrent_initializer = recurrent_kernel_initializer self._input_initializer = input_kernel_initializer self._activation = activation self._input_size = input_size
def create_cifar10_model(input, num_stack_layers, num_classes): c_map = [16, 32, 64] conv = conv_bn_relu(input, (3, 3), c_map[0]) r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0]) r2_1 = resnet_basic_inc(r1, c_map[1]) r2_2 = resnet_basic_stack(r2_1, num_stack_layers-1, c_map[1]) r3_1 = resnet_basic_inc(r2_2, c_map[2]) r3_2 = resnet_basic_stack(r3_1, num_stack_layers-1, c_map[2]) # Global average pooling and output pool = AveragePooling(filter_shape=(8, 8), name='final_avg_pooling')(r3_2) z = Dense(num_classes, init=normal(0.01))(pool) return z
def create_cifar10_model(input, num_stack_layers, num_classes): c_map = [16, 32, 64] conv = conv_bn_relu(input, (3, 3), c_map[0]) r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0]) r2_1 = resnet_basic_inc(r1, c_map[1]) r2_2 = resnet_basic_stack(r2_1, num_stack_layers - 1, c_map[1]) r3_1 = resnet_basic_inc(r2_2, c_map[2]) r3_2 = resnet_basic_stack(r3_1, num_stack_layers - 1, c_map[2]) # Global average pooling and output pool = AveragePooling(filter_shape=(8, 8), name='final_avg_pooling')(r3_2) z = Dense(num_classes, init=normal(0.01))(pool) return z
def create_imagenet_model_basic(input, num_stack_layers, num_classes): c_map = [64, 128, 256, 512] conv = conv_bn_relu(input, (7, 7), c_map[0], strides=(2, 2)) pool1 = MaxPooling((3, 3), strides=(2, 2), pad=True)(conv) r1 = resnet_basic_stack(pool1, num_stack_layers[0], c_map[0]) r2_1 = resnet_basic_inc(r1, c_map[1]) r2_2 = resnet_basic_stack(r2_1, num_stack_layers[1], c_map[1]) r3_1 = resnet_basic_inc(r2_2, c_map[2]) r3_2 = resnet_basic_stack(r3_1, num_stack_layers[2], c_map[2]) r4_1 = resnet_basic_inc(r3_2, c_map[3]) r4_2 = resnet_basic_stack(r4_1, num_stack_layers[3], c_map[3]) # Global average pooling and output pool = AveragePooling(filter_shape=(7, 7), name='final_avg_pooling')(r4_2) z = Dense(num_classes, init=normal(0.01))(pool) return z
def InceptionV4_ResNet_model(input, num_classes): l = pre_block(input) # 32 x 32 x 128 A1 = Res_A_stack(l,5, 64, 128) RA = reduction_A(A1, 32, 32, 64, 64) # 16 x 16 x 224 B1 = Res_B_stack(RA, 10, 128, 224) # 16 x 16 x 448 RB = reduction_B(B1, 32, 64, 32, 64, 128) # 8 x 8 x 480 C1 = Res_C_stack(RB, 5, 192, 480) # Global average pooling and output pool = AveragePooling(filter_shape=(8, 8), name='final_avg_pooling')(C1) z = Dense(num_classes, init=normal(0.01))(pool) return z
def Inception_ResNet_v2_model(input, num_classes): l = pre_block(input) # 32 x 32 x 128 A1 = Res_A_stack(l, 5, 32, 32, 32, 48, 64, 128) # 32 x 32 x 128 RA = reduction_A(A1, 64, 32, 64, 64) # 16 x 16 x 256 B1 = Res_B_stack(RA, 10, 128, 64, 96, 128, 256) # 16 x 16 x 256 RB = reduction_B(B1, 32, 64, 32, 64, 32, 64, 128) # 8 x 8 x 512 C1 = Res_C_stack(RB, 5, 128, 64, 128, 256, 512) # Global average pooling and output pool = AveragePooling(filter_shape=(8, 8))(C1) z = Dense(num_classes, init=normal(0.01))(pool) return z
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None, conv_bias_init=0.0): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: A CNTK variable or constant containing (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height) e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000 add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 num_channels = cfg["CNTK"].RPN_NUM_CHANNELS rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(rpn_cls_score.shape[0] / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp") p_rpn_cls_score_rshp = cntk.placeholder() rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0) rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob') rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape") # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # classification loss p_rpn_labels = cntk.placeholder() p_rpn_cls_score_rshp = cntk.placeholder() keeps = cntk.greater_equal(p_rpn_labels, 0.0) fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets") bg_labels = minus(1, fg_labels, name="bg_targets") rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0) rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0) rpn_loss_cls = element_times(rpn_ce, keeps) # The terms that are accounted for in the cls loss are those that have a label >= 0 cls_num_terms = reduce_sum(keeps) cls_normalization_factor = 1.0 / cls_num_terms normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss, [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore', 'norm_rpn_cls_loss') # regression loss p_rpn_bbox_pred = cntk.placeholder() p_rpn_bbox_targets = cntk.placeholder() p_rpn_bbox_inside_weights = cntk.placeholder() rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0) # The bbox loss is normalized by the rpn batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss, [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets), (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)], 'SmoothL1Loss', 'norm_rpn_bbox_loss') rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses") return rpn_rois, rpn_losses
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: (image_widht, image_height, image_scale) as CNTK variable or constant add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 rpn_conv_3x3 = Convolution((3, 3), 256, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=0.1)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(np.prod(rpn_cls_score.shape) / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions)) rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax") rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape) # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # For loss functions: ignore label predictions for the 'ignore label', # i.e. set target and prediction to 0 --> needs to be softmaxed before rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions)) ignore = user_function(IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1)) rpn_cls_prob_ignore = ignore.outputs[0] fg_targets = ignore.outputs[1] bg_targets = 1 - fg_targets rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0) # RPN losses rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore, rpn_labels_ignore, axis=0) rpn_loss_bbox = user_function(SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights)) rpn_losses = plus(reduce_sum(rpn_loss_cls), reduce_sum(rpn_loss_bbox), name="rpn_losses") return rpn_rois, rpn_losses
def create_rpn(conv_out, scaled_gt_boxes, im_info, cfg, add_loss_functions=True): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: A CNTK variable or constant containing (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height) e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000 cfg: The configuration dictionary add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 num_channels = cfg["MODEL"].RPN_NUM_CHANNELS rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1, init=normal(scale=0.01), init_bias=0.0)(conv_out) rpn_cls_score = Convolution( (1, 1), 18, activation=None, name="rpn_cls_score", init=normal(scale=0.01), init_bias=0.0)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution( (1, 1), 36, activation=None, name="rpn_bbox_pred", init=normal(scale=0.01), init_bias=0.0)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(rpn_cls_score.shape[0] / 2) rpn_cls_score_rshp = reshape( rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp") p_rpn_cls_score_rshp = cntk.placeholder() rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0) rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob') rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape") # proposal layer rpn_rois = create_proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg) rpn_losses = None if (add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... proposal_layer_params = "'feat_stride': {}\n'scales':\n - {}". \ format(cfg["MODEL"].FEATURE_STRIDE, "\n - ".join([str(v) for v in cfg["DATA"].PROPOSAL_LAYER_SCALES])) atl = user_function( AnchorTargetLayer( rpn_cls_score, scaled_gt_boxes, im_info, rpn_batch_size=cfg["TRAIN"].RPN_BATCHSIZE, rpn_fg_fraction=cfg["TRAIN"].RPN_FG_FRACTION, clobber_positives=cfg["TRAIN"].RPN_CLOBBER_POSITIVES, positive_overlap=cfg["TRAIN"].RPN_POSITIVE_OVERLAP, negative_overlap=cfg["TRAIN"].RPN_NEGATIVE_OVERLAP, param_str=proposal_layer_params)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # classification loss p_rpn_labels = cntk.placeholder() p_rpn_cls_score_rshp = cntk.placeholder() keeps = cntk.greater_equal(p_rpn_labels, 0.0) fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets") bg_labels = minus(1, fg_labels, name="bg_targets") rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0) rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0) rpn_loss_cls = element_times(rpn_ce, keeps) # The terms that are accounted for in the cls loss are those that have a label >= 0 cls_num_terms = reduce_sum(keeps) cls_normalization_factor = 1.0 / cls_num_terms normalized_rpn_cls_loss = reduce_sum( rpn_loss_cls) * cls_normalization_factor reduced_rpn_loss_cls = cntk.as_block( normalized_rpn_cls_loss, [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore', 'norm_rpn_cls_loss') # regression loss p_rpn_bbox_pred = cntk.placeholder() p_rpn_bbox_targets = cntk.placeholder() p_rpn_bbox_inside_weights = cntk.placeholder() rpn_loss_bbox = SmoothL1Loss(cfg.SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0) # The bbox loss is normalized by the rpn batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE normalized_rpn_bbox_loss = reduce_sum( rpn_loss_bbox) * bbox_normalization_factor reduced_rpn_loss_bbox = cntk.as_block( normalized_rpn_bbox_loss, [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets), (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)], 'SmoothL1Loss', 'norm_rpn_bbox_loss') rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses") return rpn_rois, rpn_losses
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: (image_widht, image_height, image_scale) as CNTK variable or constant add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 rpn_conv_3x3 = Convolution((3, 3), 256, activation=relu, pad=True, strides=1, init=normal(scale=0.01), init_bias=0.1)(conv_out) rpn_cls_score = Convolution( (1, 1), 18, activation=None, name="rpn_cls_score", init=normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution( (1, 1), 36, activation=None, name="rpn_bbox_pred", init=normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(np.prod(rpn_cls_score.shape) / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions)) rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax") rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape) # proposal layer rpn_rois_raw = user_function( ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if (add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function( AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # For loss functions: ignore label predictions for the 'ignore label', # i.e. set target and prediction to 0 --> needs to be softmaxed before rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions)) ignore = user_function( IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1)) rpn_cls_prob_ignore = ignore.outputs[0] fg_targets = ignore.outputs[1] bg_targets = 1 - fg_targets rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0) # RPN losses rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore, rpn_labels_ignore, axis=0) rpn_loss_bbox = user_function( SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights)) rpn_losses = plus(reduce_sum(rpn_loss_cls), reduce_sum(rpn_loss_bbox), name="rpn_losses") return rpn_rois, rpn_losses