コード例 #1
0
def dronenet(image_size,
            n_classes,
            mode='training',
            l2_regularization=0.0005,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_global=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                     [1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5]],
            two_boxes_for_ar1=True,
            steps=[8, 16, 32, 64, 100, 300],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False,
            bottleneck=True,
            reduction=0.0,
            dropout_rate=None,
            weight_decay=1e-4):


    n_predictor_layers = 4 # The number of predictor conv layers in the network is 6 for the original SSD300.
    n_classes += 1 # Account for the background class.
    l2_reg = l2_regularization # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
    if scales:
        if len(scales) != n_predictor_layers+1:
            raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales)))
    else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)

    if len(variances) != 4:
        raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError("All variances must be >0, but the variances given are {}".format(variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError("You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError("You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)




    def dense_block(prevDense, stage, nb_layers, nb_filter, growth_rate, bottleneck=True, dropout_rate=None, weight_decay=1e-4,
                    grow_nb_filters=True):
        ''' Build a dense_block where the output of each conv_block is fed to subsequent ones
            # Arguments
                x: input tensor
                stage: index for dense block
                nb_layers: the number of layers of conv_block to append to the model.
                nb_filter: number of filters
                growth_rate: growth rate
                dropout_rate: dropout rate
                weight_decay: weight decay factor
                grow_nb_filters: flag to decide to allow number of filters to grow
        '''

        for i in range(nb_layers):
            branch = i + 1
            dense = conv_block(prevDense, stage, branch, growth_rate, bottleneck, dropout_rate, weight_decay)
            #print('layer', stage, branch, nb_filter, prevDense.shape)
            prevDense = concatenate([prevDense, dense], axis=3, name='concat_'+str(stage)+'_'+str(branch))
            #print('concate', stage, nb_filter, prevDense.shape)

            if grow_nb_filters:
                nb_filter += growth_rate
        print('dense', stage, nb_filter, prevDense.shape)
        return prevDense, nb_filter




    def conv_block(prevConv, stage, branch, nb_filter, bottleneck=True, dropout_rate=None, weight_decay=1e-4):
        '''Apply BatchNorm, Relu, bottleneck 1x1 Conv2D, 3x3 Conv2D, and option dropout
            # Arguments
                prevConv: input tensor
                nb_filter: number of filters
                dropout_rate: dropout rate
                weight_decay: weight decay factor
        '''
        eps = 1.1e-5
        conv_name_base = 'conv' + str(stage) + '_' + str(branch)
        relu_name_base = 'relu' + str(stage) + '_' + str(branch)

        prevConv = BatchNormalization(epsilon=eps, axis=3, name=conv_name_base+'_x1_bn')(prevConv)
        prevConv = Activation('relu', name=relu_name_base+'_x1')(prevConv)

        if bottleneck:
            inter_channel = nb_filter * 4  # Obtained from https://github.com/liuzhuang13/DenseNet/blob/master/densenet.lua
            prevConv = Conv2D(inter_channel, (1, 1), kernel_initializer='he_normal',
                          padding='same', use_bias=False, kernel_regularizer=l2(weight_decay), name=conv_name_base+'_x1')(prevConv)

            if dropout_rate:
                prevConv = Dropout(dropout_rate)(prevConv)

        prevConv = BatchNormalization(epsilon=eps, axis=3, name=conv_name_base+'_x2_bn')(prevConv)
        prevConv = Activation('relu', name=relu_name_base+'_x2')(prevConv)
        prevConv = Conv2D(nb_filter, (3, 3), kernel_initializer='he_normal', padding='same', use_bias=False, name=conv_name_base+'_x2')(prevConv)

        if dropout_rate:
            prevConv = Dropout(dropout_rate)(prevConv)

        return prevConv



    def transition_block(prevTran, stage, nb_filter, compression=1.0, dropout_rate=None, weight_decay=1E-4):
        ''' Apply BatchNorm, 1x1 Convolution, averagePooling, optional compression, dropout
            # Arguments
                prevTran: input tensor
                stage: index for dense block
                nb_filter: number of filters
                compression: calculated as 1 - reduction. Reduces the number of feature maps in the transition block.
                dropout_rate: dropout rate
                weight_decay: weight decay factor
        '''

        eps = 1.1e-5
        conv_name_base = 'conv' + str(stage) + '_blk'
        relu_name_base = 'relu' + str(stage) + '_blk'
        pool_name_base = 'poolD' + str(stage)

        prevTran = BatchNormalization(epsilon=eps, axis=3, name=conv_name_base+'_bn')(prevTran)
        prevTran = Activation('relu', name=relu_name_base)(prevTran)
        prevTran = Conv2D(int(nb_filter * compression), (1, 1), activation='relu', kernel_initializer='he_normal',
                          padding='same', use_bias=False, kernel_regularizer=l2(weight_decay), name=conv_name_base)(prevTran)

        if dropout_rate:
            prevTran = Dropout(dropout_rate)(prevTran)

        # if stage !=3:
        #     prevTran = MaxPooling2D(pool_size=(2, 2), name=pool_name_base)(prevTran)
        print('tran', stage, prevTran.shape)
        return prevTran



    # DenseNet Parameters

    eps = 1.1e-5
    nb_filter =64
    t_nb_filter = 256
    growth_rate = 32
    nb_layers = [5, 7, 7, 7]
    compression = 1.0 - reduction
    conv1_1 = Conv2D(64, (3, 3), strides=(2, 2), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_1')(x1)
    conv1_2 = BatchNormalization(epsilon=eps, axis=3, name='conv1_2_bn')(conv1_1)
    conv1_2 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_2')(conv1_2)
    conv1_3 = BatchNormalization(epsilon=eps, axis=3, name='conv1_3_bn')(conv1_2)
    conv1_3 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_3')(conv1_3)
    pool1_3 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1_3')(conv1_3)

    # Add densBlock1
    stage = 1
    conv1, nb_filter = dense_block(pool1_3, stage, nb_layers[0], nb_filter, growth_rate, bottleneck=bottleneck,
                                   dropout_rate=None, weight_decay=weight_decay)
    trans1 = transition_block(conv1, stage, t_nb_filter, compression=compression, weight_decay=weight_decay)
    pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(trans1)
    nb_filter = int(nb_filter * compression)


    # Add densBlock2
    stage = 2
    conv2, nb_filter = dense_block(pool1, stage, nb_layers[1], nb_filter, growth_rate, bottleneck=bottleneck,
                                   dropout_rate=None, weight_decay=weight_decay)
    trans2 = transition_block(conv2, stage, t_nb_filter, compression=compression, weight_decay=weight_decay)
    pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(trans2)
    nb_filter = int(nb_filter * compression)


    # Add densBlock3
    stage = 3
    conv3, nb_filter = dense_block(pool2, stage, nb_layers[2], nb_filter, growth_rate, bottleneck=bottleneck,
                                   dropout_rate=None, weight_decay=weight_decay)
    trans3 = transition_block(conv3, stage, t_nb_filter, compression=compression, weight_decay=weight_decay)
    pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool3')(trans3)
    nb_filter = int(nb_filter * compression)


    # Add densBlock4
    stage = 4
    conv4, nb_filter = dense_block(pool3, stage, nb_layers[3], nb_filter, 26, bottleneck=bottleneck,
                                   dropout_rate=None, weight_decay=weight_decay)
    trans4 = transition_block(conv4, stage, t_nb_filter, compression=compression, weight_decay=weight_decay)
    #pool4 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool4')(trans4)




    M5 = BatchNormalization(epsilon=eps, axis=3, name='m5_bn1')(trans4)

    M5 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='M5P')(M5)

    M4 = UpSampling2D(size=(2, 2))(M5)

    # M4E = ZeroPadding2D(padding=((0, 1), (1, 0)), name='trans3_padding')(trans3)

    M4 = Add()([M4, trans3])

    M4 = BatchNormalization(epsilon=eps, axis=3, name='M4_bn')(M4)

    M4 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='M4P')(M4)

    M3 = UpSampling2D(size=(2, 2))(M4)

    # M3E = ZeroPadding2D(padding=((1, 1), (1, 1)), name='trans3_padding')(trans2)

    M3 = Add()([M3, trans2])

    M3 = BatchNormalization(epsilon=eps, axis=3, name='M3_bn')(M3)

    M3 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='M3P')(M3)

    M2 = UpSampling2D(size=(2, 2))(M3)

    # M2E = ZeroPadding2D(padding=((2, 2), (2, 2)), name='trans3_padding')(trans1)

    M2 = Add()([M2, trans1])

    M2 = BatchNormalization(epsilon=eps, axis=3, name='M2_bn')(M2)

    M2 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='M2P')(M2)

    print('M5', M5.shape)
    print('M4', M4.shape)
    print('M3', M3.shape)
    print('M2', M2.shape)
    print()


    ### Build the convolutional predictor layers on top of the base network
    # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
    # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
    # conv6_2_mbox = BatchNormalization(epsilon=eps, axis=3, name='conv6_2_mbox_conf_bn1')(M2)
    # conv6_2_mbox = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf1')(conv6_2_mbox)
    conv6_2_mbox_conf = BatchNormalization(epsilon=eps, axis=3, name='conv6_2_mbox_conf_bn2')(M2)
    conv6_2_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf2')(conv6_2_mbox_conf)

    # conv7_2_mbox = BatchNormalization(epsilon=eps, axis=3, name='conv7_2_mbox_conf_bn1')(M3)
    # conv7_2_mbox = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf1')(conv7_2_mbox)
    conv7_2_mbox_conf = BatchNormalization(epsilon=eps, axis=3, name='conv7_2_mbox_conf_bn2')(M3)
    conv7_2_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf2')(conv7_2_mbox_conf)

    # conv8_2_mbox = BatchNormalization(epsilon=eps, axis=3, name='conv8_2_mbox_conf_bn1')(M4)
    # conv8_2_mbox = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf1')(conv8_2_mbox)
    conv8_2_mbox_conf = BatchNormalization(epsilon=eps, axis=3, name='conv8_2_mbox_conf_bn2')(M4)
    conv8_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf2')(conv8_2_mbox_conf)

    # conv9_2_mbox = BatchNormalization(epsilon=eps, axis=3, name='conv9_2_mbox_conf_bn1')(M5)
    # conv9_2_mbox = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf1')(conv9_2_mbox)
    conv9_2_mbox_conf = BatchNormalization(epsilon=eps, axis=3, name='conv9_2_mbox_conf_bn2')(M5)
    conv9_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf2')(conv9_2_mbox_conf)


    print('conv6_2_mbox_conf', conv6_2_mbox_conf.shape)
    print('conv7_2_mbox_conf', conv7_2_mbox_conf.shape)
    print('conv8_2_mbox_conf', conv8_2_mbox_conf.shape)
    print('conv9_2_mbox_conf', conv9_2_mbox_conf.shape)
    print()

    # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
    # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
    conv6_2_mbox_loc = BatchNormalization(epsilon=eps, axis=3, name='conv6_2_mbox_loc_bn')(M2)
    conv6_2_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2_mbox_loc)
    conv7_2_mbox_loc = BatchNormalization(epsilon=eps, axis=3, name='conv7_2_mbox_loc_bn')(M3)
    conv7_2_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2_mbox_loc)
    conv8_2_mbox_loc = BatchNormalization(epsilon=eps, axis=3, name='conv8_2_mbox_loc_bn')(M4)
    conv8_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2_mbox_loc)
    conv9_2_mbox_loc = BatchNormalization(epsilon=eps, axis=3, name='conv9_2_mbox_loc_bn')(M5)
    conv9_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2_mbox_loc)


    print('conv6_2_mbox_loc', conv6_2_mbox_loc.shape)
    print('conv7_2_mbox_loc', conv7_2_mbox_loc.shape)
    print('conv8_2_mbox_loc', conv8_2_mbox_loc.shape)
    print('conv9_2_mbox_loc', conv9_2_mbox_loc.shape)
    print()

    ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)
    conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
    conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
    conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
    conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)


    print('conv6_2_mbox_priorbox', conv6_2_mbox_priorbox.shape)
    print('conv7_2_mbox_priorbox', conv7_2_mbox_priorbox.shape)
    print('conv8_2_mbox_priorbox', conv8_2_mbox_priorbox.shape)
    print('conv9_2_mbox_priorbox', conv9_2_mbox_priorbox.shape)
    print()

    ### Reshape
    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them

    conv6_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
    conv7_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
    conv8_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
    conv9_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)


    print('conv6_2_mbox_conf_reshape', conv6_2_mbox_conf_reshape.shape)
    print('conv7_2_mbox_conf_reshape', conv7_2_mbox_conf_reshape.shape)
    print('conv8_2_mbox_conf_reshape', conv8_2_mbox_conf_reshape.shape)
    print('conv9_2_mbox_conf_reshape', conv9_2_mbox_conf_reshape.shape)
    print()

    # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss

    conv6_2_mbox_loc_reshape = Reshape((-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
    conv7_2_mbox_loc_reshape = Reshape((-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
    conv8_2_mbox_loc_reshape = Reshape((-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
    conv9_2_mbox_loc_reshape = Reshape((-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)


    print('conv6_2_mbox_loc_reshape', conv6_2_mbox_loc_reshape.shape)
    print('conv7_2_mbox_loc_reshape', conv7_2_mbox_loc_reshape.shape)
    print('conv8_2_mbox_loc_reshape', conv8_2_mbox_loc_reshape.shape)
    print('conv9_2_mbox_loc_reshape', conv9_2_mbox_loc_reshape.shape)
    print()

    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`

    conv6_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
    conv7_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
    conv8_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
    conv9_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)


    print('conv6_2_mbox_priorbox_reshape', conv6_2_mbox_priorbox_reshape.shape)
    print('conv7_2_mbox_priorbox_reshape', conv7_2_mbox_priorbox_reshape.shape)
    print('conv8_2_mbox_priorbox_reshape', conv8_2_mbox_priorbox_reshape.shape)
    print('conv9_2_mbox_priorbox_reshape', conv9_2_mbox_priorbox_reshape.shape)
    print()

    ### Concatenate the predictions from the different layers

    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1, the number of boxes per layer
    # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv6_2_mbox_conf_reshape,
                                                       conv7_2_mbox_conf_reshape,
                                                       conv8_2_mbox_conf_reshape
                                                       ,conv9_2_mbox_conf_reshape
                                                       ])

    print('mbox_conf', mbox_conf.shape)
    print()

    # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv6_2_mbox_loc_reshape,
                                                     conv7_2_mbox_loc_reshape,
                                                     conv8_2_mbox_loc_reshape
                                                     ,conv9_2_mbox_loc_reshape
                                                     ])

    print('mbox_loc', mbox_loc.shape)
    print()

    # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv6_2_mbox_priorbox_reshape,
                                                               conv7_2_mbox_priorbox_reshape,
                                                               conv8_2_mbox_priorbox_reshape
                                                               ,conv9_2_mbox_priorbox_reshape
                                                               ])

    print('mbox_priorbox', mbox_priorbox.shape)
    print()

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf)

    print('mbox_conf_softmax', mbox_conf_softmax.shape)
    print()

    # Concatenate the class and box predictions and the anchors to one large predictions vector
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox])

    print('predictions', predictions.shape)

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
                                               iou_threshold=iou_threshold,
                                               top_k=top_k,
                                               nms_max_output_size=nms_max_output_size,
                                               coords=coords,
                                               normalize_coords=normalize_coords,
                                               img_height=img_height,
                                               img_width=img_width,
                                               name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
                                                   iou_threshold=iou_threshold,
                                                   top_k=top_k,
                                                   nms_max_output_size=nms_max_output_size,
                                                   coords=coords,
                                                   normalize_coords=normalize_coords,
                                                   img_height=img_height,
                                                   img_width=img_width,
                                                   name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([ conv6_2_mbox_conf._keras_shape[1:3],
                                     conv7_2_mbox_conf._keras_shape[1:3],
                                     conv8_2_mbox_conf._keras_shape[1:3],
                                     conv9_2_mbox_conf._keras_shape[1:3]])
        return model, predictor_sizes
    else:
        return model
コード例 #2
0
    def __init__(self, kwargs):

        # all allowed keys will be initialized as class attributes
        required_keys = set(['image_size', 'n_classes', 'mode','l2_regularization', 'min_scale', 'max_scale', 'scales','n_predictor_layers','training_info_path',\
                         'aspect_ratios_global','aspect_ratios_per_layer', 'two_boxes_for_ar1', 'steps', 'offsets', 'clip_boxes',\
                         'variances', 'coords','normalize_coords', 'subtract_mean','divide_by_stddev','swap_channels','confidence_thresh',\
                         'iou_threshold', 'top_k', 'nms_max_output_size', 'return_predictor_sizes','build_base_model'])

        # initialize all allowed keys to false
        self.__dict__.update((key, None) for key in required_keys)

        # and update the given keys by their given values
        self.__dict__.update((key, value) for key, value in kwargs.items()
                             if key in required_keys)

        self.params = kwargs

        'some evaluation on input values and re-calculations if required'
        self.init_params()

        'At the end, all build information will be stored/provided in model_info_dict'
        self.models_info = {'training':None, 'inference':None, 'inference_fast':None,\
                            'simple_model':None, 'predictor_sizes':None}

        'the input placeholder'
        x = Input(shape=(self.img_height, self.img_width, self.img_channels))
        '''
        
        model constroctures:
            
            - input_layer_constructor  : normalize the input or swap the channels, if required.
            - feature maps constructor : build the network structue throgh which the features are extracted. 
            - predictors constructor   : the network structure throgh which the predictions tensors are cretead.
            - anchors_constructor      : creats all defaults anchors according to the network structure
            
        '''
        input_layer_constructor = construct_input_layer(self.params)
        feature_maps_constructor = construct_feature_maps(self.params)
        predictors_constructor = construct_predictors(self.params)
        anchors_constructor = construct_default_anchors(self.params)
        output_constructor = construct_model_output(self.params)

        # costumes
        self.costume_input_layer = input_layer_constructor
        self.costume_feature_maps = feature_maps_constructor
        self.costume_predictors = predictors_constructor
        self.costume_anchors = anchors_constructor
        self.costume_output = output_constructor

        # main model construction
        x1 = input_layer_constructor(x)
        features = feature_maps_constructor(x1)
        classes, bboxes = predictors_constructor(features)
        anchors = anchors_constructor(bboxes)
        predictions = output_constructor(classes, bboxes, anchors)

        if self.mode == 'training':
            self.main_model = Model(inputs=x, outputs=predictions)

        if self.mode == 'inference':
            decoded_predictions = DecodeDetections(
                confidence_thresh=self.confidence_thresh,
                iou_threshold=self.iou_threshold,
                top_k=self.top_k,
                nms_max_output_size=self.nms_max_output_size,
                coords=self.coords,
                normalize_coords=self.normalize_coords,
                img_height=self.img_height,
                img_width=self.img_width,
                name='decoded_predictions')(predictions)
            self.main_model = Model(inputs=x, outputs=decoded_predictions)

        if self.mode == 'inference_fast':
            decoded_predictions = DecodeDetectionsFast(
                confidence_thresh=self.confidence_thresh,
                iou_threshold=self.iou_threshold,
                top_k=self.top_k,
                nms_max_output_size=self.nms_max_output_size,
                coords=self.coords,
                normalize_coords=self.normalize_coords,
                img_height=self.img_height,
                img_width=self.img_width,
                name='decoded_predictions')(predictions)
            self.main_model = Model(inputs=x, outputs=decoded_predictions)

        # build model
        if self.build_base_model:
            #######################################################################
            # construct the simple model for gap model
            base_model_input = Input(shape=(self.img_height, self.img_width,
                                            self.img_channels))
            base_features = feature_maps_constructor(base_model_input)
            base_classes, base_bboxes = predictors_constructor(base_features)
            self.base_model = Model(inputs=base_model_input,
                                    outputs=base_classes + base_bboxes)

        # build predictor sizes
        if self.return_predictor_sizes:
            # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
            predictors_sizes = [
                class_predictor._keras_shape[1:3]
                for class_predictor in classes
            ]
            self.predictor_sizes = np.array(predictors_sizes)
コード例 #3
0
def build_model(image_size,
                n_classes,
                mode='training',
                l2_regularization=0.0,
                min_scale=0.1,
                max_scale=0.9,
                scales=None,
                aspect_ratios_global=[0.5, 1.0, 2.0],
                aspect_ratios_per_layer=None,
                two_boxes_for_ar1=True,
                steps=None,
                offsets=None,
                clip_boxes=False,
                variances=[1.0, 1.0, 1.0, 1.0],
                coords='centroids',
                normalize_coords=False,
                subtract_mean=None,
                divide_by_stddev=None,
                swap_channels=False,
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400,
                return_predictor_sizes=False):

    n_predictor_layers = 4 # The number of predictor conv layers in the network
    n_classes += 1 # Account for the background class.
    l2_reg = l2_regularization # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
    if scales:
        if len(scales) != n_predictor_layers+1:
            raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales)))
    else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)

    if len(variances) != 4: # We need one variance value for each of the four box coordinates
        raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError("All variances must be >0, but the variances given are {}".format(variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError("You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError("You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
    if mode == 'hardware':
        x1 = x
    conv1 = Conv2D(32, (5, 5), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1')(x1)
    conv1 = BatchNormalization(axis=3, momentum=0.99, name='bn1')(conv1) # Tensorflow uses filter format [filter_height, filter_width, in_channels, out_channels], hence axis = 3
    # conv1 = ELU(name='elu1')(conv1)
    conv1 = LeakyReLU(0.125)(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2), name='pool1')(conv1)
    #48
    conv2 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2')(pool1)
    conv2 = BatchNormalization(axis=3, momentum=0.99, name='bn2')(conv2)
    conv2 = LeakyReLU(0.125)(conv2)
    conv2 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_2')(conv2)
    conv2 = BatchNormalization(axis=3, momentum=0.99, name='bn2_2')(conv2)
    conv2 = LeakyReLU(0.125)(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2), name='pool2')(conv2)
    # 64
    conv3 = Conv2D(128, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3')(pool2)
    conv3 = BatchNormalization(axis=3, momentum=0.99, name='bn3')(conv3)
    conv3 = LeakyReLU(0.125)(conv3)
    conv3 = Conv2D(128, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_2')(conv3)
    conv3 = BatchNormalization(axis=3, momentum=0.99, name='bn3_2')(conv3)
    conv3 = LeakyReLU(0.125)(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2), name='pool3')(conv3)
    # 64
    conv4 = Conv2D(256, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4')(pool3)
    conv4 = BatchNormalization(axis=3, momentum=0.99, name='bn4')(conv4)
    conv4 = LeakyReLU(0.125)(conv4)
    conv4 = Conv2D(256, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_2')(conv4)
    conv4 = BatchNormalization(axis=3, momentum=0.99, name='bn4_2')(conv4)
    conv4 = LeakyReLU(0.125)(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2), name='pool4')(conv4)
    # 48
    conv5 = Conv2D(128, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5')(pool4)
    conv5 = BatchNormalization(axis=3, momentum=0.99, name='bn5')(conv5)
    conv5 = LeakyReLU(0.125)(conv5)
    pool5 = MaxPooling2D(pool_size=(2, 2), name='pool5')(conv5)
    # 48
    conv6 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6')(pool5)
    conv6 = BatchNormalization(axis=3, momentum=0.99, name='bn6')(conv6)
    conv6 = LeakyReLU(0.125)(conv6)
#     pool6 = MaxPooling2D(pool_size=(2, 2), name='pool6')(conv6)

    pool6 = conv6
    # 32
    conv7 = Conv2D(32, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7')(pool6)
    conv7 = BatchNormalization(axis=3, momentum=0.99, name='bn7')(conv7)
    conv7 = LeakyReLU(0.125)(conv7)

    # The next part is to add the convolutional predictor layers on top of the base network
    # that we defined above. Note that I use the term "base network" differently than the paper does.
    # To me, the base network is everything that is not convolutional predictor layers or anchor
    # box layers. In this case we'll have four predictor layers, but of course you could
    # easily rewrite this into an arbitrarily deep base network and add an arbitrary number of
    # predictor layers on top of the base network by simply following the pattern shown here.

    # Build the convolutional predictor layers on top of conv layers 4, 5, 6, and 7.
    # We build two predictor layers on top of each of these layers: One for class prediction (classification), one for box coordinate prediction (localization)
    # We precidt `n_classes` confidence values for each box, hence the `classes` predictors have depth `n_boxes * n_classes`
    # We predict 4 box coordinates for each box, hence the `boxes` predictors have depth `n_boxes * 4`
    # Output shape of `classes`: `(batch, height, width, n_boxes * n_classes)`
    conv4 = Conv2D(256, (1,1))(conv4)
    conv5 = Conv2D(192, (1,1))(conv5)
    conv6 = Conv2D(128, (1,1))(conv6)
    conv7 = Conv2D(96, (1,1))(conv7)

    classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes4')(conv4)
    classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes5')(conv5)
    classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes6')(conv6)
    classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes7')(conv7)
    # Output shape of `boxes`: `(batch, height, width, n_boxes * 4)`
    boxes4 = Conv2D(n_boxes[0] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes4')(conv4)
    boxes5 = Conv2D(n_boxes[1] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes5')(conv5)
    boxes6 = Conv2D(n_boxes[2] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes6')(conv6)
    boxes7 = Conv2D(n_boxes[3] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes7')(conv7)

    # Generate the anchor boxes
    # Output shape of `anchors`: `(batch, height, width, n_boxes, 8)`
    anchors4 = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0],
                           two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0],
                           clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors4')(boxes4)
    anchors5 = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1],
                           two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1],
                           clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors5')(boxes5)
    anchors6 = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2],
                           two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2],
                           clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors6')(boxes6)
    anchors7 = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3],
                           two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3],
                           clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors7')(boxes7)

    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    classes4_reshaped = Reshape((-1, n_classes), name='classes4_reshape')(classes4)
    classes5_reshaped = Reshape((-1, n_classes), name='classes5_reshape')(classes5)
    classes6_reshaped = Reshape((-1, n_classes), name='classes6_reshape')(classes6)
    classes7_reshaped = Reshape((-1, n_classes), name='classes7_reshape')(classes7)
    # Reshape the box coordinate predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    boxes4_reshaped = Reshape((-1, 4), name='boxes4_reshape')(boxes4)
    boxes5_reshaped = Reshape((-1, 4), name='boxes5_reshape')(boxes5)
    boxes6_reshaped = Reshape((-1, 4), name='boxes6_reshape')(boxes6)
    boxes7_reshaped = Reshape((-1, 4), name='boxes7_reshape')(boxes7)
    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
    anchors4_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors4)
    anchors5_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors5)
    anchors6_reshaped = Reshape((-1, 8), name='anchors6_reshape')(anchors6)
    anchors7_reshaped = Reshape((-1, 8), name='anchors7_reshape')(anchors7)

    # Concatenate the predictions from the different layers and the assosciated anchor box tensors
    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1
    # Output shape of `classes_concat`: (batch, n_boxes_total, n_classes)
    classes_concat = Concatenate(axis=1, name='classes_concat')([classes4_reshaped,
                                                                 classes5_reshaped,
                                                                 classes6_reshaped,
                                                                 classes7_reshaped])

    # Output shape of `boxes_concat`: (batch, n_boxes_total, 4)
    boxes_concat = Concatenate(axis=1, name='boxes_concat')([boxes4_reshaped,
                                                             boxes5_reshaped,
                                                             boxes6_reshaped,
                                                             boxes7_reshaped])

    # Output shape of `anchors_concat`: (batch, n_boxes_total, 8)
    anchors_concat = Concatenate(axis=1, name='anchors_concat')([anchors4_reshaped,
                                                                 anchors5_reshaped,
                                                                 anchors6_reshaped,
                                                                 anchors7_reshaped])

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    classes_softmax = Activation('softmax', name='classes_softmax')(classes_concat)

    # Concatenate the class and box coordinate predictions and the anchors to one large predictions tensor
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')([classes_softmax, boxes_concat, anchors_concat])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
                                               iou_threshold=iou_threshold,
                                               top_k=top_k,
                                               nms_max_output_size=nms_max_output_size,
                                               coords=coords,
                                               normalize_coords=normalize_coords,
                                               img_height=img_height,
                                               img_width=img_width,
                                               name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
                                                   iou_threshold=iou_threshold,
                                                   top_k=top_k,
                                                   nms_max_output_size=nms_max_output_size,
                                                   coords=coords,
                                                   normalize_coords=normalize_coords,
                                                   img_height=img_height,
                                                   img_width=img_width,
                                                   name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'hardware':
        model = Model(inputs=x, outputs=[classes4,classes5,classes6,classes7,boxes4,boxes5,boxes6, boxes7])
    elif mode == 'anchor':
        model = Model(inputs=x, outputs=[anchors4, anchors5, anchors6, anchors7])
    else:
        raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))

    if return_predictor_sizes:
        # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
        predictor_sizes = np.array([classes4._keras_shape[1:3],
                                    classes5._keras_shape[1:3],
                                    classes6._keras_shape[1:3],
                                    classes7._keras_shape[1:3]])
        return model, predictor_sizes
    else:
        return model
コード例 #4
0
def build_model(image_size,
                n_classes,
                mode='training',
                l2_regularization=0.0005,
                min_scale=0.1,
                max_scale=0.9,
                scales=None,
                aspect_ratios_global=[0.5, 1.0, 2.0],
                aspect_ratios_per_layer=None,
                two_boxes_for_ar1=True,
                steps=None,
                offsets=None,
                clip_boxes=False,
                variances=[1.0, 1.0, 1.0, 1.0],
                coords='centroids',
                normalize_coords=False,
                subtract_mean=None,
                divide_by_stddev=None,
                swap_channels=False,
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400,
                return_predictor_sizes=False,
                n_predictor_layers=2):

    n_predictor_layers = n_predictor_layers  # The number of predictor conv layers in the network
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(
            variances
    ) != 4:  # We need one variance value for each of the four box coordinates
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################
    input_index = 0

    model_input = Input(shape=(img_height, img_width, img_channels),
                        name=f'input_{input_index}')
    # x = Input(shape=(img_height, img_width, img_channels), name=f'input_{input_index}')
    # model_input = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
    # if not (subtract_mean is None):
    #     model_input = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
    # if not (divide_by_stddev is None):
    #     model_input = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
    # if swap_channels:
    #     model_input = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
    """
    Feature Extraction
    """
    input_conv = Conv2D(32, (3, 3), (2, 2),
                        padding='same',
                        kernel_regularizer=l2(l2_reg),
                        name=f"input_{input_index}_conv")(model_input)
    input_relu = ReLU(name=f"input_{input_index}_relu")(input_conv)

    block_1 = invResBlock(input_relu,
                          index=1,
                          expand=32,
                          dw_mult=3,
                          squeeze=32,
                          strides=2)
    block_2 = invResBlock(block_1,
                          index=2,
                          expand=32,
                          dw_mult=3,
                          squeeze=64,
                          strides=2)
    block_3 = invResBlock(block_2,
                          index=3,
                          expand=48,
                          dw_mult=3,
                          squeeze=128,
                          strides=2)
    block_4 = invResBlock(block_3,
                          index=4,
                          expand=96,
                          dw_mult=3,
                          squeeze=256,
                          strides=2)
    """
    End of Feature Extraction
    """
    """
    SSD Layers
    """
    classes3 = Conv2D(n_boxes[0] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes4')(block_3)
    classes4 = Conv2D(n_boxes[1] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes5')(block_4)
    # classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes6')(conv6)
    # classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes7')(conv7)

    boxes3 = Conv2D(n_boxes[0] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes4')(block_3)
    boxes4 = Conv2D(n_boxes[1] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes5')(block_4)

    anchors3 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[0],
                           next_scale=scales[1],
                           aspect_ratios=aspect_ratios[0],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[0],
                           this_offsets=offsets[0],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors4')(boxes3)
    anchors4 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[1],
                           next_scale=scales[2],
                           aspect_ratios=aspect_ratios[1],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[1],
                           this_offsets=offsets[1],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors5')(boxes4)

    classes3_reshaped = Reshape((-1, n_classes),
                                name='classes4_reshape')(classes3)
    classes4_reshaped = Reshape((-1, n_classes),
                                name='classes5_reshape')(classes4)

    boxes3_reshaped = Reshape((-1, 4), name='boxes4_reshape')(boxes3)
    boxes4_reshaped = Reshape((-1, 4), name='boxes5_reshape')(boxes4)

    anchors3_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors3)
    anchors4_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors4)

    classes_concat = Concatenate(axis=1, name='classes_concat')([
        classes3_reshaped,
        classes4_reshaped,
    ])

    # Output shape of `boxes_concat`: (batch, n_boxes_total, 4)
    boxes_concat = Concatenate(axis=1, name='boxes_concat')([
        boxes3_reshaped,
        boxes4_reshaped,
    ])

    # Output shape of `anchors_concat`: (batch, n_boxes_total, 8)
    anchors_concat = Concatenate(axis=1, name='anchors_concat')([
        anchors3_reshaped,
        anchors4_reshaped,
    ])

    classes_softmax = Softmax(name='classes_softmax')(classes_concat)

    # Concatenate the class and box coordinate predictions and the anchors to one large predictions tensor
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [classes_softmax, boxes_concat, anchors_concat])
    """
    End of SSD Layers
    """

    model = Model(model_input, predictions)
    if mode == 'training':
        model = Model(inputs=model_input, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=model_input, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=model_input, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
        predictor_sizes = np.array(
            [classes3.shape.as_list()[1:3],
             classes4.shape.as_list()[1:3]])
        return model, predictor_sizes
    else:
        return model
コード例 #5
0
ファイル: keras_ssd7.py プロジェクト: t3pa/autonomous_ros
def build_model(image_size,
                n_classes,
                mode='training',
                l2_regularization=0.0,
                min_scale=0.1,
                max_scale=0.9,
                scales=None,
                aspect_ratios_global=[0.5, 1.0, 2.0],
                aspect_ratios_per_layer=None,
                two_boxes_for_ar1=True,
                steps=None,
                offsets=None,
                clip_boxes=False,
                variances=[1.0, 1.0, 1.0, 1.0],
                coords='centroids',
                normalize_coords=False,
                subtract_mean=None,
                divide_by_stddev=None,
                swap_channels=False,
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400,
                return_predictor_sizes=False):
    '''
    Build a Keras model with SSD architecture, see references.

    The model consists of convolutional feature layers and a number of convolutional
    predictor layers that take their input from different feature layers.
    The model is fully convolutional.

    The implementation found here is a smaller version of the original architecture
    used in the paper (where the base network consists of a modified VGG-16 extended
    by a few convolutional feature layers), but of course it could easily be changed to
    an arbitrarily large SSD architecture by following the general design pattern used here.
    This implementation has 7 convolutional layers and 4 convolutional predictor
    layers that take their input from layers 4, 5, 6, and 7, respectively.

    Most of the arguments that this function takes are only needed for the anchor
    box layers. In case you're training the network, the parameters passed here must
    be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
    trained weights, the parameters passed here must be the same as the ones used
    to produce the trained weights.

    Some of these arguments are explained in more detail in the documentation of the
    `SSDBoxEncoder` class.

    Note: Requires Keras v2.0 or later. Training currently works only with the
    TensorFlow backend (v1.0 or later).

    Arguments:
        image_size (tuple): The input image size in the format `(height, width, channels)`.
        n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
        mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
            the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
            the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
            non-maximum suppression, and top-k filtering. The difference between latter two modes is that
            'inference' follows the exact procedure of the original Caffe implementation, while
            'inference_fast' uses a faster prediction decoding procedure.
        l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
        min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images.
        max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images. All scaling factors between the smallest and the
            largest will be linearly interpolated. Note that the second to last of the linearly interpolated
            scaling factors will actually be the scaling factor for the last predictor layer, while the last
            scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
            if `two_boxes_for_ar1` is `True`.
        scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
            This list must be one element longer than the number of predictor layers. The first `k` elements are the
            scaling factors for the `k` predictor layers, while the last element is used for the second box
            for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
            last scaling factor must be passed either way, even if it is not being used. If a list is passed,
            this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
        aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
            generated. This list is valid for all predictor layers. The original implementation uses more aspect ratios
            for some predictor layers and fewer for others. If you want to do that, too, then use the next argument instead.
        aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each predictor layer.
            This allows you to set the aspect ratios for each predictor layer individually. If a list is passed,
            it overrides `aspect_ratios_global`.
        two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
            If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
            using the scaling factor for the respective layer, the second one will be generated using
            geometric mean of said scaling factor and next bigger scaling factor.
        steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
            pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
            the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
            If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
            If no steps are provided, then they will be computed such that the anchor box center points will form an
            equidistant grid within the image dimensions.
        offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either floats or tuples of two floats. These numbers represent for each predictor layer how many
            pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
            as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
            of the step size specified in the `steps` argument. If the list contains floats, then that value will
            be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
            `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size,
            which is also the recommended setting.
        clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
        variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
            its respective variance value.
        coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
            of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
            and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
            i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
        subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
            of any shape that is broadcast-compatible with the image shape. The elements of this array will be
            subtracted from the image pixel intensity values. For example, pass a list of three integers
            to perform per-channel mean normalization for color images.
        divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
            floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
            intensity values will be divided by the elements of this array. For example, pass a list
            of three integers to perform per-channel standard deviation normalization for color images.
        swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
            image channels should be swapped.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
            positive class in order to be considered for the non-maximum suppression stage for the respective class.
            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
            stage, while a larger value will result in a larger part of the selection process happening in the confidence
            thresholding stage.
        iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
            to the box's confidence score.
        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
            non-maximum suppression stage.
        nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
        return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
            a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
            you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
            to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
            spatial dimensions of the predictor layers), for inference you don't need them.

    Returns:
        model: The Keras SSD model.
        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
            of the output tensor shape for each convolutional predictor layer. During
            training, the generator function needs this in order to transform
            the ground truth labels into tensors of identical structure as the
            output tensors of the model, which is in turn needed for the cost
            function.

    References:
        https://arxiv.org/abs/1512.02325v5
    '''

    n_predictor_layers = 4  # The number of predictor conv layers in the network
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(
            variances
    ) != 4:  # We need one variance value for each of the four box coordinates
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)

    conv1 = Conv2D(32, (5, 5),
                   strides=(1, 1),
                   padding="same",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name='conv1')(x1)
    conv1 = BatchNormalization(axis=3, momentum=0.99, name='bn1')(
        conv1
    )  # Tensorflow uses filter format [filter_height, filter_width, in_channels, out_channels], hence axis = 3
    conv1 = ELU(name='elu1')(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2), name='pool1')(conv1)

    conv2 = Conv2D(48, (3, 3),
                   strides=(1, 1),
                   padding="same",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name='conv2')(pool1)
    conv2 = BatchNormalization(axis=3, momentum=0.99, name='bn2')(conv2)
    conv2 = ELU(name='elu2')(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2), name='pool2')(conv2)

    conv3 = Conv2D(64, (3, 3),
                   strides=(1, 1),
                   padding="same",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name='conv3')(pool2)
    conv3 = BatchNormalization(axis=3, momentum=0.99, name='bn3')(conv3)
    conv3 = ELU(name='elu3')(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2), name='pool3')(conv3)

    conv4 = Conv2D(64, (3, 3),
                   strides=(1, 1),
                   padding="same",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name='conv4')(pool3)
    conv4 = BatchNormalization(axis=3, momentum=0.99, name='bn4')(conv4)
    conv4 = ELU(name='elu4')(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2), name='pool4')(conv4)

    conv5 = Conv2D(48, (3, 3),
                   strides=(1, 1),
                   padding="same",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name='conv5')(pool4)
    conv5 = BatchNormalization(axis=3, momentum=0.99, name='bn5')(conv5)
    conv5 = ELU(name='elu5')(conv5)
    pool5 = MaxPooling2D(pool_size=(2, 2), name='pool5')(conv5)

    conv6 = Conv2D(48, (3, 3),
                   strides=(1, 1),
                   padding="same",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name='conv6')(pool5)
    conv6 = BatchNormalization(axis=3, momentum=0.99, name='bn6')(conv6)
    conv6 = ELU(name='elu6')(conv6)
    pool6 = MaxPooling2D(pool_size=(2, 2), name='pool6')(conv6)

    conv7 = Conv2D(32, (3, 3),
                   strides=(1, 1),
                   padding="same",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name='conv7')(pool6)
    conv7 = BatchNormalization(axis=3, momentum=0.99, name='bn7')(conv7)
    conv7 = ELU(name='elu7')(conv7)

    # The next part is to add the convolutional predictor layers on top of the base network
    # that we defined above. Note that I use the term "base network" differently than the paper does.
    # To me, the base network is everything that is not convolutional predictor layers or anchor
    # box layers. In this case we'll have four predictor layers, but of course you could
    # easily rewrite this into an arbitrarily deep base network and add an arbitrary number of
    # predictor layers on top of the base network by simply following the pattern shown here.

    # Build the convolutional predictor layers on top of conv layers 4, 5, 6, and 7.
    # We build two predictor layers on top of each of these layers: One for class prediction (classification), one for box coordinate prediction (localization)
    # We precidt `n_classes` confidence values for each box, hence the `classes` predictors have depth `n_boxes * n_classes`
    # We predict 4 box coordinates for each box, hence the `boxes` predictors have depth `n_boxes * 4`
    # Output shape of `classes`: `(batch, height, width, n_boxes * n_classes)`
    classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes4')(conv4)
    classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes5')(conv5)
    classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes6')(conv6)
    classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes7')(conv7)
    # Output shape of `boxes`: `(batch, height, width, n_boxes * 4)`
    boxes4 = Conv2D(n_boxes[0] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes4')(conv4)
    boxes5 = Conv2D(n_boxes[1] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes5')(conv5)
    boxes6 = Conv2D(n_boxes[2] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes6')(conv6)
    boxes7 = Conv2D(n_boxes[3] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes7')(conv7)

    # Generate the anchor boxes
    # Output shape of `anchors`: `(batch, height, width, n_boxes, 8)`
    anchors4 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[0],
                           next_scale=scales[1],
                           aspect_ratios=aspect_ratios[0],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[0],
                           this_offsets=offsets[0],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors4')(boxes4)
    anchors5 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[1],
                           next_scale=scales[2],
                           aspect_ratios=aspect_ratios[1],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[1],
                           this_offsets=offsets[1],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors5')(boxes5)
    anchors6 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[2],
                           next_scale=scales[3],
                           aspect_ratios=aspect_ratios[2],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[2],
                           this_offsets=offsets[2],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors6')(boxes6)
    anchors7 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[3],
                           next_scale=scales[4],
                           aspect_ratios=aspect_ratios[3],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[3],
                           this_offsets=offsets[3],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors7')(boxes7)

    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    classes4_reshaped = Reshape((-1, n_classes),
                                name='classes4_reshape')(classes4)
    classes5_reshaped = Reshape((-1, n_classes),
                                name='classes5_reshape')(classes5)
    classes6_reshaped = Reshape((-1, n_classes),
                                name='classes6_reshape')(classes6)
    classes7_reshaped = Reshape((-1, n_classes),
                                name='classes7_reshape')(classes7)
    # Reshape the box coordinate predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    boxes4_reshaped = Reshape((-1, 4), name='boxes4_reshape')(boxes4)
    boxes5_reshaped = Reshape((-1, 4), name='boxes5_reshape')(boxes5)
    boxes6_reshaped = Reshape((-1, 4), name='boxes6_reshape')(boxes6)
    boxes7_reshaped = Reshape((-1, 4), name='boxes7_reshape')(boxes7)
    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
    anchors4_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors4)
    anchors5_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors5)
    anchors6_reshaped = Reshape((-1, 8), name='anchors6_reshape')(anchors6)
    anchors7_reshaped = Reshape((-1, 8), name='anchors7_reshape')(anchors7)

    # Concatenate the predictions from the different layers and the assosciated anchor box tensors
    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1
    # Output shape of `classes_concat`: (batch, n_boxes_total, n_classes)
    classes_concat = Concatenate(axis=1, name='classes_concat')([
        classes4_reshaped, classes5_reshaped, classes6_reshaped,
        classes7_reshaped
    ])

    # Output shape of `boxes_concat`: (batch, n_boxes_total, 4)
    boxes_concat = Concatenate(axis=1, name='boxes_concat')(
        [boxes4_reshaped, boxes5_reshaped, boxes6_reshaped, boxes7_reshaped])

    # Output shape of `anchors_concat`: (batch, n_boxes_total, 8)
    anchors_concat = Concatenate(axis=1, name='anchors_concat')([
        anchors4_reshaped, anchors5_reshaped, anchors6_reshaped,
        anchors7_reshaped
    ])

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    classes_softmax = Activation('softmax',
                                 name='classes_softmax')(classes_concat)

    # Concatenate the class and box coordinate predictions and the anchors to one large predictions tensor
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [classes_softmax, boxes_concat, anchors_concat])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
        predictor_sizes = np.array([
            classes4._keras_shape[1:3], classes5._keras_shape[1:3],
            classes6._keras_shape[1:3], classes7._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #6
0
def build_model(image_size,
                n_classes,
                mode='training',
                l2_regularization=0.0,
                min_scale=0.1,
                max_scale=0.9,
                scales=None,
                aspect_ratios_global=[0.5, 1.0, 2.0],
                aspect_ratios_per_layer=None,
                two_boxes_for_ar1=True,
                steps=None,
                offsets=None,
                clip_boxes=False,
                variances=[1.0, 1.0, 1.0, 1.0],
                coords='centroids',
                normalize_coords=False,
                subtract_mean=None,
                divide_by_stddev=None,
                swap_channels=False,
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400,
                return_predictor_sizes=False):
    '''
    Build a Keras model with SSD architecture, see references.

    The model consists of convolutional feature layers and a number of convolutional
    predictor layers that take their input from different feature layers.
    The model is fully convolutional.

    The implementation found here is a smaller version of the original architecture
    used in the paper (where the base network consists of a modified VGG-16 extended
    by a few convolutional feature layers), but of course it could easily be changed to
    an arbitrarily large SSD architecture by following the general design pattern used here.
    This implementation has 7 convolutional layers and 4 convolutional predictor
    layers that take their input from layers 4, 5, 6, and 7, respectively.

    Most of the arguments that this function takes are only needed for the anchor
    box layers. In case you're training the network, the parameters passed here must
    be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
    trained weights, the parameters passed here must be the same as the ones used
    to produce the trained weights.

    Some of these arguments are explained in more detail in the documentation of the
    `SSDBoxEncoder` class.

    Note: Requires Keras v2.0 or later. Training currently works only with the
    TensorFlow backend (v1.0 or later).

    Arguments:
        image_size (tuple): The input image size in the format `(height, width, channels)`.
        n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
        mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
            the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
            the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
            non-maximum suppression, and top-k filtering. The difference between latter two modes is that
            'inference' follows the exact procedure of the original Caffe implementation, while
            'inference_fast' uses a faster prediction decoding procedure.
        l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
        min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images.
        max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images. All scaling factors between the smallest and the
            largest will be linearly interpolated. Note that the second to last of the linearly interpolated
            scaling factors will actually be the scaling factor for the last predictor layer, while the last
            scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
            if `two_boxes_for_ar1` is `True`.
        scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
            This list must be one element longer than the number of predictor layers. The first `k` elements are the
            scaling factors for the `k` predictor layers, while the last element is used for the second box
            for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
            last scaling factor must be passed either way, even if it is not being used. If a list is passed,
            this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
        aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
            generated. This list is valid for all predictor layers. The original implementation uses more aspect ratios
            for some predictor layers and fewer for others. If you want to do that, too, then use the next argument instead.
        aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each predictor layer.
            This allows you to set the aspect ratios for each predictor layer individually. If a list is passed,
            it overrides `aspect_ratios_global`.
        two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
            If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
            using the scaling factor for the respective layer, the second one will be generated using
            geometric mean of said scaling factor and next bigger scaling factor.
        steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
            pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
            the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
            If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
            If no steps are provided, then they will be computed such that the anchor box center points will form an
            equidistant grid within the image dimensions.
        offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either floats or tuples of two floats. These numbers represent for each predictor layer how many
            pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
            as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
            of the step size specified in the `steps` argument. If the list contains floats, then that value will
            be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
            `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size,
            which is also the recommended setting.
        clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
        variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
            its respective variance value.
        coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
            of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
            and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
            i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
        subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
            of any shape that is broadcast-compatible with the image shape. The elements of this array will be
            subtracted from the image pixel intensity values. For example, pass a list of three integers
            to perform per-channel mean normalization for color images.
        divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
            floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
            intensity values will be divided by the elements of this array. For example, pass a list
            of three integers to perform per-channel standard deviation normalization for color images.
        swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
            image channels should be swapped.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
            positive class in order to be considered for the non-maximum suppression stage for the respective class.
            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
            stage, while a larger value will result in a larger part of the selection process happening in the confidence
            thresholding stage.
        iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
            to the box's confidence score.
        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
            non-maximum suppression stage.
        nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
        return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
            a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
            you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
            to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
            spatial dimensions of the predictor layers), for inference you don't need them.

    Returns:
        model: The Keras SSD model.
        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
            of the output tensor shape for each convolutional predictor layer. During
            training, the generator function needs this in order to transform
            the ground truth labels into tensors of identical structure as the
            output tensors of the model, which is in turn needed for the cost
            function.

    References:
        https://arxiv.org/abs/1512.02325v5
    '''

    n_predictor_layers = 6  # The number of predictor conv layers in the network is 6 for the original SSD300.
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def _conv_block(inputs, filters, kernel, strides):

        channel_axis = 1 if K.image_data_format() == 'channels_first' else -1

        x = Conv2D(filters,
                   kernel,
                   padding='same',
                   use_bias=False,
                   strides=strides)(inputs)
        x = BatchNormalization(axis=channel_axis)(x)
        return ReLU(6)(x)

    def correct_pad(inputs, kernel_size):

        img_dim = 2 if K.image_data_format() == 'channels_first' else 1
        input_size = K.int_shape(inputs)[img_dim:(img_dim + 2)]

        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)

        if input_size[0] is None:
            adjust = (1, 1)
        else:
            adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2)

        correct = (kernel_size[0] // 2, kernel_size[1] // 2)

        return ((correct[0] - adjust[0], correct[0]), (correct[1] - adjust[1],
                                                       correct[1]))

    def _bottleneck(inputs, filters, kernel, t, s, r=False, name='sosi'):
        channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
        tchannel = K.int_shape(inputs)[channel_axis] * t

        x = _conv_block(inputs, tchannel, (1, 1), (1, 1))

        if s == 2:
            x = layers.ZeroPadding2D(padding=correct_pad(x, 3))(x)

        padding = 'same' if s == 1 else 'valid'

        x = DepthwiseConv2D(kernel,
                            strides=(s, s),
                            depth_multiplier=1,
                            use_bias=False,
                            padding=padding)(x)
        x = BatchNormalization(axis=channel_axis)(x)
        x = Activation('relu')(x)

        x = Conv2D(filters, (1, 1), strides=(1, 1), padding='same')(x)
        x = BatchNormalization(axis=channel_axis)(x)
        if s == 1 and filters == K.int_shape(inputs)[channel_axis]:
            x = Add()([x, inputs])
        return x

    def _inverted_residual_block(inputs,
                                 filters,
                                 kernel,
                                 t,
                                 strides,
                                 n,
                                 name='inv_res_block'):
        x = _bottleneck(inputs, filters, kernel, t, strides, name)

        for i in range(1, n):
            x = _bottleneck(x, filters, kernel, t, 1, True)

        return x

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)
    '''MOBILENET_V2'''
    conv0 = _conv_block(x1, 32, (3, 3), strides=(2, 2))
    conv1 = _inverted_residual_block(conv0, 16, (3, 3), t=1, strides=1, n=1)
    conv2 = _inverted_residual_block(conv1, 24, (3, 3), t=6, strides=2, n=2)
    conv3 = _inverted_residual_block(conv2, 32, (3, 3), t=6, strides=2, n=3)
    conv4 = _inverted_residual_block(conv3, 64, (3, 3), t=6, strides=2, n=4)
    conv5 = _inverted_residual_block(conv4, 64, (3, 3), t=6, strides=1, n=3)
    conv6 = _inverted_residual_block(conv5, 96, (3, 3), t=6, strides=2, n=3)
    conv7 = _inverted_residual_block(conv6, 160, (3, 3), t=6, strides=1, n=1)
    conv8 = _inverted_residual_block(conv7, 160, (3, 3), t=6, strides=1, n=1)
    conv9 = _conv_block(conv8, 1280 // 2, (1, 1), strides=(1, 1))
    # x = GlobalAveragePooling2D()(x)
    # x = Reshape((1, 1, 1280))(x)
    # x = Dropout(0.3, name='Dropout')(x)
    # x = Conv2D(k, (1, 1), padding='same')(x)

    # x = Activation('softmax', name='softmax')(x)
    '''END OF MOBILENET_V2'''
    '''RESNET-18'''
    # model = ResnetBuilder.build_resnet_18((), )
    '''END OF RESNET-18'''

    # The next part is to add the convolutional predictor layers on top of the base network
    # that we defined above. Note that I use the term "base network" differently than the paper does.
    # To me, the base network is everything that is not convolutional predictor layers or anchor
    # box layers. In this case we'll have four predictor layers, but of course you could
    # easily rewrite this into an arbitrarily deep base network and add an arbitrary number of
    # predictor layers on top of the base network by simply following the pattern shown here.

    # Build the convolutional predictor layers on top of conv layers 4, 5, 6, and 7.
    # We build two predictor layers on top of each of these layers: One for class prediction (classification), one for box coordinate prediction (localization)
    # We precidt `n_classes` confidence values for each box, hence the `classes` predictors have depth `n_boxes * n_classes`
    # We predict 4 box coordinates for each box, hence the `boxes` predictors have depth `n_boxes * 4`
    # Output shape of `classes`: `(batch, height, width, n_boxes * n_classes)`
    conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3),
                                    padding='same',
                                    kernel_initializer='he_normal',
                                    kernel_regularizer=l2(l2_reg),
                                    name='conv4_3_norm_mbox_conf')(conv4)
    fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3),
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='fc7_mbox_conf')(conv5)
    conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv6_2_mbox_conf')(conv6)
    conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv7_2_mbox_conf')(conv7)
    conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv8_2_mbox_conf')(conv8)
    conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv9_2_mbox_conf')(conv9)
    # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
    # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
    conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv4_3_norm_mbox_loc')(conv4)
    fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3),
                          padding='same',
                          kernel_initializer='he_normal',
                          kernel_regularizer=l2(l2_reg),
                          name='fc7_mbox_loc')(conv5)
    conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv6_2_mbox_loc')(conv6)
    conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv7_2_mbox_loc')(conv7)
    conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv8_2_mbox_loc')(conv8)
    conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv9_2_mbox_loc')(conv9)

    ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)

    # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
    conv4_3_norm_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
    fc7_mbox_priorbox = AnchorBoxes(img_height,
                                    img_width,
                                    this_scale=scales[1],
                                    next_scale=scales[2],
                                    aspect_ratios=aspect_ratios[1],
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    this_steps=steps[1],
                                    this_offsets=offsets[1],
                                    clip_boxes=clip_boxes,
                                    variances=variances,
                                    coords=coords,
                                    normalize_coords=normalize_coords,
                                    name='fc7_mbox_priorbox')(fc7_mbox_loc)
    conv6_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[2],
        next_scale=scales[3],
        aspect_ratios=aspect_ratios[2],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[2],
        this_offsets=offsets[2],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
    conv7_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[3],
        next_scale=scales[4],
        aspect_ratios=aspect_ratios[3],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[3],
        this_offsets=offsets[3],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
    conv8_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[4],
        next_scale=scales[5],
        aspect_ratios=aspect_ratios[4],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[4],
        this_offsets=offsets[4],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
    conv9_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[5],
        next_scale=scales[6],
        aspect_ratios=aspect_ratios[5],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[5],
        this_offsets=offsets[5],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)

    ### Reshape

    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    conv4_3_norm_mbox_conf_reshape = Reshape(
        (-1, n_classes),
        name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
    fc7_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
    conv6_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
    conv7_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
    conv8_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
    conv9_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)
    # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    conv4_3_norm_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
    fc7_mbox_loc_reshape = Reshape((-1, 4),
                                   name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
    conv6_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
    conv7_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
    conv8_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
    conv9_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)
    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
    conv4_3_norm_mbox_priorbox_reshape = Reshape(
        (-1, 8),
        name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
    fc7_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
    conv6_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
    conv7_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
    conv8_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
    conv9_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)

    ### Concatenate the predictions from the different layers

    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1, the number of boxes per layer
    # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([
        conv4_3_norm_mbox_conf_reshape, fc7_mbox_conf_reshape,
        conv6_2_mbox_conf_reshape, conv7_2_mbox_conf_reshape,
        conv8_2_mbox_conf_reshape, conv9_2_mbox_conf_reshape
    ])

    # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([
        conv4_3_norm_mbox_loc_reshape, fc7_mbox_loc_reshape,
        conv6_2_mbox_loc_reshape, conv7_2_mbox_loc_reshape,
        conv8_2_mbox_loc_reshape, conv9_2_mbox_loc_reshape
    ])

    # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([
        conv4_3_norm_mbox_priorbox_reshape, fc7_mbox_priorbox_reshape,
        conv6_2_mbox_priorbox_reshape, conv7_2_mbox_priorbox_reshape,
        conv8_2_mbox_priorbox_reshape, conv9_2_mbox_priorbox_reshape
    ])

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    # Concatenate the class and box predictions and the anchors to one large predictions vector
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            conv4_3_norm_mbox_conf._keras_shape[1:3],
            fc7_mbox_conf._keras_shape[1:3],
            conv6_2_mbox_conf._keras_shape[1:3],
            conv7_2_mbox_conf._keras_shape[1:3],
            conv8_2_mbox_conf._keras_shape[1:3],
            conv9_2_mbox_conf._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #7
0
def ssd_300(image_size,
            n_classes,
            mode='training',
            l2_regularization=0.0005,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_global=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]],
            two_boxes_for_ar1=True,
            steps=[8, 16, 32, 64, 100, 300],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False):
    '''
    #############################################################################
    # Build model Keras voi kien truc SSD300, xem paper de hieu ve kien truc
    #############################################################################

    Base network la mang VGG-16 duoc luoc bo cac layer classification, va model duoc mo
    rong bang kien truc SSD, nhu duoc mo ta trong paper.

    Hau het cac doi so ma function nay can duoc su dung cho viec cau hinh cac anchor
    box layers.
    
    Trong truong hop ban thuc hien trainning mang, cac tham so duoc truyen o day
    phai giong voi cac tham so duoc su dung de thiet lap `SSDBoxEncoder`.
    
    Trong truong hop ban tai trong so da duoc dao tao truoc, cac tham so duoc truyen vao o day
    phai giong voi cac tham so da tao ra duoc cac trong so duoc dao tao truoc do.

    Mot so trong so cac doi so nay se duoc giai thich chi tiet hon trong tai lieu cua class `SSDBoxEncoder`

    Luu y: Yeu cau Keras tu v2.0 tro len, va hien tai chi hoat dong voi backend
    Tensorflow (v1.0) tro len.

    ############################################################################################
    # Giai thich chi tiet cac doi so duoc truyen vao function
    ###########################################################################################

    Cac doi so:
        * image_size (tuple): 
                - Kich thuoc hinh anh dau vao voi dinh dang `(height, width, channels)`.
        * n_classes (int):
                - So luong cac class positives (cac class object, ko phai class background)
                  vi du: 20 cho Pascal VOC, 80 cho MS COCO.
        * mode (str, optional): 
                - Mot trong cac gia tri 'training', 'inference' va 'inference_fast'.
                - Trong mode 'trainning', output cua model se la raw predict tensor, trong khi do,
                  trong mode 'inference' va 'inference_fast' cac raw predict tensor se duoc decoded
                  (giai ma) thanh cac gia tri toa do tuyet doi, va duoc loc qua cac nguong tin cay
                  (confidence thresholding), non-maximum suppression, va top-k filtering.
                - Su khac biet giua hai mode 'inference' va 'inference_fast' do la 'inference' tuan theo
                  quy trinh theo trien khai Caffe ban dau, trong khi do mode 'inference_fast' su dung
                  thu tuc decoded cac prediction nhanh hon.
        * l2_regularization (float, optional):
                - Ty le chuan hoa L2. Ap dung cho tat ca cac layer convolutional.
                  Dat thanh 0 de huy bo viec su dung chuan hoa L2.
        * min_scale (float, optional): 
                - He so ty le nho nhat cho cac anchor box va no thuoc layer predict thap nhat.
        * max_scale (float, optional):
                - He so ty le lon nhat cho cac anchor box va no thuoc layer predict cao nhat. 
                - Tat ca cac he so ty le cho cac anchor box cua cac layer predict o giua hai lop predict
                  cao nhat va thap nhat se duoc noi suy tuyen tinh trong doan tu [min_scale, max_scale]
                - Ghi nho rang, he so ty le duoc noi suy tuyen tinh tu thu 2 den cuoi cung se thuc su
                  su la he so ty le cho layer predict cuoi cung, trong khi he so ty le cuoi cung se duoc
                  su dung cho second box cua aspect ratios = 1 trong layer predict cuoi cung, neu
                  `two_boxes_for_ar1` duoc dat la `True`.
        * scales (list, optional):
                - Mot list cac phan tu type la float chua cac he so ty le duoc ap dung cho tung
                  cac convolutional predictor layer. 
                - List nay phai dai hon mot phan tu so voi so luong cac predict layers. 
                - k phan tu dau tien la k cac he so ty le cho k layer predictor dau tien,
                  trong khi phan tu cuoi cung duoc su dung cho second box co aspect ratio = 1
                  trong layer predictor cuoi cung neu `two_boxes_for_ar1` duoc dat la true.
                - He so ty le bo sung nay phai thoa man hai dieu kien sau, ke ca trong
                  truong hop no khong duoc su dung: 
                    + Neu list nay duoc truyen vao, thi no se ghi de len cac gia tri 
                      min_scale va max_scale.
                    + Tat ca cac phan tu he so ty le phai lon hon 0.
        * aspect_ratios_global (list, optional):
                - Mot list chua cac aspect ratio (ty le khung hinh) se duoc dung de tao ra cac
                  anchor box.
                - List nay se duoc su dung cho toan bo cac layer trong model.
        * aspect_ratios_per_layer (list, optional):
                - Mot list chua aspect ratio cho tung lop predictor layers.
                - Dieu nay cho phep ban dat ty le khung hinh (aspect ratio) cho tung layer predictor
                  rieng le, day la truong hop trien khai cua SSD300 goc.
                - Neu list nay duoc truyen vao thi no se ghi de len `aspect_ratios_global`.
        * two_boxes_for_ar1 (bool, optional):
                - Chi lien quan den gia tri aspect ratio = 1.
                - Se bi bo qua neu la truong hop khac
                - Neu mang gia tri `True`, hai anchor box se duoc tao cho ty le khung hinh aspect
                  ratio = 1.
                  + Anchor box dau tien se duoc tao bang viec su dung scale cua lop tuong ung
                  + Anchor box thu hai se duoc tao bang viec su dung gia tri trung binh hinh hoc cua 
                    he so ty le scale cua layer dang xet va he so ty le cua layer tiep theo.
        * steps (list, optional):
                - `None` hoac la mot list chua so luong phan tu bang voi so luong cua cac layer 
                  predict.
                - Cac phan tu co the la ints/floats hoac tuples cua ints/floats. 
                - Nhung con so nay dai dien cho moi predict layer co bao nhieu pixels cach cac center
                  cua cac anchor box theo chieu doc va chieu ngang.
                - Neu list chua ca ints/floats thi gia tri do se duoc su dung cho ca hai kich thuoc
                  khong gian.
                - Neu list chua tuples cua hai ints/floats thi chung dai dien cho 
                  `(step_height, step_width)`.
                - Neu step khong duoc cung cap gia tri, thi chung se duoc tinh toan sao cho 
                  cac center point cua anchor box se tao thanh mot luoi cach deu chieu rong va chieu cao
                  cua hinh anh.
        * offsets (list, optional):
                - `None` hoac mot list chua so luong phan tu bang voi so luong cac layer predictor
                - Cac phan tu co the la floats hoac tuples cua hai floats.
                - Nhung con so nay dai dien cho moi layer predictor co bao nhieu pixels tu phia
                  tren cung ben trai cua hinh anh den cac diem trung tam (center point).
                - Co mot vai dieu quan trong o day: 
                    + The offsets are not absolute pixel values, but fractions 
                      of the step size specified in the `steps` argument.
                      If the list contains floats, then that value will
                      be used for both spatial dimensions.
                      If the list contains tuples of two floats, then they represent
                      `(vertical_offset, horizontal_offset)`. If no offsets are provided, 
                      then they will default to 0.5 of the step size.
        * clip_boxes (bool, optional):
                - Neu `true`, cat cac toa do anchor box de anchor box nam trong ranh gioi cua hinh anh.
        * variances (list, optional):
                - Mot list co 4 so floats > 0. 
                - The anchor box offset for each coordinate will be divided by
                  its respective variance value.
        * coords (str, optional):
                - Định dạng tọa độ hộp được sử dụng trong mô hình
                  tuc la day khog phai la dinh dang dau vao cua ground truth boxes.
                  Co the centroid cho dinh dang: `cx, cy, h, w`, `min, max` cho dinh dang
                  (xmin, xmax, ymin, ymax) hoac 'conners' cho dinh dang `xmin, ymin, xmax, ymax`
        * normalize_coords (bool, optional):
                - Dat thanh 'True' neu model su dung toa do tuong doi thay vi toa do tuyet doi
                  tuc la neu model du doan toa do cua cac box trong [0, 1] thay vi la toa do tuyet doi.
        * subtract_mean (array-like, optional):
                - `None` hoac mot doi tuong array chua cac so nguyen hoac cac gia tri
                  dau phay dong. 
                - The elements of this array will be subtracted from the image pixel intensity values. 
                  For example, pass a list of three integers to perform per-channel mean normalization 
                  for color images.
        * divide_by_stddev (array-like, optional): 
                - `None` or an array-like object of non-zero integers or floating point 
                  values of any shape that is broadcast-compatible with the image shape.
                  The image pixel intensity values will be divided by the elements of this array. 
                  For example, pass a list of three integers to perform per-channel standard 
                  deviation normalization for color images.
        * swap_channels (list, optional): 
                - Either `False` or a list of integers representing the desired order 
                  in which the input image channels should be swapped.
        * confidence_thresh (float, optional): 
                - A float in [0,1), the minimum classification confidence in a specific positive class 
                  in order to be considered for the non-maximum suppression stage for the respective class.
                - A lower value will result in a larger part of the selection process being 
                  done by the non-maximum suppression stage, while a larger value will result in a 
                  larger part of the selection process happening in the confidence thresholding stage.
        * iou_threshold (float, optional): 
                - A float in [0,1]. All boxes that have a Jaccard similarity of 
                  greater than `iou_threshold` with a locally maximal box will be 
                  removed from the set of predictions for a given class, 
                  where 'maximal' refers to the box's confidence score.
        * top_k (int, optional):
                - The number of highest scoring predictions to be kept for 
                  each batch item after the non-maximum suppression stage.
        * nms_max_output_size (int, optional):
                - The maximal number of predictions that will be left over after the NMS stage.
        * return_predictor_sizes (bool, optional):
                - If `True`, this function not only returns the model, but also
                  a list containing the spatial dimensions of the predictor layers. 
                  This isn't strictly necessary since you can always get their sizes easily 
                  via the Keras API, but it's convenient and less error-prone
                  to get them this way. They are only relevant for training anyway 
                  (SSDBoxEncoder needs to know the
                  spatial dimensions of the predictor layers), for inference you don't need them.

    Returns:
        model: The Keras SSD300 model.
        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
            of the output tensor shape for each convolutional predictor layer. During
            training, the generator function needs this in order to transform
            the ground truth labels into tensors of identical structure as the
            output tensors of the model, which is in turn needed for the cost
            function.

    References:
        https://arxiv.org/abs/1512.02325v5
    '''

    #########################################################################
    # Thuc hien trien khai viec build model SSD300 bang Keras
    #########################################################################

    # So class duoc du doan trong SSD goc
    n_predictor_layers = 6
    # Cong them mot class bieu thi cho background
    n_classes += 1
    # Dat lai ten cho chuan hoa L2 duoc ngan gon
    l2_reg = l2_regularization
    # Lay kich thuoc cua hinh anh dau vao
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Dua ra mot vai ngoai le truoc khi thuc hien build models
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` va `aspect_ratios_per_layer` khong duoc de ca hai deu None. At least one needs to be specifiedIt nhat mot gia tri phai duoc chi dinh."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "No phai thuoc mot trong hai truong hop: aspect_ratios_per_layer la None hoac len(aspect_ratios_per_layer) == {}, nhung len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Ca `min_scale` va `max_scale` hoac `scales` can duoc chi dinh!")
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "No phai thuoc mot trong hai truong hop scales la None hoac len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:
        # Neu khong co list scale duoc truyen vao mot cach ro rang, thi se tinh cac scale thong qua hai gia tri min_scale va max_scale
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 gia tri variance phai duoc truyen vao nhung co {} gia tri duoc nhan!"
            .format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "Tat ca cac gia tri cua variance >0, nhung variances duoc nhan la {}"
            .format(variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "Ban phai cung cap it nhat mot gia tri step cho moi predictor layers!"
        )

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "Ban phai cung cap it nhat mot gia tri offset cho moi predictor layers!"
        )

    ############################################################################
    # Tinh toan cac tham so cho Anchor Box
    ############################################################################

    # Dat ca aspect ratio (ty le khung hinh) cho moi lop predictor layer
    # Dieu nay la can thiet cho cac Anchor box layers
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Tinh toan so luong cac boxes duoc predicted tren moi cell cua moi predictor layers.
    # Chung ta can dieu nay de chung ta biet co bao nhieu channels ma cac predictor layers can phai co.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:
        # Neu chi co global aspect ratio list duoc truyen vao thi so luong boxes du doan
        # tren moi cell la nhu nhau doi voi cac predictor layers.
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Xac dinh cac function cho cac layer custom Lambda ben duoi
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Build Network SSD (Xay dung mang SSD)
    ############################################################################

    # Layer dau vao (kich thuoc bang kich thuoc cua hinh anh duoc truyen vao)
    x = Input(shape=(img_height, img_width, img_channels))

    # Cac Layer tien xu ly hinh anh (chuan hoa hinh anh dau vao)

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)

    # Xay dung lai model VGG-16

    # Block 1
    conv1_1 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_1')(x1)
    conv1_2 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_2')(conv1_1)
    pool1 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool1')(conv1_2)

    # Block 2
    conv2_1 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_1')(pool1)
    conv2_2 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_2')(conv2_1)
    pool2 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool2')(conv2_2)

    # Block 3
    conv3_1 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_1')(pool2)
    conv3_2 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_2')(conv3_1)
    conv3_3 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_3')(conv3_2)
    pool3 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool3')(conv3_3)

    # Block 4
    conv4_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_1')(pool3)
    conv4_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_2')(conv4_1)
    conv4_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_3')(conv4_2)
    pool4 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool4')(conv4_3)

    # Block 5
    conv5_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_1')(pool4)
    conv5_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_2')(conv5_1)
    conv5_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_3')(conv5_2)
    pool5 = MaxPooling2D(pool_size=(3, 3),
                         strides=(1, 1),
                         padding='same',
                         name='pool5')(conv5_3)

    # Het model VGG-16

    #################################################################
    # Bat dau xay dung cac predictor layers
    #################################################################

    # Phan 1: Noi cac convolution voi nhau

    # FC6
    fc6 = Conv2D(1024, (3, 3),
                 dilation_rate=(6, 6),
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=l2(l2_reg),
                 name='fc6')(pool5)

    # FC7
    fc7 = Conv2D(1024, (1, 1),
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=l2(l2_reg),
                 name='fc7')(fc6)

    # Conv8_2
    conv6_1 = Conv2D(256, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv6_1')(fc7)
    conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv6_padding')(conv6_1)
    conv6_2 = Conv2D(512, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv6_2')(conv6_1)

    # Conv9_2
    conv7_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv7_1')(conv6_2)
    conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv7_padding')(conv7_1)
    conv7_2 = Conv2D(256, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv7_2')(conv7_1)

    # Conv10_2
    conv8_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv8_1')(conv7_2)
    conv8_2 = Conv2D(256, (3, 3),
                     strides=(1, 1),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv8_2')(conv8_1)

    # Conv11_2
    conv9_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv9_1')(conv8_2)
    conv9_2 = Conv2D(256, (3, 3),
                     strides=(1, 1),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv9_2')(conv9_1)

    # Phan 2: Tao cac predictor convolution

    # Feed conv4_3 into the L2 normalization layer
    conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)

    ### Build cac convolutional predictor layers tren top (dinh) cua base network

    #  Phan 2.1: Xay dung trinh layers confidences: do tin cay cho moi box du doan

    # Chung toi predict `n_clases` gia tri confidence cho moi box, do do cac confidence predictor co
    # do sau `n_boxes * n_classes`
    # Hinh dang dau ra cua layers confidences: `(batch, height, width, n_boxes * n_classes)`
    conv4_3_norm_mbox_conf = Conv2D(
        n_boxes[0] * n_classes, (3, 3),
        padding='same',
        kernel_initializer='he_normal',
        kernel_regularizer=l2(l2_reg),
        name='conv4_3_norm_mbox_conf')(conv4_3_norm)
    fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3),
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='fc7_mbox_conf')(fc7)
    conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv6_2_mbox_conf')(conv6_2)
    conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv7_2_mbox_conf')(conv7_2)
    conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv8_2_mbox_conf')(conv8_2)
    conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv9_2_mbox_conf')(conv9_2)

    # Phan 2.2: Xay dung phan predict toa do cho moi box du doan

    # Chung toi du doan 4 toa do cho moi box,
    # do do, cac yeu to predictor localization se co chieu sau la `n_boxes * 4`
    # Hinh dang dau ra cua layers localization: `(batch, height, width, n_boxes * 4)`
    conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv4_3_norm_mbox_loc')(conv4_3_norm)
    fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3),
                          padding='same',
                          kernel_initializer='he_normal',
                          kernel_regularizer=l2(l2_reg),
                          name='fc7_mbox_loc')(fc7)
    conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv6_2_mbox_loc')(conv6_2)
    conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv7_2_mbox_loc')(conv7_2)
    conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv8_2_mbox_loc')(conv8_2)
    conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv9_2_mbox_loc')(conv9_2)

    # Phan 2.3: Tao cac Anchor boxs

    ### Tao ra cac anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)

    # Dau ra cua cac Anchors: `(batch, height, width, n_boxes, 8)`
    conv4_3_norm_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
    fc7_mbox_priorbox = AnchorBoxes(img_height,
                                    img_width,
                                    this_scale=scales[1],
                                    next_scale=scales[2],
                                    aspect_ratios=aspect_ratios[1],
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    this_steps=steps[1],
                                    this_offsets=offsets[1],
                                    clip_boxes=clip_boxes,
                                    variances=variances,
                                    coords=coords,
                                    normalize_coords=normalize_coords,
                                    name='fc7_mbox_priorbox')(fc7_mbox_loc)
    conv6_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[2],
        next_scale=scales[3],
        aspect_ratios=aspect_ratios[2],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[2],
        this_offsets=offsets[2],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
    conv7_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[3],
        next_scale=scales[4],
        aspect_ratios=aspect_ratios[3],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[3],
        this_offsets=offsets[3],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
    conv8_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[4],
        next_scale=scales[5],
        aspect_ratios=aspect_ratios[4],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[4],
        this_offsets=offsets[4],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
    conv9_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[5],
        next_scale=scales[6],
        aspect_ratios=aspect_ratios[5],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[5],
        this_offsets=offsets[5],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)

    ### Reshape

    # Dinh hinh lai shape cua class predictor, mang lai 3D tensors voi shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    conv4_3_norm_mbox_conf_reshape = Reshape(
        (-1, n_classes),
        name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
    fc7_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
    conv6_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
    conv7_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
    conv8_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
    conv9_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)

    # Reshape the box predictions, mang lai 3D tensors voi shape `(batch, height * width * n_boxes, 4)`
    # Chung toi muon 4 toa do se bi co lap (isolated) o truc cuoi (last axis) de co the tinh
    # toan cho smooth L1 loss
    conv4_3_norm_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
    fc7_mbox_loc_reshape = Reshape((-1, 4),
                                   name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
    conv6_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
    conv7_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
    conv8_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
    conv9_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)

    # Reshape the anchor box tensors, mang lai 3D tensors voi shape `(batch, height * width * n_boxes, 8)`
    conv4_3_norm_mbox_priorbox_reshape = Reshape(
        (-1, 8),
        name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
    fc7_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
    conv6_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
    conv7_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
    conv8_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
    conv9_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)

    ### Noi cac predictions tu cac lop khac nhau lai voi nhau

    # Axis 0 (batch) va axis 2 (n_classes hoac 4, tuong ung) la giong het nhau cho tat ca cac layers,
    # vi vay chung toi muon noi doc theo truc axis 1, do la so luong box tren moi layers
    # Output shape cua `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([
        conv4_3_norm_mbox_conf_reshape, fc7_mbox_conf_reshape,
        conv6_2_mbox_conf_reshape, conv7_2_mbox_conf_reshape,
        conv8_2_mbox_conf_reshape, conv9_2_mbox_conf_reshape
    ])

    # Output shape cua `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([
        conv4_3_norm_mbox_loc_reshape, fc7_mbox_loc_reshape,
        conv6_2_mbox_loc_reshape, conv7_2_mbox_loc_reshape,
        conv8_2_mbox_loc_reshape, conv9_2_mbox_loc_reshape
    ])

    # Output shape cua `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([
        conv4_3_norm_mbox_priorbox_reshape, fc7_mbox_priorbox_reshape,
        conv6_2_mbox_priorbox_reshape, conv7_2_mbox_priorbox_reshape,
        conv8_2_mbox_priorbox_reshape, conv9_2_mbox_priorbox_reshape
    ])

    # Cac predictor ve toa do se di vao ham loss function theo cach cua chung,
    # nhung con doi voi cac predict class, Chung ta se ap dung lop kich hoat softmax
    # truoc
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    # Nối các dự đoán của lớp (class) và box và các anchor vào một vectơ dự đoán lớn
    # Output shape cua `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            conv4_3_norm_mbox_conf._keras_shape[1:3],
            fc7_mbox_conf._keras_shape[1:3],
            conv6_2_mbox_conf._keras_shape[1:3],
            conv7_2_mbox_conf._keras_shape[1:3],
            conv8_2_mbox_conf._keras_shape[1:3],
            conv9_2_mbox_conf._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #8
0
def ssd_300(image_size,
            n_classes,
            mode='training',
            l2_regularization=0.0005,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_global=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]],
            two_boxes_for_ar1=True,
            steps=[8, 16, 32, 64, 100, 300],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False):
    '''
    Xây dựng model SSD300 với keras.
    Base network được sử dụng là VGG16.

    Chú ý: Yêu cầu Keras>=v2.0; TensorFlow backend>=v1.0.

    Arguments:
        image_size (tuple): Kích thước image input `(height, width, channels)`.
        n_classes (int): Số classes, chẳng hạn 20 cho Pascal VOC dataset, 80 cho MS COCO dataset.
        mode (str, optional): Một trong những dạng 'training', 'inference' và 'inference_fast'.
            'training' mode: Đầu ra của model là raw prediction tensor.
            'inference' và 'inference_fast' modes: raw predictions được decoded thành tọa độ đã được filtered thông qua threshold.
        l2_regularization (float, optional): L2-regularization rate. Áp dụng cho toàn bộ các convolutional layers.
        min_scale (float, optional): Nhân tố scaling nhỏ nhất cho các size của anchor boxes. Tỷ lệ này được tính trên so sánh với cạnh ngắn hơn
        của hình ảnh input.
        max_scale (float, optional): Nhân tố scale lớn nhất cho các size của anchor boxes.
        scales (list, optional): List các số floats chứa các nhân tố scaling của các convolutional predictor layer.
            List này phải lớn hơn số lượng các predictor layers là 1 để sử dụng cho trường hợp aspect ratio = 1 sẽ tính thêm next scale.
            Trong TH sử dụng scales thì interpolate theo min_scale và max_scale để tính list scales sẽ không được sử dụng.
        aspect_ratios_global (list, optional): List của các aspect ratios mà các anchor boxes được tạo thành. List này được áp dụng chung trên toàn bộ các prediction layers.
        aspect_ratios_per_layer (list, optional): List của các list aspect ratio cho mỗi một prediction layer.
            Nếu được truyền vào sẽ override `aspect_ratios_global`.
        two_boxes_for_ar1 (bool, optional): Chỉ áp dụng khi aspect ratio lists chứa 1. Sẽ bị loại bỏ trong các TH khác.
            Nếu `True`, 2 anchor boxes sẽ được tạo ra ứng với aspect ratio = 1. anchor box đầu tiên tạo thành bằng cách sử scale, anchor box thứ 2
            được tạo thành bằng trung bình hình học của scale và next scale.
        steps (list, optional): `None` hoặc là list với rất nhiều các phần tử có số lượng bằng với số lượng layers.
            Mỗi phần tử đại diện cho mỗi một predictor layer có bao nhiêu pixels khoảng cách giữa các tâm của anchor box.
            steps có thể gồm 2 số đại diện cho (step_width, step_height).
            nếu không có steps nào được đưa ra thì chúng ta sẽ tính để cho khoảng các giữa các tâm của anchor box là bằng nhau
        offsets (list, optional): None hoặc là các con số đại diện cho mỗi một predictor layer bao nhiêu pixels từ góc trên và bên trái mở rộng của ảnh
        clip_boxes (bool, optional): Nếu `True`, giới hạn tọa độ các anchor box để nằm trong boundaries của image.
        variances (list, optional): Một list gồm 4 số floats >0. Một anchor box offset tương ứng với mỗi tọa độ sẽ được chi cho giá trị variance tương ứng.
        coords (str, optional): Tọa độ của box được sử dụng bên trong model (chẳng hạn, nó không là input format của ground truth labels).
            Có thể là dạng 'centroids' format `(cx, cy, w, h)` (box center coordinates, width,
            and height), 'minmax' format `(xmin, xmax, ymin, ymax)`, hoặc 'corners' format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Được đặt là `True` nếu model được giả định sử dụng tọa độ tương đối thay vì tuyệt đối coordinates,
            chẳng hạn nếu model dự báo tọa độ box nằm trong [0, 1] thay vì tọa độ tuyệt đối.
        subtract_mean (array-like, optional): `None` hoặc một array object với bất kì shape nào mà dạng mở rộng phù hợp với shape của ảnh. Gía trị của nó được bớt đi từ độ lớn pixel của ảnh. The elements of this array will be
            Chẳng hạn truyền vào một list gồm 3 số nguyên để tính toán trung bình chuẩn hóa cho các kênh của ảnh.
        divide_by_stddev (array-like, optional): `None` hoặc một array object. Tương tự như subtract_mean nhưng được chia cho từ độ lớn của ảnh để tính chuẩn hóa.
        swap_channels (list, optional): Là `False` hoặc một list các số nguyên biểu diễn thứ tự kì vọng mà trong đó đầu vào các channels của ảnh có thể được hoán đổi.
        confidence_thresh (float, optional): Một số float nằm trong khoảng [0,1), là ngưỡng tin cậy nhỏ nhất trong phân loại của một lớp xảy ra.
        iou_threshold (float, optional): Một float nằm trong khoảng [0,1]. Tất cả các boxes có chỉ số Jaccard similarity lớn hơn hoặc bằng `iou_threshold`
            sẽ được xem xét là chứa vệt thể bên trong nó.
        top_k (int, optional): Điểm dự báo cáo nhất được giữ trong mỗi batch item sau bước non-maximum suppression stage.
        nms_max_output_size (int, optional): Số lượng lớn nhất các dự báo sẽ được chuyển qua bước NMS stage.
        return_predictor_sizes (bool, optional): Nếu `True`, hàm số này sẽ không chỉ trả về mô hình, mà còn trả về
            một list chứa các chiều của predictor layers.

    Returns:
        model: The Keras SSD300 model.
        predictor_sizes (optional): Một numpy array chứa các phần `(height, width)` của output tensor shape tương ứng với mỗi convolutional predictor layer.

    References:
        https://arxiv.org/abs/1512.02325v5
    '''

    n_predictor_layers = 6  # Số lượng các preductor convolutional layers trong network là 6 cho original SSD300.
    n_classes += 1  # Số lượng classes, + 1 để tính thêm background class.
    l2_reg = l2_regularization  # tham số chuẩn hóa của norm chuẩn l2.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Một số lỗi ngoại lệ.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    # Tạo list scales
    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Tính các tham số của anchor box.
    ############################################################################

    # Thiết lập aspect ratios cho mỗi predictor layer (chỉ cần thiết cho tính toán anchor box layers).
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Tính số lượng boxes được dự báo / 1 cell cho mỗi predictor layer.
    # Chúng ta cần biết bao nhiêu channels các predictor layers cần có.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 cho trường hợp aspect ratio = 1
            else:
                n_boxes.append(len(ar))
    else:  # Nếu chỉ 1 global aspect ratio list được truyền vào thì số lượng boxes là như nhau cho mọi layers.
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Xác định các hàm số cho Lambda layers bên dưới.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Bước 1: Xây dựng network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)

    ############################################################################
    # Bước 1.1: Tính toán base network là mạng VGG16
    ############################################################################

    conv1_1 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_1')(x1)
    conv1_2 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_2')(conv1_1)
    pool1 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool1')(conv1_2)

    conv2_1 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_1')(pool1)
    conv2_2 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_2')(conv2_1)
    pool2 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool2')(conv2_2)

    conv3_1 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_1')(pool2)
    conv3_2 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_2')(conv3_1)
    conv3_3 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_3')(conv3_2)
    pool3 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool3')(conv3_3)

    conv4_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_1')(pool3)
    conv4_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_2')(conv4_1)
    conv4_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_3')(conv4_2)
    pool4 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool4')(conv4_3)

    conv5_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_1')(pool4)
    conv5_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_2')(conv5_1)
    conv5_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_3')(conv5_2)
    pool5 = MaxPooling2D(pool_size=(3, 3),
                         strides=(1, 1),
                         padding='same',
                         name='pool5')(conv5_3)

    ############################################################################
    # Bước 1.2: Áp dụng các convolutional filter có kích thước (3 x 3) để tính toán ra features map.
    ############################################################################

    fc6 = Conv2D(1024, (3, 3),
                 dilation_rate=(6, 6),
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=l2(l2_reg),
                 name='fc6')(pool5)
    print('fully connected 6: ', fc6.get_shape())
    fc7 = Conv2D(1024, (1, 1),
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=l2(l2_reg),
                 name='fc7')(fc6)
    print('fully connected 7: ', fc7.get_shape())
    conv6_1 = Conv2D(256, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv6_1')(fc7)
    conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv6_padding')(conv6_1)
    conv6_2 = Conv2D(512, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv6_2')(conv6_1)
    print('conv6_2: ', conv6_2.get_shape())
    conv7_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv7_1')(conv6_2)
    conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv7_padding')(conv7_1)
    conv7_2 = Conv2D(256, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv7_2')(conv7_1)
    print('conv7_2: ', conv7_2.get_shape())
    conv8_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv8_1')(conv7_2)
    conv8_2 = Conv2D(256, (3, 3),
                     strides=(1, 1),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv8_2')(conv8_1)
    print('conv8_2: ', conv8_2.get_shape())
    conv9_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv9_1')(conv8_2)
    conv9_2 = Conv2D(256, (3, 3),
                     strides=(1, 1),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv9_2')(conv9_1)
    print('conv9_2: ', conv9_2.get_shape())
    # Feed conv4_3 vào the L2 normalization layer
    conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)
    print('conv4_3_norm.shape: ', conv4_3_norm.get_shape())

    ############################################################################
    # Bước 1.3: Xác định output phân phối xác suất theo các classes ứng với mỗi một default bounding box.
    ############################################################################

    ### Xây dựng các convolutional predictor layers tại top của base network
    # Chúng ta dự báo các giá trị confidence cho mỗi box, do đó confidence predictors có độ sâu `n_boxes * n_classes`
    # Đầu ra của confidence layers có shape: `(batch, height, width, n_boxes * n_classes)`
    conv4_3_norm_mbox_conf = Conv2D(
        n_boxes[0] * n_classes, (3, 3),
        padding='same',
        kernel_initializer='he_normal',
        kernel_regularizer=l2(l2_reg),
        name='conv4_3_norm_mbox_conf')(conv4_3_norm)
    print('conv4_3_norm_mbox_conf.shape: ', conv4_3_norm_mbox_conf.get_shape())
    fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3),
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='fc7_mbox_conf')(fc7)
    print('fc7_mbox_conf.shape: ', fc7_mbox_conf.get_shape())
    conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv6_2_mbox_conf')(conv6_2)
    conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv7_2_mbox_conf')(conv7_2)
    conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv8_2_mbox_conf')(conv8_2)
    conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv9_2_mbox_conf')(conv9_2)
    print('conv9_2_mbox_conf: ', conv9_2_mbox_conf.get_shape())

    ############################################################################
    # Bước 1.4: Xác định output các tham số offset của default bounding boxes tương ứng với mỗi cell trên các features map.
    ############################################################################

    # Chúng ta dự báo 4 tọa độ cho mỗi box, do đó localization predictors có độ sâu `n_boxes * 4`
    # Output shape của localization layers: `(batch, height, width, n_boxes * 4)`
    conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv4_3_norm_mbox_loc')(conv4_3_norm)
    print('conv4_3_norm_mbox_loc: ', conv4_3_norm_mbox_loc.get_shape())
    fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3),
                          padding='same',
                          kernel_initializer='he_normal',
                          kernel_regularizer=l2(l2_reg),
                          name='fc7_mbox_loc')(fc7)
    conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv6_2_mbox_loc')(conv6_2)
    conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv7_2_mbox_loc')(conv7_2)
    conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv8_2_mbox_loc')(conv8_2)
    conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv9_2_mbox_loc')(conv9_2)
    print('conv9_2_mbox_loc: ', conv9_2_mbox_loc.get_shape())

    ############################################################################
    # Bước 1.5: Tính toán các AnchorBoxes làm cơ sở để dự báo offsets cho các predicted bounding boxes bao quan vật thể
    ############################################################################

    ### Khởi tạo các anchor boxes (được gọi là "priors" trong code gốc Caffe/C++ của mô hình)
    # Shape output của anchors: `(batch, height, width, n_boxes, 8)`
    conv4_3_norm_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
    print('conv4_3_norm_mbox_priorbox: ',
          conv4_3_norm_mbox_priorbox.get_shape())
    fc7_mbox_priorbox = AnchorBoxes(img_height,
                                    img_width,
                                    this_scale=scales[1],
                                    next_scale=scales[2],
                                    aspect_ratios=aspect_ratios[1],
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    this_steps=steps[1],
                                    this_offsets=offsets[1],
                                    clip_boxes=clip_boxes,
                                    variances=variances,
                                    coords=coords,
                                    normalize_coords=normalize_coords,
                                    name='fc7_mbox_priorbox')(fc7_mbox_loc)
    print('fc7_mbox_priorbox: ', fc7_mbox_priorbox.get_shape())
    conv6_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[2],
        next_scale=scales[3],
        aspect_ratios=aspect_ratios[2],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[2],
        this_offsets=offsets[2],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
    print('conv6_2_mbox_priorbox: ', conv6_2_mbox_priorbox.get_shape())
    conv7_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[3],
        next_scale=scales[4],
        aspect_ratios=aspect_ratios[3],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[3],
        this_offsets=offsets[3],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
    print('conv7_2_mbox_priorbox: ', conv7_2_mbox_priorbox.get_shape())
    conv8_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[4],
        next_scale=scales[5],
        aspect_ratios=aspect_ratios[4],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[4],
        this_offsets=offsets[4],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
    print('conv8_2_mbox_priorbox: ', conv8_2_mbox_priorbox.get_shape())
    conv9_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[5],
        next_scale=scales[6],
        aspect_ratios=aspect_ratios[5],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[5],
        this_offsets=offsets[5],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)
    print('conv9_2_mbox_priorbox: ', conv9_2_mbox_priorbox.get_shape())

    ############################################################################
    # Bước 2: Reshape lại các output tensor shape
    ############################################################################

    ############################################################################
    # Bước 2.1: Reshape output của class predictions
    ############################################################################

    # Reshape các class predictions, trả về 3D tensors có shape `(batch, height * width * n_boxes, n_classes)`
    # Chúng ta muốn các classes là tách biệt nhau trên last axis để tính softmax trên chúng.
    conv4_3_norm_mbox_conf_reshape = Reshape(
        (-1, n_classes),
        name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
    fc7_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
    conv6_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
    conv7_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
    conv8_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
    conv9_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)
    print('conv4_3_norm_mbox_conf_reshape: ',
          conv4_3_norm_mbox_conf_reshape.get_shape())
    print('fc7_mbox_conf_reshape: ', fc7_mbox_conf_reshape.get_shape())
    print('conv9_2_mbox_conf_reshape: ', conv9_2_mbox_conf_reshape.get_shape())
    print('conv9_2_mbox_conf_reshape: ', conv9_2_mbox_conf_reshape.get_shape())
    print('conv9_2_mbox_conf_reshape: ', conv9_2_mbox_conf_reshape.get_shape())

    ############################################################################
    # Bước 2.2: Reshape output của bounding box predictions
    ############################################################################

    # Reshape các box predictions, trả về 3D tensors có shape `(batch, height * width * n_boxes, 4)`
    # Chúng ta muốn 4 tọa độ box là tách biệt nhau trên last axis để tính hàm smooth L1 loss
    conv4_3_norm_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
    fc7_mbox_loc_reshape = Reshape((-1, 4),
                                   name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
    conv6_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
    conv7_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
    conv8_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
    conv9_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)
    print('conv4_3_norm_mbox_loc_reshape: ',
          conv4_3_norm_mbox_loc_reshape.get_shape())
    print('fc7_mbox_loc_reshape: ', fc7_mbox_loc_reshape.get_shape())
    print('conv6_2_mbox_loc_reshape: ', conv6_2_mbox_loc_reshape.get_shape())
    print('conv7_2_mbox_loc_reshape: ', conv7_2_mbox_loc_reshape.get_shape())
    print('conv8_2_mbox_loc_reshape: ', conv8_2_mbox_loc_reshape.get_shape())
    print('conv9_2_mbox_loc_reshape: ', conv9_2_mbox_loc_reshape.get_shape())

    ############################################################################
    # Bước 2.3: Reshape output của anchor box
    ############################################################################

    # Reshape anchor box tensors, trả về 3D tensors có shape `(batch, height * width * n_boxes, 8)`
    conv4_3_norm_mbox_priorbox_reshape = Reshape(
        (-1, 8),
        name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
    fc7_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
    conv6_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
    conv7_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
    conv8_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
    conv9_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)
    print('conv4_3_norm_mbox_priorbox_reshape: ',
          conv4_3_norm_mbox_priorbox_reshape.get_shape())
    print('fc7_mbox_priorbox_reshape: ', fc7_mbox_priorbox_reshape.get_shape())
    print('conv6_2_mbox_priorbox_reshape: ',
          conv6_2_mbox_priorbox_reshape.get_shape())
    print('conv7_2_mbox_priorbox_reshape: ',
          conv7_2_mbox_priorbox_reshape.get_shape())
    print('conv8_2_mbox_priorbox_reshape: ',
          conv8_2_mbox_priorbox_reshape.get_shape())
    print('conv9_2_mbox_priorbox_reshape: ',
          conv9_2_mbox_priorbox_reshape.get_shape())
    ### Concatenate các predictions từ các layers khác nhau

    ############################################################################
    # Bước 3: Concatenate các boxes trên layers
    ############################################################################

    ############################################################################
    # Bước 3.1: Concatenate confidence output box
    ############################################################################

    # Axis 0 (batch) và axis 2 (n_classes hoặc 4) là xác định duy nhất cho toàn bộ các predictions layer
    # nên chúng ta muốn concatenate theo axis 1, số lượng các boxes trên layer
    # Output shape của `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([
        conv4_3_norm_mbox_conf_reshape, fc7_mbox_conf_reshape,
        conv6_2_mbox_conf_reshape, conv7_2_mbox_conf_reshape,
        conv8_2_mbox_conf_reshape, conv9_2_mbox_conf_reshape
    ])
    print('mbox_conf.shape: ', mbox_conf.get_shape())

    ############################################################################
    # Bước 3.2: Concatenate location output box
    ############################################################################

    # Output shape của `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([
        conv4_3_norm_mbox_loc_reshape, fc7_mbox_loc_reshape,
        conv6_2_mbox_loc_reshape, conv7_2_mbox_loc_reshape,
        conv8_2_mbox_loc_reshape, conv9_2_mbox_loc_reshape
    ])

    print('mbox_loc.shape: ', mbox_loc.get_shape())

    ############################################################################
    # Bước 3.3: Concatenate anchor output box
    ############################################################################

    # Output shape của `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([
        conv4_3_norm_mbox_priorbox_reshape, fc7_mbox_priorbox_reshape,
        conv6_2_mbox_priorbox_reshape, conv7_2_mbox_priorbox_reshape,
        conv8_2_mbox_priorbox_reshape, conv9_2_mbox_priorbox_reshape
    ])

    print('mbox_priorbox.shape: ', mbox_priorbox.get_shape())

    ############################################################################
    # Bước 4: Tính toán output
    ############################################################################

    ############################################################################
    # Bước 4.1 : Xây dựng các hàm loss function cho confidence
    ############################################################################

    # tọa độ của box predictions sẽ được truyền vào hàm loss function,
    # nhưng cho các dự báo lớp, chúng ta sẽ áp dụng một hàm softmax activation layer đầu tiên
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    # Concatenate các class và box predictions và the anchors thành một large predictions vector
    # Đầu ra của `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox])
    print('predictions.shape: ', predictions.get_shape())
    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            conv4_3_norm_mbox_conf._keras_shape[1:3],
            fc7_mbox_conf._keras_shape[1:3],
            conv6_2_mbox_conf._keras_shape[1:3],
            conv7_2_mbox_conf._keras_shape[1:3],
            conv8_2_mbox_conf._keras_shape[1:3],
            conv9_2_mbox_conf._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #9
0
ファイル: keras_ssd512.py プロジェクト: zhangfx123/SWIPENet
def ssd_512(image_size,
            n_classes,
            mode='training',
            l2_regularization=0.0005,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_global=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0]],
            two_boxes_for_ar1=True,
            steps=[8, 4, 2],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False):

    n_predictor_layers = 3  # The number of predictor conv layers in the network is 7 for the original SSD512
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)
    # 512  step=1
    conv1_1 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_1')(x1)
    conv1_2 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_2')(conv1_1)
    pool1 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool1')(conv1_2)
    # 256  step=2
    conv2_1 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_1')(pool1)
    conv2_2 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_2')(conv2_1)
    pool2 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool2')(conv2_2)
    # 128  step=4
    conv3_1 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_1')(pool2)
    conv3_2 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_2')(conv3_1)
    conv3_3 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_3')(conv3_2)
    pool3 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool3')(conv3_3)
    # 64  step=8
    conv4_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_1')(pool3)
    conv4_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_2')(conv4_1)
    conv4_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_3')(conv4_2)
    pool4 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool4')(conv4_3)
    # 32  step=16
    conv5_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_1')(pool4)
    conv5_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_2')(conv5_1)
    conv5_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_3')(conv5_2)
    # 16  step=32
    pool5 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool5')(conv5_3)
    # 16  step=32
    dilateconv6_1 = Conv2D(512, (3, 3),
                           strides=(1, 1),
                           dilation_rate=2,
                           activation='relu',
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='dilateconv6_1')(pool5)
    dilateconv6_2 = Conv2D(512, (3, 3),
                           strides=(1, 1),
                           dilation_rate=2,
                           activation='relu',
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='dilateconv6_2')(dilateconv6_1)
    dilateconv6_3 = Conv2D(512, (3, 3),
                           strides=(1, 1),
                           dilation_rate=2,
                           activation='relu',
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='dilateconv6_3')(dilateconv6_2)
    dilateconv6_4 = Conv2D(512, (3, 3),
                           strides=(1, 1),
                           dilation_rate=2,
                           activation='relu',
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='dilateconv6_4')(dilateconv6_3)
    conv7_add = concatenate([pool5, dilateconv6_4], axis=3)
    # 32  step=16
    deconv1_1 = Conv2DTranspose(512, (3, 3),
                                strides=(2, 2),
                                activation='relu',
                                padding='same',
                                kernel_initializer='he_normal',
                                kernel_regularizer=l2(l2_reg),
                                name='deconv1_1')(conv7_add)
    deconv1_2 = Conv2DTranspose(512, (3, 3),
                                activation='relu',
                                padding='same',
                                kernel_initializer='he_normal',
                                kernel_regularizer=l2(l2_reg),
                                name='deconv1_2')(deconv1_1)
    deconv1 = concatenate([deconv1_2, conv5_3], axis=3)
    # 64  step=8
    deconv2_1 = Conv2DTranspose(512, (3, 3),
                                strides=(2, 2),
                                activation='relu',
                                padding='same',
                                kernel_initializer='he_normal',
                                kernel_regularizer=l2(l2_reg),
                                name='deconv2_1')(deconv1)
    deconv2_2 = Conv2DTranspose(512, (3, 3),
                                activation='relu',
                                padding='same',
                                kernel_initializer='he_normal',
                                kernel_regularizer=l2(l2_reg),
                                name='deconv2_2')(deconv2_1)
    deconv2 = concatenate([deconv2_2, conv4_3], axis=3)
    # 128  step=4
    deconv3_1 = Conv2DTranspose(512, (3, 3),
                                strides=(2, 2),
                                activation='relu',
                                padding='same',
                                kernel_initializer='he_normal',
                                kernel_regularizer=l2(l2_reg),
                                name='deconv3_1')(deconv2)
    deconv3_2 = Conv2DTranspose(256, (3, 3),
                                activation='relu',
                                padding='same',
                                kernel_initializer='he_normal',
                                kernel_regularizer=l2(l2_reg),
                                name='deconv3_2')(deconv3_1)
    deconv3 = concatenate([deconv3_2, conv3_3], axis=3)

    # Feed conv4_3 into the L2 normalization layer
    # conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)
    # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
    # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
    deconv1_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='deconv1_mbox_conf')(deconv1)
    deconv2_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='deconv2_mbox_conf')(deconv2)
    deconv3_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='deconv3_mbox_conf')(deconv3)

    # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
    # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
    deconv1_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='deconv1_mbox_loc')(deconv1)
    deconv2_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='deconv2_mbox_loc')(deconv2)
    deconv3_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='deconv3_mbox_loc')(deconv3)

    # Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)
    # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
    deconv1_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='deconv1_mbox_priorbox')(deconv1_mbox_loc)
    deconv2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[1],
        next_scale=scales[2],
        aspect_ratios=aspect_ratios[1],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[1],
        this_offsets=offsets[1],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='deconv2_mbox_priorbox')(deconv2_mbox_loc)
    deconv3_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[2],
        next_scale=scales[3],
        aspect_ratios=aspect_ratios[2],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[2],
        this_offsets=offsets[2],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='deconv3_mbox_priorbox')(deconv3_mbox_loc)

    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    deconv1_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='deconv1_mbox_conf_reshape')(deconv1_mbox_conf)
    deconv2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='deconv2_mbox_conf_reshape')(deconv2_mbox_conf)
    deconv3_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='deconv3_mbox_conf_reshape')(deconv3_mbox_conf)

    # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    deconv1_mbox_loc_reshape = Reshape(
        (-1, 4), name='deconv1_mbox_loc_reshape')(deconv1_mbox_loc)
    deconv2_mbox_loc_reshape = Reshape(
        (-1, 4), name='deconv2_mbox_loc_reshape')(deconv2_mbox_loc)
    deconv3_mbox_loc_reshape = Reshape(
        (-1, 4), name='deconv3_mbox_loc_reshape')(deconv3_mbox_loc)

    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
    deconv1_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='deconv1_mbox_priorbox_reshape')(deconv1_mbox_priorbox)
    deconv2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='deconv2_mbox_priorbox_reshape')(deconv2_mbox_priorbox)
    deconv3_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='deconv3_mbox_priorbox_reshape')(deconv3_mbox_priorbox)

    # Concatenate the predictions from the different layers
    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1, the number of boxes per layer
    # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([
        deconv1_mbox_conf_reshape, deconv2_mbox_conf_reshape,
        deconv3_mbox_conf_reshape
    ])

    # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([
        deconv1_mbox_loc_reshape, deconv2_mbox_loc_reshape,
        deconv3_mbox_loc_reshape
    ])

    # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([
        deconv1_mbox_priorbox_reshape, deconv2_mbox_priorbox_reshape,
        deconv3_mbox_priorbox_reshape
    ])

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    # Concatenate the class and box predictions and the anchors to one large predictions vector
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            deconv1_mbox_conf._keras_shape[1:3],
            deconv2_mbox_conf._keras_shape[1:3],
            deconv3_mbox_conf._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #10
0
def build_model(image_size,
                n_classes,
                mode='training',
                l2_regularization=0.0,
                min_scale=0.1,
                max_scale=0.9,
                scales=None,
                aspect_ratios_global=[0.5, 1.0, 2.0],
                aspect_ratios_per_layer=None,
                two_boxes_for_ar1=True,
                steps=None,
                offsets=None,
                clip_boxes=False,
                variances=[1.0, 1.0, 1.0, 1.0],
                coords='centroids',
                normalize_coords=False,
                subtract_mean=None,
                divide_by_stddev=None,
                swap_channels=False,
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400,
                return_predictor_sizes=False):
    '''
    此模型包含 7 个卷积层, 其中 4 个预测层, 预测层从第 4, 5, 6, 和 7 层做预测.

    Arguments:
        image_size (tuple): The input image size in the format `(height, width, channels)`.
        n_classes (int): The number of positive classes: 5.
        mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
            the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
            the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
            non-maximum suppression, and top-k filtering. The difference between latter two modes is that
            'inference' follows the exact procedure of the original Caffe implementation, while
            'inference_fast' uses a faster prediction decoding procedure.
        l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
        min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images.
        max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images. All scaling factors between the smallest and the
            largest will be linearly interpolated. Note that the second to last of the linearly interpolated
            scaling factors will actually be the scaling factor for the last predictor layer, while the last
            scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
            if `two_boxes_for_ar1` is `True`.
        scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
            This list must be one element longer than the number of predictor layers. The first `k` elements are the
            scaling factors for the `k` predictor layers, while the last element is used for the second box
            for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
            last scaling factor must be passed either way, even if it is not being used. If a list is passed,
            this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
        aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
            generated. This list is valid for all predictor layers. The original implementation uses more aspect ratios
            for some predictor layers and fewer for others. If you want to do that, too, then use the next argument instead.
        aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each predictor layer.
            This allows you to set the aspect ratios for each predictor layer individually. If a list is passed,
            it overrides `aspect_ratios_global`.
        two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
            If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
            using the scaling factor for the respective layer, the second one will be generated using
            geometric mean of said scaling factor and next bigger scaling factor.
        steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
            pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
            the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
            If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
            If no steps are provided, then they will be computed such that the anchor box center points will form an
            equidistant grid within the image dimensions.
        offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either floats or tuples of two floats. These numbers represent for each predictor layer how many
            pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
            as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
            of the step size specified in the `steps` argument. If the list contains floats, then that value will
            be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
            `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size,
            which is also the recommended setting.
        clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
        variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
            its respective variance value.
        coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
            of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
            and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
            i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
        subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
            of any shape that is broadcast-compatible with the image shape. The elements of this array will be
            subtracted from the image pixel intensity values. For example, pass a list of three integers
            to perform per-channel mean normalization for color images.
        divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
            floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
            intensity values will be divided by the elements of this array. For example, pass a list
            of three integers to perform per-channel standard deviation normalization for color images.
        swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
            image channels should be swapped.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
            positive class in order to be considered for the non-maximum suppression stage for the respective class.
            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
            stage, while a larger value will result in a larger part of the selection process happening in the confidence
            thresholding stage.
        iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
            to the box's confidence score.
        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
            non-maximum suppression stage.
        nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
        return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
            a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
            you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
            to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
            spatial dimensions of the predictor layers), for inference you don't need them.

    Returns:
        model: The Keras SSD model.
        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
            of the output tensor shape for each convolutional predictor layer. During
            training, the generator function needs this in order to transform
            the ground truth labels into tensors of identical structure as the
            output tensors of the model, which is in turn needed for the cost
            function.

    References:
        https://arxiv.org/abs/1512.02325v5
    '''

    n_predictor_layers = 4  # The number of predictor conv layers in the network
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(
            variances
    ) != 4:  # We need one variance value for each of the four box coordinates
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # 添加网络结构

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
        predictor_sizes = np.array([
            classes4._keras_shape[1:3], classes5._keras_shape[1:3],
            classes6._keras_shape[1:3], classes7._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #11
0
def MobileNetV2(input_shape,
                n_classes,
                mode='training',
                weights=None,
                input_tensor=None,
                min_scale=None,
                max_scale=None,
                scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05],
                aspect_ratios_global=None,
                aspect_ratios_per_layer=[[1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0]],
                two_boxes_for_ar1=True,
                steps=[8, 16, 32, 64, 100, 300],
                offsets=None,
                clip_boxes=False,
                variances=[0.1, 0.1, 0.2, 0.2],
                coords='centroids',
                normalize_coords=True,
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400,
                return_predictor_sizes=False,
                alpha=1.0,
                **kwargs):

    l2_reg = kwargs.get('l2_reg', 0.00005)
    alpha = kwargs.get('alpha', 1.)
    preprocess = kwargs.get('preprocess', True)

    if weights and os.path.exists(weights):
        raise ValueError('The `weights` argument should be either '
                         '`None` (random initialization), `imagenet` '
                         '(pre-training on ImageNet), '
                         'or the path to the weights file to be loaded.')

    # Determine proper input shape and default size.
    # If both input_shape and input_tensor are used, they should match

    if alpha not in [0.35, 0.50, 0.75, 1.0, 1.3, 1.4]:
        raise ValueError('If imagenet weights are being loaded, '
                         'alpha can be one of `0.35`, `0.50`, `0.75`, '
                         '`1.0`, `1.3` or `1.4` only.')

    # prepare config
    n_predictor_layers = 6  # The number of predictor conv layers in the network is 6 for the original SSD300.
    n_classes += 1  # Account for the background class.
    img_height, img_width = input_shape[0], input_shape[1]

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    img_input = Input(shape=input_shape)
    if preprocess:
        x = Lambda(lambda x: x / 256. - 0.5)(img_input)
    if mode == 'hardware':
        x = img_input
    # 300
    x = _inverted_res_block(x,
                            filters=16,
                            alpha=alpha,
                            stride=1,
                            expansion=1,
                            block_id=0)
    # 150
    x = _inverted_res_block(x,
                            filters=24,
                            stride=2,
                            expansion=6,
                            block_id=1,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=24,
                            stride=1,
                            expansion=6,
                            block_id=2,
                            **kwargs)
    # 75
    x = _inverted_res_block(x,
                            filters=32,
                            stride=2,
                            expansion=6,
                            block_id=3,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=32,
                            stride=1,
                            expansion=6,
                            block_id=4,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=32,
                            stride=1,
                            expansion=6,
                            block_id=5,
                            **kwargs)
    # 38
    x = _inverted_res_block(x,
                            filters=64,
                            stride=2,
                            expansion=6,
                            block_id=6,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=64,
                            stride=1,
                            expansion=6,
                            block_id=7,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=64,
                            stride=1,
                            expansion=6,
                            block_id=8,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=64,
                            stride=1,
                            expansion=6,
                            block_id=9,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=96,
                            stride=1,
                            expansion=6,
                            block_id=10,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=96,
                            stride=1,
                            expansion=6,
                            block_id=11,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=96,
                            stride=1,
                            expansion=6,
                            block_id=12,
                            **kwargs)
    # 19
    conv0_pw = _inverted_res_block(x,
                                   filters=160,
                                   stride=2,
                                   expansion=6,
                                   block_id=13,
                                   **kwargs)
    x = _inverted_res_block(conv0_pw,
                            filters=160,
                            stride=1,
                            expansion=6,
                            block_id=14,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=160,
                            stride=1,
                            expansion=6,
                            block_id=15,
                            **kwargs)
    x = _inverted_res_block(x,
                            filters=320,
                            stride=1,
                            expansion=6,
                            block_id=16,
                            **kwargs)
    # 10

    # SSD lite
    x = Conv2D(1280,
               kernel_size=1,
               strides=(2, 2),
               use_bias=False,
               kernel_regularizer=l2(l2_reg),
               name='conv_ft_1')(x)
    x = BatchNormalization(epsilon=1e-3, momentum=0.999, name='bn_ft_1')(x)
    conv1_pw = ReLU(6., name='relu_ft_1')(x)
    # 5

    conv2_pw = LiteConv(conv1_pw, 2, 512)
    # 3
    conv3_pw = LiteConv(conv2_pw, 3, 256)
    # 2
    conv4_pw = LiteConv(conv3_pw, 4, 128)
    # 1
    conv5_pw = LiteConv(conv4_pw, 5, 128)

    conv0_mbox_conf = pred_cls(conv0_pw,
                               n_boxes[0] * n_classes,
                               name='conv0_mbox_conf')
    conv1_mbox_conf = pred_cls(conv1_pw,
                               n_boxes[1] * n_classes,
                               name='conv1_mbox_conf')
    conv2_mbox_conf = pred_cls(conv2_pw,
                               n_boxes[2] * n_classes,
                               name='conv2_mbox_conf')
    conv3_mbox_conf = pred_cls(conv3_pw,
                               n_boxes[3] * n_classes,
                               name='conv3_mbox_conf')
    conv4_mbox_conf = pred_cls(conv4_pw,
                               n_boxes[4] * n_classes,
                               name='conv4_mbox_conf')
    conv5_mbox_conf = pred_cls(conv5_pw,
                               n_boxes[5] * n_classes,
                               name='conv5_mbox_conf')
    # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
    # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
    conv0_mbox_loc = pred_cls(conv0_pw, n_boxes[0] * 4, name='conv0_mbox_loc')
    conv1_mbox_loc = pred_cls(conv1_pw, n_boxes[1] * 4, name='conv1_mbox_loc')
    conv2_mbox_loc = pred_cls(conv2_pw, n_boxes[2] * 4, name='conv2_mbox_loc')
    conv3_mbox_loc = pred_cls(conv3_pw, n_boxes[3] * 4, name='conv3_mbox_loc')
    conv4_mbox_loc = pred_cls(conv4_pw, n_boxes[4] * 4, name='conv4_mbox_loc')
    conv5_mbox_loc = pred_cls(conv5_pw, n_boxes[5] * 4, name='conv5_mbox_loc')

    ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)

    # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
    conv0_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv0_mbox_priorbox')(conv0_mbox_loc)
    conv1_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[1],
        next_scale=scales[2],
        aspect_ratios=aspect_ratios[1],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[1],
        this_offsets=offsets[1],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv1_mbox_priorbox')(conv1_mbox_loc)
    conv2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[2],
        next_scale=scales[3],
        aspect_ratios=aspect_ratios[2],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[2],
        this_offsets=offsets[2],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv2_mbox_priorbox')(conv2_mbox_loc)
    conv3_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[3],
        next_scale=scales[4],
        aspect_ratios=aspect_ratios[3],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[3],
        this_offsets=offsets[3],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv3_mbox_priorbox')(conv3_mbox_loc)
    conv4_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[4],
        next_scale=scales[5],
        aspect_ratios=aspect_ratios[4],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[4],
        this_offsets=offsets[4],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv4_mbox_priorbox')(conv4_mbox_loc)
    conv5_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[5],
        next_scale=scales[6],
        aspect_ratios=aspect_ratios[5],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[5],
        this_offsets=offsets[5],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv5_mbox_priorbox')(conv5_mbox_loc)

    ### Reshape

    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    conv0_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv0_mbox_conf_reshape')(conv0_mbox_conf)
    conv1_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv1_mbox_conf_reshape')(conv1_mbox_conf)
    conv2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv2_mbox_conf_reshape')(conv2_mbox_conf)
    conv3_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv3_mbox_conf_reshape')(conv3_mbox_conf)
    conv4_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv4_mbox_conf_reshape')(conv4_mbox_conf)
    conv5_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv5_mbox_conf_reshape')(conv5_mbox_conf)
    # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    conv0_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv0_mbox_loc_reshape')(conv0_mbox_loc)
    conv1_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv1_mbox_loc_reshape')(conv1_mbox_loc)
    conv2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv2_mbox_loc_reshape')(conv2_mbox_loc)
    conv3_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv3_mbox_loc_reshape')(conv3_mbox_loc)
    conv4_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv4_mbox_loc_reshape')(conv4_mbox_loc)
    conv5_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv5_mbox_loc_reshape')(conv5_mbox_loc)
    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
    conv0_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv0_mbox_priorbox_reshape')(conv0_mbox_priorbox)
    conv1_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv1_mbox_priorbox_reshape')(conv1_mbox_priorbox)
    conv2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv2_mbox_priorbox_reshape')(conv2_mbox_priorbox)
    conv3_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv3_mbox_priorbox_reshape')(conv3_mbox_priorbox)
    conv4_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv4_mbox_priorbox_reshape')(conv4_mbox_priorbox)
    conv5_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv5_mbox_priorbox_reshape')(conv5_mbox_priorbox)

    ### Concatenate the predictions from the different layers

    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1, the number of boxes per layer
    # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([
        conv0_mbox_conf_reshape, conv1_mbox_conf_reshape,
        conv2_mbox_conf_reshape, conv3_mbox_conf_reshape,
        conv4_mbox_conf_reshape, conv5_mbox_conf_reshape
    ])

    # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([
        conv0_mbox_loc_reshape, conv1_mbox_loc_reshape, conv2_mbox_loc_reshape,
        conv3_mbox_loc_reshape, conv4_mbox_loc_reshape, conv5_mbox_loc_reshape
    ])

    # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([
        conv0_mbox_priorbox_reshape, conv1_mbox_priorbox_reshape,
        conv2_mbox_priorbox_reshape, conv3_mbox_priorbox_reshape,
        conv4_mbox_priorbox_reshape, conv5_mbox_priorbox_reshape
    ])

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    # Concatenate the class and box predictions and the anchors to one large predictions vector
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox])

    if mode == 'training':
        model = Model(inputs=img_input, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=img_input, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=img_input, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            conv0_mbox_conf._keras_shape[1:3],
            conv1_mbox_conf._keras_shape[1:3],
            conv2_mbox_conf._keras_shape[1:3],
            conv3_mbox_conf._keras_shape[1:3],
            conv4_mbox_conf._keras_shape[1:3],
            conv5_mbox_conf._keras_shape[1:3]
        ])
        # Load weights.
        if weights is not None:
            model.load_weights(weights)
        return model, predictor_sizes
    else:
        # Load weights.
        if weights is not None:
            model.load_weights(weights)
        return model
コード例 #12
0
def ssd_300(image_size,
            n_classes,
            mode='training',
            l2_regularization=0.0005,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_global=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5]],
            two_boxes_for_ar1=True,
            steps=[8, 16, 32, 64, 100, 300],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False):

    n_predictor_layers = 6  # Số lượng các preductor convolutional layers trong network là 6 cho original SSD300.
    n_classes += 1  # Số lượng classes, + 1 để tính thêm background class.
    l2_reg = l2_regularization  # tham số chuẩn hóa của norm chuẩn l2.
    img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]

    ############################################################################
    # Một số lỗi ngoại lệ.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(
                    n_predictor_layers, len(aspect_ratios_per_layer)))

    # Tạo list scales
    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(
                n_predictor_layers + 1, len(scales)))
    else:
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError("All variances must be >0, but the variances given are {}".format(variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError("You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError("You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Tính các tham số của anchor box.
    ############################################################################

    # Thiết lập aspect ratios cho mỗi predictor layer (chỉ cần thiết cho tính toán anchor box layers).
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Tính số lượng boxes được dự báo / 1 cell cho mỗi predictor layer.
    # Chúng ta cần biết bao nhiêu channels các predictor layers cần có.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) + 1)  # +1 cho trường hợp aspect ratio = 1
            else:
                n_boxes.append(len(ar))
    else:  # Nếu chỉ 1 global aspect ratio list được truyền vào thì số lượng boxes là như nhau cho mọi layers.
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Xác định các hàm số cho Lambda layers bên dưới.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack(
                [tensor[..., swap_channels[0]], tensor[..., swap_channels[1]], tensor[..., swap_channels[2]]], axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([tensor[..., swap_channels[0]], tensor[..., swap_channels[1]], tensor[..., swap_channels[2]],
                            tensor[..., swap_channels[3]]], axis=-1)

    ############################################################################
    # Bước 1: Xây dựng network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(
            x1)

    ############################################################################
    # Bước 1.1: Tính toán base network là mạng VGG16
    ############################################################################

    conv4_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg), name='conv4_3')(x1)

    ############################################################################
    # Feed conv4_3 vào the L2 normalization layer
    conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(x1)
    ############################################################################
    # Bước 1.3: Xác định output phân phối xác suất theo các classes ứng với mỗi một default bounding box.
    ############################################################################

    ### Xây dựng các convolutional predictor layers tại top của base network
    # Chúng ta dự báo các giá trị confidence cho mỗi box, do đó confidence predictors có độ sâu `n_boxes * n_classes`
    # Đầu ra của confidence layers có shape: `(batch, height, width, n_boxes * n_classes)`
    conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal',
                                    kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_conf')(conv4_3_norm)
    fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg), name='fc7_mbox_conf')(x1)
    conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf')(x1)
    conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf')(x1)
    conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf')(x1)
    conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf')(x1)

    ############################################################################
    # Bước 1.4: Xác định output các tham số offset của default bounding boxes tương ứng với mỗi cell trên các features map.
    ############################################################################

    # Chúng ta dự báo 4 tọa độ cho mỗi box, do đó localization predictors có độ sâu `n_boxes * 4`
    # Output shape của localization layers: `(batch, height, width, n_boxes * 4)`
    conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm)
    fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal',
                          kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(x1)
    conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(x1)
    conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(x1)
    conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3), padding='same', kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(x1)
    conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3), padding='same', kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(x1)

    ############################################################################
    # Bước 1.5: Tính toán các AnchorBoxes làm cơ sở để dự báo offsets cho các predicted bounding boxes bao quan vật thể
    ############################################################################

    ### Khởi tạo các anchor boxes (được gọi là "priors" trong code gốc Caffe/C++ của mô hình)
    # Shape output của anchors: `(batch, height, width, n_boxes, 8)`
    conv4_3_norm_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1],
                                             aspect_ratios=aspect_ratios[0],
                                             two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0],
                                             this_offsets=offsets[0], clip_boxes=clip_boxes,
                                             variances=variances, coords=coords, normalize_coords=normalize_coords,
                                             name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
    fc7_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2],
                                    aspect_ratios=aspect_ratios[1],
                                    two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1],
                                    clip_boxes=clip_boxes,
                                    variances=variances, coords=coords, normalize_coords=normalize_coords,
                                    name='fc7_mbox_priorbox')(fc7_mbox_loc)
    conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3],
                                        aspect_ratios=aspect_ratios[2],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2],
                                        this_offsets=offsets[2], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords,
                                        name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
    conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4],
                                        aspect_ratios=aspect_ratios[3],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3],
                                        this_offsets=offsets[3], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords,
                                        name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
    conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[4], next_scale=scales[5],
                                        aspect_ratios=aspect_ratios[4],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[4],
                                        this_offsets=offsets[4], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords,
                                        name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
    conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[5], next_scale=scales[6],
                                        aspect_ratios=aspect_ratios[5],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[5],
                                        this_offsets=offsets[5], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords,
                                        name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)

    ############################################################################
    # Bước 2: Reshape lại các output tensor shape
    ############################################################################

    ############################################################################
    # Bước 2.1: Reshape output của class predictions
    ############################################################################

    # Reshape các class predictions, trả về 3D tensors có shape `(batch, height * width * n_boxes, n_classes)`
    # Chúng ta muốn các classes là tách biệt nhau trên last axis để tính softmax trên chúng.
    conv4_3_norm_mbox_conf_reshape = Reshape((-1, n_classes), name='conv4_3_norm_mbox_conf_reshape')(
        conv4_3_norm_mbox_conf)
    fc7_mbox_conf_reshape = Reshape((-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
    conv6_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
    conv7_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
    conv8_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
    conv9_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)

    ############################################################################
    # Bước 2.2: Reshape output của bounding box predictions
    ############################################################################

    # Reshape các box predictions, trả về 3D tensors có shape `(batch, height * width * n_boxes, 4)`
    # Chúng ta muốn 4 tọa độ box là tách biệt nhau trên last axis để tính hàm smooth L1 loss
    conv4_3_norm_mbox_loc_reshape = Reshape((-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
    fc7_mbox_loc_reshape = Reshape((-1, 4), name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
    conv6_2_mbox_loc_reshape = Reshape((-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
    conv7_2_mbox_loc_reshape = Reshape((-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
    conv8_2_mbox_loc_reshape = Reshape((-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
    conv9_2_mbox_loc_reshape = Reshape((-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)

    ############################################################################
    # Bước 2.3: Reshape output của anchor box
    ############################################################################

    # Reshape anchor box tensors, trả về 3D tensors có shape `(batch, height * width * n_boxes, 8)`
    conv4_3_norm_mbox_priorbox_reshape = Reshape((-1, 8), name='conv4_3_norm_mbox_priorbox_reshape')(
        conv4_3_norm_mbox_priorbox)
    fc7_mbox_priorbox_reshape = Reshape((-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
    conv6_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
    conv7_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
    conv8_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
    conv9_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)
    ### Concatenate các predictions từ các layers khác nhau

    ############################################################################
    # Bước 3: Concatenate các boxes trên layers
    ############################################################################

    ############################################################################
    # Bước 3.1: Concatenate confidence output box
    ############################################################################

    # Axis 0 (batch) và axis 2 (n_classes hoặc 4) là xác định duy nhất cho toàn bộ các predictions layer
    # nên chúng ta muốn concatenate theo axis 1, số lượng các boxes trên layer
    # Output shape của `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv4_3_norm_mbox_conf_reshape,
                                                       fc7_mbox_conf_reshape,
                                                       conv6_2_mbox_conf_reshape,
                                                       conv7_2_mbox_conf_reshape,
                                                       conv8_2_mbox_conf_reshape,
                                                       conv9_2_mbox_conf_reshape])

    ############################################################################
    # Bước 3.2: Concatenate location output box
    ############################################################################

    # Output shape của `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv4_3_norm_mbox_loc_reshape,
                                                     fc7_mbox_loc_reshape,
                                                     conv6_2_mbox_loc_reshape,
                                                     conv7_2_mbox_loc_reshape,
                                                     conv8_2_mbox_loc_reshape,
                                                     conv9_2_mbox_loc_reshape])
    ############################################################################
    # Bước 3.3: Concatenate anchor output box
    ############################################################################

    # Output shape của `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv4_3_norm_mbox_priorbox_reshape,
                                                               fc7_mbox_priorbox_reshape,
                                                               conv6_2_mbox_priorbox_reshape,
                                                               conv7_2_mbox_priorbox_reshape,
                                                               conv8_2_mbox_priorbox_reshape,
                                                               conv9_2_mbox_priorbox_reshape])

    ############################################################################
    # Bước 4: Tính toán output
    ############################################################################

    ############################################################################
    # Bước 4.1 : Xây dựng các hàm loss function cho confidence
    ############################################################################

    # tọa độ của box predictions sẽ được truyền vào hàm loss function,
    # nhưng cho các dự báo lớp, chúng ta sẽ áp dụng một hàm softmax activation layer đầu tiên
    mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf)

    # Concatenate các class và box predictions và the anchors thành một large predictions vector
    # Đầu ra của `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox])
    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
                                               iou_threshold=iou_threshold,
                                               top_k=top_k,
                                               nms_max_output_size=nms_max_output_size,
                                               coords=coords,
                                               normalize_coords=normalize_coords,
                                               img_height=img_height,
                                               img_width=img_width,
                                               name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
                                                   iou_threshold=iou_threshold,
                                                   top_k=top_k,
                                                   nms_max_output_size=nms_max_output_size,
                                                   coords=coords,
                                                   normalize_coords=normalize_coords,
                                                   img_height=img_height,
                                                   img_width=img_width,
                                                   name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([conv4_3_norm_mbox_conf._keras_shape[1:3],
                                    fc7_mbox_conf._keras_shape[1:3],
                                    conv6_2_mbox_conf._keras_shape[1:3],
                                    conv7_2_mbox_conf._keras_shape[1:3],
                                    conv8_2_mbox_conf._keras_shape[1:3],
                                    conv9_2_mbox_conf._keras_shape[1:3]])
        return model, predictor_sizes
    else:
        return model
コード例 #13
0
def build_model(image_size,
                n_classes,
                mode='training',
                l2_regularization=0.0,
                min_scale=0.1,
                max_scale=0.9,
                scales=None,
                aspect_ratios_global=[0.5, 1.0, 2.0],
                aspect_ratios_per_layer=None,
                two_boxes_for_ar1=True,
                steps=None,
                offsets=None,
                clip_boxes=False,
                variances=[1.0, 1.0, 1.0, 1.0],
                coords='centroids',
                normalize_coords=False,
                subtract_mean=None,
                divide_by_stddev=None,
                swap_channels=False,
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400,
                return_predictor_sizes=False):
    """
    此模型包含7个卷积层,其中4个预测层,预测层从4,5,6和7
    :param image_size:
    :param n_classes:
    :param mode:
    :param l2_regularization:
    :param min_scale:
    :param max_scale:
    :param scales:
    :param aspect_ratios_global:
    :param aspect_ratios_per_layer:
    :param two_boxes_for_ar1:
    :param steps:
    :param offsets:
    :param clip_boxes:
    :param variances:
    :param coords:
    :param normalize_coords:
    :param subtract_mean:
    :param divide_by_stddev:
    :param swap_channels:
    :param confidence_thresh:
    :param iou_threshold:
    :param top_k:
    :param nms_max_output_size:
    :param return_predictor_sizes:
    :return:
    """
    # the number of predictor conv layers in the network
    n_predictor_layers = 4
    # account fo rthe background class
    n_classes += 1
    # make the internal name shorter
    l2_reg = l2_regularization
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(
            variances
    ) != 4:  # We need one variance value for each of the four box coordinates
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # 添加网络结构

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
        predictor_sizes = np.array([
            classes4._keras_shape[1:3], classes5._keras_shape[1:3],
            classes6._keras_shape[1:3], classes7._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #14
0
def ssd_300(image_size,
            n_classes,
            mode='training',
            l2_regularization=0.0005,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_global=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]],
            two_boxes_for_ar1=True,
            steps=[8, 16, 32, 64, 100, 300],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False):

    bn_axis = 3
    n_predictor_layers = 6  # The number of predictor conv layers in the network is 6 for the original SSD300.
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]
    EARTH_RADIUS = tf.constant(6371000,
                               tf.float32)  # Radius in meters of Earth
    GOOGLE_CAR_CAMERA_HEIGHT = tf.cast(
        3, tf.float32
    )  # ballpark estimate of the number of meters that camera is off the ground
    MATH_PI = tf.cast(math.pi, tf.float32)
    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################
    def identity_block(input_tensor, kernel_size, filters, stage, block):
        '''The identity_block is the block that has no conv layer at shortcut
        # Arguments
            input_tensor: input tensor
            kernel_size: defualt 3, the kernel size of middle conv layer at main path
            filters: list of integers, the nb_filters of 3 conv layer at main path
            stage: integer, current stage label, used for generating layer names
            block: 'a','b'..., current block label, used for generating layer names
        '''
        nb_filter1, nb_filter2, nb_filter3 = filters
        bn_axis = 3

        conv_name_base = 'res' + str(stage) + block + '_branch'
        bn_name_base = 'bn' + str(stage) + block + '_branch'

        x = Conv2D(nb_filter1, (1, 1),
                   name=conv_name_base + '2a')(input_tensor)
        x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
        x = Activation('relu')(x)

        x = Conv2D(nb_filter2, (kernel_size, kernel_size),
                   padding='same',
                   name=conv_name_base + '2b')(x)
        x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
        x = Activation('relu')(x)

        x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c')(x)
        x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)

        x = Add()([x, input_tensor])
        x = Activation('relu')(x)
        return x

    def conv_block(input_tensor,
                   kernel_size,
                   filters,
                   stage,
                   block,
                   strides=(2, 2)):
        '''conv_block is the block that has a conv layer at shortcut
        # Arguments
            input_tensor: input tensor
            kernel_size: defualt 3, the kernel size of middle conv layer at main path
            filters: list of integers, the nb_filters of 3 conv layer at main path
            stage: integer, current stage label, used for generating layer names
            block: 'a','b'..., current block label, used for generating layer names
        Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
        And the shortcut should have subsample=(2,2) as well
        '''
        nb_filter1, nb_filter2, nb_filter3 = filters
        if K.image_dim_ordering() == 'tf':
            bn_axis = 3
        else:
            bn_axis = 1
        conv_name_base = 'res' + str(stage) + block + '_branch'
        bn_name_base = 'bn' + str(stage) + block + '_branch'

        x = Conv2D(nb_filter1, (1, 1),
                   strides=strides,
                   name=conv_name_base + '2a')(input_tensor)
        x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
        x = Activation('relu')(x)

        x = Conv2D(nb_filter2, (kernel_size, kernel_size),
                   padding='same',
                   name=conv_name_base + '2b')(x)
        x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
        x = Activation('relu')(x)

        x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c')(x)
        x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)

        shortcut = Conv2D(nb_filter3, (1, 1),
                          strides=strides,
                          name=conv_name_base + '1')(input_tensor)
        shortcut = BatchNormalization(axis=bn_axis,
                                      name=bn_name_base + '1')(shortcut)

        x = Add()([x, shortcut])
        x = Activation('relu')(x)
        return x

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    def _atan2(y, x):
        """ My implementation of atan2 in tensorflow.  Returns in -pi .. pi."""
        tan = tf.atan(y / (x + 1e-8))  # this returns in -pi/2 .. pi/2

        one_map = tf.ones_like(tan)

        # correct quadrant error
        correction = tf.where(tf.less(x + 1e-8, 0.0),
                              3.141592653589793 * one_map, 0.0 * one_map)
        tan_c = tan + correction  # this returns in -pi/2 .. 3pi/2

        # bring to positive values
        correction = tf.where(tf.less(tan_c, 0.0),
                              2 * 3.141592653589793 * one_map, 0.0 * one_map)
        tan_zero_2pi = tan_c + correction  # this returns in 0 .. 2pi

        # make symmetric
        correction = tf.where(tf.greater(tan_zero_2pi, 3.141592653589793),
                              -2 * 3.141592653589793 * one_map, 0.0 * one_map)
        tan_final = tan_zero_2pi + correction  # this returns in -pi .. pi
        return tan_final

    def world_coordinates_to_streetview_pixel(lat,
                                              lng,
                                              lat1,
                                              lng1,
                                              yaw,
                                              image_width,
                                              image_height,
                                              height=0,
                                              zoom=None,
                                              object_dims=None,
                                              method=None):
        image_height = tf.constant(300, dtype=tf.float32)
        image_width = tf.constant(600, dtype=tf.float32)

        EARTH_RADIUS = tf.cast(6371000,
                               tf.float32)  # Radius in meters of Earth
        GOOGLE_CAR_CAMERA_HEIGHT = tf.cast(
            3, tf.float32
        )  # ballpark estimate of the number of meters that camera is off the ground
        MATH_PI = tf.cast(math.pi, tf.float32)
        pitch = tf.constant(0, dtype=tf.float32)
        dx1 = tf.cos((lat1) * (MATH_PI / 180))
        dx11 = lng - lng1
        dxr = tf.sin(dx11 * (MATH_PI / 180))
        dx = dx1 * dxr

        dy11 = tf.subtract(lat, lat1)
        dyr = tf.multiply(dy11, (MATH_PI / 180))
        dy = tf.sin(dyr)
        look_at_angle = MATH_PI + _atan2(dx, dy) - yaw

        i = 2 * MATH_PI

        c = lambda x: tf.reduce_any(tf.greater(x, i))
        b = lambda x: tf.subtract(
            x,
            tf.cast(tf.greater(x, i), tf.float32) * (2 * MATH_PI))
        look_at_angle = tf.while_loop(c, b, [look_at_angle])

        t = lambda x: tf.reduce_any(tf.less(x, 0))
        d = lambda x: tf.add(
            x,
            tf.cast(tf.less(x, 0), tf.float32) * (2 * MATH_PI))
        look_at_angle = tf.while_loop(t, d, [look_at_angle])

        inner = dx * dx + dy * dy
        z = tf.multiply(tf.sqrt(tf.add(inner, 1e-10)),
                        tf.constant(6371000, tf.float32))
        # z = tf.where(tf.is_nan(z), tf.zeros_like(z), z)

        camhei_ = tf.fill(tf.shape(z), -GOOGLE_CAR_CAMERA_HEIGHT)

        x_ = tf.divide(tf.multiply(image_width, look_at_angle), (2 * MATH_PI))

        y_0 = tf.divide(image_height, tf.constant(2.0, dtype=tf.float32))
        y_1 = tf.multiply(image_height, tf.subtract(_atan2(camhei_, z), pitch))
        y_2 = tf.divide(y_1, MATH_PI)
        y_ = tf.subtract(y_0, y_2)

        return x_, y_

    def streetview_pixel_to_world_coordinates(lat1, lng1, yaw, image_width,
                                              image_height, x, y):
        EARTH_RADIUS = tf.cast(6371000,
                               tf.float32)  # Radius in meters of Earth
        GOOGLE_CAR_CAMERA_HEIGHT = tf.cast(
            3, tf.float32
        )  # ballpark estimate of the number of meters that camera is off the ground
        MATH_PI = tf.cast(math.pi, tf.float32)
        pitch = float(0)
        look_at_angle = x * (2 * math.pi) / image_width
        height = 0
        tilt_angle = (image_height / 2 - y) * math.pi / image_height + pitch
        tilt_angle = tf.cast(tilt_angle, tf.float32)
        z_ = K.minimum(np.float32(-1e-2), tilt_angle)
        z = tf.divide((-GOOGLE_CAR_CAMERA_HEIGHT), tf.tan(z_))
        dx = tf.sin(look_at_angle - MATH_PI + yaw) * z / EARTH_RADIUS
        dy = tf.cos(look_at_angle - MATH_PI + yaw) * z / EARTH_RADIUS
        lat = lat1 + tf.asin(dy) * (180 / MATH_PI)
        lng = lng1 + tf.asin(dx / tf.cos(lat1 *
                                         (MATH_PI / 180))) * (180 / MATH_PI)
        return lat, lng

    def zeroer(inp):
        z = K.ones_like(inp)
        return z

    def projector(y_input):
        y_in = y_input[:, :, :14]
        y_geo_1 = y_input[:, :, 14:17]
        y_geo_2 = y_input[:, :, 17:]

        cx = y_in[..., -12] * y_in[..., -4] * y_in[..., -6] + y_in[
            ..., -8]  # cx = cx_pred * cx_variance * w_anchor + cx_anchor
        cy = y_in[..., -11] * y_in[..., -3] * y_in[..., -5] + y_in[
            ..., -7]  # cy = cy_pred * cy_variance * h_anchor + cy_anchor
        w = tf.exp(y_in[..., -10] * y_in[..., -2]) * y_in[
            ..., -6]  # w = exp(w_pred * variance_w) * w_anchor
        h = tf.exp(y_in[..., -9] * y_in[..., -1]) * y_in[
            ..., -5]  # h = exp(h_pred * variance_h) * h_anchor

        w = y_in[...,
                 -10] * y_in[...,
                             -2]  # w = exp(w_pred * variance_w) * w_anchor
        h = y_in[..., -9] * y_in[...,
                                 -1]  # h = exp(h_pred * variance_h) * h_anchor
        cx = tf.where(tf.is_nan(cx), tf.ones_like(cx), cx) * 1e-8
        cy = tf.where(tf.is_nan(cy), tf.ones_like(cy), cy) * 1e-8
        w = tf.where(tf.is_nan(w), tf.ones_like(w), w) * 1e-8
        h = tf.where(tf.is_nan(h), tf.ones_like(h), h) * 1e-8

        cx = tf.expand_dims(cx, axis=-1)
        cy = tf.expand_dims(cy, axis=-1)
        w = tf.expand_dims(w, axis=-1)
        h = tf.expand_dims(h, axis=-1)

        tensor = Concatenate(axis=-1, name='y_proj')([cx, cy, w, h])
        ind = 0
        xmin = tensor[..., ind] - tensor[..., ind + 2] / 2.0  # Set xmin
        ymin = tensor[..., ind + 1] - tensor[..., ind + 3] / 2.0  # Set ymin
        xmax = tensor[..., ind] + tensor[..., ind + 2] / 2.0  # Set xmax
        ymax = tensor[..., ind + 1] + tensor[..., ind + 3] / 2.0  # Set ymax

        normalize_coords = True
        tf_img_height = tf.constant(300, dtype=tf.float32, name='img_height')
        tf_img_width = tf.constant(600, dtype=tf.float32, name='img_width')
        tf_normalize_coords = tf.constant(normalize_coords,
                                          name='normalize_coords')

        def normalized_coords():
            xmin1 = tf.expand_dims(xmin * tf_img_width, axis=-1)
            ymin1 = tf.expand_dims(ymin * tf_img_height, axis=-1)
            xmax1 = tf.expand_dims(xmax * tf_img_width, axis=-1)
            ymax1 = tf.expand_dims(ymax * tf_img_height, axis=-1)
            return xmin1, ymin1, xmax1, ymax1

        def non_normalized_coords():
            return tf.expand_dims(xmin, axis=-1), tf.expand_dims(
                ymin,
                axis=-1), tf.expand_dims(xmax,
                                         axis=-1), tf.expand_dims(ymax,
                                                                  axis=-1)

        xmin, ymin, xmax, ymax = tf.cond(tf_normalize_coords,
                                         normalized_coords,
                                         non_normalized_coords)

        x = xmax - xmin
        x = x / 2
        x = xmin + x
        y = ymax

        lat, lng = streetview_pixel_to_world_coordinates(
            y_geo_1[:, 0, 1][0], y_geo_1[:, 0, 2][0], y_geo_1[:, 0, 0][0], 600,
            300, x, y)
        x_, y_ = world_coordinates_to_streetview_pixel(lat, lng, y_geo_2[:, 0,
                                                                         1][0],
                                                       y_geo_2[:, 0, 2][0],
                                                       y_geo_2[:, 0,
                                                               0][0], 600, 300)

        x_h = (xmax - xmin) / 2.0
        y_h = (ymax - ymin) / 2.0

        xmin_ = x_ - x_h
        ymin_ = y_ - 2 * y_h
        xmax_ = x_ + x_h
        ymax_ = y_ + y_h

        xmin_ = tf.where(tf.is_nan(xmin_), tf.ones_like(xmin_), xmin_) * 1e-8
        ymin_ = tf.where(tf.is_nan(ymin_), tf.ones_like(ymin_), ymin_) * 1e-8
        xmax_ = tf.where(tf.is_nan(xmax_), tf.ones_like(xmax_), xmax_) * 1e-8
        ymax_ = tf.where(tf.is_nan(ymax_), tf.ones_like(ymax_), ymax_) * 1e-8

        cx_ = tf.divide(tf.add(xmin_, xmax_), tf.constant(2.0,
                                                          dtype=tf.float32))
        cy_ = tf.divide(tf.add(ymin_, ymax_), tf.constant(2.0,
                                                          dtype=tf.float32))
        w_ = tf.subtract(xmax_, xmin_)
        h_ = tf.subtract(ymax_, ymin_)

        cx_ = tf.where(tf.is_nan(cx_), tf.ones_like(cx_), cx_) * 1e-8
        cy_ = tf.where(tf.is_nan(cy_), tf.ones_like(cy_), cy_) * 1e-8
        w_ = tf.where(tf.is_nan(w_), tf.ones_like(w_), w_) * 1e-8
        h_ = tf.where(tf.is_nan(h_), tf.ones_like(h_), h_) * 1e-8

        y_out = tf.concat([cx_ / 600, cy_ / 300, w_ / 600, h_ / 300], -1)

        return Concatenate(axis=2, name='projected')([y_out, lat, lng])

    ############################################################################
    # Build the network.
    ############################################################################

    def ssdmod(x, geo_1, geo_2, suf):
        x1 = Lambda(identity_layer,
                    output_shape=(img_height, img_width, img_channels),
                    name='identity_layer' + '_' + suf)(x)
        if not (subtract_mean is None):
            x1 = Lambda(input_mean_normalization,
                        output_shape=(img_height, img_width, img_channels),
                        name='input_mean_normalization' + '_' + suf)(x1)
        if not (divide_by_stddev is None):
            x1 = Lambda(input_stddev_normalization,
                        output_shape=(img_height, img_width, img_channels),
                        name='input_stddev_normalization' + '_' + suf)(x1)
        if swap_channels:
            x1 = Lambda(input_channel_swap,
                        output_shape=(img_height, img_width, img_channels),
                        name='input_channel_swap' + '_' + suf)(x1)
        conv1_1 = Conv2D(64, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv1_1' + '_' + suf)(x1)
        conv1_2 = Conv2D(64, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv1_2' + '_' + suf)(conv1_1)
        pool1 = MaxPooling2D(pool_size=(2, 2),
                             strides=(2, 2),
                             padding='same',
                             name='pool1' + '_' + suf)(conv1_2)

        conv2_1 = Conv2D(128, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv2_1' + '_' + suf)(pool1)
        conv2_2 = Conv2D(128, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv2_2' + '_' + suf)(conv2_1)
        pool2 = MaxPooling2D(pool_size=(2, 2),
                             strides=(2, 2),
                             padding='same',
                             name='pool2' + '_' + suf)(conv2_2)

        conv3_1 = Conv2D(256, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv3_1' + '_' + suf)(pool2)
        conv3_2 = Conv2D(256, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv3_2' + '_' + suf)(conv3_1)
        conv3_3 = Conv2D(256, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv3_3' + '_' + suf)(conv3_2)
        pool3 = MaxPooling2D(pool_size=(2, 2),
                             strides=(2, 2),
                             padding='same',
                             name='pool3' + '_' + suf)(conv3_3)

        conv4_1 = Conv2D(512, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv4_1' + '_' + suf)(pool3)
        conv4_2 = Conv2D(512, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv4_2' + '_' + suf)(conv4_1)
        conv4_3 = Conv2D(512, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv4_3' + '_' + suf)(conv4_2)
        pool4 = MaxPooling2D(pool_size=(2, 2),
                             strides=(2, 2),
                             padding='same',
                             name='pool4' + '_' + suf)(conv4_3)

        conv5_1 = Conv2D(512, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv5_1' + '_' + suf)(pool4)
        conv5_2 = Conv2D(512, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv5_2' + '_' + suf)(conv5_1)
        conv5_3 = Conv2D(512, (3, 3),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv5_3' + '_' + suf)(conv5_2)
        pool5 = MaxPooling2D(pool_size=(3, 3),
                             strides=(1, 1),
                             padding='same',
                             name='pool5' + '_' + suf)(conv5_3)

        fc6 = Conv2D(1024, (3, 3),
                     dilation_rate=(6, 6),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='fc6' + '_' + suf)(pool5)

        fc7 = Conv2D(1024, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='fc7' + '_' + suf)(fc6)

        conv6_1 = Conv2D(256, (1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv6_1' + '_' + suf)(fc7)
        conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                                name='conv6adding' + '_' + suf)(conv6_1)
        conv6_2 = Conv2D(512, (3, 3),
                         strides=(2, 2),
                         activation='relu',
                         padding='valid',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv6_2' + '_' + suf)(conv6_1)

        conv7_1 = Conv2D(128, (1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv7_1' + '_' + suf)(conv6_2)
        conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                                name='conv7adding' + '_' + suf)(conv7_1)
        conv7_2 = Conv2D(256, (3, 3),
                         strides=(2, 2),
                         activation='relu',
                         padding='valid',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv7_2' + '_' + suf)(conv7_1)

        conv8_1 = Conv2D(128, (1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv8_1' + '_' + suf)(conv7_2)
        conv8_2 = Conv2D(256, (3, 3),
                         strides=(1, 1),
                         activation='relu',
                         padding='valid',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv8_2' + '_' + suf)(conv8_1)

        conv9_1 = Conv2D(128, (1, 1),
                         activation='relu',
                         padding='same',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv9_1' + '_' + suf)(conv8_2)
        conv9_2 = Conv2D(256, (3, 3),
                         strides=(1, 1),
                         activation='relu',
                         padding='valid',
                         kernel_initializer='he_normal',
                         kernel_regularizer=l2(l2_reg),
                         name='conv9_2' + '_' + suf)(conv9_1)

        # Feed conv4_3 into the L2 normalization layer
        conv4_3_norm = L2Normalization(gamma_init=20,
                                       name='conv4_3_norm' + '_' +
                                       suf)(conv4_3)

        ### Build the convolutional predictor layers on top of the base network

        # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
        # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
        conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3),
                                        padding='same',
                                        kernel_initializer='he_normal',
                                        kernel_regularizer=l2(l2_reg),
                                        name='conv4_3_norm_mbox_conf' + '_' +
                                        suf)(conv4_3_norm)
        fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='fc7_mbox_conf' + '_' + suf)(fc7)
        conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv6_2_mbox_conf' + '_' +
                                   suf)(conv6_2)
        conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv7_2_mbox_conf' + '_' +
                                   suf)(conv7_2)
        conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv8_2_mbox_conf' + '_' +
                                   suf)(conv8_2)
        conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv9_2_mbox_conf' + '_' +
                                   suf)(conv9_2)
        # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
        # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`

        conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3),
                                       padding='same',
                                       kernel_initializer='he_normal',
                                       kernel_regularizer=l2(l2_reg),
                                       name='conv4_3_norm_mbox_loc' + '_' +
                                       suf)(conv4_3_norm)
        fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='fc7_mbox_loc' + '_' + suf)(fc7)
        conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3),
                                  padding='same',
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=l2(l2_reg),
                                  name='conv6_2_mbox_loc' + '_' + suf)(conv6_2)
        conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3),
                                  padding='same',
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=l2(l2_reg),
                                  name='conv7_2_mbox_loc' + '_' + suf)(conv7_2)
        conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3),
                                  padding='same',
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=l2(l2_reg),
                                  name='conv8_2_mbox_loc' + '_' + suf)(conv8_2)
        conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3),
                                  padding='same',
                                  kernel_initializer='he_normal',
                                  kernel_regularizer=l2(l2_reg),
                                  name='conv9_2_mbox_loc' + '_' + suf)(conv9_2)

        ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)

        # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
        conv4_3_norm_mbox_priorbox = AnchorBoxes(
            img_height,
            img_width,
            this_scale=scales[0],
            next_scale=scales[1],
            aspect_ratios=aspect_ratios[0],
            two_boxes_for_ar1=two_boxes_for_ar1,
            this_steps=steps[0],
            this_offsets=offsets[0],
            clip_boxes=clip_boxes,
            variances=variances,
            coords=coords,
            normalize_coords=normalize_coords,
            name='conv4_3_norm_mbox_priorbox' + '_' +
            suf)(conv4_3_norm_mbox_loc)
        fc7_mbox_priorbox = AnchorBoxes(img_height,
                                        img_width,
                                        this_scale=scales[1],
                                        next_scale=scales[2],
                                        aspect_ratios=aspect_ratios[1],
                                        two_boxes_for_ar1=two_boxes_for_ar1,
                                        this_steps=steps[1],
                                        this_offsets=offsets[1],
                                        clip_boxes=clip_boxes,
                                        variances=variances,
                                        coords=coords,
                                        normalize_coords=normalize_coords,
                                        name='fc7_mbox_priorbox' + '_' +
                                        suf)(fc7_mbox_loc)
        conv6_2_mbox_priorbox = AnchorBoxes(
            img_height,
            img_width,
            this_scale=scales[2],
            next_scale=scales[3],
            aspect_ratios=aspect_ratios[2],
            two_boxes_for_ar1=two_boxes_for_ar1,
            this_steps=steps[2],
            this_offsets=offsets[2],
            clip_boxes=clip_boxes,
            variances=variances,
            coords=coords,
            normalize_coords=normalize_coords,
            name='conv6_2_mbox_priorbox' + '_' + suf)(conv6_2_mbox_loc)
        conv7_2_mbox_priorbox = AnchorBoxes(
            img_height,
            img_width,
            this_scale=scales[3],
            next_scale=scales[4],
            aspect_ratios=aspect_ratios[3],
            two_boxes_for_ar1=two_boxes_for_ar1,
            this_steps=steps[3],
            this_offsets=offsets[3],
            clip_boxes=clip_boxes,
            variances=variances,
            coords=coords,
            normalize_coords=normalize_coords,
            name='conv7_2_mbox_priorbox' + '_' + suf)(conv7_2_mbox_loc)
        conv8_2_mbox_priorbox = AnchorBoxes(
            img_height,
            img_width,
            this_scale=scales[4],
            next_scale=scales[5],
            aspect_ratios=aspect_ratios[4],
            two_boxes_for_ar1=two_boxes_for_ar1,
            this_steps=steps[4],
            this_offsets=offsets[4],
            clip_boxes=clip_boxes,
            variances=variances,
            coords=coords,
            normalize_coords=normalize_coords,
            name='conv8_2_mbox_priorbox' + '_' + suf)(conv8_2_mbox_loc)
        conv9_2_mbox_priorbox = AnchorBoxes(
            img_height,
            img_width,
            this_scale=scales[5],
            next_scale=scales[6],
            aspect_ratios=aspect_ratios[5],
            two_boxes_for_ar1=two_boxes_for_ar1,
            this_steps=steps[5],
            this_offsets=offsets[5],
            clip_boxes=clip_boxes,
            variances=variances,
            coords=coords,
            normalize_coords=normalize_coords,
            name='conv9_2_mbox_priorbox' + '_' + suf)(conv9_2_mbox_loc)

        ### Reshape

        # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
        # We want the classes isolated in the last axis to perform softmax on them
        conv4_3_norm_mbox_conf_reshape = Reshape(
            (-1, n_classes), name='conv4_3_norm_mbox_conf_reshape' + '_' +
            suf)(conv4_3_norm_mbox_conf)
        fc7_mbox_conf_reshape = Reshape(
            (-1, n_classes),
            name='fc7_mbox_conf_reshape' + '_' + suf)(fc7_mbox_conf)
        conv6_2_mbox_conf_reshape = Reshape(
            (-1, n_classes),
            name='conv6_2_mbox_conf_reshape' + '_' + suf)(conv6_2_mbox_conf)
        conv7_2_mbox_conf_reshape = Reshape(
            (-1, n_classes),
            name='conv7_2_mbox_conf_reshape' + '_' + suf)(conv7_2_mbox_conf)
        conv8_2_mbox_conf_reshape = Reshape(
            (-1, n_classes),
            name='conv8_2_mbox_conf_reshape' + '_' + suf)(conv8_2_mbox_conf)
        conv9_2_mbox_conf_reshape = Reshape(
            (-1, n_classes),
            name='conv9_2_mbox_conf_reshape' + '_' + suf)(conv9_2_mbox_conf)
        # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
        # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
        conv4_3_norm_mbox_loc_reshape = Reshape(
            (-1, 4), name='conv4_3_norm_mbox_loc_reshape' + '_' +
            suf)(conv4_3_norm_mbox_loc)
        fc7_mbox_loc_reshape = Reshape(
            (-1, 4), name='fc7_mbox_loc_reshape' + '_' + suf)(fc7_mbox_loc)
        conv6_2_mbox_loc_reshape = Reshape(
            (-1, 4),
            name='conv6_2_mbox_loc_reshape' + '_' + suf)(conv6_2_mbox_loc)
        conv7_2_mbox_loc_reshape = Reshape(
            (-1, 4),
            name='conv7_2_mbox_loc_reshape' + '_' + suf)(conv7_2_mbox_loc)
        conv8_2_mbox_loc_reshape = Reshape(
            (-1, 4),
            name='conv8_2_mbox_loc_reshape' + '_' + suf)(conv8_2_mbox_loc)
        conv9_2_mbox_loc_reshape = Reshape(
            (-1, 4),
            name='conv9_2_mbox_loc_reshape' + '_' + suf)(conv9_2_mbox_loc)
        # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
        conv4_3_norm_mbox_priorbox_reshape = Reshape(
            (-1, 8), name='conv4_3_norm_mbox_priorbox_reshape' + '_' +
            suf)(conv4_3_norm_mbox_priorbox)
        fc7_mbox_priorbox_reshape = Reshape(
            (-1, 8),
            name='fc7_mbox_priorbox_reshape' + '_' + suf)(fc7_mbox_priorbox)
        conv6_2_mbox_priorbox_reshape = Reshape(
            (-1, 8), name='conv6_2_mbox_priorbox_reshape' + '_' +
            suf)(conv6_2_mbox_priorbox)
        conv7_2_mbox_priorbox_reshape = Reshape(
            (-1, 8), name='conv7_2_mbox_priorbox_reshape' + '_' +
            suf)(conv7_2_mbox_priorbox)
        conv8_2_mbox_priorbox_reshape = Reshape(
            (-1, 8), name='conv8_2_mbox_priorbox_reshape' + '_' +
            suf)(conv8_2_mbox_priorbox)
        conv9_2_mbox_priorbox_reshape = Reshape(
            (-1, 8), name='conv9_2_mbox_priorbox_reshape' + '_' +
            suf)(conv9_2_mbox_priorbox)

        mbox_conf = Concatenate(axis=1, name='mbox_conf' + '_' + suf)([
            conv4_3_norm_mbox_conf_reshape, fc7_mbox_conf_reshape,
            conv6_2_mbox_conf_reshape, conv7_2_mbox_conf_reshape,
            conv8_2_mbox_conf_reshape, conv9_2_mbox_conf_reshape
        ])

        # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
        mbox_loc = Concatenate(axis=1, name='mbox_loc' + '_' + suf)([
            conv4_3_norm_mbox_loc_reshape, fc7_mbox_loc_reshape,
            conv6_2_mbox_loc_reshape, conv7_2_mbox_loc_reshape,
            conv8_2_mbox_loc_reshape, conv9_2_mbox_loc_reshape
        ])

        # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
        mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox' + '_' + suf)([
            conv4_3_norm_mbox_priorbox_reshape, fc7_mbox_priorbox_reshape,
            conv6_2_mbox_priorbox_reshape, conv7_2_mbox_priorbox_reshape,
            conv8_2_mbox_priorbox_reshape, conv9_2_mbox_priorbox_reshape
        ])

        ### Concatenate the predictions from the different layers
        model = Model(input=[x, geo_1, geo_2],
                      output=[mbox_conf, mbox_loc, mbox_priorbox])

        return model

    def proj_net(inputt, branch):
        mbox_proj = Dense(16, kernel_initializer='normal',
                          activation='relu')(inputt)
        mbox_proj = Dense(4, kernel_initializer='normal',
                          activation='relu')(mbox_proj)
        return mbox_proj

    def proj_net1(inputt, branch):
        # inputt = tf.keras.backend.expand_dims(inputt,axis=-1)
        # mbox_proj = Conv2D(128, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg))(inputt)
        mbox_proj = Dense(16, kernel_initializer='normal',
                          activation='relu')(inputt)
        mbox_proj = Dense(4, kernel_initializer='normal',
                          activation='relu')(mbox_proj)
        return mbox_proj

    def geo_regression(coords, branch):
        geo = Dense(16, kernel_initializer='normal', activation='relu')(coords)
        geo = Dense(8, kernel_initializer='normal', activation='relu')(geo)
        geo = Dense(2,
                    kernel_initializer='normal',
                    activation='relu',
                    name="geo_" + branch)(geo)
        return geo

    def crop(start):
        # Crops (or slices) a Tensor on a given dimension from start to end
        # example : to crop tensor x[:, :, 5:10]
        # call slice(2, 5, 10) as you want to crop on the second dimension
        def func(x):
            return x[:, :, start:]

        return Lambda(func)

    def croper(end):
        # Crops (or slices) a Tensor on a given dimension from start to end
        # example : to crop tensor x[:, :, 5:10]
        # call slice(2, 5, 10) as you want to crop on the second dimension
        def func(x):
            return x[:, :, :end]

        return Lambda(func)

    weights_path = 'weights/VGG_ILSVRC_16_layers_fc_reduced.h5'
    X = Input(shape=(img_height, img_width, img_channels))
    Z = Input(shape=(img_height, img_width, img_channels))
    X_geo = Input(shape=(17292, 3))
    Z_geo = Input(shape=(17292, 3))

    ssd1 = ssdmod(X, X_geo, Z_geo, "_1")
    ssd1.load_weights(weights_path, by_name=True)

    ssd2 = ssdmod(Z, Z_geo, X_geo, "_2")
    ssd2.load_weights(weights_path, by_name=True)

    ## fix me
    mbox_conf = ssd1.get_layer(name="mbox_conf__1").output
    mbox_loc = ssd1.get_layer(name="mbox_loc__1").output
    mbox_priorbox = ssd1.get_layer(name="mbox_priorbox__1").output

    mbox_conf_2 = ssd2.get_layer(name="mbox_conf__2").output
    mbox_loc_2 = ssd2.get_layer(name="mbox_loc__2").output
    mbox_priorbox_2 = ssd2.get_layer(name="mbox_priorbox__2").output

    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax__1')(mbox_conf)
    mbox_conf_softmax_2 = Activation('softmax',
                                     name='mbox_conf_softmax__2')(mbox_conf_2)

    mbox_loc_tot = Concatenate(axis=2, name='predictions_tot__1')(
        [mbox_conf, mbox_loc, mbox_priorbox, X_geo, Z_geo])
    mbox_loc_tot_2 = Concatenate(axis=2, name='predictions_tot__2')(
        [mbox_conf_2, mbox_loc_2, mbox_priorbox_2, Z_geo, X_geo])

    mbox_proj = Lambda(projector,
                       name='predictions' + '__1_mbox_proj')(mbox_loc_tot)
    mbox_proj_2 = Lambda(projector,
                         name='predictions' + '__2_mbox_proj')(mbox_loc_tot_2)

    coord_1 = crop(-2)(mbox_proj)
    coord_2 = crop(-2)(mbox_proj_2)

    mbox_proj = croper(-2)(mbox_proj)
    mbox_proj_2 = croper(-2)(mbox_proj_2)

    mbox_proj_1 = proj_net(mbox_proj, "_1")
    mbox_proj_2 = proj_net(mbox_proj_2, "_2")

    coord_1 = mbox_proj
    coord_2 = mbox_proj_2

    dist = distance_regression(mbox_conf, "_1")
    dist_2 = distance_regression(mbox_conf_2, "_2")

    geo = geo_regression(coord_1, "_1")
    geo_2 = geo_regression(coord_2, "_2")

    empty_1 = Lambda(zeroer)(dist)

    predictions = Concatenate(axis=2, name='predictions_1')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox, empty_1])
    predictions_2 = Concatenate(axis=2, name='predictions_2')(
        [mbox_conf_softmax_2, mbox_loc_2, mbox_priorbox_2, empty_1])

    predictions_proj = Concatenate(axis=2, name='predictions_1_proj')(
        [predictions, mbox_conf_softmax, mbox_proj_1, empty_1])
    predictions_proj_2 = Concatenate(axis=2, name='predictions_2_proj')([
        predictions_2, mbox_conf_softmax_2, mbox_proj_2, mbox_priorbox, empty_1
    ])

    if mode == 'training':

        model = Model(inputs=[X, Z, X_geo, Z_geo],
                      outputs=[
                          predictions, predictions_2, predictions_proj,
                          predictions_proj_2
                      ])

    elif mode == 'inference':

        siamese = Concatenate(axis=2, name='siamese')([mbox_conf, mbox_conf_2])
        predictions = Concatenate(axis=2, name='predictions_inference')(
            [predictions, predictions_2, predictions_proj, predictions_proj_2])

        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)

        model = Model(inputs=[X, Z, X_geo, Z_geo],
                      outputs=[decoded_predictions, siamese])

    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            conv4_3_norm_mbox_conf._keras_shape[1:3],
            fc7_mbox_conf._keras_shape[1:3],
            conv6_2_mbox_conf._keras_shape[1:3],
            conv7_2_mbox_conf._keras_shape[1:3],
            conv8_2_mbox_conf._keras_shape[1:3],
            conv9_2_mbox_conf._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #15
0
def mobilenet_ssd_300(image_size,
                      n_classes,
                      mode='training',
                      l2_regularization=0.0005,
                      min_scale=None,
                      max_scale=None,
                      scales=None,
                      aspect_ratios_global=None,
                      aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                               [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                               [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                               [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                               [1.0, 2.0, 0.5],
                                               [1.0, 2.0, 0.5]],
                      two_boxes_for_ar1=True,
                      steps=[8, 16, 32, 64, 100, 300],
                      offsets=None,
                      clip_boxes=False,
                      variances=[0.1, 0.1, 0.2, 0.2],
                      coords='centroids',
                      normalize_coords=True,
                      subtract_mean=[123, 117, 104],
                      divide_by_stddev=None,
                      swap_channels=[2, 1, 0],
                      confidence_thresh=0.01,
                      iou_threshold=0.45,
                      top_k=200,
                      nms_max_output_size=400,
                      return_predictor_sizes=False):

    n_predictor_layers = 6  # The number of predictor conv layers in the network is 6 for the original SSD300.
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1 & len(n_boxes) != 0:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    # print("Boxes:{}".format(n_boxes))

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)

    mobilenet = MobileNet(input_shape=(224, 224, 3),
                          include_top=False,
                          weights='imagenet')

    FeatureExtractor = Model(
        inputs=mobilenet.input,
        outputs=mobilenet.get_layer('conv_pw_5_relu').output)
    mobilenet_conv_pw_5_relu = FeatureExtractor(x1)

    conv6dw = SeparableConv2D(256, (3, 3),
                              padding='same',
                              strides=(2, 2),
                              name='conv_dw_6')(mobilenet_conv_pw_5_relu)
    conv6dw = BatchNormalization(momentum=0.99, name='conv_dw_6_bn')(conv6dw)
    conv6dw = ReLU(6., name='conv_dw_6_relu')(conv6dw)

    conv6pw = Conv2D(256, (1, 1), padding='same', name='conv_pw_6')(conv6dw)
    conv6pw = BatchNormalization(momentum=0.99, name='conv_pw_6_bn')(conv6pw)
    conv6pw = ReLU(6., name='conv_pw_6_relu')(conv6pw)

    conv7dw = SeparableConv2D(512, (3, 3), padding='same',
                              name='conv_dw_7')(conv6pw)
    conv7dw = BatchNormalization(momentum=0.99, name='conv_dw_7_bn')(conv7dw)
    conv7dw = ReLU(6., name='conv_dw_7_relu')(conv7dw)

    conv7pw = Conv2D(512, (1, 1), padding='same', name='conv_pw_7')(conv7dw)
    conv7pw = BatchNormalization(momentum=0.99, name='conv_pw_7_bn')(conv7pw)
    conv7pw = ReLU(6., name='conv_pw_7_relu')(conv7pw)

    conv8dw = SeparableConv2D(512, (3, 3), padding='same',
                              name='conv_dw_8')(conv7pw)
    conv8dw = BatchNormalization(momentum=0.99, name='conv_dw_8_bn')(conv8dw)
    conv8dw = ReLU(6., name='conv_dw_8_relu')(conv8dw)

    conv8pw = Conv2D(512, (1, 1), padding='same', name='conv_pw_8')(conv8dw)
    conv8pw = BatchNormalization(momentum=0.99, name='conv_pw_8_bn')(conv8pw)
    conv8pw = ReLU(6., name='conv_pw_8_relu')(conv8pw)

    conv9dw = SeparableConv2D(512, (3, 3), padding='same',
                              name='conv_dw_9')(conv8pw)
    conv9dw = BatchNormalization(momentum=0.99, name='conv_dw_9_bn')(conv9dw)
    conv9dw = ReLU(6., name='conv_dw_9_relu')(conv9dw)

    conv9pw = Conv2D(512, (1, 1), padding='same', name='conv_pw_9')(conv9dw)
    conv9pw = BatchNormalization(momentum=0.99, name='conv_pw_9_bn')(conv9pw)
    conv9pw = ReLU(6., name='conv_pw_9_relu')(conv9pw)

    conv10dw = SeparableConv2D(512, (3, 3), padding='same',
                               name='conv_dw_10')(conv9pw)
    conv10dw = BatchNormalization(momentum=0.99,
                                  name='conv_dw_10_bn')(conv10dw)
    conv10dw = ReLU(6., name='conv_dw_10_relu')(conv10dw)

    conv10pw = Conv2D(512, (1, 1), padding='same', name='conv_pw_10')(conv10dw)
    conv10pw = BatchNormalization(momentum=0.99,
                                  name='conv_pw_10_bn')(conv10pw)
    conv10pw = ReLU(6., name='conv_pw_10_relu')(conv10pw)

    conv11dw = SeparableConv2D(512, (3, 3), padding='same',
                               name='conv_dw_11')(conv10pw)
    conv11dw = BatchNormalization(momentum=0.99,
                                  name='conv_dw_11_bn')(conv11dw)
    conv11dw = ReLU(6., name='conv_dw_11_relu')(conv11dw)

    conv11pw = Conv2D(512, (1, 1), padding='same', name='conv_pw_11')(conv11dw)
    conv11pw = BatchNormalization(momentum=0.99,
                                  name='conv_pw_11_bn')(conv11pw)
    conv11pw = ReLU(6., name='conv_pw_11_relu')(conv11pw)

    conv12dw = SeparableConv2D(512, (3, 3),
                               strides=(2, 2),
                               padding='same',
                               name='conv_dw_12')(conv11pw)
    conv12dw = BatchNormalization(momentum=0.99,
                                  name='conv_dw_12_bn')(conv12dw)
    conv12dw = ReLU(6., name='conv_dw_12_relu')(conv12dw)

    conv12pw = Conv2D(1024, (1, 1), padding='same',
                      name='conv_pw_12')(conv12dw)
    conv12pw = BatchNormalization(momentum=0.99,
                                  name='conv_pw_12_bn')(conv12pw)
    conv12pw = ReLU(6., name='conv_pw_12_relu')(conv12pw)

    conv13dw = SeparableConv2D(1024, (3, 3), padding='same',
                               name='conv_dw_13')(conv12pw)
    conv13dw = BatchNormalization(momentum=0.99,
                                  name='conv_dw_13_bn')(conv13dw)
    conv13dw = ReLU(6., name='conv_dw_13_relu')(conv13dw)

    conv13pw = Conv2D(1024, (1, 1), padding='same',
                      name='conv_pw_13')(conv13dw)
    conv13pw = BatchNormalization(momentum=0.99,
                                  name='conv_pw_13_bn')(conv13pw)
    conv13pw = ReLU(6., name='conv_pw_13_relu')(conv13pw)

    conv14_1 = Conv2D(256, (1, 1), padding='same', name='conv14_1')(conv13pw)
    conv14_1 = BatchNormalization(momentum=0.99, name='bn14_1')(conv14_1)
    conv14_1 = ReLU(6., name='conv14_1_relu')(conv14_1)

    conv14_2dw = SeparableConv2D(512, (3, 3),
                                 strides=(2, 2),
                                 padding='same',
                                 name='conv_dw_14_2')(conv14_1)
    conv14_2dw = BatchNormalization(momentum=0.99,
                                    name='conv_dw_14_2_bn')(conv14_2dw)
    conv14_2dw = ReLU(6., name='conv_dw_14_2_relu')(conv14_2dw)

    conv14_2pw = Conv2D(512, (1, 1), padding='same',
                        name='conv_pw_14_2')(conv14_2dw)
    conv14_2pw = BatchNormalization(momentum=0.99,
                                    name='conv_pw_14_2_bn')(conv14_2pw)
    conv14_2pw = ReLU(6., name='conv_pw_14_2_relu')(conv14_2pw)

    conv15_1 = Conv2D(128, (1, 1), padding='same', name='conv15_1')(conv14_2pw)
    conv15_1 = BatchNormalization(momentum=0.99, name='bn15_1')(conv15_1)
    conv15_1 = ReLU(6., name='conv15_1_relu')(conv15_1)

    conv15_2dw = SeparableConv2D(256, (3, 3), name='conv_dw_15_2')(conv15_1)
    conv15_2dw = BatchNormalization(momentum=0.99,
                                    name='conv_dw_15_2_bn')(conv15_2dw)
    conv15_2dw = ReLU(6., name='conv_dw_15_2_relu')(conv15_2dw)

    conv15_2pw = Conv2D(256, (1, 1), padding='same',
                        name='conv_pw_15_2')(conv15_2dw)
    conv15_2pw = BatchNormalization(momentum=0.99,
                                    name='conv_pw_15_2_bn')(conv15_2pw)
    conv15_2pw = ReLU(6., name='conv_pw_15_2_relu')(conv15_2pw)

    conv16_1 = Conv2D(128, (1, 1), padding='same', name='conv16_1')(conv15_2pw)
    conv16_1 = BatchNormalization(momentum=0.99, name='bn16_1')(conv16_1)
    conv16_1 = ReLU(6., name='conv16_1_relu')(conv16_1)

    conv16_2 = SeparableConv2D(256, (3, 3), name='conv16_2_')(conv16_1)
    conv16_2 = BatchNormalization(momentum=0.99, name='bn16_2')(conv16_2)
    conv16_2 = ReLU(6., name='conv16_2_relu')(conv16_2)

    conv5_mbox_loc = Conv2D(n_boxes[0] * 4, (1, 1),
                            padding='same',
                            name='conv5_mbox_loc')(mobilenet_conv_pw_5_relu)
    conv11_mbox_loc = Conv2D(n_boxes[1] * 4, (1, 1),
                             padding='same',
                             name='conv11_mbox_loc_')(conv11pw)
    conv13_mbox_loc = Conv2D(n_boxes[2] * 4, (1, 1),
                             padding='same',
                             name='conv13_mbox_loc_')(conv13pw)
    conv14_mbox_loc = Conv2D(n_boxes[3] * 4, (1, 1),
                             padding='same',
                             name='conv14_mbox_loc_')(conv14_2pw)
    conv15_mbox_loc = Conv2D(n_boxes[4] * 4, (1, 1),
                             padding='same',
                             name='conv15_mbox_loc_')(conv15_2pw)
    conv16_mbox_loc = Conv2D(n_boxes[5] * 4, (1, 1),
                             padding='same',
                             name='conv16_mbox_loc_')(conv16_2)

    conv5_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv5_mbox_loc_reshape')(conv5_mbox_loc)
    conv11_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv11_mbox_loc_reshape')(conv11_mbox_loc)
    conv13_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv13_mbox_loc_reshape')(conv13_mbox_loc)
    conv14_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv14_mbox_loc_reshape')(conv14_mbox_loc)
    conv15_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv15_mbox_loc_reshape')(conv15_mbox_loc)
    conv16_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv16_mbox_loc_reshape')(conv16_mbox_loc)

    conv5_mbox_conf = Conv2D(n_boxes[0] * n_classes, (1, 1),
                             padding='same',
                             name='conv5_mbox_conf')(mobilenet_conv_pw_5_relu)
    conv11_mbox_conf = Conv2D(n_boxes[1] * n_classes, (1, 1),
                              padding='same',
                              name='conv11_mbox_conf_')(conv11pw)
    conv13_mbox_conf = Conv2D(n_boxes[2] * n_classes, (1, 1),
                              padding='same',
                              name='conv13_mbox_conf_')(conv13pw)
    conv14_mbox_conf = Conv2D(n_boxes[3] * n_classes, (1, 1),
                              padding='same',
                              name='conv14_mbox_conf_')(conv14_2pw)
    conv15_mbox_conf = Conv2D(n_boxes[4] * n_classes, (1, 1),
                              padding='same',
                              name='conv15_mbox_conf_')(conv15_2pw)
    conv16_mbox_conf = Conv2D(n_boxes[5] * n_classes, (1, 1),
                              padding='same',
                              name='conv16_mbox_conf_')(conv16_2)

    conv5_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv5_mbox_conf_reshape')(conv5_mbox_conf)
    conv11_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv11_mbox_conf_reshape')(conv11_mbox_conf)
    conv13_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv13_mbox_conf_reshape')(conv13_mbox_conf)
    conv14_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv14_mbox_conf_reshape')(conv14_mbox_conf)
    conv15_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv15_mbox_conf_reshape')(conv15_mbox_conf)
    conv16_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv16_mbox_conf_reshape')(conv16_mbox_conf)

    conv5_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv5_mbox_priorbox')(mobilenet_conv_pw_5_relu)
    conv11_mbox_priorbox = AnchorBoxes(img_height,
                                       img_width,
                                       this_scale=scales[1],
                                       next_scale=scales[2],
                                       aspect_ratios=aspect_ratios[1],
                                       two_boxes_for_ar1=two_boxes_for_ar1,
                                       this_steps=steps[1],
                                       this_offsets=offsets[1],
                                       clip_boxes=clip_boxes,
                                       variances=variances,
                                       coords=coords,
                                       normalize_coords=normalize_coords,
                                       name='conv11_mbox_priorbox')(conv11pw)
    conv13_mbox_priorbox = AnchorBoxes(img_height,
                                       img_width,
                                       this_scale=scales[2],
                                       next_scale=scales[3],
                                       aspect_ratios=aspect_ratios[2],
                                       two_boxes_for_ar1=two_boxes_for_ar1,
                                       this_steps=steps[2],
                                       this_offsets=offsets[2],
                                       clip_boxes=clip_boxes,
                                       variances=variances,
                                       coords=coords,
                                       normalize_coords=normalize_coords,
                                       name='conv13_mbox_priorbox')(conv13pw)
    conv14_mbox_priorbox = AnchorBoxes(img_height,
                                       img_width,
                                       this_scale=scales[3],
                                       next_scale=scales[4],
                                       aspect_ratios=aspect_ratios[3],
                                       two_boxes_for_ar1=two_boxes_for_ar1,
                                       this_steps=steps[3],
                                       this_offsets=offsets[3],
                                       clip_boxes=clip_boxes,
                                       variances=variances,
                                       coords=coords,
                                       normalize_coords=normalize_coords,
                                       name='conv14_mbox_priorbox')(conv14_2pw)
    conv15_mbox_priorbox = AnchorBoxes(img_height,
                                       img_width,
                                       this_scale=scales[4],
                                       next_scale=scales[5],
                                       aspect_ratios=aspect_ratios[4],
                                       two_boxes_for_ar1=two_boxes_for_ar1,
                                       this_steps=steps[4],
                                       this_offsets=offsets[4],
                                       clip_boxes=clip_boxes,
                                       variances=variances,
                                       coords=coords,
                                       normalize_coords=normalize_coords,
                                       name='conv15_mbox_priorbox')(conv15_2pw)
    conv16_mbox_priorbox = AnchorBoxes(img_height,
                                       img_width,
                                       this_scale=scales[5],
                                       next_scale=scales[6],
                                       aspect_ratios=aspect_ratios[5],
                                       two_boxes_for_ar1=two_boxes_for_ar1,
                                       this_steps=steps[5],
                                       this_offsets=offsets[5],
                                       clip_boxes=clip_boxes,
                                       variances=variances,
                                       coords=coords,
                                       normalize_coords=normalize_coords,
                                       name='conv16_mbox_priorbox')(conv16_2)

    conv5_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv5_mbox_priorbox_reshape')(conv5_mbox_priorbox)
    conv11_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv11_mbox_priorbox_reshape')(conv11_mbox_priorbox)
    conv13_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv13_mbox_priorbox_reshape')(conv13_mbox_priorbox)
    conv14_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv14_mbox_priorbox_reshape')(conv14_mbox_priorbox)
    conv15_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv15_mbox_priorbox_reshape')(conv15_mbox_priorbox)
    conv16_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv16_mbox_priorbox_reshape')(conv16_mbox_priorbox)

    mbox_loc = concatenate([
        conv5_mbox_loc_reshape, conv11_mbox_loc_reshape,
        conv13_mbox_loc_reshape, conv14_mbox_loc_reshape,
        conv15_mbox_loc_reshape, conv16_mbox_loc_reshape
    ],
                           axis=1,
                           name='mbox_loc')

    mbox_conf = concatenate([
        conv5_mbox_conf_reshape, conv11_mbox_conf_reshape,
        conv13_mbox_conf_reshape, conv14_mbox_conf_reshape,
        conv15_mbox_conf_reshape, conv16_mbox_conf_reshape
    ],
                            axis=1,
                            name='mbox_conf')

    mbox_priorbox = concatenate([
        conv5_mbox_priorbox_reshape, conv11_mbox_priorbox_reshape,
        conv13_mbox_priorbox_reshape, conv14_mbox_priorbox_reshape,
        conv15_mbox_priorbox_reshape, conv16_mbox_priorbox_reshape
    ],
                                axis=1,
                                name='mbox_priorbox')

    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    predictions = concatenate([mbox_conf_softmax, mbox_loc, mbox_priorbox],
                              axis=2,
                              name='predictions')

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            conv5_mbox_conf._keras_shape[1:3],
            conv11_mbox_conf._keras_shape[1:3],
            conv13_mbox_conf._keras_shape[1:3],
            conv14_mbox_conf._keras_shape[1:3],
            conv15_mbox_conf._keras_shape[1:3],
            conv16_mbox_conf._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #16
0
# Configure the decode detections layer based on the model mode
if model_mode == "inference":
    decode_layer = DecodeDetections(
        img_height=img_height,
        img_width=img_width,
        confidence_thresh=conf_thresh,
        iou_threshold=0.45,
        top_k=200,
        nms_max_output_size=400,
    )
if model_mode == "inference_fast":
    decode_layer = DecodeDetectionsFast(
        img_height=img_height,
        img_width=img_width,
        confidence_thresh=conf_thresh,
        iou_threshold=0.45,
        top_k=200,
        nms_max_output_size=400,
    )

# Finally load the model
model = load_model(
    weights_path,
    custom_objects={
        "AnchorBoxes": AnchorBoxes,
        "L2Normalization": L2Normalization,
        "DecodeDetections": decode_layer,
        "compute_loss": ssd_loss.compute_loss,
    },
)
コード例 #17
0
ファイル: keras_ssd300.py プロジェクト: CVBase-Bupt/SSD-keras
def ssd_300(image_size,
            n_classes,
            mode='training',
            l2_regularization=0.0005,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]],
            two_boxes_for_ar1=True,
            steps=[8, 16, 32, 64, 100, 300],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False):
    '''
    使用Keras构建SSD300模型,是VGG-16(a reduced atrous VGG-16)上进行改进

    如果你正在训练网络,这里传递的参数必须与用于设置`SSDBoxEncoder`的参数相同。
    如果您正在加载训练权重,则此处传递的参数必须与用于生成训练权重时的参数相同。
    一些参数之后会在`SSDBoxEncoder` 类的文档中详细介绍。
    提示: 需要Keras v2.0 或更新版本. 目前仅适用于TensorFlow后端(v1.0 或更新)。

    Arguments:
        image_size (tuple): 输入图像的尺寸`(height, width, channels)`.
        n_classes (int): 目标类别数量(不包含背景) e.g. 20 for Pascal VOC, 80 for MS COCO.
        mode (str, optional): (1)training 模型输出原始预测张量
        (2)inference (3)inference_fast 将原始预测解码为绝对坐标,并通过置信度阈值,非最大抑制和top-k过滤。
            后两种模式之间的区别在于“inference”严格遵循原始Caffe实现过程,而'inference_fast'使用更快的预测解码过程。
        l2_regularization (float, optional): L2正则化率,适用于所有卷积层,设置为零以停用L2正则化。
        min_scale (float, optional), max_scale (float, optional), scales (list, optional)
        aspect_ratios_per_layer (list, optional): 每层比例.
        two_boxes_for_ar1 (bool, optional): 比例为1时是否生成两种scale的anchor
        steps (list, optional): 预测层每两个像素之间的距离为原图多少距离,即下采样倍数
        offsets (list, optional): 分数 
        clip_boxes (bool, optional): 如果为`True`, 裁剪超出图像边界的default box
        variances (list, optional): box每个偏移值坐标会除以对应的variances
        coords (str, optional): centroids(cx, cy, w, h), minmax(xmin, xmax, ymin, ymax), corners(xmin, ymin, xmax, ymax).
        normalize_coords (bool, optional): 坐标标准化,即模型预测的box坐标在[0,1]之间(相对坐标)而不是绝对坐标。
        subtract_mean (array-like, optional):  减均值           
        divide_by_stddev (array-like, optional): 除以标准差
        swap_channels (list, optional): 'False'或整数列表,表示应交换的输入图像通道的所需顺序。
        confidence_thresh (float, optional): [0,1)之间的一个浮点数, nms   
        iou_threshold (float, optional):[0,1]之间的浮点数。与局部置信度分数最大的框之间的Jaccard相似性
            大于“iou_threshold”的框,将从给定类的预测集中移除。           
        top_k (int, optional): 每批次数据在非最大抑制阶段之后保留的最高评分预测的数量。
        nms_max_output_size (int, optional): NMS阶段后剩余的最大预测数.
        return_predictor_sizes (bool, optional): 
            如果为“True”,则此函数不仅返回模型,还返回包含用来预测的特征图的空间维度的列表。 
            不是必要的,因为可以通过Keras API轻松获得它们的大小,但是通过这种方式获取它们很方便且不易出错。 
            它们仅与训练相关(SSDBoxEncoder需要知道预测层的空间维度),inference时不需要。
    Returns:
        model: SSD300 网络.
        predictor_sizes (optional): 
            Numpy数组,包含每个预测特征图“(高度,宽度)”。 
            在训练期间,数据生成函数需要这样,以便将ground truth标签转换成与模型的输出张量相同结构的张量,参与损失函数计算
    References:
        https://arxiv.org/abs/1512.02325v5
    '''

    n_predictor_layers = 6  # 原始SSD300中用来预测的层是6.
    n_classes += 1  # 考虑到背景类.
    l2_reg = l2_regularization  # 让名字短一点.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # 一些异常.
    ############################################################################

    if aspect_ratios_per_layer is None:
        raise ValueError("`aspect_ratios_per_layer` cannot both be None.")
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # 没有具体的scaling factors,那就通过`min_scale`和`max_scale`计算
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # 计算default box的参数
    ############################################################################

    # 设置每一个预测层上box的比例,只有priorbox层需要这个参数。
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer

    # 计算每个预测层的每个单元预测的box数量。即预测层需要有多少个通道。
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) + 1)  # +1 宽高比1的第二个框
            else:
                n_boxes.append(len(ar))

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # 为下面的Lambda层定义函数。
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # 建立网络结构.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)

    conv1_1 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_1')(x1)
    conv1_2 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_2')(conv1_1)
    pool1 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool1')(conv1_2)

    conv2_1 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_1')(pool1)
    conv2_2 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_2')(conv2_1)
    pool2 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool2')(conv2_2)

    conv3_1 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_1')(pool2)
    conv3_2 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_2')(conv3_1)
    conv3_3 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_3')(conv3_2)
    pool3 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool3')(conv3_3)

    conv4_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_1')(pool3)
    conv4_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_2')(conv4_1)
    conv4_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_3')(conv4_2)
    pool4 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool4')(conv4_3)

    conv5_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_1')(pool4)
    conv5_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_2')(conv5_1)
    conv5_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_3')(conv5_2)
    pool5 = MaxPooling2D(pool_size=(3, 3),
                         strides=(1, 1),
                         padding='same',
                         name='pool5')(conv5_3)

    fc6 = Conv2D(1024, (3, 3),
                 dilation_rate=(6, 6),
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=l2(l2_reg),
                 name='fc6')(pool5)

    fc7 = Conv2D(1024, (1, 1),
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=l2(l2_reg),
                 name='fc7')(fc6)

    conv6_1 = Conv2D(256, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv6_1')(fc7)
    conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv6_padding')(conv6_1)
    conv6_2 = Conv2D(512, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv6_2')(conv6_1)

    conv7_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv7_1')(conv6_2)
    conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv7_padding')(conv7_1)
    conv7_2 = Conv2D(256, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv7_2')(conv7_1)

    conv8_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv8_1')(conv7_2)
    conv8_2 = Conv2D(256, (3, 3),
                     strides=(1, 1),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv8_2')(conv8_1)

    conv9_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv9_1')(conv8_2)
    conv9_2 = Conv2D(256, (3, 3),
                     strides=(1, 1),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv9_2')(conv9_1)

    # 将conv4_3传入L2 normalization层处理
    conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)

    ###在基础网络上建立卷积预测层

    # 每个box预测`n_classes` 类别, n_boxes * n_classes
    # 预测类别: `(batch, height, width, n_boxes * n_classes)`
    conv4_3_norm_mbox_conf = Conv2D(
        n_boxes[0] * n_classes, (3, 3),
        padding='same',
        kernel_initializer='he_normal',
        kernel_regularizer=l2(l2_reg),
        name='conv4_3_norm_mbox_conf')(conv4_3_norm)
    fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3),
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='fc7_mbox_conf')(fc7)
    conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv6_2_mbox_conf')(conv6_2)
    conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv7_2_mbox_conf')(conv7_2)
    conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv8_2_mbox_conf')(conv8_2)
    conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv9_2_mbox_conf')(conv9_2)

    # 每个box预测4个坐标,  n_boxes * 4
    # 预测位置: `(batch, height, width, n_boxes * 4)`
    conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv4_3_norm_mbox_loc')(conv4_3_norm)
    fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3),
                          padding='same',
                          kernel_initializer='he_normal',
                          kernel_regularizer=l2(l2_reg),
                          name='fc7_mbox_loc')(fc7)
    conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv6_2_mbox_loc')(conv6_2)
    conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv7_2_mbox_loc')(conv7_2)
    conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv8_2_mbox_loc')(conv8_2)
    conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv9_2_mbox_loc')(conv9_2)

    ### 生成priorbox(default boxes)
    # priorbox: `(batch, height, width, n_boxes, 8)`
    conv4_3_norm_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
    fc7_mbox_priorbox = AnchorBoxes(img_height,
                                    img_width,
                                    this_scale=scales[1],
                                    next_scale=scales[2],
                                    aspect_ratios=aspect_ratios[1],
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    this_steps=steps[1],
                                    this_offsets=offsets[1],
                                    clip_boxes=clip_boxes,
                                    variances=variances,
                                    coords=coords,
                                    normalize_coords=normalize_coords,
                                    name='fc7_mbox_priorbox')(fc7_mbox_loc)
    conv6_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[2],
        next_scale=scales[3],
        aspect_ratios=aspect_ratios[2],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[2],
        this_offsets=offsets[2],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
    conv7_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[3],
        next_scale=scales[4],
        aspect_ratios=aspect_ratios[3],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[3],
        this_offsets=offsets[3],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
    conv8_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[4],
        next_scale=scales[5],
        aspect_ratios=aspect_ratios[4],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[4],
        this_offsets=offsets[4],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
    conv9_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[5],
        next_scale=scales[6],
        aspect_ratios=aspect_ratios[5],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[5],
        this_offsets=offsets[5],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)

    ### Reshape

    # Reshape类别预测, 得到三维张量`(batch, height * width * n_boxes, n_classes)`
    # 希望把类别分数独立到最后一维,进行softmax处理
    conv4_3_norm_mbox_conf_reshape = Reshape(
        (-1, n_classes),
        name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
    fc7_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
    conv6_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
    conv7_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
    conv8_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
    conv9_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)

    # Reshape偏移值, 得到3维张量 `(batch, height * width * n_boxes, 4)`
    # 我们希望把四个坐标值拿出来计算损失
    conv4_3_norm_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
    fc7_mbox_loc_reshape = Reshape((-1, 4),
                                   name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
    conv6_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
    conv7_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
    conv8_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
    conv9_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)

    # Reshape默认框, 得到3维张量 `(batch, height * width * n_boxes, 8)`
    conv4_3_norm_mbox_priorbox_reshape = Reshape(
        (-1, 8),
        name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
    fc7_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
    conv6_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
    conv7_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
    conv8_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
    conv9_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)

    ### 拼接不同层的预测结果
    # 第0维 (batch)和第 2 维(类别数或者4)所有预测层都是相同的,
    # 选择沿第1维拼接,即每一层的box数量

    # 类别 box_conf: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([
        conv4_3_norm_mbox_conf_reshape, fc7_mbox_conf_reshape,
        conv6_2_mbox_conf_reshape, conv7_2_mbox_conf_reshape,
        conv8_2_mbox_conf_reshape, conv9_2_mbox_conf_reshape
    ])

    #偏移值 mbox_loc: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([
        conv4_3_norm_mbox_loc_reshape, fc7_mbox_loc_reshape,
        conv6_2_mbox_loc_reshape, conv7_2_mbox_loc_reshape,
        conv8_2_mbox_loc_reshape, conv9_2_mbox_loc_reshape
    ])

    #默认框 mbox_priorbox: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([
        conv4_3_norm_mbox_priorbox_reshape, fc7_mbox_priorbox_reshape,
        conv6_2_mbox_priorbox_reshape, conv7_2_mbox_priorbox_reshape,
        conv8_2_mbox_priorbox_reshape, conv9_2_mbox_priorbox_reshape
    ])

    # 这些预测值接下来就会输入损失函数参与训练
    # 但是对于类别预测,我们先要经过一个softmax激活层
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    # 拼接类别,偏移值预测,以及默认框,预测: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)

    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)

    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)

    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            conv4_3_norm_mbox_conf._keras_shape[1:3],
            fc7_mbox_conf._keras_shape[1:3],
            conv6_2_mbox_conf._keras_shape[1:3],
            conv7_2_mbox_conf._keras_shape[1:3],
            conv8_2_mbox_conf._keras_shape[1:3],
            conv9_2_mbox_conf._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #18
0
def FCOS(
    image_size,
    n_classes,
    mode='training',
    l2_regularization=0.0005,
    clip_boxes_boundary=True,
    subtract_mean=[123, 117, 104],
    divide_by_stddev=None,
    swap_channels=[2, 1, 0],
    anchor_stride_list=[8, 16, 32, 64, 128],
    confidence_thresh=0.01,
    iou_threshold=0.45,
    top_k=200,
    nms_max_output_size=400,
    # batch_size = 16
):
    # FCOS NET

    # n_classes += 1 # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    def linspace(self, start, end, num):
        return np.array(np.linspace(start, end, num), np.float32)

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)

    outputs = ResnetBuilder.build_resnet_50(
        x1, (img_channels, img_height, img_width), n_classes)
    # batch_size = tf.shape(x1)[0]
    # model.compile(loss="categorical_crossentropy", optimizer="sgd")
    C3, C4, C5 = outputs[1:]

    # upsample C5 to get P5 from the FPN paper
    P5 = Conv2D(256,
                kernel_size=1,
                strides=1,
                padding='same',
                name='C5_reduced',
                kernel_initializer='he_normal')(C5)
    # 从上采样改进为deconv,参考DSSD
    P5_upsampled = Conv2DTranspose(256, (2, 2),
                                   strides=(2, 2),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   name='P5_upsampled')(P5)
    # P5_upsampled = UpsampleLike(name='P5_upsampled')([P5, C4])
    P5 = keras.layers.Conv2D(256,
                             kernel_size=3,
                             strides=1,
                             padding='same',
                             name='P5',
                             kernel_initializer='he_normal')(P5)

    # add P5 elementwise to C4
    P4 = keras.layers.Conv2D(256,
                             kernel_size=1,
                             strides=1,
                             padding='same',
                             name='C4_reduced',
                             kernel_initializer='he_normal')(C4)
    P4 = keras.layers.Add(name='P4_merged')([P5_upsampled, P4])
    # P4_upsampled = UpsampleLike(name='P4_upsampled')([P4, C3])
    P4_upsampled = Conv2DTranspose(256, (2, 2),
                                   strides=(2, 2),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   name='P4_upsampled')(P4)
    P4 = keras.layers.Conv2D(256,
                             kernel_size=3,
                             strides=1,
                             padding='same',
                             name='P4',
                             kernel_initializer='he_normal')(P4)

    # add P4 elementwise to C3
    P3 = keras.layers.Conv2D(256,
                             kernel_size=1,
                             strides=1,
                             padding='same',
                             name='C3_reduced',
                             kernel_initializer='he_normal')(C3)
    P3 = keras.layers.Add(name='P3_merged')([P4_upsampled, P3])
    P3 = keras.layers.Conv2D(256,
                             kernel_size=3,
                             strides=1,
                             padding='same',
                             name='P3',
                             kernel_initializer='he_normal')(P3)

    # "P6 is obtained via a 3x3 stride-2 conv on C5"
    P6 = keras.layers.Conv2D(256,
                             kernel_size=3,
                             strides=2,
                             padding='same',
                             name='P6',
                             kernel_initializer='he_normal')(C5)

    # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6"
    P7 = keras.layers.Activation('relu', name='C6_relu')(P6)
    P7 = keras.layers.Conv2D(256,
                             kernel_size=3,
                             strides=2,
                             padding='same',
                             name='P7',
                             kernel_initializer='he_normal')(P7)

    ### Build the subnet class predicts
    # class
    options = {
        'kernel_size': 3,
        'strides': 1,
        'padding': 'same',
    }
    subnet_class_scores = []
    subnet_class_prob = []
    rpn_cnt_scores = []
    # subnet_loc = []
    rpn_bbox = []
    output = [P3, P4, P5, P6, P7]
    for index, input in enumerate(output):
        subnet_tmp = input
        for i in range(4):
            subnet_tmp = keras.layers.Conv2D(
                filters=256,
                activation='relu',
                name='subnet_P{}_tmp{}'.format(index, i),
                kernel_initializer=cfgs.SUBNETS_WEIGHTS_INITIALIZER,
                bias_initializer=cfgs.SUBNETS_BIAS_INITIALIZER,
                **options)(subnet_tmp)
        subnet_tmp1 = keras.layers.Conv2D(
            filters=n_classes,
            kernel_initializer=cfgs.FINAL_CONV_WEIGHTS_INITIALIZER,
            bias_initializer=cfgs.FINAL_CONV_BIAS_INITIALIZER,
            name='pyramid_class_P{}'.format(index),
            **options)(subnet_tmp)
        # rpn_box_scores = Lambda(lambda inputs: tf.reshape(inputs,
        #                                      [batch_size, -1, n_classes], name='pyramid_class_P{}_reshape'.format(index)))(subnet_tmp1)
        rpn_box_scores = Reshape(
            (-1, n_classes),
            name='pyramid_class_P{}_reshape'.format(index))(subnet_tmp1)

        # rpn_box_scores = tf.reshape(subnet_tmp1,[batch_size, -1, n_classes], name='pyramid_class_P{}_reshape'.format(index))
        # rpn_box_probs2 = tf.nn.sigmoid(rpn_box_scores, name='pyramid_class_P{}_sigmoid'.format(index))
        # rpn_box_probs = Lambda(lambda inputs:tf.nn.sigmoid(inputs, name='pyramid_class_P{}_sigmoid'.format(index))(rpn_box_scores))
        rpn_box_probs = Activation(
            'sigmoid',
            name='pyramid_class_P{}_sigmoid'.format(index))(rpn_box_scores)
        subnet_class_scores.append(rpn_box_scores)
        subnet_class_prob.append(rpn_box_probs)
        ##center ness
        subnet_tmp1 = keras.layers.Conv2D(
            filters=1,
            kernel_initializer=cfgs.SUBNETS_WEIGHTS_INITIALIZER,
            bias_initializer=cfgs.SUBNETS_BIAS_INITIALIZER,
            name='pyramid_center_P{}'.format(index),
            **options)(subnet_tmp)
        # subnet_tmp = tf.reshape(subnet_tmp1, [batch_size, -1],
        #                             name='pyramid_center_P{}_reshape'.format(index))
        # subnet_tmp = Lambda(lambda inputs: tf.reshape(inputs,[batch_size, -1], name='pyramid_center_P{}_reshape'.format(index)))(subnet_tmp1)
        subnet_tmp = Reshape(
            (-1, 1),
            name='pyramid_center_P{}_reshape'.format(index))(subnet_tmp1)

        rpn_cnt_scores.append(subnet_tmp)
        # loc
        subnet_loc_tmp = input
        for i in range(4):
            subnet_loc_tmp = keras.layers.Conv2D(
                filters=256,
                activation='relu',
                name='subnet_loc_P{}_tmp{}'.format(index, i),
                kernel_initializer=cfgs.SUBNETS_WEIGHTS_INITIALIZER,
                bias_initializer=cfgs.SUBNETS_BIAS_INITIALIZER,
                **options)(subnet_loc_tmp)
        subnet_loc_tmp = keras.layers.Conv2D(
            filters=4,
            kernel_initializer=cfgs.SUBNETS_WEIGHTS_INITIALIZER,
            bias_initializer=cfgs.SUBNETS_BIAS_INITIALIZER,
            name='pyramid_loc_P{}'.format(index),
            **options)(subnet_loc_tmp)

        # rpn_box_offset = Lambda(lambda inputs:  keras.backend.exp(inputs) * anchor_stride_list[index])(subnet_loc_tmp)

        rpn_box_offset = keras_exp_stride(
            stride=anchor_stride_list[index])(subnet_loc_tmp)
        rpn_box_offset = get_rpn_bbox(
            stride=anchor_stride_list[index])(rpn_box_offset)
        rpn_box_offset = Reshape(
            (-1, 4), name='rpn_box_P{}_reshape'.format(index))(rpn_box_offset)
        # rpn_box_offset = Lambda(lambda inputs: tf.reshape(inputs,[batch_size, -1,4], name='rpn_box_P{}_reshape'.format(index)))(rpn_box_offset)
        rpn_bbox.append(rpn_box_offset)

    # subnet_class_scores = tf.concat(subnet_class_scores, axis=1)
    # subnet_class_scores = Lambda(lambda inputs:  tf.concat(inputs, axis=1))(subnet_class_scores)
    subnet_class_scores = Concatenate(axis=1)(subnet_class_scores)
    subnet_class_prob = Concatenate(axis=1)(subnet_class_prob)
    # subnet_class_prob = Lambda(lambda inputs: tf.concat(inputs, axis=1))(subnet_class_prob)
    # subenet_center = Concatenate(axis=1)(subenet_center)
    rpn_cnt_scores = Concatenate(axis=1)(rpn_cnt_scores)
    rpn_cnt_prob = Activation('sigmoid',
                              name='rpn_cnt_prob_sigmoid')(rpn_cnt_scores)
    # rpn_cnt_prob = Lambda(lambda inputs:tf.expand_dims(inputs, axis=2))(rpn_cnt_prob)
    # 给每一类输出都分配相应的盒子中心概率值
    rpn_cnt_prob = keras_boardcoast(n_classes=n_classes)(rpn_cnt_prob)
    # rpn_cnt_scores = Lambda(lambda inputs: tf.expand_dims(inputs, axis=2))(rpn_cnt_scores)
    rpn_bbox = Concatenate(axis=1)(rpn_bbox)

    # rpn_bbox = Lambda(lambda inputs: tf.concat(inputs, axis=1))(rpn_bbox)

    # predictions :batch size,total netural,(nclass_scores,nclass_prob,nclass_cnt_prob,xmin,ymin,xmax,ymax)
    predictions = Concatenate(axis=2, name='predictions')([
        subnet_class_scores, subnet_class_prob, rpn_cnt_scores, rpn_cnt_prob,
        rpn_bbox
    ])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            clip_boxes_boundary=clip_boxes_boundary,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            clip_boxes_boundary=clip_boxes_boundary,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions_fast')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    return model
コード例 #19
0
def ssd_300(image_size,
            n_classes,
            input_tensor=None,
            mode='training',
            groups=3,
            scale_factor=1.0,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_global=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]],
            two_boxes_for_ar1=True,
            steps=[8, 16, 32, 64, 100, 300],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False):
    '''
    Build a Keras model with SSD300 architecture, see references.

    The base network is a reduced atrous VGG-16, extended by the SSD architecture,
    as described in the paper.

    Most of the arguments that this function takes are only needed for the anchor
    box layers. In case you're training the network, the parameters passed here must
    be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
    trained weights, the parameters passed here must be the same as the ones used
    to produce the trained weights.

    Some of these arguments are explained in more detail in the documentation of the
    `SSDBoxEncoder` class.

    Note: Requires Keras v2.0 or later. Currently works only with the
    TensorFlow backend (v1.0 or later).

    Arguments:
        image_size (tuple): The input image size in the format `(height, width, channels)`.
        input_tensor: Tensor with shape (batch, height, width, channels)
        n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
        mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
            the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
            the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
            non-maximum suppression, and top-k filtering. The difference between latter two modes is that
            'inference' follows the exact procedure of the original Caffe implementation, while
            'inference_fast' uses a faster prediction decoding procedure.
        min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images.
        max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images. All scaling factors between the smallest and the
            largest will be linearly interpolated. Note that the second to last of the linearly interpolated
            scaling factors will actually be the scaling factor for the last predictor layer, while the last
            scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
            if `two_boxes_for_ar1` is `True`.
        scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
            This list must be one element longer than the number of predictor layers. The first `k` elements are the
            scaling factors for the `k` predictor layers, while the last element is used for the second box
            for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
            last scaling factor must be passed either way, even if it is not being used. If a list is passed,
            this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
        aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
            generated. This list is valid for all prediction layers.
        aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
            This allows you to set the aspect ratios for each predictor layer individually, which is the case for the
            original SSD300 implementation. If a list is passed, it overrides `aspect_ratios_global`.
        two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
            If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
            using the scaling factor for the respective layer, the second one will be generated using
            geometric mean of said scaling factor and next bigger scaling factor.
        steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
            pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
            the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
            If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
            If no steps are provided, then they will be computed such that the anchor box center points will form an
            equidistant grid within the image dimensions.
        offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either floats or tuples of two floats. These numbers represent for each predictor layer how many
            pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
            as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
            of the step size specified in the `steps` argument. If the list contains floats, then that value will
            be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
            `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
        clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
        variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
            its respective variance value.
        coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
            of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
            and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
            i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
        subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
            of any shape that is broadcast-compatible with the image shape. The elements of this array will be
            subtracted from the image pixel intensity values. For example, pass a list of three integers
            to perform per-channel mean normalization for color images.
        divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
            floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
            intensity values will be divided by the elements of this array. For example, pass a list
            of three integers to perform per-channel standard deviation normalization for color images.
        swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
            image channels should be swapped.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
            positive class in order to be considered for the non-maximum suppression stage for the respective class.
            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
            stage, while a larger value will result in a larger part of the selection process happening in the confidence
            thresholding stage.
        iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
            to the box's confidence score.
        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
            non-maximum suppression stage.
        nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
        return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
            a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
            you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
            to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
            spatial dimensions of the predictor layers), for inference you don't need them.

    Returns:
        model: The Keras SSD300 model.
        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
            of the output tensor shape for each convolutional predictor layer. During
            training, the generator function needs this in order to transform
            the ground truth labels into tensors of identical structure as the
            output tensors of the model, which is in turn needed for the cost
            function.

    References:
        https://arxiv.org/abs/1512.02325v5
    '''

    n_predictor_layers = 6  # The number of predictor conv layers in the network is 6 for the original SSD300.
    n_classes += 1  # Account for the background class.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    #############################################################################
    # Functions for Shufflenetv1 architeture
    #############################################################################

    def _shuffle_unit(inputs,
                      in_channels,
                      out_channels,
                      groups,
                      bottleneck_ratio,
                      strides=2,
                      stage=1,
                      block=1,
                      linear=False):
        """
        creates a shuffleunit
        Parameters
        ----------
        inputs:
            Input tensor of with `channels_last` data format
        in_channels:
            number of input channels
        out_channels:
            number of output channels
        strides:
            An integer or tuple/list of 2 integers,
            specifying the strides of the convolution along the width and height.
        groups: int(1)
            number of groups per channel
        bottleneck_ratio: float
            bottleneck ratio implies the ratio of bottleneck channels to output channels.
            For example, bottleneck ratio = 1 : 4 means the output feature map is 4 times
            the width of the bottleneck feature map.
        stage: int(1)
            stage number
        block: int(1)
            block number
        Returns
        -------
        """
        if K.image_data_format() == 'channels_last':
            bn_axis = -1
        else:
            bn_axis = 1

        prefix = 'stage%d/block%d' % (stage, block)

        # default: 1/4 of the output channel of a ShuffleNet Unit
        bottleneck_channels = int(out_channels * bottleneck_ratio)
        groups = (1 if stage == 2 and block == 1 else groups)

        x = _group_conv(inputs,
                        in_channels,
                        out_channels=bottleneck_channels,
                        groups=(1 if stage == 2 and block == 1 else groups),
                        name='%s/1x1_gconv_1' % prefix)
        x = BatchNormalization(axis=bn_axis, name='%s/bn_gconv_1' % prefix)(x)
        x = Activation('relu', name='%s/relu_gconv_1' % prefix)(x)

        x = Lambda(channel_shuffle,
                   arguments={'groups': groups},
                   name='%s/channel_shuffle' % prefix)(x)
        x = DepthwiseConv2D(kernel_size=(3, 3),
                            padding="same",
                            use_bias=False,
                            strides=strides,
                            name='%s/1x1_dwconv_1' % prefix)(x)
        x = BatchNormalization(axis=bn_axis, name='%s/bn_dwconv_1' % prefix)(x)

        x = _group_conv(
            x,
            bottleneck_channels,
            out_channels=out_channels if strides == 1 else out_channels -
            in_channels,
            groups=groups,
            name='%s/1x1_gconv_2' % prefix)
        x = BatchNormalization(axis=bn_axis, name='%s/bn_gconv_2' % prefix)(x)

        if strides < 2:
            ret = Add(name='%s/add' % prefix)([x, inputs])
        else:
            avg = AveragePooling2D(pool_size=3,
                                   strides=2,
                                   padding='same',
                                   name='%s/avg_pool' % prefix)(inputs)
            ret = Concatenate(bn_axis, name='%s/concat' % prefix)([x, avg])

        if linear:
            return Activation('relu', name='%s/relu_out' % prefix)(ret), ret

        ret = Activation('relu', name='%s/relu_out' % prefix)(ret)

        return ret

    def _group_conv(x,
                    in_channels,
                    out_channels,
                    groups,
                    kernel=1,
                    stride=1,
                    name=''):
        """
        grouped convolution


        Parameters
        ----------
        x:
            Input tensor of with `channels_last` data format
        in_channels:
            number of input channels
        out_channels:
            number of output channels
        groups:
            number of groups per channel
        kernel: int(1)
            An integer or tuple/list of 2 integers, specifying the
            width and height of the 2D convolution window.
            Can be a single integer to specify the same value for
            all spatial dimensions.
        stride: int(1)
            An integer or tuple/list of 2 integers,
            specifying the strides of the convolution along the width and height.
            Can be a single integer to specify the same value for all spatial dimensions.
        name: str
            A string to specifies the layer name

        Returns
        -------

        """
        if groups == 1:
            return Conv2D(filters=out_channels,
                          kernel_size=kernel,
                          padding='same',
                          use_bias=False,
                          strides=stride,
                          name=name)(x)

        # number of intput channels per group
        ig = in_channels // groups
        group_list = []

        assert out_channels % groups == 0

        for i in range(groups):
            offset = i * ig
            group = Lambda(lambda z: z[:, :, :, offset:offset + ig],
                           name='%s/g%d_slice' % (name, i))(x)
            group_list.append(
                Conv2D(int(0.5 + out_channels / groups),
                       kernel_size=kernel,
                       strides=stride,
                       use_bias=False,
                       padding='same',
                       name='%s_/g%d' % (name, i))(group))
        return Concatenate(name='%s/concat' % name)(group_list)

    def channel_shuffle(x, groups):
        """

        Parameters
        ----------
        x:
            Input tensor of with `channels_last` data format
        groups: int
            number of groups per channel


        Returns
        -------
            channel shuffled output tensor


        Examples
        --------
        Example for a 1D Array with 3 groups

        >>> d = np.array([0,1,2,3,4,5,6,7,8])
        >>> x = np.reshape(d, (3,3))
        >>> x = np.transpose(x, [1,0])
        >>> x = np.reshape(x, (9,))
        '[0 1 2 3 4 5 6 7 8] --> [0 3 6 1 4 7 2 5 8]'


        """
        height, width, in_channels = x.shape.as_list()[1:]
        channels_per_group = in_channels // groups

        x = K.reshape(x, [-1, height, width, groups, channels_per_group])
        x = K.permute_dimensions(x, (0, 1, 2, 4, 3))  # transpose
        x = K.reshape(x, [-1, height, width, in_channels])

        return x

    def _conv_blockSSD(inputs, filters, block_id=11):
        channel_axis = -1
        x = ZeroPadding2D(padding=(1, 1),
                          name='conv_pad_%d_1' % block_id)(inputs)
        x = Conv2D(filters, (1, 1),
                   padding='valid',
                   use_bias=False,
                   strides=(1, 1),
                   name='conv__%d_1' % block_id)(x)
        x = BatchNormalization(axis=channel_axis,
                               name='conv_%d_bn_1' % block_id)(x)
        x = Activation('relu', name='conv_%d_relu_1' % block_id)(x)
        Conv = Conv2D(filters * 2, (3, 3),
                      padding='valid',
                      use_bias=False,
                      strides=(2, 2),
                      name='conv__%d_2' % block_id)(x)
        x = BatchNormalization(axis=channel_axis,
                               name='conv_%d_bn_2' % block_id)(Conv)
        x = Activation('relu', name='conv_%d_relu_2' % block_id)(x)
        return x, Conv

    ############################################################################
    # Build the network.
    ############################################################################

    if input_tensor != None:
        x = Input(tensor=input_tensor,
                  shape=(img_height, img_width, img_channels))
    else:
        x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)

    num_shuffle_units = list([3, 7, 3])
    out_dim_stage_two = {1: 144, 2: 200, 3: 240, 4: 272, 8: 384}
    exp = np.insert(np.arange(0, len(num_shuffle_units), dtype=np.float32), 0,
                    0)
    out_channels_in_stage = 2**exp
    out_channels_in_stage *= out_dim_stage_two[
        groups]  # calculate output channels for each stage
    out_channels_in_stage *= scale_factor
    out_channels_in_stage[0] = 24  # first stage has always 24 output channels
    out_channels_in_stage = out_channels_in_stage.astype(int)

    # Get shufflenet architecture
    shufflenetv1 = shufflenet.ShuffleNet(groups=groups,
                                         scale_factor=scale_factor,
                                         input_shape=(img_height, img_width,
                                                      img_channels),
                                         include_top=False)
    FeatureExtractor = Model(
        inputs=shufflenetv1.input,
        outputs=shufflenetv1.get_layer('stage3/block8/add').output)

    # Stage 3 last block unit
    shuffle_unit13 = FeatureExtractor(x1)
    layer = Activation('relu', name='stage3/block8/relu_out')(shuffle_unit13)

    layer = _shuffle_unit(layer,
                          in_channels=out_channels_in_stage[4 - 2],
                          out_channels=out_channels_in_stage[4 - 1],
                          strides=2,
                          groups=groups,
                          bottleneck_ratio=0.25,
                          stage=4,
                          block=1)
    layer = _shuffle_unit(layer,
                          in_channels=out_channels_in_stage[4 - 1],
                          out_channels=out_channels_in_stage[4 - 1],
                          strides=1,
                          groups=groups,
                          bottleneck_ratio=0.25,
                          stage=4,
                          block=2)
    layer = _shuffle_unit(layer,
                          in_channels=out_channels_in_stage[4 - 1],
                          out_channels=out_channels_in_stage[4 - 1],
                          strides=1,
                          groups=groups,
                          bottleneck_ratio=0.25,
                          stage=4,
                          block=3)
    layer, shuffle_unit17 = _shuffle_unit(
        layer,
        in_channels=out_channels_in_stage[4 - 1],
        out_channels=out_channels_in_stage[4 - 1],
        strides=1,
        groups=groups,
        bottleneck_ratio=0.25,
        stage=4,
        block=4,
        linear=True)

    layer, conv18_2 = _conv_blockSSD(layer, 256, block_id=18)
    layer, conv19_2 = _conv_blockSSD(layer, 128, block_id=19)
    layer, conv20_2 = _conv_blockSSD(layer, 128, block_id=20)
    layer, conv21_2 = _conv_blockSSD(layer, 64, block_id=21)

    ### Build the convolutional predictor layers on top of the base network

    # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
    # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
    conv13_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3),
                              padding='same',
                              name='conv13_mbox_conf')(shuffle_unit13)
    conv17_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3),
                              padding='same',
                              name='conv17_mbox_conf')(shuffle_unit17)
    conv18_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3),
                                padding='same',
                                name='conv18_2_mbox_conf')(conv18_2)
    conv19_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3),
                                padding='same',
                                name='conv19_2_mbox_conf')(conv19_2)
    conv20_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3),
                                padding='same',
                                name='conv20_2_mbox_conf')(conv20_2)
    conv21_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3),
                                padding='same',
                                name='conv21_2_mbox_conf')(conv21_2)

    # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
    # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
    conv13_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3),
                             padding='same',
                             name='conv13_mbox_loc')(shuffle_unit13)
    conv17_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3),
                             padding='same',
                             name='conv17_mbox_loc')(shuffle_unit17)
    conv18_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3),
                               padding='same',
                               name='conv18_2_mbox_loc')(conv18_2)
    conv19_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3),
                               padding='same',
                               name='conv19_2_mbox_loc')(conv19_2)
    conv20_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3),
                               padding='same',
                               name='conv20_2_mbox_loc')(conv20_2)
    conv21_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3),
                               padding='same',
                               name='conv21_2_mbox_loc')(conv21_2)

    ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)

    # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
    conv13_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv13_mbox_priorbox')(conv13_mbox_loc)
    conv17_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[1],
        next_scale=scales[2],
        aspect_ratios=aspect_ratios[1],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[1],
        this_offsets=offsets[1],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv17_mbox_priorbox')(conv17_mbox_loc)
    conv18_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[2],
        next_scale=scales[3],
        aspect_ratios=aspect_ratios[2],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[2],
        this_offsets=offsets[2],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv18_2_mbox_priorbox')(conv18_2_mbox_loc)
    conv19_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[3],
        next_scale=scales[4],
        aspect_ratios=aspect_ratios[3],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[3],
        this_offsets=offsets[3],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv19_2_mbox_priorbox')(conv19_2_mbox_loc)
    conv20_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[4],
        next_scale=scales[5],
        aspect_ratios=aspect_ratios[4],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[4],
        this_offsets=offsets[4],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv20_2_mbox_priorbox')(conv20_2_mbox_loc)
    conv21_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[5],
        next_scale=scales[6],
        aspect_ratios=aspect_ratios[5],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[5],
        this_offsets=offsets[5],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv21_2_mbox_priorbox')(conv21_2_mbox_loc)

    ### Reshape

    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    conv13_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv13_mbox_conf_reshape')(conv13_mbox_conf)
    conv17_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv17_mbox_conf_reshape')(conv17_mbox_conf)
    conv18_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv18_2_mbox_conf_reshape')(conv18_2_mbox_conf)
    conv19_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv19_2_mbox_conf_reshape')(conv19_2_mbox_conf)
    conv20_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv20_2_mbox_conf_reshape')(conv20_2_mbox_conf)
    conv21_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv21_2_mbox_conf_reshape')(conv21_2_mbox_conf)

    # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    conv13_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv13_mbox_loc_reshape')(conv13_mbox_loc)
    conv17_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv17_mbox_loc_reshape')(conv17_mbox_loc)
    conv18_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv18_2_mbox_loc_reshape')(conv18_2_mbox_loc)
    conv19_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv19_2_mbox_loc_reshape')(conv19_2_mbox_loc)
    conv20_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv20_2_mbox_loc_reshape')(conv20_2_mbox_loc)
    conv21_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv21_2_mbox_loc_reshape')(conv21_2_mbox_loc)

    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
    conv13_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv13_mbox_priorbox_reshape')(conv13_mbox_priorbox)
    conv17_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv17_mbox_priorbox_reshape')(conv17_mbox_priorbox)
    conv18_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv18_2_mbox_priorbox_reshape')(conv18_2_mbox_priorbox)
    conv19_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv19_2_mbox_priorbox_reshape')(conv19_2_mbox_priorbox)
    conv20_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv20_2_mbox_priorbox_reshape')(conv20_2_mbox_priorbox)
    conv21_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv21_2_mbox_priorbox_reshape')(conv21_2_mbox_priorbox)

    ### Concatenate the predictions from the different layers

    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1, the number of boxes per layer
    # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([
        conv13_mbox_conf_reshape, conv17_mbox_conf_reshape,
        conv18_2_mbox_conf_reshape, conv19_2_mbox_conf_reshape,
        conv20_2_mbox_conf_reshape, conv21_2_mbox_conf_reshape
    ])

    # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([
        conv13_mbox_loc_reshape, conv17_mbox_loc_reshape,
        conv18_2_mbox_loc_reshape, conv19_2_mbox_loc_reshape,
        conv20_2_mbox_loc_reshape, conv21_2_mbox_loc_reshape
    ])

    # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([
        conv13_mbox_priorbox_reshape, conv17_mbox_priorbox_reshape,
        conv18_2_mbox_priorbox_reshape, conv19_2_mbox_priorbox_reshape,
        conv20_2_mbox_priorbox_reshape, conv21_2_mbox_priorbox_reshape
    ])

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    # Concatenate the class and box predictions and the anchors to one large predictions vector
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            #normalize_coords=normalize_coords, #change this parameter for inference
            normalize_coords=False,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    return model
コード例 #20
0
def ssd_512(image_size,
            n_classes,
            mode='training',
            l2_regularization=0.0005,
            min_scale=None,
            max_scale=None,
            scales=None,
            aspect_ratios_global=None,
            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                                     [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]],
            two_boxes_for_ar1=True,
            steps=[4, 8, 16, 32, 64, 128],
            offsets=None,
            clip_boxes=False,
            variances=[0.1, 0.1, 0.2, 0.2],
            coords='centroids',
            normalize_coords=True,
            subtract_mean=[123, 117, 104],
            divide_by_stddev=None,
            swap_channels=[2, 1, 0],
            confidence_thresh=0.01,
            iou_threshold=0.45,
            top_k=200,
            nms_max_output_size=400,
            return_predictor_sizes=False):
    '''
    Build a Keras model with SSD512 architecture, see references.

    The base network is a reduced atrous VGG-16, extended by the SSD architecture,
    as described in the paper.

    Most of the arguments that this function takes are only needed for the anchor
    box layers. In case you're training the network, the parameters passed here must
    be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
    trained weights, the parameters passed here must be the same as the ones used
    to produce the trained weights.

    Some of these arguments are explained in more detail in the documentation of the
    `SSDBoxEncoder` class.

    Note: Requires Keras v2.0 or later. Currently works only with the
    TensorFlow backend (v1.0 or later).

    Arguments:
        image_size (tuple): The input image size in the format `(height, width, channels)`.
        n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
        mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
            the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
            the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
            non-maximum suppression, and top-k filtering. The difference between latter two modes is that
            'inference' follows the exact procedure of the original Caffe implementation, while
            'inference_fast' uses a faster prediction decoding procedure.
        l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
            Set to zero to deactivate L2-regularization.
        min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images.
        max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
            of the shorter side of the input images. All scaling factors between the smallest and the
            largest will be linearly interpolated. Note that the second to last of the linearly interpolated
            scaling factors will actually be the scaling factor for the last predictor layer, while the last
            scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
            if `two_boxes_for_ar1` is `True`.
        scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
            This list must be one element longer than the number of predictor layers. The first `k` elements are the
            scaling factors for the `k` predictor layers, while the last element is used for the second box
            for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
            last scaling factor must be passed either way, even if it is not being used.
            If a list is passed, this argument overrides `min_scale` and `max_scale`. All scaling factors
            must be greater than zero.
        aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
            generated. This list is valid for all prediction layers.
        aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
            This allows you to set the aspect ratios for each predictor layer individually, which is the case for the
            original SSD512 implementation. If a list is passed, it overrides `aspect_ratios_global`.
        two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
            If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
            using the scaling factor for the respective layer, the second one will be generated using
            geometric mean of said scaling factor and next bigger scaling factor.
        steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
            pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
            the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
            If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
            If no steps are provided, then they will be computed such that the anchor box center points will form an
            equidistant grid within the image dimensions.
        offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
            either floats or tuples of two floats. These numbers represent for each predictor layer how many
            pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
            as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
            of the step size specified in the `steps` argument. If the list contains floats, then that value will
            be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
            `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
        clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
        variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
            its respective variance value.
        coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
            of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
            and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
            i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
        subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
            of any shape that is broadcast-compatible with the image shape. The elements of this array will be
            subtracted from the image pixel intensity values. For example, pass a list of three integers
            to perform per-channel mean normalization for color images.
        divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
            floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
            intensity values will be divided by the elements of this array. For example, pass a list
            of three integers to perform per-channel standard deviation normalization for color images.
        swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
            image channels should be swapped.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
            positive class in order to be considered for the non-maximum suppression stage for the respective class.
            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
            stage, while a larger value will result in a larger part of the selection process happening in the confidence
            thresholding stage.
        iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
            to the box's confidence score.
        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
            non-maximum suppression stage.
        nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
        return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
            a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
            you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
            to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
            spatial dimensions of the predictor layers), for inference you don't need them.

    Returns:
        model: The Keras SSD512 model.
        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
            of the output tensor shape for each convolutional predictor layer. During
            training, the generator function needs this in order to transform
            the ground truth labels into tensors of identical structure as the
            output tensors of the model, which is in turn needed for the cost
            function.

    References:
        https://arxiv.org/abs/1512.02325v5
    '''

    n_predictor_layers = 7  # The number of predictor conv layers in the network is 7 for the original SSD512
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(variances) != 4:
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################

    x = Input(shape=(img_height, img_width, img_channels))

    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
    x1 = Lambda(identity_layer,
                output_shape=(img_height, img_width, img_channels),
                name='identity_layer')(x)
    if not (subtract_mean is None):
        x1 = Lambda(input_mean_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_mean_normalization')(x1)
    if not (divide_by_stddev is None):
        x1 = Lambda(input_stddev_normalization,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_stddev_normalization')(x1)
    if swap_channels:
        x1 = Lambda(input_channel_swap,
                    output_shape=(img_height, img_width, img_channels),
                    name='input_channel_swap')(x1)

    conv1_1 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_1')(x1)
    conv1_2 = Conv2D(64, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv1_2')(conv1_1)
    pool1 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool1')(conv1_2)

    conv2_1 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_1')(pool1)
    conv2_2 = Conv2D(128, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv2_2')(conv2_1)
    pool2 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool2')(conv2_2)

    conv3_1 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_1')(pool2)
    conv3_2 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_2')(conv3_1)
    conv3_3 = Conv2D(256, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv3_3')(conv3_2)
    pool3 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool3')(conv3_3)

    conv4_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_1')(pool3)
    conv4_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_2')(conv4_1)
    conv4_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv4_3')(conv4_2)
    pool4 = MaxPooling2D(pool_size=(2, 2),
                         strides=(2, 2),
                         padding='same',
                         name='pool4')(conv4_3)

    conv5_1 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_1')(pool4)
    conv5_2 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_2')(conv5_1)
    conv5_3 = Conv2D(512, (3, 3),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv5_3')(conv5_2)
    pool5 = MaxPooling2D(pool_size=(3, 3),
                         strides=(1, 1),
                         padding='same',
                         name='pool5')(conv5_3)

    fc6 = Conv2D(1024, (3, 3),
                 dilation_rate=(6, 6),
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=l2(l2_reg),
                 name='fc6')(pool5)

    fc7 = Conv2D(1024, (1, 1),
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=l2(l2_reg),
                 name='fc7')(fc6)

    conv6_1 = Conv2D(256, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv6_1')(fc7)
    conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv6_padding')(conv6_1)
    conv6_2 = Conv2D(512, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv6_2')(conv6_1)

    conv7_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv7_1')(conv6_2)
    conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv7_padding')(conv7_1)
    conv7_2 = Conv2D(256, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv7_2')(conv7_1)

    conv8_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv8_1')(conv7_2)
    conv8_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv8_padding')(conv8_1)
    conv8_2 = Conv2D(256, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv8_2')(conv8_1)

    conv9_1 = Conv2D(128, (1, 1),
                     activation='relu',
                     padding='same',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv9_1')(conv8_2)
    conv9_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                            name='conv9_padding')(conv9_1)
    conv9_2 = Conv2D(256, (3, 3),
                     strides=(2, 2),
                     activation='relu',
                     padding='valid',
                     kernel_initializer='he_normal',
                     kernel_regularizer=l2(l2_reg),
                     name='conv9_2')(conv9_1)

    conv10_1 = Conv2D(128, (1, 1),
                      activation='relu',
                      padding='same',
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='conv10_1')(conv9_2)
    conv10_1 = ZeroPadding2D(padding=((1, 1), (1, 1)),
                             name='conv10_padding')(conv10_1)
    conv10_2 = Conv2D(256, (4, 4),
                      strides=(1, 1),
                      activation='relu',
                      padding='valid',
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='conv10_2')(conv10_1)

    # Feed conv4_3 into the L2 normalization layer
    conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)

    ### Build the convolutional predictor layers on top of the base network

    # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
    # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
    conv4_3_norm_mbox_conf = Conv2D(
        n_boxes[0] * n_classes, (3, 3),
        padding='same',
        kernel_initializer='he_normal',
        kernel_regularizer=l2(l2_reg),
        name='conv4_3_norm_mbox_conf')(conv4_3_norm)
    fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3),
                           padding='same',
                           kernel_initializer='he_normal',
                           kernel_regularizer=l2(l2_reg),
                           name='fc7_mbox_conf')(fc7)
    conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv6_2_mbox_conf')(conv6_2)
    conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv7_2_mbox_conf')(conv7_2)
    conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv8_2_mbox_conf')(conv8_2)
    conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv9_2_mbox_conf')(conv9_2)
    conv10_2_mbox_conf = Conv2D(n_boxes[6] * n_classes, (3, 3),
                                padding='same',
                                kernel_initializer='he_normal',
                                kernel_regularizer=l2(l2_reg),
                                name='conv10_2_mbox_conf')(conv10_2)
    # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
    # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
    conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3),
                                   padding='same',
                                   kernel_initializer='he_normal',
                                   kernel_regularizer=l2(l2_reg),
                                   name='conv4_3_norm_mbox_loc')(conv4_3_norm)
    fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3),
                          padding='same',
                          kernel_initializer='he_normal',
                          kernel_regularizer=l2(l2_reg),
                          name='fc7_mbox_loc')(fc7)
    conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv6_2_mbox_loc')(conv6_2)
    conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv7_2_mbox_loc')(conv7_2)
    conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv8_2_mbox_loc')(conv8_2)
    conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3),
                              padding='same',
                              kernel_initializer='he_normal',
                              kernel_regularizer=l2(l2_reg),
                              name='conv9_2_mbox_loc')(conv9_2)
    conv10_2_mbox_loc = Conv2D(n_boxes[6] * 4, (3, 3),
                               padding='same',
                               kernel_initializer='he_normal',
                               kernel_regularizer=l2(l2_reg),
                               name='conv10_2_mbox_loc')(conv10_2)

    ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)

    # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
    conv4_3_norm_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[0],
        next_scale=scales[1],
        aspect_ratios=aspect_ratios[0],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[0],
        this_offsets=offsets[0],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
    fc7_mbox_priorbox = AnchorBoxes(img_height,
                                    img_width,
                                    this_scale=scales[1],
                                    next_scale=scales[2],
                                    aspect_ratios=aspect_ratios[1],
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    this_steps=steps[1],
                                    this_offsets=offsets[1],
                                    clip_boxes=clip_boxes,
                                    variances=variances,
                                    coords=coords,
                                    normalize_coords=normalize_coords,
                                    name='fc7_mbox_priorbox')(fc7_mbox_loc)
    conv6_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[2],
        next_scale=scales[3],
        aspect_ratios=aspect_ratios[2],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[2],
        this_offsets=offsets[2],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
    conv7_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[3],
        next_scale=scales[4],
        aspect_ratios=aspect_ratios[3],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[3],
        this_offsets=offsets[3],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
    conv8_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[4],
        next_scale=scales[5],
        aspect_ratios=aspect_ratios[4],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[4],
        this_offsets=offsets[4],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
    conv9_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[5],
        next_scale=scales[6],
        aspect_ratios=aspect_ratios[5],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[5],
        this_offsets=offsets[5],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)
    conv10_2_mbox_priorbox = AnchorBoxes(
        img_height,
        img_width,
        this_scale=scales[6],
        next_scale=scales[7],
        aspect_ratios=aspect_ratios[6],
        two_boxes_for_ar1=two_boxes_for_ar1,
        this_steps=steps[6],
        this_offsets=offsets[6],
        clip_boxes=clip_boxes,
        variances=variances,
        coords=coords,
        normalize_coords=normalize_coords,
        name='conv10_2_mbox_priorbox')(conv10_2_mbox_loc)

    ### Reshape

    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    conv4_3_norm_mbox_conf_reshape = Reshape(
        (-1, n_classes),
        name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
    fc7_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
    conv6_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
    conv7_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
    conv8_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
    conv9_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)
    conv10_2_mbox_conf_reshape = Reshape(
        (-1, n_classes), name='conv10_2_mbox_conf_reshape')(conv10_2_mbox_conf)
    # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    conv4_3_norm_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
    fc7_mbox_loc_reshape = Reshape((-1, 4),
                                   name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
    conv6_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
    conv7_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
    conv8_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
    conv9_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)
    conv10_2_mbox_loc_reshape = Reshape(
        (-1, 4), name='conv10_2_mbox_loc_reshape')(conv10_2_mbox_loc)
    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
    conv4_3_norm_mbox_priorbox_reshape = Reshape(
        (-1, 8),
        name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
    fc7_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
    conv6_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
    conv7_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
    conv8_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
    conv9_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)
    conv10_2_mbox_priorbox_reshape = Reshape(
        (-1, 8), name='conv10_2_mbox_priorbox_reshape')(conv10_2_mbox_priorbox)

    ### Concatenate the predictions from the different layers

    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1, the number of boxes per layer
    # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
    mbox_conf = Concatenate(axis=1, name='mbox_conf')([
        conv4_3_norm_mbox_conf_reshape, fc7_mbox_conf_reshape,
        conv6_2_mbox_conf_reshape, conv7_2_mbox_conf_reshape,
        conv8_2_mbox_conf_reshape, conv9_2_mbox_conf_reshape,
        conv10_2_mbox_conf_reshape
    ])

    # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
    mbox_loc = Concatenate(axis=1, name='mbox_loc')([
        conv4_3_norm_mbox_loc_reshape, fc7_mbox_loc_reshape,
        conv6_2_mbox_loc_reshape, conv7_2_mbox_loc_reshape,
        conv8_2_mbox_loc_reshape, conv9_2_mbox_loc_reshape,
        conv10_2_mbox_loc_reshape
    ])

    # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([
        conv4_3_norm_mbox_priorbox_reshape, fc7_mbox_priorbox_reshape,
        conv6_2_mbox_priorbox_reshape, conv7_2_mbox_priorbox_reshape,
        conv8_2_mbox_priorbox_reshape, conv9_2_mbox_priorbox_reshape,
        conv10_2_mbox_priorbox_reshape
    ])

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)

    # Concatenate the class and box predictions and the anchors to one large predictions vector
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_priorbox])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        predictor_sizes = np.array([
            conv4_3_norm_mbox_conf._keras_shape[1:3],
            fc7_mbox_conf._keras_shape[1:3],
            conv6_2_mbox_conf._keras_shape[1:3],
            conv7_2_mbox_conf._keras_shape[1:3],
            conv8_2_mbox_conf._keras_shape[1:3],
            conv9_2_mbox_conf._keras_shape[1:3],
            conv10_2_mbox_conf._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model
コード例 #21
0
def build_model(image_size,
                n_classes,
                mode='training',
                l2_regularization=0.0,
                min_scale=0.1,
                max_scale=0.9,
                scales=None,
                aspect_ratios_global=[0.5, 1.0, 2.0],
                aspect_ratios_per_layer=None,
                two_boxes_for_ar1=True,
                steps=None,
                offsets=None,
                clip_boxes=False,
                variances=[1.0, 1.0, 1.0, 1.0],
                coords='centroids',
                normalize_coords=False,
                subtract_mean=None,
                divide_by_stddev=None,
                swap_channels=False,
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400,
                return_predictor_sizes=False):
    '''
    Build a Keras model with SSD architecture, see references.

    The model consists of convolutional feature layers and a number of convolutional
    predictor layers that take their input from different feature layers.
    The model is fully convolutional.
    '''

    n_predictor_layers = 4  # The number of predictor conv layers in the network
    n_classes += 1  # Account for the background class.
    l2_reg = l2_regularization  # Make the internal name shorter.
    img_height, img_width, img_channels = image_size[0], image_size[
        1], image_size[2]

    ############################################################################
    # Get a few exceptions out of the way.
    ############################################################################

    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
        raise ValueError(
            "`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified."
        )
    if aspect_ratios_per_layer:
        if len(aspect_ratios_per_layer) != n_predictor_layers:
            raise ValueError(
                "It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}."
                .format(n_predictor_layers, len(aspect_ratios_per_layer)))

    if (min_scale is None or max_scale is None) and scales is None:
        raise ValueError(
            "Either `min_scale` and `max_scale` or `scales` need to be specified."
        )
    if scales:
        if len(scales) != n_predictor_layers + 1:
            raise ValueError(
                "It must be either scales is None or len(scales) == {}, but len(scales) == {}."
                .format(n_predictor_layers + 1, len(scales)))
    else:  # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
        scales = np.linspace(min_scale, max_scale, n_predictor_layers + 1)

    if len(
            variances
    ) != 4:  # We need one variance value for each of the four box coordinates
        raise ValueError(
            "4 variance values must be pased, but {} values were received.".
            format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            "All variances must be >0, but the variances given are {}".format(
                variances))

    if (not (steps is None)) and (len(steps) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one step value per predictor layer.")

    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
        raise ValueError(
            "You must provide at least one offset value per predictor layer.")

    ############################################################################
    # Compute the anchor box parameters.
    ############################################################################

    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
    if aspect_ratios_per_layer:
        aspect_ratios = aspect_ratios_per_layer
    else:
        aspect_ratios = [aspect_ratios_global] * n_predictor_layers

    # Compute the number of boxes to be predicted per cell for each predictor layer.
    # We need this so that we know how many channels the predictor layers need to have.
    if aspect_ratios_per_layer:
        n_boxes = []
        for ar in aspect_ratios_per_layer:
            if (1 in ar) & two_boxes_for_ar1:
                n_boxes.append(len(ar) +
                               1)  # +1 for the second box for aspect ratio 1
            else:
                n_boxes.append(len(ar))
    else:  # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
            n_boxes = len(aspect_ratios_global) + 1
        else:
            n_boxes = len(aspect_ratios_global)
        n_boxes = [n_boxes] * n_predictor_layers

    if steps is None:
        steps = [None] * n_predictor_layers
    if offsets is None:
        offsets = [None] * n_predictor_layers

    ############################################################################
    # Define functions for the Lambda layers below.
    ############################################################################

    def identity_layer(tensor):
        return tensor

    def input_mean_normalization(tensor):
        return tensor - np.array(subtract_mean)

    def input_stddev_normalization(tensor):
        return tensor / np.array(divide_by_stddev)

    def input_channel_swap(tensor):
        if len(swap_channels) == 3:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]]
            ],
                           axis=-1)
        elif len(swap_channels) == 4:
            return K.stack([
                tensor[..., swap_channels[0]], tensor[..., swap_channels[1]],
                tensor[..., swap_channels[2]], tensor[..., swap_channels[3]]
            ],
                           axis=-1)

    ############################################################################
    # Build the network.
    ############################################################################
    base_model = MobileNet(input_shape=(img_height, img_width, img_channels),
                           weights=None,
                           include_top=False)
    base_model.load_weights("G:/keras_weights/mobilenet_1_0_224_tf_no_top.h5")
    #base_model.summary()
    x = base_model.input
    #base_model.layers[3].output
    conv4 = base_model.get_layer("conv_pw_4_relu").output

    conv5 = base_model.get_layer("conv_pw_6_relu").output

    conv6 = base_model.get_layer("conv_pw_12_relu").output

    conv7 = base_model.get_layer("conv_pw_13_relu").output

    # Build the convolutional predictor layers on top of conv layers 4, 5, 6, and 7.
    # We build two predictor layers on top of each of these layers: One for class prediction (classification), one for box coordinate prediction (localization)
    # We precidt `n_classes` confidence values for each box, hence the `classes` predictors have depth `n_boxes * n_classes`
    # We predict 4 box coordinates for each box, hence the `boxes` predictors have depth `n_boxes * 4`
    # Output shape of `classes`: `(batch, height, width, n_boxes * n_classes)`
    classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes4')(conv4)
    classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes5')(conv5)
    classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes6')(conv6)
    classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3),
                      strides=(1, 1),
                      padding="same",
                      kernel_initializer='he_normal',
                      kernel_regularizer=l2(l2_reg),
                      name='classes7')(conv7)
    # Output shape of `boxes`: `(batch, height, width, n_boxes * 4)`
    boxes4 = Conv2D(n_boxes[0] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes4')(conv4)
    boxes5 = Conv2D(n_boxes[1] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes5')(conv5)
    boxes6 = Conv2D(n_boxes[2] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes6')(conv6)
    boxes7 = Conv2D(n_boxes[3] * 4, (3, 3),
                    strides=(1, 1),
                    padding="same",
                    kernel_initializer='he_normal',
                    kernel_regularizer=l2(l2_reg),
                    name='boxes7')(conv7)

    # Generate the anchor boxes
    # Output shape of `anchors`: `(batch, height, width, n_boxes, 8)`
    anchors4 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[0],
                           next_scale=scales[1],
                           aspect_ratios=aspect_ratios[0],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[0],
                           this_offsets=offsets[0],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors4')(boxes4)
    anchors5 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[1],
                           next_scale=scales[2],
                           aspect_ratios=aspect_ratios[1],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[1],
                           this_offsets=offsets[1],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors5')(boxes5)
    anchors6 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[2],
                           next_scale=scales[3],
                           aspect_ratios=aspect_ratios[2],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[2],
                           this_offsets=offsets[2],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors6')(boxes6)
    anchors7 = AnchorBoxes(img_height,
                           img_width,
                           this_scale=scales[3],
                           next_scale=scales[4],
                           aspect_ratios=aspect_ratios[3],
                           two_boxes_for_ar1=two_boxes_for_ar1,
                           this_steps=steps[3],
                           this_offsets=offsets[3],
                           clip_boxes=clip_boxes,
                           variances=variances,
                           coords=coords,
                           normalize_coords=normalize_coords,
                           name='anchors7')(boxes7)

    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
    # We want the classes isolated in the last axis to perform softmax on them
    classes4_reshaped = Reshape((-1, n_classes),
                                name='classes4_reshape')(classes4)
    classes5_reshaped = Reshape((-1, n_classes),
                                name='classes5_reshape')(classes5)
    classes6_reshaped = Reshape((-1, n_classes),
                                name='classes6_reshape')(classes6)
    classes7_reshaped = Reshape((-1, n_classes),
                                name='classes7_reshape')(classes7)
    # Reshape the box coordinate predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
    boxes4_reshaped = Reshape((-1, 4), name='boxes4_reshape')(boxes4)
    boxes5_reshaped = Reshape((-1, 4), name='boxes5_reshape')(boxes5)
    boxes6_reshaped = Reshape((-1, 4), name='boxes6_reshape')(boxes6)
    boxes7_reshaped = Reshape((-1, 4), name='boxes7_reshape')(boxes7)
    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
    anchors4_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors4)
    anchors5_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors5)
    anchors6_reshaped = Reshape((-1, 8), name='anchors6_reshape')(anchors6)
    anchors7_reshaped = Reshape((-1, 8), name='anchors7_reshape')(anchors7)

    # Concatenate the predictions from the different layers and the assosciated anchor box tensors
    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
    # so we want to concatenate along axis 1
    # Output shape of `classes_concat`: (batch, n_boxes_total, n_classes)
    classes_concat = Concatenate(axis=1, name='classes_concat')([
        classes4_reshaped, classes5_reshaped, classes6_reshaped,
        classes7_reshaped
    ])

    # Output shape of `boxes_concat`: (batch, n_boxes_total, 4)
    boxes_concat = Concatenate(axis=1, name='boxes_concat')(
        [boxes4_reshaped, boxes5_reshaped, boxes6_reshaped, boxes7_reshaped])

    # Output shape of `anchors_concat`: (batch, n_boxes_total, 8)
    anchors_concat = Concatenate(axis=1, name='anchors_concat')([
        anchors4_reshaped, anchors5_reshaped, anchors6_reshaped,
        anchors7_reshaped
    ])

    # The box coordinate predictions will go into the loss function just the way they are,
    # but for the class predictions, we'll apply a softmax activation layer first
    classes_softmax = Activation('softmax',
                                 name='classes_softmax')(classes_concat)

    # Concatenate the class and box coordinate predictions and the anchors to one large predictions tensor
    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
    predictions = Concatenate(axis=2, name='predictions')(
        [classes_softmax, boxes_concat, anchors_concat])

    if mode == 'training':
        model = Model(inputs=x, outputs=predictions)
    elif mode == 'inference':
        decoded_predictions = DecodeDetections(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    elif mode == 'inference_fast':
        decoded_predictions = DecodeDetectionsFast(
            confidence_thresh=confidence_thresh,
            iou_threshold=iou_threshold,
            top_k=top_k,
            nms_max_output_size=nms_max_output_size,
            coords=coords,
            normalize_coords=normalize_coords,
            img_height=img_height,
            img_width=img_width,
            name='decoded_predictions')(predictions)
        model = Model(inputs=x, outputs=decoded_predictions)
    else:
        raise ValueError(
            "`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'."
            .format(mode))

    if return_predictor_sizes:
        # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
        predictor_sizes = np.array([
            classes4._keras_shape[1:3], classes5._keras_shape[1:3],
            classes6._keras_shape[1:3], classes7._keras_shape[1:3]
        ])
        return model, predictor_sizes
    else:
        return model