Ejemplo n.º 1
0
def SSD_VGG16(
    config,
    label_maps,
    num_predictions=10,
    is_training=True,
):
    """ This network follows the official caffe implementation of SSD: https://github.com/chuanqi305/ssd
    1. Changes made to VGG16 config D layers:
        - fc6 and fc7 is converted into convolutional layers instead of fully connected layers specify in the VGG paper
        - atrous convolution is used to turn fc6 and fc7 into convolutional layers
        - pool5 size is changed from (2, 2) to (3, 3) and its strides is changed from (2, 2) to (1, 1)
        - l2 normalization is used only on the output of conv4_3 because it has different scales compared to other layers. To learn more read SSD paper section 3.1 PASCAL VOC2007
    2. In Keras:
        - padding "same" is equivalent to padding 1 in caffe
        - padding "valid" is equivalent to padding 0 (no padding) in caffe
        - Atrous Convolution is referred to as dilated convolution in Keras and can be used by specifying dilation rate in Conv2D
    3. The name of each layer in the network is renamed to match the official caffe implementation

    Args:
        - config: python dict as read from the config file
        - label_maps: A python list containing the classes
        - num_predictions: The number of predictions to produce as final output
        - is_training: whether the model is constructed for training purpose or inference purpose

    Returns:
        - A keras version of SSD300 with VGG16 as backbone network.

    Code References:
        - https://github.com/chuanqi305/ssd
        - https://github.com/pierluigiferrari/ssd_keras/blob/master/models/keras_ssd300.py

    Paper References:
        - Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. Y., & Berg, A. C. (2016).
          SSD: Single Shot MultiBox Detector. https://arxiv.org/abs/1512.02325
    """
    model_config = config["model"]
    input_shape = (model_config["input_size"], model_config["input_size"], 3)
    num_classes = len(label_maps) + 1  # for background class
    l2_reg = model_config["l2_regularization"]
    kernel_initializer = model_config["kernel_initializer"]
    default_boxes_config = model_config["default_boxes"]
    extra_box_for_ar_1 = model_config["extra_box_for_ar_1"]

    # construct the base network and extra feature layers
    base_network = VGG16(input_shape=input_shape,
                         classes=num_classes,
                         weights='imagenet',
                         include_top=False)
    base_network = Model(inputs=base_network.input,
                         outputs=base_network.get_layer('block5_conv3').output)
    base_network.get_layer("input_1")._name = "input"
    for layer in base_network.layers:
        if "pool" in layer.name:
            new_name = layer.name.replace("block", "")
            new_name = new_name.split("_")
            new_name = f"{new_name[1]}{new_name[0]}"
        else:
            new_name = layer.name.replace("conv", "")
            new_name = new_name.replace("block", "conv")
        base_network.get_layer(layer.name)._name = new_name
        base_network.get_layer(layer.name)._kernel_initializer = "he_normal"
        base_network.get_layer(layer.name)._kernel_regularizer = l2(l2_reg)
        layer.trainable = False  # each layer of the base network should not be trainable

    def conv_block_1(x,
                     filters,
                     name,
                     padding='valid',
                     dilation_rate=(1, 1),
                     strides=(1, 1)):
        return Conv2D(filters,
                      kernel_size=(3, 3),
                      strides=strides,
                      activation='relu',
                      padding=padding,
                      dilation_rate=dilation_rate,
                      kernel_initializer=kernel_initializer,
                      kernel_regularizer=l2(l2_reg),
                      name=name)(x)

    def conv_block_2(x,
                     filters,
                     name,
                     padding='valid',
                     dilation_rate=(1, 1),
                     strides=(1, 1)):
        return Conv2D(filters,
                      kernel_size=(1, 1),
                      strides=strides,
                      activation='relu',
                      padding=padding,
                      dilation_rate=dilation_rate,
                      kernel_initializer=kernel_initializer,
                      kernel_regularizer=l2(l2_reg),
                      name=name)(x)

    pool5 = MaxPool2D(pool_size=(3, 3),
                      strides=(1, 1),
                      padding="same",
                      name="pool5")(base_network.get_layer('conv5_3').output)

    fc6 = conv_block_1(x=pool5,
                       filters=1024,
                       padding="same",
                       dilation_rate=(6, 6),
                       name="fc6")
    fc7 = conv_block_2(x=fc6, filters=1024, padding="same", name="fc7")
    conv8_1 = conv_block_2(x=fc7, filters=256, padding="valid", name="conv8_1")
    conv8_2 = conv_block_1(x=conv8_1,
                           filters=512,
                           padding="same",
                           strides=(2, 2),
                           name="conv8_2")
    conv9_1 = conv_block_2(x=conv8_2,
                           filters=128,
                           padding="valid",
                           name="conv9_1")
    conv9_2 = conv_block_1(x=conv9_1,
                           filters=256,
                           padding="same",
                           strides=(2, 2),
                           name="conv9_2")
    conv10_1 = conv_block_2(x=conv9_2,
                            filters=128,
                            padding="valid",
                            name="conv10_1")
    conv10_2 = conv_block_1(x=conv10_1,
                            filters=256,
                            padding="valid",
                            name="conv10_2")
    conv11_1 = conv_block_2(x=conv10_2,
                            filters=128,
                            padding="valid",
                            name="conv11_1")
    conv11_2 = conv_block_1(x=conv11_1,
                            filters=256,
                            padding="valid",
                            name="conv11_2")

    model = Model(inputs=base_network.input, outputs=conv11_2)

    # construct the prediction layers (conf, loc, & default_boxes)
    scales = np.linspace(default_boxes_config["min_scale"],
                         default_boxes_config["max_scale"],
                         len(default_boxes_config["layers"]))
    mbox_conf_layers = []
    mbox_loc_layers = []
    mbox_default_boxes_layers = []
    for i, layer in enumerate(default_boxes_config["layers"]):
        num_default_boxes = get_number_default_boxes(
            layer["aspect_ratios"], extra_box_for_ar_1=extra_box_for_ar_1)
        x = model.get_layer(layer["name"]).output
        layer_name = layer["name"]

        # conv4_3 has different scales compared to other feature map layers
        if layer_name == "conv4_3":
            layer_name = f"{layer_name}_norm"
            x = L2Normalization(gamma_init=20, name=layer_name)(x)

        layer_mbox_conf = Conv2D(filters=num_default_boxes * num_classes,
                                 kernel_size=(3, 3),
                                 padding='same',
                                 kernel_initializer=kernel_initializer,
                                 kernel_regularizer=l2(l2_reg),
                                 name=f"{layer_name}_mbox_conf")(x)
        layer_mbox_conf_reshape = Reshape(
            (-1, num_classes),
            name=f"{layer_name}_mbox_conf_reshape")(layer_mbox_conf)
        layer_mbox_loc = Conv2D(filters=num_default_boxes * 4,
                                kernel_size=(3, 3),
                                padding='same',
                                kernel_initializer=kernel_initializer,
                                kernel_regularizer=l2(l2_reg),
                                name=f"{layer_name}_mbox_loc")(x)
        layer_mbox_loc_reshape = Reshape(
            (-1, 4), name=f"{layer_name}_mbox_loc_reshape")(layer_mbox_loc)
        layer_default_boxes = DefaultBoxes(
            image_shape=input_shape,
            scale=scales[i],
            next_scale=scales[i + 1]
            if i + 1 <= len(default_boxes_config["layers"]) - 1 else 1,
            aspect_ratios=layer["aspect_ratios"],
            variances=default_boxes_config["variances"],
            extra_box_for_ar_1=extra_box_for_ar_1,
            name=f"{layer_name}_default_boxes")(x)
        layer_default_boxes_reshape = Reshape(
            (-1, 8),
            name=f"{layer_name}_default_boxes_reshape")(layer_default_boxes)
        mbox_conf_layers.append(layer_mbox_conf_reshape)
        mbox_loc_layers.append(layer_mbox_loc_reshape)
        mbox_default_boxes_layers.append(layer_default_boxes_reshape)

    # concentenate class confidence predictions from different feature map layers
    mbox_conf = Concatenate(axis=-2, name="mbox_conf")(mbox_conf_layers)
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)
    # concentenate object location predictions from different feature map layers
    mbox_loc = Concatenate(axis=-2, name="mbox_loc")(mbox_loc_layers)
    # concentenate default boxes from different feature map layers
    mbox_default_boxes = Concatenate(
        axis=-2, name="mbox_default_boxes")(mbox_default_boxes_layers)
    # concatenate confidence score predictions, bounding box predictions, and default boxes
    predictions = Concatenate(axis=-1, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_default_boxes])

    if is_training:
        return Model(inputs=base_network.input, outputs=predictions)

    decoded_predictions = DecodeSSDPredictions(
        input_size=model_config["input_size"],
        num_predictions=num_predictions,
        name="decoded_predictions")(predictions)
    return Model(inputs=base_network.input, outputs=decoded_predictions)
Ejemplo n.º 2
0
def TBPP_VGG16(
    config,
    num_predictions=10,
    is_training=True,
):
    """
    """
    model_config = config["model"]
    input_shape = (model_config["input_size"], model_config["input_size"], 3)
    num_classes = 2  # 1 for text and 1 for background
    l2_reg = model_config["l2_regularization"]
    kernel_initializer = model_config["kernel_initializer"]
    default_boxes_config = model_config["default_boxes"]
    extra_box_for_ar_1 = model_config["extra_box_for_ar_1"]

    input_tensor = Input(shape=input_shape)
    input_tensor = ZeroPadding2D(padding=(2, 2))(input_tensor)

    # construct the base network and extra feature layers
    base_network = VGG16(input_tensor=input_tensor,
                         classes=num_classes,
                         weights='imagenet',
                         include_top=False)

    base_network = Model(inputs=base_network.input,
                         outputs=base_network.get_layer('block5_conv3').output)
    base_network.get_layer("input_1")._name = "input"
    for layer in base_network.layers:
        if "pool" in layer.name:
            new_name = layer.name.replace("block", "")
            new_name = new_name.split("_")
            new_name = f"{new_name[1]}{new_name[0]}"
        else:
            new_name = layer.name.replace("conv", "")
            new_name = new_name.replace("block", "conv")
        base_network.get_layer(layer.name)._name = new_name
        base_network.get_layer(layer.name)._kernel_initializer = "he_normal"
        base_network.get_layer(layer.name)._kernel_regularizer = l2(l2_reg)
        layer.trainable = False  # each layer of the base network should not be trainable

    def conv_block_1(x,
                     filters,
                     name,
                     padding='valid',
                     dilation_rate=(1, 1),
                     strides=(1, 1)):
        return Conv2D(filters,
                      kernel_size=(3, 3),
                      strides=strides,
                      activation='relu',
                      padding=padding,
                      dilation_rate=dilation_rate,
                      kernel_initializer=kernel_initializer,
                      kernel_regularizer=l2(l2_reg),
                      name=name)(x)

    def conv_block_2(x,
                     filters,
                     name,
                     padding='valid',
                     dilation_rate=(1, 1),
                     strides=(1, 1)):
        return Conv2D(filters,
                      kernel_size=(1, 1),
                      strides=strides,
                      activation='relu',
                      padding=padding,
                      dilation_rate=dilation_rate,
                      kernel_initializer=kernel_initializer,
                      kernel_regularizer=l2(l2_reg),
                      name=name)(x)

    pool5 = MaxPool2D(pool_size=(3, 3),
                      strides=(1, 1),
                      padding="same",
                      name="pool5")(base_network.get_layer('conv5_3').output)

    fc6 = conv_block_1(x=pool5,
                       filters=1024,
                       padding="same",
                       dilation_rate=(6, 6),
                       name="fc6")
    fc7 = conv_block_2(x=fc6, filters=1024, padding="same", name="fc7")
    conv8_1 = conv_block_2(x=fc7, filters=256, padding="valid", name="conv8_1")
    conv8_2 = conv_block_1(x=conv8_1,
                           filters=512,
                           padding="same",
                           strides=(2, 2),
                           name="conv8_2")
    conv9_1 = conv_block_2(x=conv8_2,
                           filters=128,
                           padding="valid",
                           name="conv9_1")
    conv9_2 = conv_block_1(x=conv9_1,
                           filters=256,
                           padding="same",
                           strides=(2, 2),
                           name="conv9_2")
    conv10_1 = conv_block_2(x=conv9_2,
                            filters=128,
                            padding="valid",
                            name="conv10_1")
    conv10_2 = conv_block_1(x=conv10_1,
                            filters=256,
                            padding="valid",
                            name="conv10_2")
    conv11_1 = conv_block_2(x=conv10_2,
                            filters=128,
                            padding="valid",
                            name="conv11_1")
    conv11_2 = conv_block_1(x=conv11_1,
                            filters=256,
                            padding="valid",
                            name="conv11_2")

    model = Model(inputs=base_network.input, outputs=conv11_2)

    # construct the prediction layers (conf, loc, & default_boxes)
    scales = np.linspace(default_boxes_config["min_scale"],
                         default_boxes_config["max_scale"],
                         len(default_boxes_config["layers"]))
    mbox_conf_layers = []
    mbox_loc_layers = []
    mbox_quad_layers = []
    mbox_default_boxes_layers = []
    for i, layer in enumerate(default_boxes_config["layers"]):
        num_default_boxes = get_number_default_boxes(
            layer["aspect_ratios"], extra_box_for_ar_1=extra_box_for_ar_1)
        x = model.get_layer(layer["name"]).output
        layer_name = layer["name"]

        # conv4_3 has different scales compared to other feature map layers
        if layer_name == "conv4_3":
            layer_name = f"{layer_name}_norm"
            x = L2Normalization(gamma_init=20, name=layer_name)(x)

        layer_mbox_conf = Conv2D(filters=num_default_boxes * num_classes,
                                 kernel_size=(3, 5),
                                 padding='same',
                                 kernel_initializer=kernel_initializer,
                                 kernel_regularizer=l2(l2_reg),
                                 name=f"{layer_name}_mbox_conf")(x)
        layer_mbox_conf_reshape = Reshape(
            (-1, num_classes),
            name=f"{layer_name}_mbox_conf_reshape")(layer_mbox_conf)
        layer_mbox_loc = Conv2D(filters=num_default_boxes * 4,
                                kernel_size=(3, 5),
                                padding='same',
                                kernel_initializer=kernel_initializer,
                                kernel_regularizer=l2(l2_reg),
                                name=f"{layer_name}_mbox_loc")(x)
        layer_mbox_loc_reshape = Reshape(
            (-1, 4), name=f"{layer_name}_mbox_loc_reshape")(layer_mbox_loc)
        layer_mbox_quad = Conv2D(filters=num_default_boxes * 8,
                                 kernel_size=(3, 5),
                                 padding='same',
                                 kernel_initializer=kernel_initializer,
                                 kernel_regularizer=l2(l2_reg),
                                 name=f"{layer_name}_mbox_quad")(x)
        layer_mbox_quad_reshape = Reshape(
            (-1, 8), name=f"{layer_name}_mbox_quad_reshape")(layer_mbox_quad)
        layer_default_boxes = DefaultBoxes(
            image_shape=input_shape,
            scale=scales[i],
            next_scale=scales[i + 1]
            if i + 1 <= len(default_boxes_config["layers"]) - 1 else 1,
            aspect_ratios=layer["aspect_ratios"],
            variances=default_boxes_config["variances"],
            extra_box_for_ar_1=extra_box_for_ar_1,
            name=f"{layer_name}_default_boxes")(x)
        layer_default_boxes_reshape = Reshape(
            (-1, 8),
            name=f"{layer_name}_default_boxes_reshape")(layer_default_boxes)
        mbox_conf_layers.append(layer_mbox_conf_reshape)
        mbox_loc_layers.append(layer_mbox_loc_reshape)
        mbox_quad_layers.append(layer_mbox_quad_reshape)
        mbox_default_boxes_layers.append(layer_default_boxes_reshape)

    # concentenate class confidence predictions from different feature map layers
    mbox_conf = Concatenate(axis=-2, name="mbox_conf")(mbox_conf_layers)
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)
    # concentenate object location predictions from different feature map layers
    mbox_loc = Concatenate(axis=-2, name="mbox_loc")(mbox_loc_layers)
    # concentenate object quad predictions from different feature map layers
    mbox_quad = Concatenate(axis=-2, name="mbox_quad")(mbox_quad_layers)
    # concentenate default boxes from different feature map layers
    mbox_default_boxes = Concatenate(
        axis=-2, name="mbox_default_boxes")(mbox_default_boxes_layers)
    # concatenate confidence score predictions, bounding box predictions, and default boxes
    predictions = Concatenate(axis=-1, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_quad, mbox_default_boxes])

    if is_training:
        return Model(inputs=base_network.input, outputs=predictions)

    decoded_predictions = DecodeTBPPPredictions(
        input_size=model_config["input_size"],
        num_predictions=num_predictions,
        name="decoded_predictions")(predictions)
    return Model(inputs=base_network.input, outputs=decoded_predictions)
Ejemplo n.º 3
0
def SSD_MOBILENET(
    config,
    label_maps,
    num_predictions=10,
    is_training=True,
):
    """ Construct an SSD network that uses MobileNetV1 backbone.

    Args:
        - config: python dict as read from the config file
        - label_maps: A python list containing the classes
        - num_predictions: The number of predictions to produce as final output
        - is_training: whether the model is constructed for training purpose or inference purpose

    Returns:
        - A keras version of SSD300 with MobileNetV1 as backbone network.

    Code References:
        - https://github.com/chuanqi305/MobileNet-SSD
    """
    model_config = config["model"]
    input_shape = (model_config["input_size"], model_config["input_size"], 3)
    num_classes = len(label_maps) + 1  # for background class
    l2_reg = model_config["l2_regularization"]
    kernel_initializer = model_config["kernel_initializer"]
    default_boxes_config = model_config["default_boxes"]
    extra_box_for_ar_1 = model_config["extra_box_for_ar_1"]
    # construct the base network and extra feature layers
    base_network = MobileNet(
        input_shape=input_shape,
        alpha=config["model"]["width_multiplier"],
        depth_multiplier=config["model"]["depth_multiplier"],
        classes=num_classes,
        weights='imagenet',
        include_top=False)
    base_network.get_layer("input_1")._name = "input"
    for layer in base_network.layers:
        base_network.get_layer(layer.name)._kernel_initializer = "he_normal"
        base_network.get_layer(layer.name)._kernel_regularizer = l2(l2_reg)
        layer.trainable = False  # each layer of the base network should not be trainable

    conv11 = base_network.get_layer("conv_pw_11_relu").output
    conv13 = base_network.get_layer("conv_pw_13_relu").output

    def conv_block_1(x, filters, name):
        x = Conv2D(filters=filters,
                   kernel_size=(1, 1),
                   padding="valid",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name=name,
                   use_bias=False)(x)
        x = BatchNormalization(name=f"{name}/bn")(x)
        x = ReLU(name=f"{name}/relu")(x)
        return x

    def conv_block_2(x, filters, name):
        x = Conv2D(filters=filters,
                   kernel_size=(3, 3),
                   padding="same",
                   kernel_initializer='he_normal',
                   kernel_regularizer=l2(l2_reg),
                   name=name,
                   use_bias=False,
                   strides=(2, 2))(x)
        x = BatchNormalization(name=f"{name}/bn")(x)
        x = ReLU(name=f"{name}/relu")(x)
        return x

    conv14_1 = conv_block_1(x=conv13, filters=256, name="conv14_1")
    conv14_2 = conv_block_2(x=conv14_1, filters=512, name="conv14_2")
    conv15_1 = conv_block_1(x=conv14_2, filters=128, name="conv15_1")
    conv15_2 = conv_block_2(x=conv15_1, filters=256, name="conv15_2")
    conv16_1 = conv_block_1(x=conv15_2, filters=128, name="conv16_1")
    conv16_2 = conv_block_2(x=conv16_1, filters=256, name="conv16_2")
    conv17_1 = conv_block_1(x=conv16_2, filters=128, name="conv17_1")
    conv17_2 = conv_block_2(x=conv17_1, filters=256, name="conv17_2")
    model = Model(inputs=base_network.input, outputs=conv17_2)
    # construct the prediction layers (conf, loc, & default_boxes)
    scales = np.linspace(default_boxes_config["min_scale"],
                         default_boxes_config["max_scale"],
                         len(default_boxes_config["layers"]))
    mbox_conf_layers = []
    mbox_loc_layers = []
    mbox_default_boxes_layers = []
    for i, layer in enumerate(default_boxes_config["layers"]):
        num_default_boxes = get_number_default_boxes(
            layer["aspect_ratios"], extra_box_for_ar_1=extra_box_for_ar_1)
        x = model.get_layer(layer["name"]).output
        layer_name = layer["name"]

        layer_mbox_conf = Conv2D(filters=num_default_boxes * num_classes,
                                 kernel_size=(3, 3),
                                 padding='same',
                                 kernel_initializer=kernel_initializer,
                                 kernel_regularizer=l2(l2_reg),
                                 name=f"{layer_name}_mbox_conf")(x)
        layer_mbox_conf_reshape = Reshape(
            (-1, num_classes),
            name=f"{layer_name}_mbox_conf_reshape")(layer_mbox_conf)
        layer_mbox_loc = Conv2D(filters=num_default_boxes * 4,
                                kernel_size=(3, 3),
                                padding='same',
                                kernel_initializer=kernel_initializer,
                                kernel_regularizer=l2(l2_reg),
                                name=f"{layer_name}_mbox_loc")(x)
        layer_mbox_loc_reshape = Reshape(
            (-1, 4), name=f"{layer_name}_mbox_loc_reshape")(layer_mbox_loc)
        layer_default_boxes = DefaultBoxes(
            image_shape=input_shape,
            scale=scales[i],
            next_scale=scales[i + 1]
            if i + 1 <= len(default_boxes_config["layers"]) - 1 else 1,
            aspect_ratios=layer["aspect_ratios"],
            variances=default_boxes_config["variances"],
            extra_box_for_ar_1=extra_box_for_ar_1,
            name=f"{layer_name}_default_boxes")(x)
        layer_default_boxes_reshape = Reshape(
            (-1, 8),
            name=f"{layer_name}_default_boxes_reshape")(layer_default_boxes)
        mbox_conf_layers.append(layer_mbox_conf_reshape)
        mbox_loc_layers.append(layer_mbox_loc_reshape)
        mbox_default_boxes_layers.append(layer_default_boxes_reshape)

    # concentenate class confidence predictions from different feature map layers
    mbox_conf = Concatenate(axis=-2, name="mbox_conf")(mbox_conf_layers)
    mbox_conf_softmax = Activation('softmax',
                                   name='mbox_conf_softmax')(mbox_conf)
    # concentenate object location predictions from different feature map layers
    mbox_loc = Concatenate(axis=-2, name="mbox_loc")(mbox_loc_layers)
    # concentenate default boxes from different feature map layers
    mbox_default_boxes = Concatenate(
        axis=-2, name="mbox_default_boxes")(mbox_default_boxes_layers)
    # concatenate confidence score predictions, bounding box predictions, and default boxes
    predictions = Concatenate(axis=-1, name='predictions')(
        [mbox_conf_softmax, mbox_loc, mbox_default_boxes])

    if is_training:
        return Model(inputs=base_network.input, outputs=predictions)

    decoded_predictions = DecodeSSDPredictions(
        input_size=model_config["input_size"],
        num_predictions=num_predictions,
        name="decoded_predictions")(predictions)

    return Model(inputs=base_network.input, outputs=decoded_predictions)