Example #1
0
    def add_backbone_model(self, cnn):
        # --------------------------------------------------------------------------
        # Resnet-34 backbone model -- modified for SSD
        # --------------------------------------------------------------------------

        # Input 300x300, output 150x150
        cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True)
        cnn.mpool(3, 3, 2, 2, mode='SAME')

        resnet34_layers = [3, 4, 6, 3]
        version = 'v1'

        # ResNet-34 block group 1
        # Input 150x150, output 75x75
        for i in range(resnet34_layers[0]):
            # Last argument forces residual_block to use projection shortcut, even
            # though the numbers of input and output channels are equal
            resnet_model.residual_block(cnn, 64, 1, version)

        # ResNet-34 block group 2
        # Input 75x75, output 38x38
        for i in range(resnet34_layers[1]):
            stride = 2 if i == 0 else 1
            resnet_model.residual_block(cnn, 128, stride, version, i == 0)

        # ResNet-34 block group 3
        # This block group is modified: first layer uses stride=1 so that the image
        # size does not change in group of layers
        # Input 38x38, output 38x38
        for i in range(resnet34_layers[2]):
            # The following line is intentionally commented out to differentiate from
            # the original ResNet-34 model
            # stride = 2 if i == 0 else 1
            resnet_model.residual_block(cnn, 256, stride, version, i == 0)
Example #2
0
    def add_inference(self, cnn):
        # TODO(haoyuzhang): check batch norm params for resnet34 in reference model?
        cnn.use_batch_norm = True
        cnn.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True}

        # --------------------------------------------------------------------------
        # Resnet-34 backbone model -- modified for SSD
        # --------------------------------------------------------------------------

        # Input 300x300, output 150x150
        cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True)
        cnn.mpool(3, 3, 2, 2, mode='SAME')

        resnet34_layers = [3, 4, 6, 3]
        version = 'v1'

        # ResNet-34 block group 1
        # Input 150x150, output 75x75
        for i in range(resnet34_layers[0]):
            # Last argument forces residual_block to use projection shortcut, even
            # though the numbers of input and output channels are equal
            resnet_model.residual_block(cnn, 64, 1, version, i == 0)

        # ResNet-34 block group 2
        # Input 75x75, output 38x38
        for i in range(resnet34_layers[1]):
            stride = 2 if i == 0 else 1
            resnet_model.residual_block(cnn, 128, stride, version, i == 0)

        # ResNet-34 block group 3
        # This block group is modified: first layer uses stride=1 so that the image
        # size does not change in group of layers
        # Input 38x38, output 38x38
        for i in range(resnet34_layers[2]):
            # The following line is intentionally commented out to differentiate from
            # the original ResNet-34 model
            # stride = 2 if i == 0 else 1
            resnet_model.residual_block(cnn, 256, stride, version, i == 0)

        # ResNet-34 block group 4: removed final block group
        # The following 3 lines are intentially commented out to differentiate from
        # the original ResNet-34 model
        # for i in range(resnet34_layers[3]):
        #   stride = 2 if i == 0 else 1
        #   resnet_model.residual_block(cnn, 512, stride, version, i == 0)

        # Create saver with mapping from variable names in checkpoint of backbone
        # model to variables in SSD model
        if not self.backbone_saver:
            backbone_var_list = self._collect_backbone_vars()
            self.backbone_saver = tf.train.Saver(backbone_var_list)

        # --------------------------------------------------------------------------
        # SSD additional layers
        # --------------------------------------------------------------------------

        def add_ssd_layer(cnn, depth, k_size, stride, mode):
            return cnn.conv(
                depth,
                k_size,
                k_size,
                stride,
                stride,
                mode=mode,
                bias=None,
                kernel_initializer=tf.contrib.layers.xavier_initializer())

        # Activations for feature maps of different layers
        self.activations = [cnn.top_layer]
        # Conv7_1, Conv7_2
        # Input 38x38, output 19x19
        add_ssd_layer(cnn, 256, 1, 1, 'valid')
        self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))

        # Conv8_1, Conv8_2
        # Input 19x19, output 10x10
        add_ssd_layer(cnn, 256, 1, 1, 'valid')
        self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))

        # Conv9_1, Conv9_2
        # Input 10x10, output 5x5
        add_ssd_layer(cnn, 128, 1, 1, 'valid')
        self.activations.append(add_ssd_layer(cnn, 256, 3, 2, 'same'))

        # Conv10_1, Conv10_2
        # Input 5x5, output 3x3
        add_ssd_layer(cnn, 128, 1, 1, 'valid')
        self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))

        # Conv11_1, Conv11_2
        # Input 3x3, output 1x1
        add_ssd_layer(cnn, 128, 1, 1, 'valid')
        self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))

        self.loc = []
        self.conf = []

        for nd, ac, oc in zip(self.num_dboxes, self.activations,
                              self.out_chan):
            self.loc.append(
                tf.reshape(
                    cnn.conv(nd * 4,
                             3,
                             3,
                             1,
                             1,
                             input_layer=ac,
                             num_channels_in=oc,
                             activation=None,
                             bias=None,
                             kernel_initializer=tf.contrib.layers.
                             xavier_initializer()),
                    [ac.get_shape()[0], 4, -1]))
            self.conf.append(
                tf.reshape(
                    cnn.conv(nd * self.label_num,
                             3,
                             3,
                             1,
                             1,
                             input_layer=ac,
                             num_channels_in=oc,
                             activation=None,
                             bias=None,
                             kernel_initializer=tf.contrib.layers.
                             xavier_initializer()),
                    [ac.get_shape()[0], self.label_num, -1]))

        # Shape of locs: [batch_size, 4, NUM_SSD_BOXES]
        # Shape of confs: [batch_size, label_num, NUM_SSD_BOXES]
        locs, confs = tf.concat(self.loc, 2), tf.concat(self.conf, 2)

        # Pack location and confidence outputs into a single output layer
        # Shape of logits: [batch_size, 4+label_num, NUM_SSD_BOXES]
        logits = tf.concat([locs, confs], 1)

        cnn.top_layer = logits
        cnn.top_size = 4 + self.label_num

        return cnn.top_layer