def add_backbone_model(self, cnn): # -------------------------------------------------------------------------- # Resnet-34 backbone model -- modified for SSD # -------------------------------------------------------------------------- # Input 300x300, output 150x150 cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True) cnn.mpool(3, 3, 2, 2, mode='SAME') resnet34_layers = [3, 4, 6, 3] version = 'v1' # ResNet-34 block group 1 # Input 150x150, output 75x75 for i in range(resnet34_layers[0]): # Last argument forces residual_block to use projection shortcut, even # though the numbers of input and output channels are equal resnet_model.residual_block(cnn, 64, 1, version) # ResNet-34 block group 2 # Input 75x75, output 38x38 for i in range(resnet34_layers[1]): stride = 2 if i == 0 else 1 resnet_model.residual_block(cnn, 128, stride, version, i == 0) # ResNet-34 block group 3 # This block group is modified: first layer uses stride=1 so that the image # size does not change in group of layers # Input 38x38, output 38x38 for i in range(resnet34_layers[2]): # The following line is intentionally commented out to differentiate from # the original ResNet-34 model # stride = 2 if i == 0 else 1 resnet_model.residual_block(cnn, 256, stride, version, i == 0)
def add_inference(self, cnn): # TODO(haoyuzhang): check batch norm params for resnet34 in reference model? cnn.use_batch_norm = True cnn.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True} # -------------------------------------------------------------------------- # Resnet-34 backbone model -- modified for SSD # -------------------------------------------------------------------------- # Input 300x300, output 150x150 cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True) cnn.mpool(3, 3, 2, 2, mode='SAME') resnet34_layers = [3, 4, 6, 3] version = 'v1' # ResNet-34 block group 1 # Input 150x150, output 75x75 for i in range(resnet34_layers[0]): # Last argument forces residual_block to use projection shortcut, even # though the numbers of input and output channels are equal resnet_model.residual_block(cnn, 64, 1, version, i == 0) # ResNet-34 block group 2 # Input 75x75, output 38x38 for i in range(resnet34_layers[1]): stride = 2 if i == 0 else 1 resnet_model.residual_block(cnn, 128, stride, version, i == 0) # ResNet-34 block group 3 # This block group is modified: first layer uses stride=1 so that the image # size does not change in group of layers # Input 38x38, output 38x38 for i in range(resnet34_layers[2]): # The following line is intentionally commented out to differentiate from # the original ResNet-34 model # stride = 2 if i == 0 else 1 resnet_model.residual_block(cnn, 256, stride, version, i == 0) # ResNet-34 block group 4: removed final block group # The following 3 lines are intentially commented out to differentiate from # the original ResNet-34 model # for i in range(resnet34_layers[3]): # stride = 2 if i == 0 else 1 # resnet_model.residual_block(cnn, 512, stride, version, i == 0) # Create saver with mapping from variable names in checkpoint of backbone # model to variables in SSD model if not self.backbone_saver: backbone_var_list = self._collect_backbone_vars() self.backbone_saver = tf.train.Saver(backbone_var_list) # -------------------------------------------------------------------------- # SSD additional layers # -------------------------------------------------------------------------- def add_ssd_layer(cnn, depth, k_size, stride, mode): return cnn.conv( depth, k_size, k_size, stride, stride, mode=mode, bias=None, kernel_initializer=tf.contrib.layers.xavier_initializer()) # Activations for feature maps of different layers self.activations = [cnn.top_layer] # Conv7_1, Conv7_2 # Input 38x38, output 19x19 add_ssd_layer(cnn, 256, 1, 1, 'valid') self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same')) # Conv8_1, Conv8_2 # Input 19x19, output 10x10 add_ssd_layer(cnn, 256, 1, 1, 'valid') self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same')) # Conv9_1, Conv9_2 # Input 10x10, output 5x5 add_ssd_layer(cnn, 128, 1, 1, 'valid') self.activations.append(add_ssd_layer(cnn, 256, 3, 2, 'same')) # Conv10_1, Conv10_2 # Input 5x5, output 3x3 add_ssd_layer(cnn, 128, 1, 1, 'valid') self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid')) # Conv11_1, Conv11_2 # Input 3x3, output 1x1 add_ssd_layer(cnn, 128, 1, 1, 'valid') self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid')) self.loc = [] self.conf = [] for nd, ac, oc in zip(self.num_dboxes, self.activations, self.out_chan): self.loc.append( tf.reshape( cnn.conv(nd * 4, 3, 3, 1, 1, input_layer=ac, num_channels_in=oc, activation=None, bias=None, kernel_initializer=tf.contrib.layers. xavier_initializer()), [ac.get_shape()[0], 4, -1])) self.conf.append( tf.reshape( cnn.conv(nd * self.label_num, 3, 3, 1, 1, input_layer=ac, num_channels_in=oc, activation=None, bias=None, kernel_initializer=tf.contrib.layers. xavier_initializer()), [ac.get_shape()[0], self.label_num, -1])) # Shape of locs: [batch_size, 4, NUM_SSD_BOXES] # Shape of confs: [batch_size, label_num, NUM_SSD_BOXES] locs, confs = tf.concat(self.loc, 2), tf.concat(self.conf, 2) # Pack location and confidence outputs into a single output layer # Shape of logits: [batch_size, 4+label_num, NUM_SSD_BOXES] logits = tf.concat([locs, confs], 1) cnn.top_layer = logits cnn.top_size = 4 + self.label_num return cnn.top_layer