def vgg_16(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='vgg_16'): """Oxford Net VGG 16-Layers version D Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, 1000, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: # net = tf.squeeze(net, [1, 2], name='fc8/squeezed') net = custom_layers.spatial_squeeze(net) net = custom_layers.pad_logits(net, pad=(num_classes - 1000, 0)) end_points[sc.name + '/fc8'] = net return net, end_points
def vgg_16(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='vgg_16'): """Oxford Net VGG 16-Layers version D Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope( [slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, 1000, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict( end_points_collection) if spatial_squeeze: # net = tf.squeeze(net, [1, 2], name='fc8/squeezed') net = custom_layers.spatial_squeeze(net) net = custom_layers.pad_logits(net, pad=(num_classes - 1000, 0)) end_points[sc.name + '/fc8'] = net return net, end_points
def mobilenets(inputs, num_classes=1000, width_multiplier=1.0, is_training=True, dropout_keep_prob=0.5, scope='MobileNets'): """MobileNets implementation. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ kernel_size = [3, 3] padding = [(kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2] def mobilenet_block(net, num_out_channels, stride=[1, 1], scope=None): """Basic MobileNet block combining: - depthwise conv + BN + relu - 1x1 conv + BN + relu """ with tf.variable_scope(scope, 'block', [net]) as sc: num_out_channels = int(num_out_channels * width_multiplier) if stride[0] == 1 and stride[1] == 1: # Depthwise convolution with stride=1 net = custom_layers.depthwise_convolution2d(net, kernel_size, depth_multiplier=1, stride=stride, scope='conv_dw') else: # Mimic CAFFE padding if stride > 1. net = custom_layers.pad2d(net, pad=padding) net = custom_layers.depthwise_convolution2d(net, kernel_size, padding='VALID', depth_multiplier=1, stride=stride, scope='conv_dw') # Pointwise convolution. net = slim.conv2d(net, num_out_channels, [1, 1], scope='conv_pw') return net with tf.variable_scope(scope, 'MobileNets', [inputs]) as sc: end_points = {} # First full convolution... net = custom_layers.pad2d(inputs, pad=padding) net = slim.conv2d(net, 32, kernel_size, stride=[2, 2], padding='VALID', scope='conv1') # net = slim.conv2d(inputs, 32, [ksize, ksize], stride=[2, 2], scope='conv1') # Then, MobileNet blocks! net = mobilenet_block(net, 64, scope='block2') net = mobilenet_block(net, 128, stride=[2, 2], scope='block3') net = mobilenet_block(net, 128, scope='block4') net = mobilenet_block(net, 256, stride=[2, 2], scope='block5') net = mobilenet_block(net, 256, scope='block6') net = mobilenet_block(net, 512, stride=[2, 2], scope='block7') # Intermediate blocks... for i in range(5): net = mobilenet_block(net, 512, scope='block%i' % (i + 8)) # Final blocks. net = mobilenet_block(net, 1024, stride=[2, 2], scope='block13') net = mobilenet_block(net, 1024, scope='block14') # Spatial pooling + fully connected layer. net = custom_layers.spatial_mean(net, keep_dims=True, scope='spatial_mean14') net = slim.conv2d(net, 1000, [1, 1], activation_fn=None, normalizer_fn=None, normalizer_params=None, biases_initializer=tf.zeros_initializer(), scope='conv_fc15') net = custom_layers.spatial_squeeze(net) # net = slim.fully_connected(net, 1000, scope='fc15') # Logits padding... net = custom_layers.pad_logits(net, pad=(num_classes - 1000, 0)) return net, end_points
def mobilenets_btree(inputs, num_classes=1000, kernel_size=[3, 3], width_multiplier=1.0, dropouts=[0.5], pad_logits=True, is_training=True, reuse=None, scope='MobileNets'): """MobileNets implementation. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ # MobileNets kernel size and padding (for layers with stride > 1). # kernel_size = [3, 3] padding = [(kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2] def mobilenet_block(net, num_out_channels, stride=[1, 1], scope=None): """Basic MobileNet block combining: - depthwise conv + BN + relu - 1x1 conv + BN + relu """ with tf.variable_scope(scope, 'block', [net]) as sc: num_out_channels = int(num_out_channels * width_multiplier) if stride[0] == 1 and stride[1] == 1: # Depthwise convolution with stride=1 net = custom_layers.depthwise_convolution2d(net, kernel_size, depth_multiplier=1, stride=stride, scope='conv_dw') else: # Mimic CAFFE padding if stride > 1 => usually better accuracy. net = custom_layers.pad2d(net, pad=padding) net = custom_layers.depthwise_convolution2d(net, kernel_size, padding='VALID', depth_multiplier=1, stride=stride, scope='conv_dw') # Pointwise convolution. net = slim.conv2d(net, num_out_channels, [1, 1], scope='conv_pw') return net def mobilenet_block_btree_v1(net, num_out_channels, stride=[1, 1], split=2, scope=None): """Basic MobileNet block combining: - depthwise conv + BN + relu - 1x1 conv + BN + relu """ with tf.variable_scope(scope, 'block', [net]) as sc: num_out_channels = int(num_out_channels * width_multiplier) # Depthwise convolution with stride=1 net = custom_layers.depthwise_convolution2d(net, kernel_size, depth_multiplier=1, stride=stride, scope='conv_dw') # Split-pointwise convolution. net = btree_layers.conv2d_1x1_split(net, num_out_channels, split=split, scope='conv_pw_split') return net def mobilenet_block_btree_v2(net, num_out_channels, stride=[1, 1], split=2, scope=None): """Combination of ResNets block and B-tree. """ with tf.variable_scope(scope, 'block', [net]) as sc: # Start with Batch Norm. net = custom_layers.batch_norm(net) # Depthwise convolution with stride=1 net = custom_layers.depthwise_convolution2d(net, kernel_size, depth_multiplier=1, stride=stride, activation_fn=None, scope='conv_dw') # Split-pointwise convolution. num_out_channels = int(num_out_channels * width_multiplier) net = btree_layers.conv2d_1x1_split(net, num_out_channels, split=split, activation_fn=None, normalizer_fn=None, scope='conv_pw_split') return net with tf.variable_scope(scope, 'MobileNets', [inputs], reuse=reuse) as sc: end_points = {} # First full convolution... net = custom_layers.pad2d(inputs, pad=padding) net = slim.conv2d(net, 32, kernel_size, stride=[2, 2], padding='VALID', scope='conv1') # net = slim.conv2d(inputs, 32, kernel_size, stride=[2, 2], # padding='SAME', scope='conv1') # Then, MobileNet blocks! net = mobilenet_block(net, 64, scope='block2') net = mobilenet_block(net, 128, stride=[2, 2], scope='block3') net = mobilenet_block(net, 128, scope='block4') net = mobilenet_block(net, 256, stride=[2, 2], scope='block5') net = mobilenet_block(net, 256, scope='block6') net = mobilenet_block(net, 512, stride=[2, 2], scope='block7') # Intermediate blocks... for i in range(8, 16): with tf.variable_scope(scope, 'resblock_%i' % i, [net]) as sc: # Residual block... res = net net = mobilenet_block_btree_v2(net, 512, split=4, scope='block%i_a' % i) net = btree_layers.translate_channels( net, delta=64, scope='ch_translate_%i_a' % i) net = mobilenet_block_btree_v2(net, 512, split=4, scope='block%i_b' % i) net = btree_layers.translate_channels( net, delta=-64, scope='ch_translate_%i_b' % i) net = mobilenet_block_btree_v2(net, 512, split=4, scope='block%i_c' % i) net = tf.add(res, net, 'residual_sum_%i' % i) net = custom_layers.batch_norm(net) # Final blocks. net = mobilenet_block(net, 1024, stride=[2, 2], scope='block13') net = mobilenet_block(net, 1024, scope='block14') # Spatial pooling + fully connected layer. net = custom_layers.spatial_mean(net, keep_dims=True, scope='spatial_mean14') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, normalizer_params=None, biases_initializer=tf.zeros_initializer(), scope='conv_fc15') net = custom_layers.spatial_squeeze(net) # Logits padding: get everyone to the same number of classes. if pad_logits: net = custom_layers.pad_logits(net, pad=(num_classes - 1000, 0)) return net, end_points