def mobilenet_v2_net(inputs, is_training=False): conv_def = globals()["V2_DEF_" + config.model_name] with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): logits, end_points = mobilenet_v2.mobilenet_base( inputs, num_classes=0, conv_defs=conv_def, is_training=is_training) if conv_def == V2_DEF_tiny: layers = [5, 8, 11, 12] elif conv_def == V2_DEF_small: layers = [6, 10, 14, 16] elif conv_def == V2_DEF_medium: layers = [5, 8, 11, 13, 14] elif conv_def == V2_DEF_large: layers = [6, 10, 14, 17, 19] elif conv_def == V2_DEF_very_large: layers = [5, 8, 11, 13, 15, 16] if config.strides[0] == 2: layers.insert(0, 2) pool_no = 2 end_point_map = [] for layer_no in layers: end_point_map.append(end_points["layer_{}".format(layer_no)]) pool_no += 1 end_point_map.reverse() return logits, end_point_map
def testMobilenetBase(self): tf.reset_default_graph() # Verifies that mobilenet_base returns pre-pooling layer. with slim.arg_scope((mobilenet.depth_multiplier, ), min_depth=32): net, _ = mobilenet_v2.mobilenet_base(tf.placeholder( tf.float32, (10, 224, 224, 16)), conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=0.1) self.assertEqual(net.get_shape().as_list(), [10, 7, 7, 128])
def _mobilenet_v2(net, depth_multiplier, output_stride, divisible_by=None, reuse=None, scope=None, final_endpoint=None): """Auxiliary function to add support for 'reuse' to mobilenet_v2. Args: net: Input tensor of shape [batch_size, height, width, channels]. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. output_stride: An integer that specifies the requested ratio of input to output spatial resolution. If not None, then we invoke atrous convolution if necessary to prevent the network from reducing the spatial resolution of the activation maps. Allowed values are 8 (accurate fully convolutional mode), 16 (fast fully convolutional mode), 32 (classification mode). divisible_by: None (use default setting) or an integer that ensures all layers # channels will be divisible by this number. Used in MobileNet. reuse: Reuse model variables. scope: Optional variable scope. final_endpoint: The endpoint to construct the network up to. Returns: Features extracted by MobileNetv2. """ if divisible_by is None: divisible_by = 8 if depth_multiplier == 1.0 else 1 with tf.variable_scope(scope, 'MobilenetV2', [net], reuse=reuse) as scope: return mobilenet_v2.mobilenet_base( net, conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=depth_multiplier, min_depth=8 if depth_multiplier == 1.0 else 1, divisible_by=divisible_by, final_endpoint=final_endpoint or _MOBILENET_V2_FINAL_ENDPOINT, output_stride=output_stride, scope=scope)
def testMultiplier(self): op = mobilenet.op new_def = copy.deepcopy(mobilenet_v2.V2_DEF) def inverse_multiplier(output_params, multiplier): output_params['num_outputs'] /= multiplier new_def['spec'][0] = op(slim.conv2d, kernel_size=(3, 3), multiplier_func=inverse_multiplier, num_outputs=16) _ = mobilenet_v2.mobilenet_base(tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=new_def, depth_multiplier=0.1) s = [ op.outputs[0].get_shape().as_list()[-1] for op in find_ops('Conv2D') ] # Expect first layer to be 160 (16 / 0.1), and other layers # their max(original size * 0.1, 8) self.assertEqual([160, 8, 48, 8, 48], s[:5])
def testWithOutputStride8(self): out = mobilenet_v2.mobilenet_base( conv_defs=mobilenet_v2.V2_DEF, output_stride=8)(torch.randn(10, 16, 224, 224)) self.assertEqual(list(out.shape)[2:4], [28, 28])
def testWithOutputStride16AndExplicitPadding(self): out = mobilenet_v2.mobilenet_base(output_stride=16, use_explicit_padding=True, conv_defs=mobilenet_v2.V2_DEF, multiplier=0.1)(torch.randn(10, 16, 224, 224)) self.assertEqual(list(out.shape)[2:4], [14, 14])
def testMobilenetBase(self): # Verifies that mobilenet_base returns pre-pooling layer. out = mobilenet_v2.mobilenet_base(min_depth=32, conv_defs=mobilenet_v2.V2_DEF, multiplier=0.1)(torch.randn(10, 16, 224, 224)) self.assertEqual(list(out.shape), [10, 128, 7, 7])
def _build_graph(self): _, endpoints = mobilenet_base(self.images, num_classes=self.num_classes) # Stop gradient doesn't work for unclear reasons. I confirmed that the Mobilenet # weights still changed. #endpoints['Stage2'] = tf.stop_gradient(endpoints['Stage2']) #endpoints['Stage3'] = tf.stop_gradient(endpoints['Stage3']) #endpoints['Stage4'] = tf.stop_gradient(endpoints['Stage4']) with tf.variable_scope('fcos'): with tf.variable_scope('pyramid'): c3 = self._bn_activation_conv(endpoints['Stage2'], 256, 1, 1) c4 = self._bn_activation_conv(endpoints['Stage3'], 256, 1, 1) c5 = self._bn_activation_conv(endpoints['Stage4'], 256, 1, 1) #_get_pyramid(feat, feature_size, top_feat=None) p5 = self._get_pyramid(c5, 256) p4, top_down = self._get_pyramid(c4, 256, p5) p3, _ = self._get_pyramid(c3, 256, top_down) p6 = self._bn_activation_conv(p5, 256, 3, 2) p7 = self._bn_activation_conv(p6, 256, 3, 2) with tf.variable_scope('head'): # The paper indicates that heads are shared between pyramids. # I don't think that's implemented here. p3conf, p3reg, p3center = self._detect_head(p3) p4conf, p4reg, p4center = self._detect_head(p4) p5conf, p5reg, p5center = self._detect_head(p5) p6conf, p6reg, p6center = self._detect_head(p6) p7conf, p7reg, p7center = self._detect_head(p7) if self.data_format == 'channels_first': p3conf = tf.transpose(p3conf, [0, 2, 3, 1]) p3reg = tf.transpose(p3reg, [0, 2, 3, 1]) p3center = tf.transpose(p3center, [0, 2, 3, 1]) p4conf = tf.transpose(p4conf, [0, 2, 3, 1]) p4reg = tf.transpose(p4reg, [0, 2, 3, 1]) p4center = tf.transpose(p4center, [0, 2, 3, 1]) p5conf = tf.transpose(p5conf, [0, 2, 3, 1]) p5reg = tf.transpose(p5reg, [0, 2, 3, 1]) p5center = tf.transpose(p5center, [0, 2, 3, 1]) p6conf = tf.transpose(p6conf, [0, 2, 3, 1]) p6reg = tf.transpose(p6reg, [0, 2, 3, 1]) p6center = tf.transpose(p6center, [0, 2, 3, 1]) p7conf = tf.transpose(p7conf, [0, 2, 3, 1]) p7reg = tf.transpose(p7reg, [0, 2, 3, 1]) p7center = tf.transpose(p7center, [0, 2, 3, 1]) p3shape = [tf.shape(p3center)[1], tf.shape(p3center)[2]] p4shape = [tf.shape(p4center)[1], tf.shape(p4center)[2]] p5shape = [tf.shape(p5center)[1], tf.shape(p5center)[2]] p6shape = [tf.shape(p6center)[1], tf.shape(p6center)[2]] p7shape = [tf.shape(p7center)[1], tf.shape(p7center)[2]] h3 = tf.range(0., tf.cast(p3shape[0], tf.float32), dtype=tf.float32) w3 = tf.range(0., tf.cast(p3shape[1], tf.float32), dtype=tf.float32) h4 = tf.range(0., tf.cast(p4shape[0], tf.float32), dtype=tf.float32) w4 = tf.range(0., tf.cast(p4shape[1], tf.float32), dtype=tf.float32) h5 = tf.range(0., tf.cast(p5shape[0], tf.float32), dtype=tf.float32) w5 = tf.range(0., tf.cast(p5shape[1], tf.float32), dtype=tf.float32) h6 = tf.range(0., tf.cast(p6shape[0], tf.float32), dtype=tf.float32) w6 = tf.range(0., tf.cast(p6shape[1], tf.float32), dtype=tf.float32) h7 = tf.range(0., tf.cast(p7shape[0], tf.float32), dtype=tf.float32) w7 = tf.range(0., tf.cast(p7shape[1], tf.float32), dtype=tf.float32) [grid_x3, grid_y3] = tf.meshgrid(w3, h3) [grid_x4, grid_y4] = tf.meshgrid(w4, h4) [grid_x5, grid_y5] = tf.meshgrid(w5, h5) [grid_x6, grid_y6] = tf.meshgrid(w6, h6) [grid_x7, grid_y7] = tf.meshgrid(w7, h7) stride_3, stride_4, stride_5, stride_6, stride_7 = 8, 16, 32, 64, 128 if self.mode == 'train' or self.mode == 'val': total_loss = [] for i in range(self.batch_size): gt_i = self.ground_truth[i, ...] slice_index = tf.argmin(gt_i, axis=0)[0] gt_i = tf.gather( gt_i, tf.range(0, slice_index, dtype=tf.int64)) gt_size = tf.sqrt(gt_i[..., 2] * gt_i[..., 3]) # As according to the paper Feature Pyramid Networks, only train on objects at a specific spatial scale # ^^Hardcoded^^ #pyramid_sizes = [64., 128., 256., 512.] pyramid_sizes = [32., 64., 128., 256.] g3 = tf.boolean_mask(gt_i, gt_size <= pyramid_sizes[0]) g4 = tf.boolean_mask( gt_i, tf.cast(gt_size >= pyramid_sizes[0], tf.float32) * tf.cast(gt_size <= pyramid_sizes[1], tf.float32) > 0.) g5 = tf.boolean_mask( gt_i, tf.cast(gt_size >= pyramid_sizes[1], tf.float32) * tf.cast(gt_size <= pyramid_sizes[2], tf.float32) > 0.) g6 = tf.boolean_mask( gt_i, tf.cast(gt_size >= pyramid_sizes[2], tf.float32) * tf.cast(gt_size <= pyramid_sizes[3], tf.float32) > 0.) g7 = tf.boolean_mask(gt_i, gt_size >= pyramid_sizes[3]) # If the pyramid 3 layer has a training sample, compute the loss on that layer, else 0 loss #_compute_one_image_loss(heatmap_pred, dist_pred, center_pred, ground_truth, grid_y, grid_x, stride, pshape): loss3 = tf.cond( tf.shape(g3)[0] > 0, lambda: self._compute_one_image_loss( p3conf[i, ...], p3reg[i, ...], p3center[ i, ...], g3, grid_y3, grid_x3, stride_3, p3shape), lambda: 0.) loss4 = tf.cond( tf.shape(g4)[0] > 0, lambda: self._compute_one_image_loss( p4conf[i, ...], p4reg[i, ...], p4center[ i, ...], g4, grid_y4, grid_x4, stride_4, p4shape), lambda: 0.) loss5 = tf.cond( tf.shape(g5)[0] > 0, lambda: self._compute_one_image_loss( p5conf[i, ...], p5reg[i, ...], p5center[ i, ...], g5, grid_y5, grid_x5, stride_5, p5shape), lambda: 0.) loss6 = tf.cond( tf.shape(g6)[0] > 0, lambda: self._compute_one_image_loss( p6conf[i, ...], p6reg[i, ...], p6center[ i, ...], g6, grid_y6, grid_x6, stride_6, p6shape), lambda: 0.) loss7 = tf.cond( tf.shape(g7)[0] > 0, lambda: self._compute_one_image_loss( p7conf[i, ...], p7reg[i, ...], p7center[ i, ...], g7, grid_y7, grid_x7, stride_7, p7shape), lambda: 0.) total_loss.append(loss3 + loss4 + loss5 + loss6 + loss7) self.loss = tf.reduce_mean( total_loss) + self.weight_decay * tf.add_n([ tf.nn.l2_loss(var) for var in tf.trainable_variables() ]) optimizer = tf.train.AdamOptimizer(self.lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "fcos") train_op = optimizer.minimize(self.loss, global_step=self.global_step, var_list=train_vars) self.train_op = tf.group([update_ops, train_op]) else: # Test mode p3conf = tf.reshape( tf.sigmoid(p3conf[0, ...]) * tf.sigmoid(p3center[0, ...]), [-1, self.num_classes]) p4conf = tf.reshape( tf.sigmoid(p4conf[0, ...]) * tf.sigmoid(p4center[0, ...]), [-1, self.num_classes]) p5conf = tf.reshape( tf.sigmoid(p5conf[0, ...]) * tf.sigmoid(p5center[0, ...]), [-1, self.num_classes]) p6conf = tf.reshape( tf.sigmoid(p6conf[0, ...]) * tf.sigmoid(p6center[0, ...]), [-1, self.num_classes]) p7conf = tf.reshape( tf.sigmoid(p7conf[0, ...]) * tf.sigmoid(p7center[0, ...]), [-1, self.num_classes]) pconf = tf.concat([p3conf, p4conf, p5conf, p6conf, p7conf], axis=0) p3reg = p3reg[0, ...] p4reg = p4reg[0, ...] p5reg = p5reg[0, ...] p6reg = p6reg[0, ...] p7reg = p7reg[0, ...] grid_y3 = tf.expand_dims(grid_y3, axis=-1) grid_x3 = tf.expand_dims(grid_x3, axis=-1) grid_y4 = tf.expand_dims(grid_y4, axis=-1) grid_x4 = tf.expand_dims(grid_x4, axis=-1) grid_y5 = tf.expand_dims(grid_y5, axis=-1) grid_x5 = tf.expand_dims(grid_x5, axis=-1) grid_y6 = tf.expand_dims(grid_y6, axis=-1) grid_x6 = tf.expand_dims(grid_x6, axis=-1) grid_y7 = tf.expand_dims(grid_y7, axis=-1) grid_x7 = tf.expand_dims(grid_x7, axis=-1) p3_y1 = grid_y3 - p3reg[..., 2:3] p3_y2 = grid_y3 + p3reg[..., 3:4] p3_x1 = grid_x3 - p3reg[..., 0:1] p3_x2 = grid_x3 + p3reg[..., 1:2] p4_y1 = grid_y4 - p4reg[..., 2:3] p4_y2 = grid_y4 + p4reg[..., 3:4] p4_x1 = grid_x4 - p4reg[..., 0:1] p4_x2 = grid_x4 + p4reg[..., 1:2] p5_y1 = grid_y5 - p5reg[..., 2:3] p5_y2 = grid_y5 + p5reg[..., 3:4] p5_x1 = grid_x5 - p5reg[..., 0:1] p5_x2 = grid_x5 + p5reg[..., 1:2] p6_y1 = grid_y6 - p6reg[..., 2:3] p6_y2 = grid_y6 + p6reg[..., 3:4] p6_x1 = grid_x6 - p6reg[..., 0:1] p6_x2 = grid_x6 + p6reg[..., 1:2] p7_y1 = grid_y7 - p7reg[..., 2:3] p7_y2 = grid_y7 + p7reg[..., 3:4] p7_x1 = grid_x7 - p7reg[..., 0:1] p7_x2 = grid_x7 + p7reg[..., 1:2] p3bbox = tf.reshape( tf.concat([p3_y1, p3_x1, p3_y2, p3_x2], axis=-1), [-1, 4]) * stride_3 p4bbox = tf.reshape( tf.concat([p4_y1, p4_x1, p4_y2, p4_x2], axis=-1), [-1, 4]) * stride_4 p5bbox = tf.reshape( tf.concat([p5_y1, p5_x1, p5_y2, p5_x2], axis=-1), [-1, 4]) * stride_5 p6bbox = tf.reshape( tf.concat([p6_y1, p6_x1, p6_y2, p6_x2], axis=-1), [-1, 4]) * stride_6 p7bbox = tf.reshape( tf.concat([p7_y1, p7_x1, p7_y2, p7_x2], axis=-1), [-1, 4]) * stride_7 pbbox = tf.concat([p3bbox, p4bbox, p5bbox, p6bbox, p7bbox], axis=0) filter_mask = tf.greater_equal(pconf, self.nms_score_threshold) scores = [] class_id = [] bbox = [] for i in range(self.num_classes - 1): scoresi = tf.boolean_mask(pconf[:, i], filter_mask[:, i]) bboxi = tf.boolean_mask(pbbox, filter_mask[:, i]) selected_indices = tf.image.non_max_suppression( bboxi, scoresi, self.nms_max_boxes, self.nms_iou_threshold, ) scores.append(tf.gather(scoresi, selected_indices)) bbox.append(tf.gather(bboxi, selected_indices)) class_id.append( tf.ones_like(tf.gather(scoresi, selected_indices), tf.int32) * i) bbox = tf.concat(bbox, axis=0) scores = tf.concat(scores, axis=0) class_id = tf.concat(class_id, axis=0) self.detection_pred = [scores, bbox, class_id]
import cv2 import numpy as np from mobilenet_v2 import mobilenet_base import tensorflow as tf image = tf.image.decode_jpeg(tf.read_file("test.jpg")) images = tf.expand_dims(image, 0) images = tf.cast(images, tf.float32) / 128. - 1 images.set_shape((None, None, None, 3)) images = tf.image.resize_images(images, (128, 64)) net, end = mobilenet_base(images) # print("========== net ============") # for part in net: print(end) print("========== end ============") for part in end: print(part) x = end["Predictions"] print(x)