def extract_base_features_small(self, preprocessed_inputs): """Extract the small base model features. Variables are created under the scope of <scope>/MobilenetV2_2/ Args: preprocessed_inputs: preprocessed input images of shape: [batch, width, height, depth]. Returns: net: the last feature map created from the base feature extractor. end_points: a dictionary of feature maps created. """ scope_name = self._base_network_scope + '_2' with tf.variable_scope(scope_name, reuse=self._reuse_weights) as base_scope: if self._low_res: size_small = preprocessed_inputs.get_shape().as_list()[1] / 2 inputs_small = tf.image.resize_images(preprocessed_inputs, [size_small, size_small]) # Create end point handle for tflite deployment. with tf.name_scope(None): inputs_small = tf.identity( inputs_small, name='normalized_input_image_tensor_small') else: inputs_small = preprocessed_inputs net, end_points = mobilenet_v2.mobilenet_base( inputs_small, depth_multiplier=self._depth_multipliers[1], conv_defs=mobilenet_defs.mobilenet_v2_lite_def( is_quantized=self._is_quantized, low_res=self._low_res), use_explicit_padding=self._use_explicit_padding, scope=base_scope) return net, end_points
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) #with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: # with slim.arg_scope( #mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ #slim.arg_scope( #[mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=self._conv_defs, use_explicit_padding=self._use_explicit_padding, scope=scope)
def testMobilenetBase(self): tf.reset_default_graph() # Verifies that mobilenet_base returns pre-pooling layer. with slim.arg_scope((mobilenet.depth_multiplier,), min_depth=32): net, _ = mobilenet_v2.mobilenet_base( tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=0.1) self.assertEqual(net.get_shape().as_list(), [10, 7, 7, 128])
def test_mobilenet_v2_lite_def_low_res(self): net, _ = mobilenet_v2.mobilenet_base( tf.placeholder(tf.float32, (10, 320, 320, 3)), min_depth=8, depth_multiplier=1.0, conv_defs=mobilenet_defs.mobilenet_v2_lite_def(low_res=True), use_explicit_padding=True, scope='MobilenetV2') self.assertEqual(net.get_shape().as_list(), [10, 20, 20, 320])
def test_mobilenet_v2_lite_def_is_quantized(self): net, _ = mobilenet_v2.mobilenet_base( tf.placeholder(tf.float32, (10, 320, 320, 3)), min_depth=8, depth_multiplier=1.0, conv_defs=mobilenet_defs.mobilenet_v2_lite_def(is_quantized=True), use_explicit_padding=True, scope='MobilenetV2') self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320]) self._assert_contains_op('MobilenetV2/expanded_conv_16/project/Relu6')
def _image_to_head(self, is_training, reuse=None): with slim.arg_scope(mobilenet_v2.training_scope(is_training=is_training)): net, endpoints = mobilenet_v2.mobilenet_base(self._image, conv_defs=CTPN_DEF) self.variables_to_restore = slim.get_variables_to_restore() self._act_summaries.append(net) self._layers['head'] = net return net
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) ''' feature_map_layout = { 'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_depthwise': self._use_depthwise, 'use_explicit_padding': self._use_explicit_padding, } ''' feature_map_layout = { 'from_layer': ['layer_5/expansion_output', 'layer_6/expansion_output', 'layer_7/expansion_output', 'layer_10/expansion_output', 'layer_15/expansion_output', 'layer_19'], 'layer_depth': [-1, -1, -1, -1, -1, -1], 'use_depthwise': self._use_depthwise, 'use_explicit_padding': self._use_explicit_padding, } with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_utils.resnet_arg_scope()), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def _image_to_head(self, is_training, reuse=None): with slim.arg_scope( mobilenet_v2.training_scope(is_training=is_training)): net, endpoints = mobilenet_v2.mobilenet_base(self._image, conv_defs=CTPN_DEF) self.variables_to_restore = slim.get_variables_to_restore() self._act_summaries.append(net) self._layers['head'] = net return net
def _extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_depthwise': self._use_depthwise, 'use_explicit_padding': self._use_explicit_padding, } with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope( is_training=(self._is_training and self._batch_norm_trainable), bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): # TODO(b/68150321): Enable fused batch norm once quantization # supports it. with slim.arg_scope([slim.batch_norm], fused=False): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams): # TODO(b/68150321): Enable fused batch norm once quantization # supports it. with slim.arg_scope([slim.batch_norm], fused=False): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def MobileNet(depth_multiplier, imgs_in, weight_decay, batch_norm_momentum, is_training): with tf.contrib.slim.arg_scope( mobilenet_v2.training_scope(is_training=is_training, weight_decay=weight_decay, bn_decay=batch_norm_momentum)): features, _ = mobilenet_v2.mobilenet_base( imgs_in, depth_multiplier=depth_multiplier, finegrain_classification_mode=depth_multiplier < 1, output_stride=16) return features
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_depthwise': self._use_depthwise, 'use_explicit_padding': self._use_explicit_padding, } with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): # TODO(b/68150321): Enable fused batch norm once quantization # supports it. with slim.arg_scope([slim.batch_norm], fused=False): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): # TODO(b/68150321): Enable fused batch norm once quantization # supports it. with slim.arg_scope([slim.batch_norm], fused=False): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def test_mobilenet_v2_lite_def_reduced(self): net, features = mobilenet_v2.mobilenet_base( tf.placeholder(tf.float32, (10, 320, 320, 3)), min_depth=8, depth_multiplier=1.0, conv_defs=mobilenet_defs.mobilenet_v2_lite_def(reduced=True), use_explicit_padding=True, scope='MobilenetV2') self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320]) self.assertEqual( features['layer_3/expansion_output'].get_shape().as_list(), [10, 160, 160, 48]) self.assertEqual( features['layer_4/expansion_output'].get_shape().as_list(), [10, 80, 80, 72])
def build_encoder(network, inputs, is_training, depth_multiplier=None, output_stride=16): if network == "mobilenet_v2": return mobilenet_v2_slim.mobilenet_base( inputs, conv_defs=mobilenet_v2_slim.V2_DEF, depth_multiplier=depth_multiplier, final_endpoint="layer_18", output_stride=output_stride, is_training=is_training) else: raise NotImplementedError
def test_mobilenet_v2_lite_def(self): net, features = mobilenet_v2.mobilenet_base( tf.placeholder(tf.float32, (10, 320, 320, 3)), min_depth=8, depth_multiplier=1.0, conv_defs=mobilenet_defs.mobilenet_v2_lite_def(), use_explicit_padding=True, scope='MobilenetV2') self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320]) self._assert_contains_op( 'MobilenetV2/expanded_conv_16/project/Identity') self.assertEqual( features['layer_3/expansion_output'].get_shape().as_list(), [10, 160, 160, 96]) self.assertEqual( features['layer_4/expansion_output'].get_shape().as_list(), [10, 80, 80, 144])
def _mobilenet_v2(net, depth_multiplier, output_stride, conv_defs=None, divisible_by=None, reuse=None, scope=None, final_endpoint=None): """Auxiliary function to add support for 'reuse' to mobilenet_v2. Args: net: Input tensor of shape [batch_size, height, width, channels]. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. output_stride: An integer that specifies the requested ratio of input to output spatial resolution. If not None, then we invoke atrous convolution if necessary to prevent the network from reducing the spatial resolution of the activation maps. Allowed values are 8 (accurate fully convolutional mode), 16 (fast fully convolutional mode), 32 (classification mode). conv_defs: MobileNet con def. divisible_by: None (use default setting) or an integer that ensures all layers # channels will be divisible by this number. Used in MobileNet. reuse: Reuse model variables. scope: Optional variable scope. final_endpoint: The endpoint to construct the network up to. Returns: Features extracted by MobileNetv2. """ if divisible_by is None: divisible_by = 8 if depth_multiplier == 1.0 else 1 if conv_defs is None: conv_defs = mobilenet_v2.V2_DEF with tf.variable_scope( scope, 'MobilenetV2', [net], reuse=reuse) as scope: return mobilenet_v2.mobilenet_base( net, conv_defs=conv_defs, depth_multiplier=depth_multiplier, min_depth=8 if depth_multiplier == 1.0 else 1, divisible_by=divisible_by, final_endpoint=final_endpoint or _MOBILENET_V2_FINAL_ENDPOINT, output_stride=output_stride, scope=scope)
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] activations: A dictionary mapping feature extractor tensor names to tensors Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ # print('###faster_rcnn_mobilenet_v2_feature_extractor.py### - extract_proposal_features') preprocessed_inputs.get_shape().assert_has_rank(4) preprocessed_inputs = shape_utils.check_min_image_dim( min_dim=33, image_tensor=preprocessed_inputs) with slim.arg_scope( mobilenet_v2.training_scope(is_training=self._train_batch_norm, weight_decay=self._weight_decay)): with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: params = {} if self._skip_last_stride: # Not called by default, will use conv_defs in slim.nets.mobilenet.mobilenet_v2 params[ 'conv_defs'] = _get_mobilenet_conv_no_last_stride_defs( conv_depth_ratio_in_percentage=self. _conv_depth_ratio_in_percentage) _, endpoints = mobilenet_v2.mobilenet_base( preprocessed_inputs, final_endpoint='layer_19', # actually 'MobilenetV2/Conv_1' min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope, **params) return endpoints['layer_19'], endpoints
def testMultiplier(self): op = mobilenet.op new_def = copy.deepcopy(mobilenet_v2.V2_DEF) def inverse_multiplier(output_params, multiplier): output_params['num_outputs'] /= multiplier new_def['spec'][0] = op( slim.conv2d, kernel_size=(3, 3), multiplier_func=inverse_multiplier, num_outputs=16) _ = mobilenet_v2.mobilenet_base( tf.placeholder(tf.float32, (10, 224, 224, 16)), conv_defs=new_def, depth_multiplier=0.1) s = [op.outputs[0].get_shape().as_list()[-1] for op in find_ops('Conv2D')] # Expect first layer to be 160 (16 / 0.1), and other layers # their max(original size * 0.1, 8) self.assertEqual([160, 8, 48, 8, 48], s[:5])
def _mobilenet_v2(net, depth_multiplier, output_stride, divisible_by=None, reuse=None, scope=None, final_endpoint=None): """Auxiliary function to add support for 'reuse' to mobilenet_v2. Args: net: Input tensor of shape [batch_size, height, width, channels]. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. output_stride: An integer that specifies the requested ratio of input to output spatial resolution. If not None, then we invoke atrous convolution if necessary to prevent the network from reducing the spatial resolution of the activation maps. Allowed values are 8 (accurate fully convolutional mode), 16 (fast fully convolutional mode), 32 (classification mode). divisible_by: None (use default setting) or an integer that ensures all layers # channels will be divisible by this number. Used in MobileNet. reuse: Reuse model variables. scope: Optional variable scope. final_endpoint: The endpoint to construct the network up to. Returns: Features extracted by MobileNetv2. """ if divisible_by is None: divisible_by = 8 if depth_multiplier == 1.0 else 1 with tf.variable_scope( scope, 'MobilenetV2', [net], reuse=reuse) as scope: return mobilenet_v2.mobilenet_base( net, conv_defs=mobilenet_v2.V2_DEF, depth_multiplier=depth_multiplier, min_depth=8 if depth_multiplier == 1.0 else 1, divisible_by=divisible_by, final_endpoint=final_endpoint or _MOBILENET_V2_FINAL_ENDPOINT, output_stride=output_stride, scope=scope)
def extract_base_features_large(self, preprocessed_inputs): """Extract the large base model features. Variables are created under the scope of <scope>/MobilenetV2_1/ Args: preprocessed_inputs: preprocessed input images of shape: [batch, width, height, depth]. Returns: net: the last feature map created from the base feature extractor. end_points: a dictionary of feature maps created. """ scope_name = self._base_network_scope + '_1' with tf.variable_scope(scope_name, reuse=self._reuse_weights) as base_scope: net, end_points = mobilenet_v2.mobilenet_base( preprocessed_inputs, depth_multiplier=self._depth_multipliers[0], conv_defs=mobilenet_defs.mobilenet_v2_lite_def( is_quantized=self._is_quantized), use_explicit_padding=self._use_explicit_padding, scope=base_scope) return net, end_points
def extract_features(self, inputs): """Extracts features from inputs. This function adds 4 additional feature maps on top of 'layer_15/expansion_output' and 'layer_19' in the base Mobilenet v2 network. Args: inputs: a tensor of shape [batch_size, height, with, channels], holding the input images. Returns: a list of 6 float tensors of shape [batch_size, height, width, channels], holding feature map tensors to be fed to box predictor. """ feature_map_specs_dict = { 'layer_name': ['layer_15/expansion_output', 'layer_19', None, None, None, None], 'layer_depth': [None, None, 512, 256, 256, 128]} with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)): _, end_points = mobilenet_v2.mobilenet_base( inputs, final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.ssd_feature_maps( feature_map_tensor_dict=end_points, feature_map_specs_dict=feature_map_specs_dict, depth_multiplier=1, use_depthwise=self._use_depthwise, insert_1x1_conv=True) feature_map_list = list(feature_maps.values()) return feature_map_list
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] activations: A dictionary mapping feature extractor tensor names to tensors Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): _, activations = mobilenet_v2.mobilenet_base( preprocessed_inputs, final_endpoint='layer_19', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) return activations['layer_19'], activations
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=self._conv_defs, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'layer_4', 'layer_7', 'layer_14', 'layer_19' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise, use_explicit_padding=self._use_explicit_padding) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features padding = 'VALID' if self._use_explicit_padding else 'SAME' kernel_size = 3 for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d if self._use_explicit_padding: last_feature_map = ops.fixed_padding( last_feature_map, kernel_size) last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[kernel_size, kernel_size], stride=2, padding=padding, scope='bottom_up_Conv2d_{}'.format( i - base_fpn_max_level + 19)) feature_maps.append(last_feature_map) return feature_maps
def _build_model(self, **kwargs): d = dict() num_classes = self.num_classes frontend = kwargs.pop('frontend', 'resnet_v2_50') num_anchors = kwargs.pop('num_anchors', 9) if 'resnet_v2' in frontend: d['feature_map'] = self.X - [[[123.68, 116.779, 103.939]]] frontend_dir = os.path.join('pretrained_models', '{}.ckpt'.format(frontend)) with slim.arg_scope(resnet_v2.resnet_arg_scope()): logits, end_points = resnet_v2.resnet_v2_50(d['feature_map'], is_training=self.is_train) d['init_fn'] = slim.assign_from_checkpoint_fn(model_path=frontend_dir, var_list=slim.get_model_variables(frontend)) convs = [end_points[frontend + '/block{}'.format(x)] for x in [4, 2, 1]] elif 'mobilenet_v2' in frontend: d['feature_map'] = (2.0 / 255.0) * self.X - 1.0 frontend_dir = os.path.join('pretrained_models', 'mobilenet_v2_1.4_224', '{}.ckpt'.format(frontend)) with slim.arg_scope(mobilenet_v2.training_scope()): _, end_points = mobilenet_v2.mobilenet_base(d['feature_map'], is_training=self.is_train) convs = [end_points[x] for x in ['layer_19', 'layer_14', 'layer_7']] else: #TODO build convNet raise NotImplementedError("Build own convNet!") with tf.variable_scope('layer5'): d['s_5'] = conv_layer(convs[0], 256, (1, 1), (1, 1)) d['cls_head5'] = build_head_cls(d['s_5'], num_anchors, num_classes + 1) d['loc_head5'] = build_head_loc(d['s_5'], num_anchors) d['flat_cls_head5'] = tf.reshape(d['cls_head5'], (tf.shape(d['cls_head5'])[0], -1, num_classes + 1)) d['flat_loc_head5'] = tf.reshape(d['loc_head5'], (tf.shape(d['loc_head5'])[0], -1, 4)) with tf.variable_scope('layer6'): d['s_6'] = conv_layer(d['s_5'], 256, (3, 3), (2, 2)) d['cls_head6'] = build_head_cls(d['s_6'], num_anchors, num_classes + 1) d['loc_head6'] = build_head_loc(d['s_6'], num_anchors) d['flat_cls_head6'] = tf.reshape(d['cls_head6'], (tf.shape(d['cls_head6'])[0], -1, num_classes + 1)) d['flat_loc_head6'] = tf.reshape(d['loc_head6'], (tf.shape(d['loc_head6'])[0], -1, 4)) with tf.variable_scope('layer7'): d['s_7'] = conv_layer(tf.nn.relu(d['s_6']), 256, (3, 3), (2, 2)) d['cls_head7'] = build_head_cls(d['s_7'], num_anchors, num_classes + 1) d['loc_head7'] = build_head_loc(d['s_7'], num_anchors) d['flat_cls_head7'] = tf.reshape(d['cls_head7'], (tf.shape(d['cls_head7'])[0], -1, num_classes + 1)) d['flat_loc_head7'] = tf.reshape(d['loc_head7'], (tf.shape(d['loc_head7'])[0], -1, 4)) with tf.variable_scope('layer4'): d['up4'] = resize_to_target(d['s_5'], convs[1]) d['s_4'] = conv_layer(convs[1], 256, (1, 1), (1, 1)) + d['up4'] d['cls_head4'] = build_head_cls(d['s_4'], num_anchors, num_classes + 1) d['loc_head4'] = build_head_loc(d['s_4'], num_anchors) d['flat_cls_head4'] = tf.reshape(d['cls_head4'], (tf.shape(d['cls_head4'])[0], -1, num_classes + 1)) d['flat_loc_head4'] = tf.reshape(d['loc_head4'], (tf.shape(d['loc_head4'])[0], -1, 4)) with tf.variable_scope('layer3'): d['up3'] = resize_to_target(d['s_4'], convs[2]) d['s_3'] = conv_layer(convs[2], 256, (1, 1), (1, 1)) + d['up3'] d['cls_head3'] = build_head_cls(d['s_3'], num_anchors, num_classes + 1) d['loc_head3'] = build_head_loc(d['s_3'], num_anchors) d['flat_cls_head3'] = tf.reshape(d['cls_head3'], (tf.shape(d['cls_head3'])[0], -1, num_classes + 1)) d['flat_loc_head3'] = tf.reshape(d['loc_head3'], (tf.shape(d['loc_head3'])[0], -1, 4)) with tf.variable_scope('head'): d['cls_head'] = tf.concat((d['flat_cls_head3'], d['flat_cls_head4'], d['flat_cls_head5'], d['flat_cls_head6'], d['flat_cls_head7']), axis=1) d['loc_head'] = tf.concat((d['flat_loc_head3'], d['flat_loc_head4'], d['flat_loc_head5'], d['flat_loc_head6'], d['flat_loc_head7']), axis=1) d['logits'] = tf.concat((d['loc_head'], d['cls_head']), axis=2) d['pred'] = tf.concat((d['loc_head'], tf.nn.softmax(d['cls_head'], axis=-1)), axis=2) return d
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.99)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with slim.arg_scope( training_scope(l2_weight_decay=4e-5, is_training=self._is_training)): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_18', depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) multiplier_func = functools.partial( _apply_multiplier, multiplier=self._depth_multiplier, min_depth=self._min_depth) with tf.variable_scope('MnasFPN', reuse=self._reuse_weights): with slim.arg_scope( training_scope(l2_weight_decay=1e-4, is_training=self._is_training)): # Create C6 by downsampling C5. c6 = slim.max_pool2d( _maybe_pad(image_features['layer_18'], self._use_explicit_padding), [3, 3], stride=[2, 2], padding='VALID' if self._use_explicit_padding else 'SAME', scope='C6_downsample') c6 = slim.conv2d( c6, multiplier_func(self._fpn_layer_depth), [1, 1], activation_fn=tf.identity, normalizer_fn=slim.batch_norm, weights_regularizer=None, # this 1x1 has no kernel regularizer. padding='VALID', scope='C6_Conv1x1') image_features['C6'] = tf.identity(c6) # Needed for quantization. for k in sorted(image_features.keys()): tf.logging.error('{}: {}'.format(k, image_features[k])) mnasfpn_inputs = [ image_features['layer_7'], # C3 image_features['layer_14'], # C4 image_features['layer_18'], # C5 image_features['C6'] # C6 ] self._verify_config(mnasfpn_inputs) feature_maps = mnasfpn( mnasfpn_inputs, head_def=self._head_def, output_channel=self._fpn_layer_depth, use_explicit_padding=self._use_explicit_padding, use_native_resize_op=self._use_native_resize_op, multiplier_func=multiplier_func) return feature_maps
def backbone_net(inputs, image_size, is_training=True, depth_multiplier=0.5, **kwargs): pad_to_multiple = 10 use_explicit_padding = False depth_multiplier = depth_multiplier print('construct backbone_net for image_size', image_size, 'depth_multiplier = ', depth_multiplier) use_depthwise = True override_base_feature_extractor_hyperparams = False reuse_weights = None min_depth = 16 specs = [ op(slim.conv2d, stride=2, num_outputs=64, kernel_size=[3, 3], activation_fn=tf.nn.elu), # todo: Depthwise Conv3×3 op(ops.expanded_conv, stride=1, kernel_size=[3, 3], num_outputs=64), # 562×64Bottleneck 2 64 5 2 op(slim.max_pool2d, kernel_size=[3, 3], padding='SAME', stride=1), op(ops.expanded_conv, stride=2, num_outputs=64, kernel_size=[3, 3]), ] for _ in range(0, 4): specs.append( op(ops.expanded_conv, stride=1, num_outputs=64, kernel_size=[3, 3])) # 282×64Bottleneck212812 specs.append( op(ops.expanded_conv, stride=2, num_outputs=128, kernel_size=[3, 3])) # 142×128Bottleneck412861 mid_conv_n = kwargs.get('mid_conv_n', 4) for _ in range(0, mid_conv_n): specs.append( op(ops.expanded_conv, expansion_size=expand_input(4), num_outputs=128, stride=1)) kernel_size = [5, 5] specs.append(op(ops.expanded_conv, stride=1, num_outputs=16, scope='S1')) specs.append( op(slim.conv2d, stride=2, kernel_size=[3, 3], num_outputs=32, scope='S2', activation_fn=tf.nn.elu)) specs.append( op(slim.conv2d, stride=1, kernel_size=kernel_size, num_outputs=128, scope='S3', padding='VALID', activation_fn=tf.nn.elu)) # print('specs = ', specs, ' len = ', len(specs)) arch = dict( defaults={ # Note: these parameters of batch norm affect the architecture # that's why they are here and not in training_scope. ( slim.batch_norm, ): { 'center': True, 'scale': True }, (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 }, (ops.expanded_conv, ): { 'expansion_size': expand_input(2), 'split_expansion': 1, 'normalizer_fn': slim.batch_norm, 'residual': True, }, (slim.conv2d, slim.separable_conv2d): { 'padding': 'SAME', 'weights_initializer': slim.xavier_initializer() } }, spec=specs) print('input to backbone_net ', inputs) with tf.variable_scope('Backbone', reuse=reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=is_training, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=min_depth): with (slim.arg_scope(conv_hyperparams_fn(is_training=is_training)) if override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( od_ops.pad_to_multiple(inputs, pad_to_multiple), depth_multiplier=depth_multiplier, is_training=is_training, use_explicit_padding=use_explicit_padding, conv_defs=arch, scope=scope) # do a fully connected layer here # TODO print('image image_features', image_features.keys()) all_layers = [] for layer_name in image_features.keys(): if re.match('^layer_\\d+$', layer_name) is not None: all_layers.append(layer_name) def layer_key(val): return int(val.split('_')[1]) all_layers.sort(key=layer_key) print('all_layers', all_layers) layer_15 = image_features[all_layers[-3]] layer_16 = image_features[all_layers[-2]] layer_17 = image_features[all_layers[-1]] # batch_size = tf.shape(S1)[0] S1 = slim.flatten( layer_15, scope='S1flatten') # tf.reshape(S1, [batch_size, -1]) S2 = slim.flatten(layer_16, scope='S2flatten') # [batch_size, -1]) S3 = slim.flatten(layer_17, scope='S3flatten') # [batch_size, -1]) before_dense = tf.concat([S1, S2, S3], 1) for l in all_layers: print(l, image_features[l]) # print('layer_17', layer_17) print('S1', S1) print('S2', S2) print('S3', S3) # to_test = slim.conv2d(image_features['layer_19']) print('before fully_connected', before_dense) with slim.arg_scope( [slim.fully_connected], weights_initializer=slim.xavier_initializer(), normalizer_fn=None, activation_fn=tf.nn.tanh): fc_x = kwargs.get('fc_x_n', 2) print('fully_connected before last x ', fc_x) pre_chin = slim.fully_connected(before_dense, 34 * fc_x) pre_left_eye_brow = slim.fully_connected( before_dense, 10 * fc_x) pre_right_eye_brow = slim.fully_connected( before_dense, 10 * fc_x) pre_nose = slim.fully_connected(before_dense, 18 * fc_x) pre_left_eye = slim.fully_connected( before_dense, 12 * fc_x) pre_right_eye = slim.fully_connected( before_dense, 12 * fc_x) pre_mouth = slim.fully_connected(before_dense, 40 * fc_x) chin = slim.fully_connected(pre_chin, 34) left_eye_brow = slim.fully_connected(pre_left_eye_brow, 10) right_eye_brow = slim.fully_connected( pre_right_eye_brow, 10) nose = slim.fully_connected(pre_nose, 18) left_eye = slim.fully_connected(pre_left_eye, 12) right_eye = slim.fully_connected(pre_right_eye, 12) mouth = slim.fully_connected(pre_mouth, 40) landmarks = tf.concat([ chin, left_eye_brow, right_eye_brow, nose, left_eye, right_eye, mouth ], -1) return image_features, landmarks, None
shape=shift_buffer_shapes[9], name='shift_buffer_9') ] #FINAL_NODE_NAME="MobilenetV2/Conv_1/Relu6" FINAL_NODE_NAME = "MobilenetV2/Logits/output" in_tensor = tf.placeholder(tf.float32, shape=(1, 224, 224, 3), name='in_img') print( torch_params(0)['normalizer_params']['param_initializers'] ['moving_mean'].get_config()) in_img = tf.identity(in_tensor) net, endpoints = mobilenet_v2.mobilenet_base(in_img, conv_defs=V2_DEF_TSM) # Add the classifier with tf.variable_scope("MobilenetV2/Logits"): kernel_initializer = None bias_initializer = tf.zeros_initializer() if IMPORT_PYTORCH: kernel_initializer = torch_params(-1)["weights_initializer"] bias_initializer = torch_params(-1)["biases_initializer"] net = tf.nn.avg_pool(net, [1, 7, 7, 1], 1, "VALID", name="AvgPool") net = tf.squeeze(net, (1, 2)) net = tf.layers.dense(net, 27, use_bias=True, trainable=False,
def backbone_net(inputs, image_size, is_training=True, depth_multiplier=0.5): pad_to_multiple = 14 if image_size == 112 else (10 if image_size == 80 else 8) use_explicit_padding = False depth_multiplier = depth_multiplier print('construct backbone_net for image_size', image_size, 'depth_multiplier = ', depth_multiplier) use_depthwise = True override_base_feature_extractor_hyperparams = False reuse_weights = None min_depth = 16 specs = [ op(slim.conv2d, stride=2, num_outputs=64, kernel_size=[3, 3]), # todo: Depthwise Conv3×3 op(slim.separable_conv2d, stride=1, kernel_size=[3, 3], num_outputs=None, multiplier_func=dummy_depth_multiplier), # 562×64Bottleneck 2 64 5 2 op(ops.expanded_conv, stride=2, num_outputs=64), ] for _ in range(0, 4): specs.append(op(ops.expanded_conv, stride=1, num_outputs=64)) # 282×64Bottleneck212812 specs.append(op(ops.expanded_conv, stride=2, num_outputs=128)) # 142×128Bottleneck412861 for _ in range(0, 6): specs.append(op(ops.expanded_conv, expansion_size=expand_input(4), num_outputs=128, stride=1)) kernel_size = [7, 7] if image_size == 112 else ([5,5] if image_size == 80 else [4,4]) specs.append(op(ops.expanded_conv, stride=1, num_outputs=16, scope='S1')) specs.append(op(slim.conv2d, stride=2, kernel_size=[3, 3], num_outputs=32, scope='S2')) specs.append(op(slim.conv2d, stride=1, kernel_size=kernel_size, num_outputs=128, scope='S3', padding='VALID')) # print('specs = ', specs, ' len = ', len(specs)) arch = dict( defaults={ # Note: these parameters of batch norm affect the architecture # that's why they are here and not in training_scope. (slim.batch_norm,): {'center': True, 'scale': True}, (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 }, (ops.expanded_conv,): { 'expansion_size': expand_input(2), 'split_expansion': 1, 'normalizer_fn': slim.batch_norm, 'residual': True, }, (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME', 'weights_initializer': slim.xavier_initializer()} }, spec=specs ) print('input to backbone_net ' , inputs) with tf.variable_scope('Backbone', reuse=reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=is_training, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=min_depth): with (slim.arg_scope(conv_hyperparams_fn(is_training=is_training)) if override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( od_ops.pad_to_multiple(inputs, pad_to_multiple), depth_multiplier=depth_multiplier, is_training=is_training, use_explicit_padding=use_explicit_padding, conv_defs=arch, scope=scope) # do a fully connected layer here # TODO layer_15 = image_features['layer_15'] layer_16 = image_features['layer_16'] layer_17 = image_features['layer_17'] # batch_size = tf.shape(S1)[0] S1 = slim.flatten(layer_15, scope='S1flatten') # tf.reshape(S1, [batch_size, -1]) S2 = slim.flatten(layer_16, scope='S2flatten') # [batch_size, -1]) S3 = slim.flatten(layer_17, scope='S3flatten') # [batch_size, -1]) before_dense = tf.concat([S1, S2, S3], 1) for i in range(1, 18): print('layer_' + str(i), image_features['layer_' + str(i)]) # print('layer_17', layer_17) print('S1', S1) print('S2', S2) print('S3', S3) # to_test = slim.conv2d(image_features['layer_19']) print('image image_features', image_features.keys()) with slim.arg_scope([slim.batch_norm], is_training=is_training, center=True, scale=True): return image_features, slim.fully_connected(before_dense, 136, activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm, weights_initializer=slim.xavier_initializer()), (image_features['layer_1'], inputs, image_features['layer_2'])
def create_network(images, num_classes=None, add_logits=True, reuse=None, create_summaries=True, weight_decay=1e-8): nonlinearity = tf.nn.elu conv_weight_init = tf.truncated_normal_initializer(stddev=1e-3) conv_bias_init = tf.zeros_initializer() conv_regularizer = slim.l2_regularizer(weight_decay) fc_weight_init = tf.truncated_normal_initializer(stddev=1e-3) fc_bias_init = tf.zeros_initializer() fc_regularizer = slim.l2_regularizer(weight_decay) def batch_norm_fn(x): return slim.batch_norm(x, scope=tf.get_variable_scope().name + "/bn") network = images network, _, networkFirst = base.mobilenet_base(network) feature1_dim = networkFirst.get_shape().as_list()[-1] print("feature1 dimensionality: ", feature1_dim) feature1 = slim.flatten(networkFirst) print("Feature1 Size: ", network.get_shape().as_list()) feature_dim = network.get_shape().as_list()[-1] print("feature2 dimensionality: ", feature_dim) network = slim.flatten(network) print("Feature2 Size: ", network.get_shape().as_list()) network = tf.concat([network, feature1], 1) print("Total Feature Size: ", network.get_shape().as_list()) feature_dim = 128 network = slim.dropout(network, keep_prob=0.6) network = slim.fully_connected( network, feature_dim, activation_fn=nonlinearity, ## feature_dim normalizer_fn=batch_norm_fn, weights_regularizer=fc_regularizer, scope="fc1", weights_initializer=fc_weight_init, biases_initializer=fc_bias_init) features = network # Features in rows, normalize axis 1. features = tf.nn.l2_normalize(features, dim=1) if add_logits: with slim.variable_scope.variable_scope("ball", reuse=reuse): weights = slim.model_variable( "mean_vectors", (feature_dim, int(num_classes)), initializer=tf.truncated_normal_initializer(stddev=1e-3), regularizer=None) scale = slim.model_variable("scale", (), tf.float32, initializer=tf.constant_initializer( 0., tf.float32), regularizer=slim.l2_regularizer(1e-1)) if create_summaries: tf.summary.scalar("scale", scale) scale = tf.nn.softplus(scale) # Mean vectors in colums, normalize axis 0. weights_normed = tf.nn.l2_normalize(weights, dim=0) logits = scale * tf.matmul(features, weights_normed) else: logits = None return features, logits
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=_CONV_DEFS if self._use_depthwise else None, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'layer_4', 'layer_7', 'layer_14', 'layer_19' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial( slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19)) feature_maps.append(last_feature_map) return feature_maps