def testModelHasExpectedNumberOfParameters(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) with slim.arg_scope([slim.conv2d, slim.separable_conv2d], normalizer_fn=slim.batch_norm): mobilenet_v1.mobilenet_v1_base(inputs) total_params, _ = slim.model_analyzer.analyze_vars( slim.get_model_variables()) self.assertAlmostEqual(3217920, total_params)
def testOutputStride8BuildAndCheckAllEndPointsUptoConv2d_13(self): batch_size = 5 height, width = 224, 224 output_stride = 8 inputs = tf.random_uniform((batch_size, height, width, 3)) with slim.arg_scope([slim.conv2d, slim.separable_conv2d], normalizer_fn=slim.batch_norm): _, end_points = mobilenet_v1.mobilenet_v1_base( inputs, output_stride=output_stride, final_endpoint='Conv2d_13_pointwise') _, explicit_padding_end_points = mobilenet_v1.mobilenet_v1_base( inputs, output_stride=output_stride, final_endpoint='Conv2d_13_pointwise', use_explicit_padding=True) endpoints_shapes = {'Conv2d_0': [batch_size, 112, 112, 32], 'Conv2d_1_depthwise': [batch_size, 112, 112, 32], 'Conv2d_1_pointwise': [batch_size, 112, 112, 64], 'Conv2d_2_depthwise': [batch_size, 56, 56, 64], 'Conv2d_2_pointwise': [batch_size, 56, 56, 128], 'Conv2d_3_depthwise': [batch_size, 56, 56, 128], 'Conv2d_3_pointwise': [batch_size, 56, 56, 128], 'Conv2d_4_depthwise': [batch_size, 28, 28, 128], 'Conv2d_4_pointwise': [batch_size, 28, 28, 256], 'Conv2d_5_depthwise': [batch_size, 28, 28, 256], 'Conv2d_5_pointwise': [batch_size, 28, 28, 256], 'Conv2d_6_depthwise': [batch_size, 28, 28, 256], 'Conv2d_6_pointwise': [batch_size, 28, 28, 512], 'Conv2d_7_depthwise': [batch_size, 28, 28, 512], 'Conv2d_7_pointwise': [batch_size, 28, 28, 512], 'Conv2d_8_depthwise': [batch_size, 28, 28, 512], 'Conv2d_8_pointwise': [batch_size, 28, 28, 512], 'Conv2d_9_depthwise': [batch_size, 28, 28, 512], 'Conv2d_9_pointwise': [batch_size, 28, 28, 512], 'Conv2d_10_depthwise': [batch_size, 28, 28, 512], 'Conv2d_10_pointwise': [batch_size, 28, 28, 512], 'Conv2d_11_depthwise': [batch_size, 28, 28, 512], 'Conv2d_11_pointwise': [batch_size, 28, 28, 512], 'Conv2d_12_depthwise': [batch_size, 28, 28, 512], 'Conv2d_12_pointwise': [batch_size, 28, 28, 1024], 'Conv2d_13_depthwise': [batch_size, 28, 28, 1024], 'Conv2d_13_pointwise': [batch_size, 28, 28, 1024]} self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) for endpoint_name, expected_shape in endpoints_shapes.items(): self.assertTrue(endpoint_name in end_points) self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), expected_shape) self.assertItemsEqual(endpoints_shapes.keys(), explicit_padding_end_points.keys()) for endpoint_name, expected_shape in endpoints_shapes.items(): self.assertTrue(endpoint_name in explicit_padding_end_points) self.assertListEqual( explicit_padding_end_points[endpoint_name].get_shape().as_list(), expected_shape)
def testBuildAndCheckAllEndPointsApproximateFaceNet(self): batch_size = 5 height, width = 128, 128 inputs = tf.random_uniform((batch_size, height, width, 3)) with slim.arg_scope([slim.conv2d, slim.separable_conv2d], normalizer_fn=slim.batch_norm): _, end_points = mobilenet_v1.mobilenet_v1_base( inputs, final_endpoint='Conv2d_13_pointwise', depth_multiplier=0.75) _, explicit_padding_end_points = mobilenet_v1.mobilenet_v1_base( inputs, final_endpoint='Conv2d_13_pointwise', depth_multiplier=0.75, use_explicit_padding=True) # For the Conv2d_0 layer FaceNet has depth=16 endpoints_shapes = {'Conv2d_0': [batch_size, 64, 64, 24], 'Conv2d_1_depthwise': [batch_size, 64, 64, 24], 'Conv2d_1_pointwise': [batch_size, 64, 64, 48], 'Conv2d_2_depthwise': [batch_size, 32, 32, 48], 'Conv2d_2_pointwise': [batch_size, 32, 32, 96], 'Conv2d_3_depthwise': [batch_size, 32, 32, 96], 'Conv2d_3_pointwise': [batch_size, 32, 32, 96], 'Conv2d_4_depthwise': [batch_size, 16, 16, 96], 'Conv2d_4_pointwise': [batch_size, 16, 16, 192], 'Conv2d_5_depthwise': [batch_size, 16, 16, 192], 'Conv2d_5_pointwise': [batch_size, 16, 16, 192], 'Conv2d_6_depthwise': [batch_size, 8, 8, 192], 'Conv2d_6_pointwise': [batch_size, 8, 8, 384], 'Conv2d_7_depthwise': [batch_size, 8, 8, 384], 'Conv2d_7_pointwise': [batch_size, 8, 8, 384], 'Conv2d_8_depthwise': [batch_size, 8, 8, 384], 'Conv2d_8_pointwise': [batch_size, 8, 8, 384], 'Conv2d_9_depthwise': [batch_size, 8, 8, 384], 'Conv2d_9_pointwise': [batch_size, 8, 8, 384], 'Conv2d_10_depthwise': [batch_size, 8, 8, 384], 'Conv2d_10_pointwise': [batch_size, 8, 8, 384], 'Conv2d_11_depthwise': [batch_size, 8, 8, 384], 'Conv2d_11_pointwise': [batch_size, 8, 8, 384], 'Conv2d_12_depthwise': [batch_size, 4, 4, 384], 'Conv2d_12_pointwise': [batch_size, 4, 4, 768], 'Conv2d_13_depthwise': [batch_size, 4, 4, 768], 'Conv2d_13_pointwise': [batch_size, 4, 4, 768]} self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) for endpoint_name, expected_shape in endpoints_shapes.items(): self.assertTrue(endpoint_name in end_points) self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), expected_shape) self.assertItemsEqual(endpoints_shapes.keys(), explicit_padding_end_points.keys()) for endpoint_name, expected_shape in endpoints_shapes.items(): self.assertTrue(endpoint_name in explicit_padding_end_points) self.assertListEqual( explicit_padding_end_points[endpoint_name].get_shape().as_list(), expected_shape)
def testBuildOnlyUptoFinalEndpoint(self): batch_size = 5 height, width = 224, 224 endpoints = ['Conv2d_0', 'Conv2d_1_depthwise', 'Conv2d_1_pointwise', 'Conv2d_2_depthwise', 'Conv2d_2_pointwise', 'Conv2d_3_depthwise', 'Conv2d_3_pointwise', 'Conv2d_4_depthwise', 'Conv2d_4_pointwise', 'Conv2d_5_depthwise', 'Conv2d_5_pointwise', 'Conv2d_6_depthwise', 'Conv2d_6_pointwise', 'Conv2d_7_depthwise', 'Conv2d_7_pointwise', 'Conv2d_8_depthwise', 'Conv2d_8_pointwise', 'Conv2d_9_depthwise', 'Conv2d_9_pointwise', 'Conv2d_10_depthwise', 'Conv2d_10_pointwise', 'Conv2d_11_depthwise', 'Conv2d_11_pointwise', 'Conv2d_12_depthwise', 'Conv2d_12_pointwise', 'Conv2d_13_depthwise', 'Conv2d_13_pointwise'] for index, endpoint in enumerate(endpoints): with tf.Graph().as_default(): inputs = tf.random_uniform((batch_size, height, width, 3)) out_tensor, end_points = mobilenet_v1.mobilenet_v1_base( inputs, final_endpoint=endpoint) self.assertTrue(out_tensor.op.name.startswith( 'MobilenetV1/' + endpoint)) self.assertItemsEqual(endpoints[:index+1], end_points.keys())
def _extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: if image height or width are not 256 pixels. """ image_shape = preprocessed_inputs.get_shape() image_shape.assert_has_rank(4) image_height = image_shape[1].value image_width = image_shape[2].value if image_height is None or image_width is None: shape_assert = tf.Assert( tf.logical_and(tf.equal(tf.shape(preprocessed_inputs)[1], 256), tf.equal(tf.shape(preprocessed_inputs)[2], 256)), ['image size must be 256 in both height and width.']) with tf.control_dependencies([shape_assert]): preprocessed_inputs = tf.identity(preprocessed_inputs) elif image_height != 256 or image_width != 256: raise ValueError('image size must be = 256 in both height and width;' ' image dim = %d,%d' % (image_height, image_width)) feature_map_layout = { 'from_layer': [ 'Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '' ], 'layer_depth': [-1, -1, 512, 256, 256], 'conv_kernel_size': [-1, -1, 3, 3, 2], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with slim.arg_scope(self._conv_hyperparams): with slim.arg_scope([slim.batch_norm], fused=False): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=True, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): # TODO(skligys): Enable fused batch norm once quantization supports it. with slim.arg_scope([slim.batch_norm], fused=False): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): # TODO(skligys): Enable fused batch norm once quantization supports it. with slim.arg_scope([slim.batch_norm], fused=False): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=(self._batch_norm_trainable and self._is_training))): # TODO(skligys): Enable fused batch norm once quantization supports it. with slim.arg_scope([slim.batch_norm], fused=False): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams): # TODO(skligys): Enable fused batch norm once quantization supports it. with slim.arg_scope([slim.batch_norm], fused=False): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def testOutputStride8BuildAndCheckAllEndPointsUptoConv2d_13(self): batch_size = 5 height, width = 224, 224 output_stride = 8 inputs = tf.random.uniform((batch_size, height, width, 3)) with slim.arg_scope([slim.conv2d, slim.separable_conv2d], normalizer_fn=slim.batch_norm): _, end_points = mobilenet_v1.mobilenet_v1_base( inputs, output_stride=output_stride, final_endpoint='Conv2d_13_pointwise') endpoints_shapes = { 'Conv2d_0': [batch_size, 112, 112, 32], 'Conv2d_1_depthwise': [batch_size, 112, 112, 32], 'Conv2d_1_pointwise': [batch_size, 112, 112, 64], 'Conv2d_2_depthwise': [batch_size, 56, 56, 64], 'Conv2d_2_pointwise': [batch_size, 56, 56, 128], 'Conv2d_3_depthwise': [batch_size, 56, 56, 128], 'Conv2d_3_pointwise': [batch_size, 56, 56, 128], 'Conv2d_4_depthwise': [batch_size, 28, 28, 128], 'Conv2d_4_pointwise': [batch_size, 28, 28, 256], 'Conv2d_5_depthwise': [batch_size, 28, 28, 256], 'Conv2d_5_pointwise': [batch_size, 28, 28, 256], 'Conv2d_6_depthwise': [batch_size, 28, 28, 256], 'Conv2d_6_pointwise': [batch_size, 28, 28, 512], 'Conv2d_7_depthwise': [batch_size, 28, 28, 512], 'Conv2d_7_pointwise': [batch_size, 28, 28, 512], 'Conv2d_8_depthwise': [batch_size, 28, 28, 512], 'Conv2d_8_pointwise': [batch_size, 28, 28, 512], 'Conv2d_9_depthwise': [batch_size, 28, 28, 512], 'Conv2d_9_pointwise': [batch_size, 28, 28, 512], 'Conv2d_10_depthwise': [batch_size, 28, 28, 512], 'Conv2d_10_pointwise': [batch_size, 28, 28, 512], 'Conv2d_11_depthwise': [batch_size, 28, 28, 512], 'Conv2d_11_pointwise': [batch_size, 28, 28, 512], 'Conv2d_12_depthwise': [batch_size, 28, 28, 512], 'Conv2d_12_pointwise': [batch_size, 28, 28, 1024], 'Conv2d_13_depthwise': [batch_size, 28, 28, 1024], 'Conv2d_13_pointwise': [batch_size, 28, 28, 1024] } self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) for endpoint_name, expected_shape in endpoints_shapes.items(): self.assertTrue(endpoint_name in end_points) self.assertListEqual( end_points[endpoint_name].get_shape().as_list(), expected_shape)
def testBuildAndCheckAllEndPointsApproximateFaceNet(self): batch_size = 5 height, width = 128, 128 inputs = tf.random.uniform((batch_size, height, width, 3)) with slim.arg_scope([slim.conv2d, slim.separable_conv2d], normalizer_fn=slim.batch_norm): _, end_points = mobilenet_v1.mobilenet_v1_base( inputs, final_endpoint='Conv2d_13_pointwise', depth_multiplier=0.75) # For the Conv2d_0 layer FaceNet has depth=16 endpoints_shapes = { 'Conv2d_0': [batch_size, 64, 64, 24], 'Conv2d_1_depthwise': [batch_size, 64, 64, 24], 'Conv2d_1_pointwise': [batch_size, 64, 64, 48], 'Conv2d_2_depthwise': [batch_size, 32, 32, 48], 'Conv2d_2_pointwise': [batch_size, 32, 32, 96], 'Conv2d_3_depthwise': [batch_size, 32, 32, 96], 'Conv2d_3_pointwise': [batch_size, 32, 32, 96], 'Conv2d_4_depthwise': [batch_size, 16, 16, 96], 'Conv2d_4_pointwise': [batch_size, 16, 16, 192], 'Conv2d_5_depthwise': [batch_size, 16, 16, 192], 'Conv2d_5_pointwise': [batch_size, 16, 16, 192], 'Conv2d_6_depthwise': [batch_size, 8, 8, 192], 'Conv2d_6_pointwise': [batch_size, 8, 8, 384], 'Conv2d_7_depthwise': [batch_size, 8, 8, 384], 'Conv2d_7_pointwise': [batch_size, 8, 8, 384], 'Conv2d_8_depthwise': [batch_size, 8, 8, 384], 'Conv2d_8_pointwise': [batch_size, 8, 8, 384], 'Conv2d_9_depthwise': [batch_size, 8, 8, 384], 'Conv2d_9_pointwise': [batch_size, 8, 8, 384], 'Conv2d_10_depthwise': [batch_size, 8, 8, 384], 'Conv2d_10_pointwise': [batch_size, 8, 8, 384], 'Conv2d_11_depthwise': [batch_size, 8, 8, 384], 'Conv2d_11_pointwise': [batch_size, 8, 8, 384], 'Conv2d_12_depthwise': [batch_size, 4, 4, 384], 'Conv2d_12_pointwise': [batch_size, 4, 4, 768], 'Conv2d_13_depthwise': [batch_size, 4, 4, 768], 'Conv2d_13_pointwise': [batch_size, 4, 4, 768] } self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) for endpoint_name, expected_shape in endpoints_shapes.items(): self.assertTrue(endpoint_name in end_points) self.assertListEqual( end_points[endpoint_name].get_shape().as_list(), expected_shape)
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] activations: A dictionary mapping feature extractor tensor names to tensors Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ preprocessed_inputs.get_shape().assert_has_rank(4) shape_assert = tf.Assert( tf.logical_and( tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), ['image size must at least be 33 in both height and width.']) with tf.control_dependencies([shape_assert]): with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=False, weight_decay=self._weight_decay)): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: params = {} if self._skip_last_stride: params[ 'conv_defs'] = _get_mobilenet_conv_no_last_stride_defs( conv_depth_ratio_in_percentage=self. _conv_depth_ratio_in_percentage) _, activations = mobilenet_v1.mobilenet_v1_base( preprocessed_inputs, final_endpoint='Conv2d_11_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope, **params) return activations['Conv2d_11_pointwise']
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs.get_shape().assert_has_rank(4) shape_assert = tf.Assert( tf.logical_and( tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), ['image size must at least be 33 in both height and width.']) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], #'use_depthwise':True } with tf.control_dependencies([shape_assert]): with slim.arg_scope(self._conv_hyperparams): with slim.arg_scope([slim.batch_norm], fused=False): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def _extract_box_classifier_features(self, proposal_feature_maps, scope): """Extracts second stage box classifier features. Args: proposal_feature_maps: A 4-D float tensor with shape [batch_size * self.max_num_proposals, crop_height, crop_width, depth] representing the feature map cropped to each proposal. scope: A scope name (unused). Returns: proposal_classifier_features: A 4-D float tensor with shape [batch_size * self.max_num_proposals, height, width, depth] representing box classifier features for each proposal. """ net = proposal_feature_maps depth = lambda d: max(int(d * self._depth_multiplier), self._min_depth) trunc_normal = lambda stddev: tf.truncated_normal_initializer( 0.0, stddev) data_format = 'NHWC' concat_dim = 3 if data_format == 'NHWC' else 1 with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights): with slim.arg_scope( [slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME', data_format=data_format): with _batch_norm_arg_scope( [slim.conv2d, slim.separable_conv2d], batch_norm_scale=True, train_batch_norm=self._train_batch_norm): proposal_classifier_features, _ = mobilenet_v1.mobilenet_v1_base( proposal_feature_maps, final_endpoint='Conv2d_13_pointwise', start_enum=12, min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, conv_defs=mobilenet_v1._CONV_DEFS[-2:], scope=scope) return proposal_classifier_features
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs.get_shape().assert_has_rank(4) shape_assert = tf.Assert( tf.logical_and( tf.equal(tf.shape(preprocessed_inputs)[1], 256), tf.equal(tf.shape(preprocessed_inputs)[2], 256)), ['image size must be 256 in both height and width.']) feature_map_layout = { 'from_layer': [ 'Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '' ], 'layer_depth': [-1, -1, 512, 256, 256], 'conv_kernel_size': [-1, -1, 3, 3, 2], } with tf.control_dependencies([shape_assert]): with slim.arg_scope(self._conv_hyperparams): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] activations: A dictionary mapping feature extractor tensor names to tensors Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ preprocessed_inputs.get_shape().assert_has_rank(4) preprocessed_inputs = shape_utils.check_min_image_dim( min_dim=33, image_tensor=preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=self._train_batch_norm, weight_decay=self._weight_decay)): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: params = {} if self._skip_last_stride: params[ 'conv_defs'] = _get_mobilenet_conv_no_last_stride_defs( conv_depth_ratio_in_percentage=self. _conv_depth_ratio_in_percentage) _, activations = mobilenet_v1.mobilenet_v1_base( preprocessed_inputs, final_endpoint='Conv2d_11_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope, **params) return activations['Conv2d_11_pointwise'], activations
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] activations: A dictionary mapping feature extractor tensor names to tensors Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ preprocessed_inputs.get_shape().assert_has_rank(4) preprocessed_inputs = shape_utils.check_min_image_dim( min_dim=33, image_tensor=preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=self._train_batch_norm, weight_decay=self._weight_decay)): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: params = {} if self._skip_last_stride: params['conv_defs'] = _get_mobilenet_conv_no_last_stride_defs( conv_depth_ratio_in_percentage=self. _conv_depth_ratio_in_percentage) _, activations = mobilenet_v1.mobilenet_v1_base( preprocessed_inputs, final_endpoint='Conv2d_11_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope, **params) return activations['Conv2d_11_pointwise'], activations
def segMnet(inputs,multiplier): segMnet, end_points = mobilenet_v1.mobilenet_v1_base(inputs,depth_multiplier=multiplier) filters = int(1024*multiplier) with tf.variable_scope('gobal_average_pool'): shape = tf.shape(segMnet) gap = tf.layers.average_pooling2d(segMnet,[shape[1], shape[2]],[shape[1], shape[2]]) with tf.variable_scope('gap_conv'): gconv = batch_norm(conv2d(gap, filters, filters, 1, 1)) with tf.variable_scope('gap_resize'): shape = tf.shape(segMnet) gresize = tf.image.resize_bilinear(gconv, [shape[1], shape[2]]) with tf.variable_scope('atrous0'): atnet0 = batch_norm(conv2d(segMnet, filters, filters, 1, 1)) with tf.variable_scope('atrous1'): atnet1 = batch_norm(atrous_conv2d(segMnet, filters, filters, 3, 6)) with tf.variable_scope('atrous2'): atnet2 = batch_norm(atrous_conv2d(segMnet, filters, filters, 3, 12)) with tf.variable_scope('atrous3'): atnet3 = batch_norm(atrous_conv2d(segMnet, filters, filters, 3, 18)) with tf.variable_scope('concat'): segMnet = tf.concat([gresize, atnet0, atnet1, atnet2, atnet3], 3) with tf.variable_scope('combine_conv'): segMnet = batch_norm(conv2d(segMnet, filters*5, filters, 1, 1)) #with tf.variable_scope('final_conv'): # segMnet = conv2d(segMnet, filters, 2, 1, 1) with tf.variable_scope('deconv'): wshape = [64, 64, 2, filters] #wshape = [64, 64, 2, 2] strides = [1, 32, 32, 1] initializer = tf.contrib.layers.xavier_initializer() weight = tf.Variable(initializer(wshape)) shape = tf.shape(inputs) output_shape = tf.stack([shape[0], shape[1], shape[2], 2]) segMnet = tf.nn.conv2d_transpose(segMnet, weight, output_shape, strides=strides, padding='SAME', name='conv_transpose') return segMnet
def segMnet(inputs, multiplier): segMnet, end_points = mobilenet_v1.mobilenet_v1_base( inputs, depth_multiplier=multiplier) filters = int(1024 * multiplier) with tf.variable_scope('deconv'): wshape = [64, 64, 2, filters] strides = [1, 32, 32, 1] initializer = tf.contrib.layers.xavier_initializer() weight = tf.Variable(initializer(wshape)) shape = tf.shape(inputs) output_shape = tf.stack([shape[0], shape[1], shape[2], 2]) segMnet = tf.nn.conv2d_transpose(segMnet, weight, output_shape, strides=strides, padding='SAME', name='conv_transpose') return segMnet
def testBuildCustomNetworkUsingConvDefs(self): batch_size = 5 height, width = 224, 224 conv_defs = [ mobilenet_v1.Conv(kernel=[3, 3], stride=2, depth=32), mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=64), mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=128), mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512) ] inputs = tf.random_uniform((batch_size, height, width, 3)) net, end_points = mobilenet_v1.mobilenet_v1_base( inputs, final_endpoint='Conv2d_3_pointwise', conv_defs=conv_defs) self.assertTrue(net.op.name.startswith('MobilenetV1/Conv2d_3')) self.assertListEqual(net.get_shape().as_list(), [batch_size, 56, 56, 512]) expected_endpoints = ['Conv2d_0', 'Conv2d_1_depthwise', 'Conv2d_1_pointwise', 'Conv2d_2_depthwise', 'Conv2d_2_pointwise', 'Conv2d_3_depthwise', 'Conv2d_3_pointwise'] self.assertItemsEqual(end_points.keys(), expected_endpoints)
def testBuildCustomNetworkUsingConvDefs(self): batch_size = 5 height, width = 224, 224 conv_defs = [ mobilenet_v1.Conv(kernel=[3, 3], stride=2, depth=32), mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=64), mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=128), mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512) ] inputs = tf.random_uniform((batch_size, height, width, 3)) net, end_points = mobilenet_v1.mobilenet_v1_base( inputs, final_endpoint='Conv2d_3_pointwise', conv_defs=conv_defs) self.assertTrue(net.op.name.startswith('MobilenetV1/Conv2d_3')) self.assertListEqual(net.get_shape().as_list(), [batch_size, 56, 56, 512]) expected_endpoints = ['Conv2d_0', 'Conv2d_1_depthwise', 'Conv2d_1_pointwise', 'Conv2d_2_depthwise', 'Conv2d_2_pointwise', 'Conv2d_3_depthwise', 'Conv2d_3_pointwise'] self.assertItemsEqual(end_points.keys(), expected_endpoints)
def make_network(self, is_train): if is_train: image = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.data_shape, 3]) label15 = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.output_shape, cfg.nr_skeleton]) label11 = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.output_shape, cfg.nr_skeleton]) label9 = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.output_shape, cfg.nr_skeleton]) label7 = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.output_shape, cfg.nr_skeleton]) labels = [label15, label11, label9, label7] valids = tf.placeholder(tf.float32, shape=[cfg.batch_size, cfg.nr_skeleton]) self.set_inputs(image, label15, label11, label9, label7, valids) else: image = tf.placeholder(tf.float32, shape=[None, *cfg.data_shape, 3]) self.set_inputs(image) mobilenet_fms, endpoints = mobilenet_v1_base(image) heatmap_outs = self.head_net(mobilenet_fms, is_train) # make loss if is_train: def ohkm(loss, top_k): ohkm_loss = 0. for i in range(cfg.batch_size): sub_loss = loss[i] topk_val, topk_idx = tf.nn.top_k(sub_loss, k=top_k, sorted=False, name='ohkm{}'.format(i)) tmp_loss = tf.gather(sub_loss, topk_idx, name='ohkm_loss{}'.format(i)) # can be ignore ??? ohkm_loss += tf.reduce_sum(tmp_loss) / top_k ohkm_loss /= cfg.batch_size return ohkm_loss label = label7 * tf.to_float(tf.greater(tf.reshape(valids, (-1, 1, 1, cfg.nr_skeleton)), 0.1)) loss = tf.reduce_mean(tf.square(heatmap_outs - label)) self.add_tower_summary('loss', loss) self.set_loss(loss) else: self.set_outputs(heatmap_outs)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs.get_shape().assert_has_rank(4) shape_assert = tf.Assert( tf.logical_and( tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), ['image size must at least be 33 in both height and width.']) bottomup_features_names = [ 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ] num_appended_layers = 4 appended_channel_num = [512, 256, 256, 256] with tf.control_dependencies([shape_assert]): with slim.arg_scope(self._conv_hyperparams): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: _, image_features = mobilenet_v1.mobilenet_v1_base( preprocessed_inputs, final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) topdown_features = self._topdown_feature_maps( image_features, bottomup_features_names=bottomup_features_names, num_appended_layers=num_appended_layers, appended_channel_num=appended_channel_num) return topdown_features.values()
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] activations: A dictionary mapping feature extractor tensor names to tensors Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ preprocessed_inputs.get_shape().assert_has_rank(4) shape_assert = tf.Assert( tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), ['image size must at least be 33 in both height and width.']) with tf.control_dependencies([shape_assert]): with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=self._train_batch_norm, weight_decay=self._weight_decay)): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: _, activations = mobilenet_v1.mobilenet_v1_base( preprocessed_inputs, final_endpoint='Conv2d_11_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) return activations['Conv2d_11_pointwise'], activations
def testBuildBaseNetwork(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) net, end_points = mobilenet_v1.mobilenet_v1_base(inputs) self.assertTrue(net.op.name.startswith('MobilenetV1/Conv2d_13')) self.assertListEqual(net.get_shape().as_list(), [batch_size, 7, 7, 1024]) expected_endpoints = [ 'Conv2d_0', 'Conv2d_1_depthwise', 'Conv2d_1_pointwise', 'Conv2d_2_depthwise', 'Conv2d_2_pointwise', 'Conv2d_3_depthwise', 'Conv2d_3_pointwise', 'Conv2d_4_depthwise', 'Conv2d_4_pointwise', 'Conv2d_5_depthwise', 'Conv2d_5_pointwise', 'Conv2d_6_depthwise', 'Conv2d_6_pointwise', 'Conv2d_7_depthwise', 'Conv2d_7_pointwise', 'Conv2d_8_depthwise', 'Conv2d_8_pointwise', 'Conv2d_9_depthwise', 'Conv2d_9_pointwise', 'Conv2d_10_depthwise', 'Conv2d_10_pointwise', 'Conv2d_11_depthwise', 'Conv2d_11_pointwise', 'Conv2d_12_depthwise', 'Conv2d_12_pointwise', 'Conv2d_13_depthwise', 'Conv2d_13_pointwise' ] self.assertItemsEqual(end_points.keys(), expected_endpoints)
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ preprocessed_inputs.get_shape().assert_has_rank(4) shape_assert = tf.Assert( tf.logical_and( tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), ['image size must at least be 33 in both height and width.']) with tf.control_dependencies([shape_assert]): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with _batch_norm_arg_scope( [slim.conv2d, slim.separable_conv2d], batch_norm_scale=True, train_batch_norm=self._train_batch_norm): _, activations = mobilenet_v1.mobilenet_v1_base( preprocessed_inputs, final_endpoint='Conv2d_11_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) return activations['Conv2d_11_pointwise']
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.pooling_pyramid_feature_maps( base_feature_map_depth=0, num_layers=6, image_features={ 'image_features': image_features['Conv2d_11_pointwise'] }) return feature_maps.values()
def testBuildOnlyUptoFinalEndpoint(self): batch_size = 5 height, width = 224, 224 endpoints = [ 'Conv2d_0', 'Conv2d_1_depthwise', 'Conv2d_1_pointwise', 'Conv2d_2_depthwise', 'Conv2d_2_pointwise', 'Conv2d_3_depthwise', 'Conv2d_3_pointwise', 'Conv2d_4_depthwise', 'Conv2d_4_pointwise', 'Conv2d_5_depthwise', 'Conv2d_5_pointwise', 'Conv2d_6_depthwise', 'Conv2d_6_pointwise', 'Conv2d_7_depthwise', 'Conv2d_7_pointwise', 'Conv2d_8_depthwise', 'Conv2d_8_pointwise', 'Conv2d_9_depthwise', 'Conv2d_9_pointwise', 'Conv2d_10_depthwise', 'Conv2d_10_pointwise', 'Conv2d_11_depthwise', 'Conv2d_11_pointwise', 'Conv2d_12_depthwise', 'Conv2d_12_pointwise', 'Conv2d_13_depthwise', 'Conv2d_13_pointwise' ] for index, endpoint in enumerate(endpoints): with tf.Graph().as_default(): inputs = tf.random_uniform((batch_size, height, width, 3)) out_tensor, end_points = mobilenet_v1.mobilenet_v1_base( inputs, final_endpoint=endpoint) self.assertTrue( out_tensor.op.name.startswith('MobilenetV1/' + endpoint)) self.assertItemsEqual(endpoints[:index + 1], end_points.keys())
def testBuildBaseNetwork(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) net, end_points = mobilenet_v1.mobilenet_v1_base(inputs) self.assertTrue(net.op.name.startswith('MobilenetV1/Conv2d_13')) self.assertListEqual(net.get_shape().as_list(), [batch_size, 7, 7, 1024]) expected_endpoints = ['Conv2d_0', 'Conv2d_1_depthwise', 'Conv2d_1_pointwise', 'Conv2d_2_depthwise', 'Conv2d_2_pointwise', 'Conv2d_3_depthwise', 'Conv2d_3_pointwise', 'Conv2d_4_depthwise', 'Conv2d_4_pointwise', 'Conv2d_5_depthwise', 'Conv2d_5_pointwise', 'Conv2d_6_depthwise', 'Conv2d_6_pointwise', 'Conv2d_7_depthwise', 'Conv2d_7_pointwise', 'Conv2d_8_depthwise', 'Conv2d_8_pointwise', 'Conv2d_9_depthwise', 'Conv2d_9_pointwise', 'Conv2d_10_depthwise', 'Conv2d_10_pointwise', 'Conv2d_11_depthwise', 'Conv2d_11_pointwise', 'Conv2d_12_depthwise', 'Conv2d_12_pointwise', 'Conv2d_13_depthwise', 'Conv2d_13_pointwise'] self.assertItemsEqual(end_points.keys(), expected_endpoints)
def mobilenet_v1_l2norm(inputs, num_classes=1000, dropout_keep_prob=0.999, is_training=True, min_depth=8, depth_multiplier=1.0, conv_defs=None, prediction_fn=tf.contrib.layers.softmax, spatial_squeeze=True, reuse=None, scope='MobilenetV1', global_pool=False, initial_scale=10.): """Mobilenet v1 model for classification. Args: inputs: a tensor of shape [batch_size, height, width, channels]. num_classes: number of predicted classes. If 0 or None, the logits layer is omitted and the input features to the logits layer (before dropout) are returned instead. dropout_keep_prob: the percentage of activation values that are retained. is_training: whether is training or not. min_depth: Minimum depth value (number of channels) for all convolution ops. Enforced when depth_multiplier < 1, and not an active constraint when depth_multiplier >= 1. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. conv_defs: A list of ConvDef namedtuples specifying the net architecture. prediction_fn: a function to get predictions out of logits. spatial_squeeze: if True, logits is of shape is [B, C], if false logits is of shape [B, 1, 1, C], where B is batch_size and C is number of classes. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. global_pool: Optional boolean flag to control the avgpooling before the logits layer. If false or unset, pooling is done with a fixed window that reduces default-sized inputs to 1x1, while larger inputs lead to larger outputs. If true, any input size is pooled down to 1x1. initial_scale: normalized weights should be multiplied with a scale to alleviate the performance loss. This variable is trainable. Returns: net: a 2D Tensor with the logits (pre-softmax activations) if num_classes is a non-zero integer, or the non-dropped-out input to the logits layer if num_classes is 0 or None. end_points: a dictionary from components of the network to the corresponding activation. Raises: ValueError: Input rank is invalid. """ input_shape = inputs.get_shape().as_list() if len(input_shape) != 4: raise ValueError('Invalid input tensor rank, expected 4, was: %d' % len(input_shape)) with tf.variable_scope(scope, 'MobilenetV1', [inputs], reuse=reuse) as scope: with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): net, end_points = mobilenet_v1.mobilenet_v1_base( inputs, scope=scope, min_depth=min_depth, depth_multiplier=depth_multiplier, conv_defs=conv_defs) with tf.variable_scope('Logits'): if global_pool: # Global average pooling. net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool') end_points['global_pool'] = net else: # Pooling with a fixed kernel size. kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7]) net = slim.avg_pool2d( net, kernel_size, padding='VALID', scope='AvgPool_1a') end_points['AvgPool_1a'] = net net = slim.conv2d( net, num_outputs=1024, kernel_size=[1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1c_1x1_negative') net = tf.nn.l2_normalize(net, axis=-1, name='Normalize_1a') end_points['Normalize_1a'] = net if not num_classes: return net, end_points net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') logits = conv2d_l2norm.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1d_1x1_normalize', weight_norm=True) scale_factor = tf.get_variable( 'scale_factor', [], dtype=tf.float32, initializer=tf.constant_initializer(initial_scale), trainable=True) slim.summaries.add_scalar_summary(scale_factor, 'scale_factor', 'scale_factors') logits = tf.multiply(logits, scale_factor) if spatial_squeeze: logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') end_points['Logits'] = logits if prediction_fn: end_points['Predictions'] = prediction_fn(logits, scope='Predictions') return logits, end_points
def _construct_model(model_type='resnet_v1_50'): """Constructs model for the desired type of CNN. Args: model_type: Type of model to be used. Returns: end_points: A dictionary from components of the network to the corresponding activations. Raises: ValueError: If the model_type is not supported. """ # Placeholder input. images = array_ops.placeholder( dtypes.float32, shape=(1, None, None, 3), name=_INPUT_NODE) # Construct model. if model_type == 'inception_resnet_v2': _, end_points = inception.inception_resnet_v2_base(images) elif model_type == 'inception_resnet_v2-same': _, end_points = inception.inception_resnet_v2_base( images, align_feature_maps=True) elif model_type == 'inception_v2': _, end_points = inception.inception_v2_base(images) elif model_type == 'inception_v2-no-separable-conv': _, end_points = inception.inception_v2_base( images, use_separable_conv=False) elif model_type == 'inception_v3': _, end_points = inception.inception_v3_base(images) elif model_type == 'inception_v4': _, end_points = inception.inception_v4_base(images) elif model_type == 'alexnet_v2': _, end_points = alexnet.alexnet_v2(images) elif model_type == 'vgg_a': _, end_points = vgg.vgg_a(images) elif model_type == 'vgg_16': _, end_points = vgg.vgg_16(images) elif model_type == 'mobilenet_v1': _, end_points = mobilenet_v1.mobilenet_v1_base(images) elif model_type == 'mobilenet_v1_075': _, end_points = mobilenet_v1.mobilenet_v1_base( images, depth_multiplier=0.75) elif model_type == 'resnet_v1_50': _, end_points = resnet_v1.resnet_v1_50( images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v1_101': _, end_points = resnet_v1.resnet_v1_101( images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v1_152': _, end_points = resnet_v1.resnet_v1_152( images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v1_200': _, end_points = resnet_v1.resnet_v1_200( images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v2_50': _, end_points = resnet_v2.resnet_v2_50( images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v2_101': _, end_points = resnet_v2.resnet_v2_101( images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v2_152': _, end_points = resnet_v2.resnet_v2_152( images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v2_200': _, end_points = resnet_v2.resnet_v2_200( images, num_classes=None, is_training=False, global_pool=False) else: raise ValueError('Unsupported model_type %s.' % model_type) return end_points
def extract_features(self, preprocessed_inputs, state_saver=None, state_name='lstm_state', unroll_length=5, scope=None): """Extracts features from preprocessed inputs. The features include the base network features, lstm features and SSD features, organized in the following name scope: <parent scope>/MobilenetV1/... <parent scope>/LSTM/... <parent scope>/FeatureMaps/... Args: preprocessed_inputs: A [batch, height, width, channels] float tensor representing a batch of consecutive frames from video clips. state_saver: A state saver object with methods `state` and `save_state`. state_name: A python string for the name to use with the state_saver. unroll_length: The number of steps to unroll the lstm. scope: The scope for the base network of the feature extractor. Returns: A list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=self._is_training)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): with slim.arg_scope([slim.batch_norm], fused=False): # Base network. with tf.variable_scope(scope, self._base_network_scope, reuse=self._reuse_weights) as scope: net, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope([slim.batch_norm], fused=False, is_training=self._is_training): # ConvLSTM layers. with tf.variable_scope( 'LSTM', reuse=self._reuse_weights) as lstm_scope: lstm_cell = lstm_cells.BottleneckConvLSTMCell( filter_size=(3, 3), output_size=(net.shape[1].value, net.shape[2].value), num_units=max(self._min_depth, self._lstm_state_depth), activation=tf.nn.relu6, visualize_gates=True) net_seq = list(tf.split(net, unroll_length)) if state_saver is None: init_state = lstm_cell.init_state( state_name, net.shape[0].value / unroll_length, tf.float32) else: c = state_saver.state('%s_c' % state_name) h = state_saver.state('%s_h' % state_name) init_state = (c, h) # Identities added for inputing state tensors externally. c_ident = tf.identity(init_state[0], name='lstm_state_in_c') h_ident = tf.identity(init_state[1], name='lstm_state_in_h') init_state = (c_ident, h_ident) net_seq, states_out = rnn_decoder.rnn_decoder( net_seq, init_state, lstm_cell, scope=lstm_scope) batcher_ops = None self._states_out = states_out if state_saver is not None: self._step = state_saver.state('%s_step' % state_name) batcher_ops = [ state_saver.save_state('%s_c' % state_name, states_out[-1][0]), state_saver.save_state('%s_h' % state_name, states_out[-1][1]), state_saver.save_state('%s_step' % state_name, self._step - 1) ] with tf_ops.control_dependencies(batcher_ops): image_features['Conv2d_13_pointwise_lstm'] = tf.concat( net_seq, 0) # Identities added for reading output states, to be reused externally. tf.identity(states_out[-1][0], name='lstm_state_out_c') tf.identity(states_out[-1][1], name='lstm_state_out_h') # SSD layers. with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=self._feature_map_layout, depth_multiplier=(self._depth_multiplier), min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: if image height or width are not 256 pixels. """ image_shape = preprocessed_inputs.get_shape() image_shape.assert_has_rank(4) image_height = image_shape[1].value image_width = image_shape[2].value if image_height is None or image_width is None: shape_assert = tf.Assert( tf.logical_and(tf.equal(tf.shape(preprocessed_inputs)[1], 256), tf.equal(tf.shape(preprocessed_inputs)[2], 256)), ['image size must be 256 in both height and width.']) with tf.control_dependencies([shape_assert]): preprocessed_inputs = tf.identity(preprocessed_inputs) elif image_height != 256 or image_width != 256: raise ValueError( 'image size must be = 256 in both height and width;' ' image dim = %d,%d' % (image_height, image_width)) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256], 'conv_kernel_size': [-1, -1, 3, 3, 2], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope(is_training=None)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def _construct_model(model_type='resnet_v1_50'): """Constructs model for the desired type of CNN. Args: model_type: Type of model to be used. Returns: end_points: A dictionary from components of the network to the corresponding activations. Raises: ValueError: If the model_type is not supported. """ # Placeholder input. images = array_ops.placeholder(dtypes.float32, shape=(1, None, None, 3), name=_INPUT_NODE) # Construct model. if model_type == 'inception_resnet_v2': _, end_points = inception.inception_resnet_v2_base(images) elif model_type == 'inception_resnet_v2-same': _, end_points = inception.inception_resnet_v2_base( images, align_feature_maps=True) elif model_type == 'inception_v2': _, end_points = inception.inception_v2_base(images) elif model_type == 'inception_v2-no-separable-conv': _, end_points = inception.inception_v2_base(images, use_separable_conv=False) elif model_type == 'inception_v3': _, end_points = inception.inception_v3_base(images) elif model_type == 'inception_v4': _, end_points = inception.inception_v4_base(images) elif model_type == 'alexnet_v2': _, end_points = alexnet.alexnet_v2(images) elif model_type == 'vgg_a': _, end_points = vgg.vgg_a(images) elif model_type == 'vgg_16': _, end_points = vgg.vgg_16(images) elif model_type == 'mobilenet_v1': _, end_points = mobilenet_v1.mobilenet_v1_base(images) elif model_type == 'mobilenet_v1_075': _, end_points = mobilenet_v1.mobilenet_v1_base(images, depth_multiplier=0.75) elif model_type == 'resnet_v1_50': _, end_points = resnet_v1.resnet_v1_50(images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v1_101': _, end_points = resnet_v1.resnet_v1_101(images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v1_152': _, end_points = resnet_v1.resnet_v1_152(images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v1_200': _, end_points = resnet_v1.resnet_v1_200(images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v2_50': _, end_points = resnet_v2.resnet_v2_50(images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v2_101': _, end_points = resnet_v2.resnet_v2_101(images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v2_152': _, end_points = resnet_v2.resnet_v2_152(images, num_classes=None, is_training=False, global_pool=False) elif model_type == 'resnet_v2_200': _, end_points = resnet_v2.resnet_v2_200(images, num_classes=None, is_training=False, global_pool=False) else: raise ValueError('Unsupported model_type %s.' % model_type) return end_points
import tensorflow as tf from nets import mobilenet_v1 input = tf.placeholder("float32", shape=[1, 1000, 800, 3]) net, features = mobilenet_v1.mobilenet_v1_base(input) shape = net.get_shape().as_list() print("Network shape", shape) print("There are %d endpoints" % len(features)) for key in sorted(features): shape = features[key].get_shape().as_list() print("\t%s shape:" % key, shape)
def extract_features( self, preprocessed_inputs ): #this will extract features from iamge w.r.t mobilenet archtecture """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs.get_shape().assert_has_rank(4) shape_assert = tf.Assert( tf.logical_and( tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), ['image size must at least be 33 in both height and width.']) feature_map_layout = { 'from_layer': [ 'Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', #we first extract 2 layers from mobilenet '', '' ], 'layer_depth': [ -1, -1, 512, 256, 256, 128 ], #for first two things it's -1 means we directly take the depth as in the feature maps } with tf.control_dependencies([shape_assert]): #with following we apply all the hyperparams in the scrip by keeping arg scope free with slim.arg_scope( self._conv_hyperparams ): #arg score - Here the convolutional hyper params are for feature extractor we create ot with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: _, image_features = mobilenet_v1.mobilenet_v1_base( #getting the feature extracted from mobilnet in the slim preprocessed_inputs, final_endpoint= 'Conv2d_13_pointwise', #this is extracting the features min_depth=self. _min_depth, #our min deph is 16 , It's like our depth of the feature extator depth_multiplier=self. _depth_multiplier, #there is 1 we take all the layers in depth demension scope=scope ) #this is a dicrionalt with names of the feature maps and feature maps #the following function can extract the features from above feature maps , also it can create new one's too acording to the output stride thing which we are not using Alos we give a featue map lay_out what should be there , and this also can create addicitonal feature maps feature_maps = feature_map_generators.multi_resolution_feature_maps( #This is for generating feature maps feature_map_layout= feature_map_layout, #wanted feature maps extracted from above model maps and create new maps for empty things depth_multiplier=self. _depth_multiplier, #depth multi-plier min_depth=self._min_depth, #this is 16 insert_1x1_conv=True, # image_features=image_features) #feature dictionary return feature_maps.values() #list of 6 feature maps for the ssd
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, conv_defs=self._conv_defs, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'Conv2d_3_pointwise', 'Conv2d_5_pointwise', 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise, use_explicit_padding=self._use_explicit_padding) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features padding = 'VALID' if self._use_explicit_padding else 'SAME' kernel_size = 3 for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d if self._use_explicit_padding: last_feature_map = ops.fixed_padding( last_feature_map, kernel_size) last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[kernel_size, kernel_size], stride=2, padding=padding, scope='bottom_up_Conv2d_{}'.format( i - base_fpn_max_level + 13)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in [ 'Conv2d_5_pointwise', 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ]], depth=depth_fn(256)) last_feature_map = fpn_features[ 'top_down_Conv2d_13_pointwise'] coarse_features = {} for i in range(14, 16): last_feature_map = slim.conv2d( last_feature_map, num_outputs=depth_fn(256), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format(i)) coarse_features['bottom_up_Conv2d_{}'.format( i)] = last_feature_map return [ fpn_features['top_down_Conv2d_5_pointwise'], fpn_features['top_down_Conv2d_11_pointwise'], fpn_features['top_down_Conv2d_13_pointwise'], coarse_features['bottom_up_Conv2d_14'], coarse_features['bottom_up_Conv2d_15'] ]
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs.get_shape().assert_has_rank(4) shape_assert = tf.Assert( tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), ['image size must at least be 33 in both height and width.']) feature_map_layout = { 'from_layer': ['east_conv1_3x3'], 'layer_depth': [-1], } with tf.control_dependencies([shape_assert]): with slim.arg_scope(self._conv_hyperparams): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: _, image_features = mobilenet_v1.mobilenet_v1_base( preprocessed_inputs, final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) """ by chenx """ east_conv_1 = image_features['Conv2d_3_pointwise'] east_conv_2 = image_features['Conv2d_5_pointwise'] east_conv_3 = image_features['Conv2d_11_pointwise'] east_conv_4 = image_features['Conv2d_13_pointwise'] east_deconv4 = slim.conv2d_transpose(east_conv_4, 512, [4, 4], 2, \ padding='SAME', scope='east_deconv4') east_conv4_concat = tf.concat([east_conv_4, east_deconv4], axis=3) east_conv4_1x1 = slim.conv2d(east_conv4_concat, 256, [1,1], stride=1, normalizer_fn=slim.batch_norm, scope='east_conv4_1x1') east_conv4_3x3 = slim.conv2d(east_conv4_1x1, 256, [3,3], stride=1, normalizer_fn=slim.batch_norm, scope='east_conv4_3x3') image_features['east_conv4_3x3'] = east_conv4_3x3 east_deconv3 = slim.conv2d_transpose(east_conv4_3x3, 256, [4, 4], 2, \ padding='SAME', scope='east_deconv3') east_conv3_concat = tf.concat([east_conv_3, east_deconv3], axis=3) east_conv3_1x1 = slim.conv2d(east_conv4_concat, 128, [1,1], stride=1, normalizer_fn=slim.batch_norm, scope='east_conv3_1x1') east_conv3_3x3 = slim.conv2d(east_conv4_1x1, 128, [3,3], stride=1, normalizer_fn=slim.batch_norm, scope='east_conv3_3x3') image_features['east_conv3_3x3'] = east_conv3_3x3 east_deconv2 = slim.conv2d_transpose(east_conv3_3x3, 128, [4, 4], 2, \ padding='SAME', scope='east_deconv2') east_conv2_concat = tf.concat([east_conv_2, east_deconv3], axis=3) east_conv2_1x1 = slim.conv2d(east_conv2_concat, 64, [1,1], stride=1, normalizer_fn=slim.batch_norm, scope='east_conv2_1x1') east_conv2_3x3 = slim.conv2d(east_conv2_1x1, 64, [3,3], stride=1, normalizer_fn=slim.batch_norm, scope='east_conv2_3x3') image_features['east_conv2_3x3'] = east_conv2_3x3 east_deconv1 = slim.conv2d_transpose(east_conv2_3x3, 64, [4, 4], 2, \ padding='SAME', scope='east_deconv1') east_conv1_concat = tf.concat([east_conv_1, east_deconv1], axis=3) east_conv1_1x1 = slim.conv2d(east_conv1_concat, 32, [1,1], stride=1, normalizer_fn=slim.batch_norm, scope='east_conv1_1x1') east_conv1_3x3 = slim.conv2d(east_conv1_1x1, 32, [3,3], stride=1, normalizer_fn=slim.batch_norm, scope='east_conv1_3x3') image_features['east_conv1_3x3'] = east_conv1_3x3 feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'Conv2d_3_pointwise', 'Conv2d_5_pointwise', 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(256)) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): last_feature_map = slim.conv2d( last_feature_map, num_outputs=depth_fn(256), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 13)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs, state_saver=None, state_name='lstm_state', unroll_length=5, scope=None): """Extracts features from preprocessed inputs. The features include the base network features, lstm features and SSD features, organized in the following name scope: <parent scope>/MobilenetV1/... <parent scope>/LSTM/... <parent scope>/FeatureMaps/... Args: preprocessed_inputs: A [batch, height, width, channels] float tensor representing a batch of consecutive frames from video clips. state_saver: A state saver object with methods `state` and `save_state`. state_name: A python string for the name to use with the state_saver. unroll_length: The number of steps to unroll the lstm. scope: The scope for the base network of the feature extractor. Returns: A list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope(is_training=self._is_training)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): with slim.arg_scope([slim.batch_norm], fused=False): # Base network. with tf.variable_scope( scope, self._base_network_scope, reuse=self._reuse_weights) as scope: net, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope( [slim.batch_norm], fused=False, is_training=self._is_training): # ConvLSTM layers. with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope: lstm_cell = lstm_cells.BottleneckConvLSTMCell( filter_size=(3, 3), output_size=(net.shape[1].value, net.shape[2].value), num_units=max(self._min_depth, self._lstm_state_depth), activation=tf.nn.relu6, visualize_gates=True) net_seq = list(tf.split(net, unroll_length)) if state_saver is None: init_state = lstm_cell.init_state( state_name, net.shape[0].value / unroll_length, tf.float32) else: c = state_saver.state('%s_c' % state_name) h = state_saver.state('%s_h' % state_name) init_state = (c, h) # Identities added for inputing state tensors externally. c_ident = tf.identity(init_state[0], name='lstm_state_in_c') h_ident = tf.identity(init_state[1], name='lstm_state_in_h') init_state = (c_ident, h_ident) net_seq, states_out = rnn_decoder.rnn_decoder( net_seq, init_state, lstm_cell, scope=lstm_scope) batcher_ops = None self._states_out = states_out if state_saver is not None: self._step = state_saver.state('%s_step' % state_name) batcher_ops = [ state_saver.save_state('%s_c' % state_name, states_out[-1][0]), state_saver.save_state('%s_h' % state_name, states_out[-1][1]), state_saver.save_state('%s_step' % state_name, self._step - 1) ] with tf_ops.control_dependencies(batcher_ops): image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0) # Identities added for reading output states, to be reused externally. tf.identity(states_out[-1][0], name='lstm_state_out_c') tf.identity(states_out[-1][1], name='lstm_state_out_h') # SSD layers. with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=self._feature_map_layout, depth_multiplier=(self._depth_multiplier), min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
trainable=embed_config['train_embedding'], is_training=False) with slim.arg_scope(arg_scope): embed_x, end_points = convolutional_alexnet( input_image, reuse=False, split=alexnet_config['split']) embed_z, end_points_z = convolutional_alexnet( template_image, reuse=True, split=alexnet_config['split']) elif feature_extactor == "mobilenet_v1": mobilenent_config = model_config['mobilenet_v1'] with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope(is_training=False)): with tf.variable_scope('MobilenetV1', reuse=False) as scope: embed_x, end_points = mobilenet_v1.mobilenet_v1_base( input_image, final_endpoint=mobilenent_config['final_endpoint'], conv_defs=mobilenet.CONV_DEFS, depth_multiplier=mobilenent_config['depth_multiplier'], scope=scope) with tf.variable_scope('MobilenetV1', reuse=True) as scope: embed_z, end_points_z = mobilenet_v1.mobilenet_v1_base( template_image, final_endpoint=mobilenent_config['final_endpoint'], conv_defs=mobilenet.CONV_DEFS, depth_multiplier=mobilenent_config['depth_multiplier'], scope=scope) else: raise ValueError( "Invalid feature extractor: {}".format(feature_extactor)) # build cross-correlation between features from template image and input image with tf.variable_scope('detection'):