def test_use_bounded_activations_clip_value(self, use_native_resize_op): tf_graph = tf.Graph() with tf_graph.as_default(): image_features = [ ('block2', 255 * tf.ones([4, 8, 8, 256], dtype=tf.float32)), ('block3', 255 * tf.ones([4, 4, 4, 256], dtype=tf.float32)), ('block4', 255 * tf.ones([4, 2, 2, 256], dtype=tf.float32)), ('block5', 255 * tf.ones([4, 1, 1, 256], dtype=tf.float32)) ] feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128, use_bounded_activations=True, use_native_resize_op=use_native_resize_op) expected_clip_by_value_ops = [ 'top_down/clip_by_value', 'top_down/clip_by_value_1', 'top_down/clip_by_value_2', 'top_down/clip_by_value_3', 'top_down/clip_by_value_4', 'top_down/clip_by_value_5', 'top_down/clip_by_value_6' ] # Gathers activation tensors before and after clip_by_value operations. activations = {} for clip_by_value_op in expected_clip_by_value_ops: clip_input_tensor = tf_graph.get_operation_by_name( '{}/Minimum'.format(clip_by_value_op)).inputs[0] clip_output_tensor = tf_graph.get_tensor_by_name( '{}:0'.format(clip_by_value_op)) activations.update({ 'before_{}'.format(clip_by_value_op): clip_input_tensor, 'after_{}'.format(clip_by_value_op): clip_output_tensor, }) expected_lower_bound = -feature_map_generators.ACTIVATION_BOUND expected_upper_bound = feature_map_generators.ACTIVATION_BOUND init_op = tf.global_variables_initializer() with self.test_session() as session: session.run(init_op) activations_output = session.run(activations) for clip_by_value_op in expected_clip_by_value_ops: # Before clipping, activations are beyound the expected bound because # of large input image_features values. activations_before_clipping = ( activations_output['before_{}'.format(clip_by_value_op)]) before_clipping_lower_bound = np.amin(activations_before_clipping) before_clipping_upper_bound = np.amax(activations_before_clipping) self.assertLessEqual(before_clipping_lower_bound, expected_lower_bound) self.assertGreaterEqual(before_clipping_upper_bound, expected_upper_bound) # After clipping, activations are bounded as expectation. activations_after_clipping = ( activations_output['after_{}'.format(clip_by_value_op)]) after_clipping_lower_bound = np.amin(activations_after_clipping) after_clipping_upper_bound = np.amax(activations_after_clipping) self.assertGreaterEqual(after_clipping_lower_bound, expected_lower_bound) self.assertLessEqual(after_clipping_upper_bound, expected_upper_bound)
def test_use_bounded_activations_add_operations(self, use_native_resize_op): tf_graph = tf.Graph() with tf_graph.as_default(): image_features = [ ('block2', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)), ('block3', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32)), ('block4', tf.random_uniform([4, 2, 2, 256], dtype=tf.float32)), ('block5', tf.random_uniform([4, 1, 1, 256], dtype=tf.float32)) ] feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128, use_bounded_activations=True, use_native_resize_op=use_native_resize_op) expected_added_operations = dict.fromkeys([ 'top_down/clip_by_value', 'top_down/clip_by_value_1', 'top_down/clip_by_value_2', 'top_down/clip_by_value_3', 'top_down/clip_by_value_4', 'top_down/clip_by_value_5', 'top_down/clip_by_value_6' ]) op_names = {op.name: None for op in tf_graph.get_operations()} self.assertDictContainsSubset(expected_added_operations, op_names)
def test_use_bounded_activations_add_operations(self, use_native_resize_op): tf_graph = tf.Graph() with tf_graph.as_default(): image_features = [('block2', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)), ('block3', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32)), ('block4', tf.random_uniform([4, 2, 2, 256], dtype=tf.float32)), ('block5', tf.random_uniform([4, 1, 1, 256], dtype=tf.float32))] feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128, use_bounded_activations=True, use_native_resize_op=use_native_resize_op) expected_added_operations = dict.fromkeys([ 'top_down/clip_by_value', 'top_down/clip_by_value_1', 'top_down/clip_by_value_2', 'top_down/clip_by_value_3', 'top_down/clip_by_value_4', 'top_down/clip_by_value_5', 'top_down/clip_by_value_6' ]) op_names = {op.name: None for op in tf_graph.get_operations()} self.assertDictContainsSubset(expected_added_operations, op_names)
def test_get_expected_feature_map_shapes_with_depthwise( self, use_native_resize_op): image_features = [ ('block2', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)), ('block3', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32)), ('block4', tf.random_uniform([4, 2, 2, 256], dtype=tf.float32)), ('block5', tf.random_uniform([4, 1, 1, 256], dtype=tf.float32)) ] feature_maps = feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128, use_depthwise=True, use_native_resize_op=use_native_resize_op) expected_feature_map_shapes = { 'top_down_block2': (4, 8, 8, 128), 'top_down_block3': (4, 4, 4, 128), 'top_down_block4': (4, 2, 2, 128), 'top_down_block5': (4, 1, 1, 128) } init_op = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) out_feature_maps = sess.run(feature_maps) out_feature_map_shapes = {key: value.shape for key, value in out_feature_maps.items()} self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
def test_get_expected_feature_map_shapes_with_depthwise( self, use_native_resize_op): image_features = [ ('block2', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)), ('block3', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32)), ('block4', tf.random_uniform([4, 2, 2, 256], dtype=tf.float32)), ('block5', tf.random_uniform([4, 1, 1, 256], dtype=tf.float32)) ] feature_maps = feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128, use_depthwise=True, use_native_resize_op=use_native_resize_op) expected_feature_map_shapes = { 'top_down_block2': (4, 8, 8, 128), 'top_down_block3': (4, 4, 4, 128), 'top_down_block4': (4, 2, 2, 128), 'top_down_block5': (4, 1, 1, 128) } init_op = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) out_feature_maps = sess.run(feature_maps) out_feature_map_shapes = { key: value.shape for key, value in out_feature_maps.items() } self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
def test_get_expected_feature_map_shapes(self): image_features = [ tf.random_uniform([4, 8, 8, 256], dtype=tf.float32), tf.random_uniform([4, 4, 4, 256], dtype=tf.float32), tf.random_uniform([4, 2, 2, 256], dtype=tf.float32), tf.random_uniform([4, 1, 1, 256], dtype=tf.float32), ] feature_maps = feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128) expected_feature_map_shapes = { 'top_down_feature_map_0': (4, 8, 8, 128), 'top_down_feature_map_1': (4, 4, 4, 128), 'top_down_feature_map_2': (4, 2, 2, 128), 'top_down_feature_map_3': (4, 1, 1, 128) } init_op = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) out_feature_maps = sess.run(feature_maps) out_feature_map_shapes = { key: value.shape for key, value in out_feature_maps.items() } self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope( self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v1.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, store_non_strided_activations=True, min_base_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) image_features = self._filter_features(image_features) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights): base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append('block{}'.format(level - 1)) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth)) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append( fpn_features['top_down_block{}'.format(level - 1)]) last_feature_map = fpn_features['top_down_block{}'.format( base_fpn_max_level - 1)] # Construct coarse features for i in range(base_fpn_max_level, self._fpn_max_level): last_feature_map = slim.conv2d( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_block{}'.format(i)) feature_maps.append(last_feature_map) return feature_maps
def test_get_expected_feature_map_shapes(self): print('\n================================================') print('test_get_expected_feature_map_shapes') image_features = [ ('block2', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)), ('block3', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32)), ('block4', tf.random_uniform([4, 2, 2, 256], dtype=tf.float32)), ('block5', tf.random_uniform([4, 1, 1, 256], dtype=tf.float32)) ] feature_maps = feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128) expected_feature_map_shapes = { 'top_down_block2': (4, 8, 8, 128), 'top_down_block3': (4, 4, 4, 128), 'top_down_block4': (4, 2, 2, 128), 'top_down_block5': (4, 1, 1, 128) } init_op = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) out_feature_maps = sess.run(feature_maps) out_feature_map_shapes = { key: value.shape for key, value in out_feature_maps.items() } for key, value in out_feature_maps.items(): print('{}: {}'.format(key, value.shape)) self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: depth multiplier is not supported. """ if self._depth_multiplier != 1.0: raise ValueError('Depth multiplier not supported.') preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope( self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v1.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, store_non_strided_activations=True, scope=scope) image_features = self._filter_features(image_features) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights): fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in ['block2', 'block3', 'block4']], depth=256) last_feature_map = fpn_features['top_down_block4'] coarse_features = {} for i in range(5, 7): last_feature_map = slim.conv2d( last_feature_map, num_outputs=256, kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_block{}'.format(i)) coarse_features['bottom_up_block{}'.format(i)] = last_feature_map return [fpn_features['top_down_block2'], fpn_features['top_down_block3'], fpn_features['top_down_block4'], coarse_features['bottom_up_block5'], coarse_features['bottom_up_block6']]
def feature_map_generator(image_features): return feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=depth, use_depthwise=use_depthwise, use_explicit_padding=use_explicit_padding, use_bounded_activations=use_bounded_activations, use_native_resize_op=use_native_resize_op)
def test_get_expected_feature_map_shapes(self): image_features = [ tf.random_uniform([4, 8, 8, 256], dtype=tf.float32), tf.random_uniform([4, 4, 4, 256], dtype=tf.float32), tf.random_uniform([4, 2, 2, 256], dtype=tf.float32), tf.random_uniform([4, 1, 1, 256], dtype=tf.float32), ] feature_maps = feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128) expected_feature_map_shapes = { 'top_down_feature_map_0': (4, 8, 8, 128), 'top_down_feature_map_1': (4, 4, 4, 128), 'top_down_feature_map_2': (4, 2, 2, 128), 'top_down_feature_map_3': (4, 1, 1, 128) } init_op = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) out_feature_maps = sess.run(feature_maps) out_feature_map_shapes = {key: value.shape for key, value in out_feature_maps.items()} self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=self._conv_defs, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'layer_4', 'layer_7', 'layer_14', 'layer_19' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise, use_explicit_padding=self._use_explicit_padding) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features padding = 'VALID' if self._use_explicit_padding else 'SAME' kernel_size = 3 for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d if self._use_explicit_padding: last_feature_map = ops.fixed_padding( last_feature_map, kernel_size) last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[kernel_size, kernel_size], stride=2, padding=padding, scope='bottom_up_Conv2d_{}'.format( i - base_fpn_max_level + 19)) feature_maps.append(last_feature_map) return feature_maps
def test_use_bounded_activations_clip_value(self, use_native_resize_op): tf_graph = tf.Graph() with tf_graph.as_default(): image_features = [ ('block2', 255 * tf.ones([4, 8, 8, 256], dtype=tf.float32)), ('block3', 255 * tf.ones([4, 4, 4, 256], dtype=tf.float32)), ('block4', 255 * tf.ones([4, 2, 2, 256], dtype=tf.float32)), ('block5', 255 * tf.ones([4, 1, 1, 256], dtype=tf.float32)) ] feature_map_generators.fpn_top_down_feature_maps( image_features=image_features, depth=128, use_bounded_activations=True, use_native_resize_op=use_native_resize_op) expected_clip_by_value_ops = [ 'top_down/clip_by_value', 'top_down/clip_by_value_1', 'top_down/clip_by_value_2', 'top_down/clip_by_value_3', 'top_down/clip_by_value_4', 'top_down/clip_by_value_5', 'top_down/clip_by_value_6' ] # Gathers activation tensors before and after clip_by_value operations. activations = {} for clip_by_value_op in expected_clip_by_value_ops: clip_input_tensor = tf_graph.get_operation_by_name( '{}/Minimum'.format(clip_by_value_op)).inputs[0] clip_output_tensor = tf_graph.get_tensor_by_name( '{}:0'.format(clip_by_value_op)) activations.update({ 'before_{}'.format(clip_by_value_op): clip_input_tensor, 'after_{}'.format(clip_by_value_op): clip_output_tensor, }) expected_lower_bound = -feature_map_generators.ACTIVATION_BOUND expected_upper_bound = feature_map_generators.ACTIVATION_BOUND init_op = tf.global_variables_initializer() with self.test_session() as session: session.run(init_op) activations_output = session.run(activations) for clip_by_value_op in expected_clip_by_value_ops: # Before clipping, activations are beyound the expected bound because # of large input image_features values. activations_before_clipping = (activations_output[ 'before_{}'.format(clip_by_value_op)]) before_clipping_lower_bound = np.amin( activations_before_clipping) before_clipping_upper_bound = np.amax( activations_before_clipping) self.assertLessEqual(before_clipping_lower_bound, expected_lower_bound) self.assertGreaterEqual(before_clipping_upper_bound, expected_upper_bound) # After clipping, activations are bounded as expectation. activations_after_clipping = (activations_output[ 'after_{}'.format(clip_by_value_op)]) after_clipping_lower_bound = np.amin( activations_after_clipping) after_clipping_upper_bound = np.amax( activations_after_clipping) self.assertGreaterEqual(after_clipping_lower_bound, expected_lower_bound) self.assertLessEqual(after_clipping_upper_bound, expected_upper_bound)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=_CONV_DEFS if self._use_depthwise else None, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'layer_4', 'layer_7', 'layer_14', 'layer_19' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial( slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19)) feature_maps.append(last_feature_map) return feature_maps
class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): """SSD Feature Extractor using MobilenetV2 FPN features.""" def __init__(self, is_training, depth_multiplier, min_depth, pad_to_multiple, conv_hyperparams_fn, fpn_min_level=3, fpn_max_level=7, additional_layer_depth=256, reuse_weights=None, use_explicit_padding=False, use_depthwise=False, override_base_feature_extractor_hyperparams=False): """SSD FPN feature extractor based on Mobilenet v2 architecture. Args: is_training: whether the network is in training mode. depth_multiplier: float depth multiplier for feature extractor. min_depth: minimum feature extractor depth. pad_to_multiple: the nearest multiple to zero pad the input height and width dimensions to. conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d and separable_conv2d ops in the layers that are added on top of the base feature extractor. fpn_min_level: the highest resolution feature map to use in FPN. The valid values are {2, 3, 4, 5} which map to MobileNet v2 layers {layer_4, layer_7, layer_14, layer_19}, respectively. fpn_max_level: the smallest resolution feature map to construct or use in FPN. FPN constructions uses features maps starting from fpn_min_level upto the fpn_max_level. In the case that there are not enough feature maps in the backbone network, additional feature maps are created by applying stride 2 convolutions until we get the desired number of fpn levels. additional_layer_depth: additional feature map layer channel depth. reuse_weights: whether to reuse variables. Default is None. use_explicit_padding: Whether to use explicit padding when extracting features. Default is False. use_depthwise: Whether to use depthwise convolutions. Default is False. override_base_feature_extractor_hyperparams: Whether to override hyperparameters of the base feature extractor with the one from `conv_hyperparams_fn`. """ super(SSDMobileNetV2FpnFeatureExtractor, self).__init__( is_training=is_training, depth_multiplier=depth_multiplier, min_depth=min_depth, pad_to_multiple=pad_to_multiple, conv_hyperparams_fn=conv_hyperparams_fn, reuse_weights=reuse_weights, use_explicit_padding=use_explicit_padding, use_depthwise=use_depthwise, override_base_feature_extractor_hyperparams= override_base_feature_extractor_hyperparams) self._fpn_min_level = fpn_min_level self._fpn_max_level = fpn_max_level self._additional_layer_depth = additional_layer_depth self._conv_defs = None if self._use_depthwise: self._conv_defs = _create_modified_mobilenet_config() def preprocess(self, resized_inputs): """SSD preprocessing. Maps pixel values to the range [-1, 1]. Args: resized_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. """ return (2.0 / 255.0) * resized_inputs - 1.0 def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) #with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: # with slim.arg_scope( #mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ #slim.arg_scope( #[mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=self._conv_defs, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'layer_4', 'layer_7', 'layer_14', 'layer_19' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise, use_explicit_padding=self._use_explicit_padding) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features padding = 'VALID' if self._use_explicit_padding else 'SAME' kernel_size = 3 for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial( slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d if self._use_explicit_padding: last_feature_map = ops.fixed_padding( last_feature_map, kernel_size) last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[kernel_size, kernel_size], stride=2, padding=padding, scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19)) feature_maps.append(last_feature_map)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'Conv2d_3_pointwise', 'Conv2d_5_pointwise', 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(256)) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): last_feature_map = slim.conv2d( last_feature_map, num_outputs=depth_fn(256), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format( i - base_fpn_max_level + 13)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'Conv2d_3_pointwise', 'Conv2d_5_pointwise', 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(256)) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): last_feature_map = slim.conv2d( last_feature_map, num_outputs=depth_fn(256), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 13)) feature_maps.append(last_feature_map) return feature_maps