def scope_fn(): with (slim.arg_scope([slim.batch_norm], **batch_norm_params) if batch_norm_params is not None else context_manager.IdentityContextManager()): with slim.arg_scope(affected_ops, weights_regularizer=_build_slim_regularizer( hyperparams_config.regularizer), weights_initializer=_build_initializer( hyperparams_config.initializer), activation_fn=_build_activation_fn( hyperparams_config.activation), normalizer_fn=normalizer_fn) as sc: return sc
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): # TODO(skligys): Enable fused batch norm once quantization supports it. with slim.arg_scope([slim.batch_norm], fused=False): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): # TODO(skligys): Enable fused batch norm once quantization supports it. with slim.arg_scope([slim.batch_norm], fused=False): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''][:self._num_layers], 'layer_depth': [-1, -1, 512, 256, 256, 128][:self._num_layers], 'use_depthwise': self._use_depthwise, 'use_explicit_padding': self._use_explicit_padding, } with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=True, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: depth multiplier is not supported. """ if self._depth_multiplier != 1.0: raise ValueError('Depth multiplier not supported.') preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope(self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v1.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): with slim.arg_scope([resnet_v1.bottleneck], use_bounded_activations=self. _use_bounded_activations): _, activations = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, store_non_strided_activations=True, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.pooling_pyramid_feature_maps( base_feature_map_depth=self._base_feature_map_depth, num_layers=self._num_layers, image_features={ 'image_features': self._filter_features(activations)['block3'] }) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ feature_map_layout = { 'from_layer': ['Cell_7', 'Cell_11', '', '', '', ''][:self._num_layers], 'layer_depth': [-1, -1, 512, 256, 256, 128][:self._num_layers], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with slim.arg_scope( pnasnet_large_arg_scope_for_detection( is_batch_norm_training=self._is_training)): with slim.arg_scope( [slim.conv2d, slim.batch_norm, slim.separable_conv2d], reuse=self._reuse_weights): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = pnasnet.build_pnasnet_large( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=self._is_training, final_endpoint='Cell_11') with tf.compat.v1.variable_scope('SSD_feature_maps', reuse=self._reuse_weights): with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '', ''][:self._num_layers], 'layer_depth': [-1, -1, 512, 256, 256, 128][:self._num_layers], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.pooling_pyramid_feature_maps( base_feature_map_depth=0, num_layers=6, image_features={ 'image_features': image_features['Conv2d_11_pointwise'] }) return list(feature_maps.values())
def scope_fn(): batch_norm = slim.batch_norm affected_ops = [slim.conv2d, slim.separable_conv2d, slim.fully_connected] batch_norm_params = { 'decay': batch_norm.decay, 'center': batch_norm.center, 'scale': batch_norm.scale, 'epsilon': batch_norm.epsilon, # Remove is_training parameter from here and deprecate it in the proto # once we refactor Faster RCNN models to set is_training through an outer # arg_scope in the meta architecture. 'is_training': True and batch_norm.train } with (slim.arg_scope([slim.batch_norm], **batch_norm_params) if batch_norm_params is not None else context_manager.IdentityContextManager()): with slim.arg_scope( affected_ops, weights_initializer=tf.truncated_normal_initializer(stddev=0.01), weights_regularizer=slim.l2_regularizer(0.0005), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm) as sc: return sc
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=self._conv_defs, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'layer_4', 'layer_7', 'layer_14', 'layer_19' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise, use_explicit_padding=self._use_explicit_padding) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features padding = 'VALID' if self._use_explicit_padding else 'SAME' kernel_size = 3 for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d if self._use_explicit_padding: last_feature_map = ops.fixed_padding( last_feature_map, kernel_size) last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[kernel_size, kernel_size], stride=2, padding=padding, scope='bottom_up_Conv2d_{}'.format( i - base_fpn_max_level + 19)) feature_maps.append(last_feature_map) return feature_maps
def test_identity_context_manager(self): with context_manager.IdentityContextManager() as identity_context: self.assertIsNone(identity_context)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: depth multiplier is not supported. """ if self._depth_multiplier != 1.0: raise ValueError('Depth multiplier not supported.') preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope(self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v2.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, # store_non_strided_activations=True, scope=scope) image_features = self._filter_features(image_features) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights): base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append('block{}'.format(level - 1)) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=self._additional_layer_depth, use_explicit_padding=True) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append( fpn_features['top_down_block{}'.format(level - 1)]) last_feature_map = fpn_features['top_down_block{}'.format( base_fpn_max_level - 1)] # Construct coarse features for i in range(base_fpn_max_level, self._fpn_max_level): last_feature_map = slim.conv2d( last_feature_map, num_outputs=self._additional_layer_depth, kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_block{}'.format(i)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs, state_saver=None, state_name='lstm_state', unroll_length=5, scope=None): """Extracts features from preprocessed inputs. The features include the base network features, lstm features and SSD features, organized in the following name scope: <parent scope>/MobilenetV1/... <parent scope>/LSTM/... <parent scope>/FeatureMaps/... Args: preprocessed_inputs: A [batch, height, width, channels] float tensor representing a batch of consecutive frames from video clips. state_saver: A state saver object with methods `state` and `save_state`. state_name: A python string for the name to use with the state_saver. unroll_length: The number of steps to unroll the lstm. scope: The scope for the base network of the feature extractor. Returns: A list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=self._is_training)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): with slim.arg_scope([slim.batch_norm], fused=False): # Base network. with tf.variable_scope(scope, self._base_network_scope, reuse=self._reuse_weights) as scope: net, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope([slim.batch_norm], fused=False, is_training=self._is_training): # ConvLSTM layers. with tf.variable_scope( 'LSTM', reuse=self._reuse_weights) as lstm_scope: lstm_cell = lstm_cells.BottleneckConvLSTMCell( filter_size=(3, 3), output_size=(net.shape[1].value, net.shape[2].value), num_units=max(self._min_depth, self._lstm_state_depth), activation=tf.nn.relu6, visualize_gates=True) net_seq = list(tf.split(net, unroll_length)) if state_saver is None: init_state = lstm_cell.init_state( state_name, net.shape[0].value / unroll_length, tf.float32) else: c = state_saver.state('%s_c' % state_name) h = state_saver.state('%s_h' % state_name) init_state = (c, h) # Identities added for inputing state tensors externally. c_ident = tf.identity(init_state[0], name='lstm_state_in_c') h_ident = tf.identity(init_state[1], name='lstm_state_in_h') init_state = (c_ident, h_ident) net_seq, states_out = rnn_decoder.rnn_decoder( net_seq, init_state, lstm_cell, scope=lstm_scope) batcher_ops = None self._states_out = states_out if state_saver is not None: self._step = state_saver.state('%s_step' % state_name) batcher_ops = [ state_saver.save_state('%s_c' % state_name, states_out[-1][0]), state_saver.save_state('%s_h' % state_name, states_out[-1][1]), state_saver.save_state('%s_step' % state_name, self._step - 1) ] with tf_ops.control_dependencies(batcher_ops): image_features['Conv2d_13_pointwise_lstm'] = tf.concat( net_seq, 0) # Identities added for reading output states, to be reused externally. tf.identity(states_out[-1][0], name='lstm_state_out_c') tf.identity(states_out[-1][1], name='lstm_state_out_h') # SSD layers. with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=self._feature_map_layout, depth_multiplier=(self._depth_multiplier), min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: if image height or width are not 256 pixels. """ image_shape = preprocessed_inputs.get_shape() image_shape.assert_has_rank(4) image_height = image_shape[1].value image_width = image_shape[2].value if image_height is None or image_width is None: shape_assert = tf.Assert( tf.logical_and(tf.equal(tf.shape(preprocessed_inputs)[1], 256), tf.equal(tf.shape(preprocessed_inputs)[2], 256)), ['image size must be 256 in both height and width.']) with tf.control_dependencies([shape_assert]): preprocessed_inputs = tf.identity(preprocessed_inputs) elif image_height != 256 or image_width != 256: raise ValueError( 'image size must be = 256 in both height and width;' ' image dim = %d,%d' % (image_height, image_width)) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256], 'conv_kernel_size': [-1, -1, 3, 3, 2], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope(is_training=None)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def backbone_net(inputs, image_size, is_training=True, depth_multiplier=0.5): pad_to_multiple = 14 if image_size == 112 else (10 if image_size == 80 else 8) use_explicit_padding = False depth_multiplier = depth_multiplier print('construct backbone_net for image_size', image_size, 'depth_multiplier = ', depth_multiplier) use_depthwise = True override_base_feature_extractor_hyperparams = False reuse_weights = None min_depth = 16 specs = [ op(slim.conv2d, stride=2, num_outputs=64, kernel_size=[3, 3]), # todo: Depthwise Conv3×3 op(slim.separable_conv2d, stride=1, kernel_size=[3, 3], num_outputs=None, multiplier_func=dummy_depth_multiplier), # 562×64Bottleneck 2 64 5 2 op(ops.expanded_conv, stride=2, num_outputs=64), ] for _ in range(0, 4): specs.append(op(ops.expanded_conv, stride=1, num_outputs=64)) # 282×64Bottleneck212812 specs.append(op(ops.expanded_conv, stride=2, num_outputs=128)) # 142×128Bottleneck412861 for _ in range(0, 6): specs.append(op(ops.expanded_conv, expansion_size=expand_input(4), num_outputs=128, stride=1)) kernel_size = [7, 7] if image_size == 112 else ([5,5] if image_size == 80 else [4,4]) specs.append(op(ops.expanded_conv, stride=1, num_outputs=16, scope='S1')) specs.append(op(slim.conv2d, stride=2, kernel_size=[3, 3], num_outputs=32, scope='S2')) specs.append(op(slim.conv2d, stride=1, kernel_size=kernel_size, num_outputs=128, scope='S3', padding='VALID')) # print('specs = ', specs, ' len = ', len(specs)) arch = dict( defaults={ # Note: these parameters of batch norm affect the architecture # that's why they are here and not in training_scope. (slim.batch_norm,): {'center': True, 'scale': True}, (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 }, (ops.expanded_conv,): { 'expansion_size': expand_input(2), 'split_expansion': 1, 'normalizer_fn': slim.batch_norm, 'residual': True, }, (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME', 'weights_initializer': slim.xavier_initializer()} }, spec=specs ) print('input to backbone_net ' , inputs) with tf.variable_scope('Backbone', reuse=reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=is_training, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=min_depth): with (slim.arg_scope(conv_hyperparams_fn(is_training=is_training)) if override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( od_ops.pad_to_multiple(inputs, pad_to_multiple), depth_multiplier=depth_multiplier, is_training=is_training, use_explicit_padding=use_explicit_padding, conv_defs=arch, scope=scope) # do a fully connected layer here # TODO layer_15 = image_features['layer_15'] layer_16 = image_features['layer_16'] layer_17 = image_features['layer_17'] # batch_size = tf.shape(S1)[0] S1 = slim.flatten(layer_15, scope='S1flatten') # tf.reshape(S1, [batch_size, -1]) S2 = slim.flatten(layer_16, scope='S2flatten') # [batch_size, -1]) S3 = slim.flatten(layer_17, scope='S3flatten') # [batch_size, -1]) before_dense = tf.concat([S1, S2, S3], 1) for i in range(1, 18): print('layer_' + str(i), image_features['layer_' + str(i)]) # print('layer_17', layer_17) print('S1', S1) print('S2', S2) print('S3', S3) # to_test = slim.conv2d(image_features['layer_19']) print('image image_features', image_features.keys()) with slim.arg_scope([slim.batch_norm], is_training=is_training, center=True, scale=True): return image_features, slim.fully_connected(before_dense, 136, activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm, weights_initializer=slim.xavier_initializer()), (image_features['layer_1'], inputs, image_features['layer_2'])
def backbone_net(inputs, image_size, is_training=True, depth_multiplier=0.5, **kwargs): pad_to_multiple = 10 use_explicit_padding = False depth_multiplier = depth_multiplier print('construct backbone_net for image_size', image_size, 'depth_multiplier = ', depth_multiplier) use_depthwise = True override_base_feature_extractor_hyperparams = False reuse_weights = None min_depth = 16 specs = [ op(slim.conv2d, stride=2, num_outputs=64, kernel_size=[3, 3], activation_fn=tf.nn.elu), # todo: Depthwise Conv3×3 op(ops.expanded_conv, stride=1, kernel_size=[3, 3], num_outputs=64), # 562×64Bottleneck 2 64 5 2 op(slim.max_pool2d, kernel_size=[3, 3], padding='SAME', stride=1), op(ops.expanded_conv, stride=2, num_outputs=64, kernel_size=[3, 3]), ] for _ in range(0, 4): specs.append( op(ops.expanded_conv, stride=1, num_outputs=64, kernel_size=[3, 3])) # 282×64Bottleneck212812 specs.append( op(ops.expanded_conv, stride=2, num_outputs=128, kernel_size=[3, 3])) # 142×128Bottleneck412861 mid_conv_n = kwargs.get('mid_conv_n', 4) for _ in range(0, mid_conv_n): specs.append( op(ops.expanded_conv, expansion_size=expand_input(4), num_outputs=128, stride=1)) kernel_size = [5, 5] specs.append(op(ops.expanded_conv, stride=1, num_outputs=16, scope='S1')) specs.append( op(slim.conv2d, stride=2, kernel_size=[3, 3], num_outputs=32, scope='S2', activation_fn=tf.nn.elu)) specs.append( op(slim.conv2d, stride=1, kernel_size=kernel_size, num_outputs=128, scope='S3', padding='VALID', activation_fn=tf.nn.elu)) # print('specs = ', specs, ' len = ', len(specs)) arch = dict( defaults={ # Note: these parameters of batch norm affect the architecture # that's why they are here and not in training_scope. ( slim.batch_norm, ): { 'center': True, 'scale': True }, (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 }, (ops.expanded_conv, ): { 'expansion_size': expand_input(2), 'split_expansion': 1, 'normalizer_fn': slim.batch_norm, 'residual': True, }, (slim.conv2d, slim.separable_conv2d): { 'padding': 'SAME', 'weights_initializer': slim.xavier_initializer() } }, spec=specs) print('input to backbone_net ', inputs) with tf.variable_scope('Backbone', reuse=reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=is_training, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=min_depth): with (slim.arg_scope(conv_hyperparams_fn(is_training=is_training)) if override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( od_ops.pad_to_multiple(inputs, pad_to_multiple), depth_multiplier=depth_multiplier, is_training=is_training, use_explicit_padding=use_explicit_padding, conv_defs=arch, scope=scope) # do a fully connected layer here # TODO print('image image_features', image_features.keys()) all_layers = [] for layer_name in image_features.keys(): if re.match('^layer_\\d+$', layer_name) is not None: all_layers.append(layer_name) def layer_key(val): return int(val.split('_')[1]) all_layers.sort(key=layer_key) print('all_layers', all_layers) layer_15 = image_features[all_layers[-3]] layer_16 = image_features[all_layers[-2]] layer_17 = image_features[all_layers[-1]] # batch_size = tf.shape(S1)[0] S1 = slim.flatten( layer_15, scope='S1flatten') # tf.reshape(S1, [batch_size, -1]) S2 = slim.flatten(layer_16, scope='S2flatten') # [batch_size, -1]) S3 = slim.flatten(layer_17, scope='S3flatten') # [batch_size, -1]) before_dense = tf.concat([S1, S2, S3], 1) for l in all_layers: print(l, image_features[l]) # print('layer_17', layer_17) print('S1', S1) print('S2', S2) print('S3', S3) # to_test = slim.conv2d(image_features['layer_19']) print('before fully_connected', before_dense) with slim.arg_scope( [slim.fully_connected], weights_initializer=slim.xavier_initializer(), normalizer_fn=None, activation_fn=tf.nn.tanh): fc_x = kwargs.get('fc_x_n', 2) print('fully_connected before last x ', fc_x) pre_chin = slim.fully_connected(before_dense, 34 * fc_x) pre_left_eye_brow = slim.fully_connected( before_dense, 10 * fc_x) pre_right_eye_brow = slim.fully_connected( before_dense, 10 * fc_x) pre_nose = slim.fully_connected(before_dense, 18 * fc_x) pre_left_eye = slim.fully_connected( before_dense, 12 * fc_x) pre_right_eye = slim.fully_connected( before_dense, 12 * fc_x) pre_mouth = slim.fully_connected(before_dense, 40 * fc_x) chin = slim.fully_connected(pre_chin, 34) left_eye_brow = slim.fully_connected(pre_left_eye_brow, 10) right_eye_brow = slim.fully_connected( pre_right_eye_brow, 10) nose = slim.fully_connected(pre_nose, 18) left_eye = slim.fully_connected(pre_left_eye, 12) right_eye = slim.fully_connected(pre_right_eye, 12) mouth = slim.fully_connected(pre_mouth, 40) landmarks = tf.concat([ chin, left_eye_brow, right_eye_brow, nose, left_eye, right_eye, mouth ], -1) return image_features, landmarks, None
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'Conv2d_3_pointwise', 'Conv2d_5_pointwise', 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(256)) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): last_feature_map = slim.conv2d( last_feature_map, num_outputs=depth_fn(256), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format( i - base_fpn_max_level + 13)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: depth multiplier is not supported. """ if self._depth_multiplier != 1.0: raise ValueError('Depth multiplier not supported.') preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope(self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v1.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, store_non_strided_activations=True, scope=scope) image_features = self._filter_features(image_features) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights): fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in ['block2', 'block3', 'block4']], depth=256) last_feature_map = fpn_features['top_down_block4'] coarse_features = {} for i in range(5, 7): last_feature_map = slim.conv2d( last_feature_map, num_outputs=256, kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_block{}'.format(i)) coarse_features['bottom_up_block{}'.format( i)] = last_feature_map return [ fpn_features['top_down_block2'], fpn_features['top_down_block3'], fpn_features['top_down_block4'], coarse_features['bottom_up_block5'], coarse_features['bottom_up_block6'] ]
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError if conv_defs is not provided or from_layer does not meet the size requirement. """ if not self._conv_defs: raise ValueError('Must provide backbone conv defs.') if len(self._from_layer) != 2: raise ValueError('SSD input feature names are not provided.') preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': [self._from_layer[0], self._from_layer[1], '', '', '', ''], 'layer_depth': [-1, -1, 128, 128, 128, 128], #[-1, -1, 512, 256, 256, 128] 'use_depthwise': self._use_depthwise, 'use_explicit_padding': self._use_explicit_padding, } with tf.variable_scope(self._scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v3.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v3.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), conv_defs=self._conv_defs, final_endpoint=self._from_layer[1], depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()