def __init__(self, num_classes, endpoints_num_filters=0, aggregation='top', dropout_rate=0.0, batch_norm_activation=nn_ops.BatchNormActivation(), data_format='channels_last'): """Initialize params to build classification head. Args: num_classes: the number of classes, including one background class. endpoints_num_filters: the number of filters of the optional embedding layer after the multiscale feature aggregation. If 0, no additional embedding layer is applied. aggregation: the method to aggregate the multiscale feature maps. If `top`, the feature map of the highest level will be directly used. If `all`, all levels will be used by nearest-neighbor upsampling and averaging to the same size as the lowest level (the number of filters for all levels should match). dropout_rate: the dropout rate of the optional dropout layer. If 0.0, no additional dropout layer is applied. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. data_format: An optional string from: `channels_last`, `channels_first`. Defaults to `channels_last`. """ self._num_classes = num_classes self._endpoints_num_filters = endpoints_num_filters self._aggregation = aggregation self._dropout_rate = dropout_rate self._batch_norm_activation = batch_norm_activation self._data_format = data_format
def spinenet_builder( model_id, min_level=3, max_level=7, block_specs=build_block_specs(), use_native_resize_op=False, activation='swish', batch_norm_activation=nn_ops.BatchNormActivation(activation='swish'), init_drop_connect_rate=None, data_format='channels_last'): """Builds the SpineNet network.""" if model_id not in SCALING_MAP: raise ValueError( 'SpineNet {} is not a valid architecture.'.format(model_id)) scaling_params = SCALING_MAP[model_id] return SpineNet( min_level=min_level, max_level=max_level, block_specs=block_specs, endpoints_num_filters=scaling_params['endpoints_num_filters'], resample_alpha=scaling_params['resample_alpha'], use_native_resize_op=use_native_resize_op, block_repeats=scaling_params['block_repeats'], filter_size_scale=scaling_params['filter_size_scale'], activation=activation, batch_norm_activation=batch_norm_activation, init_drop_connect_rate=init_drop_connect_rate, data_format=data_format)
def resample_feature_map(feat, level, target_level, is_training, target_feat_dims=256, conv2d_op=tf.layers.conv2d, batch_norm_activation=nn_ops.BatchNormActivation(), name=None): """Resample input feature map to have target number of channels and width.""" feat_dims = feat.get_shape().as_list()[3] with tf.variable_scope('resample_{}'.format(name)): if feat_dims != target_feat_dims: feat = conv2d_op( feat, filters=target_feat_dims, kernel_size=(1, 1), padding='same') feat = batch_norm_activation( feat, is_training=is_training, relu=False, name='bn') if level < target_level: stride = int(2**(target_level-level)) feat = tf.layers.max_pooling2d( inputs=feat, pool_size=stride, strides=[stride, stride], padding='SAME') elif level > target_level: scale = int(2**(level - target_level)) feat = spatial_transform_ops.nearest_upsampling(feat, scale=scale) return feat
def __init__(self, min_level=3, max_level=7, fpn_feat_dims=256, use_separable_conv=False, use_batch_norm=True, batch_norm_activation=nn_ops.BatchNormActivation()): """FPN initialization function. Args: min_level: `int` minimum level in FPN output feature maps. max_level: `int` maximum level in FPN output feature maps. fpn_feat_dims: `int` number of filters in FPN layers. use_separable_conv: `bool`, if True use separable convolution for convolution in FPN layers. use_batch_norm: 'bool', indicating whether batchnorm layers are added. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. """ self._min_level = min_level self._max_level = max_level self._fpn_feat_dims = fpn_feat_dims if use_separable_conv: self._conv2d_op = functools.partial(tf.layers.separable_conv2d, depth_multiplier=1) else: self._conv2d_op = tf.layers.conv2d self._use_batch_norm = use_batch_norm self._batch_norm_activation = batch_norm_activation
def __init__( self, num_classes, num_attributes, num_convs=0, num_filters=256, use_separable_conv=False, num_fcs=2, fc_dims=1024, activation='relu', use_batch_norm=True, batch_norm_activation=nn_ops.BatchNormActivation(activation='relu')): """Initialize params to build Fast R-CNN head with attribute prediction. Args: num_classes: an integer for the number of classes. num_attributes: an integer for the number of attributes. num_convs: `int` number that represents the number of the intermediate conv layers before the FC layers. num_filters: `int` number that represents the number of filters of the intermediate conv layers. use_separable_conv: `bool`, indicating whether the separable conv layers is used. num_fcs: `int` number that represents the number of FC layers before the predictions. fc_dims: `int` number that represents the number of dimension of the FC layers. activation: activation function. Support 'relu' and 'swish'. use_batch_norm: 'bool', indicating whether batchnorm layers are added. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. """ self._num_classes = num_classes self._num_attributes = num_attributes self._num_convs = num_convs self._num_filters = num_filters if use_separable_conv: self._conv2d_op = functools.partial( tf.layers.separable_conv2d, depth_multiplier=1, bias_initializer=tf.zeros_initializer()) else: self._conv2d_op = functools.partial( tf.layers.conv2d, kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2, mode='fan_out', distribution='untruncated_normal'), bias_initializer=tf.zeros_initializer()) self._num_fcs = num_fcs self._fc_dims = fc_dims if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError( 'Activation {} not implemented.'.format(activation)) self._use_batch_norm = use_batch_norm self._batch_norm_activation = batch_norm_activation
def batch_norm_activation_generator(params): return nn_ops.BatchNormActivation( momentum=params.batch_norm_momentum, epsilon=params.batch_norm_epsilon, trainable=params.batch_norm_trainable, use_sync_bn=params.use_sync_bn, activation=params.activation)
def __init__(self, min_level=3, max_level=7, block_specs=build_block_specs(), endpoints_num_filters=48, use_native_resize_op=False, se_ratio=0.2, block_repeats=1, filter_size_scale=1.0, activation='swish', batch_norm_activation=nn_ops.BatchNormActivation( activation='swish'), init_drop_connect_rate=None, data_format='channels_last'): """SpineNetMBConv initialization function. Args: min_level: `int` minimum level in SpineNet endpoints. max_level: `int` maximum level in SpineNet endpoints. block_specs: a list of BlockSpec objects that specifies the SpineNet network topology. By default, the previously discovered architecture is used. endpoints_num_filters: `int` feature dimension applied to endpoints before sharing conv layers in head. use_native_resize_op: Whether to use native tf.image.nearest_neighbor_resize or the broadcast implmentation to do upsampling. se_ratio: squeeze and excitation ratio for MBConv blocks. block_repeats: `int` number of repeats per block. filter_size_scale: `float` a scaling factor to uniformaly scale feature dimension in SpineNet. activation: the activation function after cross-scale feature fusion. Support 'relu' and 'swish'. batch_norm_activation: An operation that includes a batch normalization layer followed by an optional activation layer. init_drop_connect_rate: `float` initial drop connect rate. data_format: An optional string from: "channels_last", "channels_first". Defaults to "channels_last". """ self._min_level = min_level self._max_level = max_level self._block_specs = block_specs self._endpoints_num_filters = endpoints_num_filters self._use_native_resize_op = use_native_resize_op self._se_ratio = se_ratio self._block_repeats = block_repeats self._filter_size_scale = filter_size_scale if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError( 'Activation {} not implemented.'.format(activation)) self._batch_norm_activation = batch_norm_activation self._init_dc_rate = init_drop_connect_rate self._data_format = data_format self._dropblock = nn_ops.Dropblock()
def __init__(self, min_level, max_level, anchors_per_location, num_convs=2, num_filters=256, use_separable_conv=False, activation='relu', use_batch_norm=True, batch_norm_activation=nn_ops.BatchNormActivation( activation='relu')): """Initialize params to build Region Proposal Network head. Args: min_level: `int` number of minimum feature level. max_level: `int` number of maximum feature level. anchors_per_location: `int` number of number of anchors per pixel location. num_convs: `int` number that represents the number of the intermediate conv layers before the prediction. num_filters: `int` number that represents the number of filters of the intermediate conv layers. use_separable_conv: `bool`, indicating whether the separable conv layers is used. activation: activation function. Support 'relu' and 'swish'. use_batch_norm: 'bool', indicating whether batchnorm layers are added. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. """ self._min_level = min_level self._max_level = max_level self._anchors_per_location = anchors_per_location self._num_convs = num_convs self._num_filters = num_filters if use_separable_conv: self._conv2d_op = functools.partial( tf.layers.separable_conv2d, depth_multiplier=1, bias_initializer=tf.zeros_initializer()) else: self._conv2d_op = functools.partial( tf.layers.conv2d, kernel_initializer=tf.random_normal_initializer(stddev=0.01), bias_initializer=tf.zeros_initializer()) self._use_batch_norm = use_batch_norm if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError('Activation {} not implemented.'.format(activation)) self._batch_norm_activation = batch_norm_activation
def resample_with_sepconv(feat, target_width, target_num_filters, use_native_resize_op=False, batch_norm_activation=nn_ops.BatchNormActivation(), data_format='channels_last', name=None, is_training=False): """Match resolution and feature dimension to the target block.""" _, height, width, num_filters = feat.get_shape().as_list() if width is None or num_filters is None: raise ValueError('Shape of feat is None (shape:{}).'.format( feat.shape)) with tf.variable_scope('resample_with_sepconv_{}'.format(name)): # Down-sample. if width > target_width: if width % target_width != 0: raise ValueError('width ({}) is not divisible by ' 'target_width ({}).'.format( width, target_width)) while width > target_width: feat = nn_ops.depthwise_conv2d_fixed_padding( inputs=feat, kernel_size=3, strides=2, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) width /= 2 # Up-sample with NN interpolation. elif width < target_width: if target_width % width != 0: raise ValueError('target_wdith ({}) is not divisible by ' 'width ({}).'.format(target_width, width)) scale = target_width // width if use_native_resize_op: feat = tf.image.resize_nearest_neighbor( feat, [height * scale, width * scale]) else: feat = spatial_transform_ops.nearest_upsampling(feat, scale=scale) # Match feature dimension to the target block. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=target_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, relu=False, is_training=is_training) return feat
def block_group(inputs, filters, strides, block_fn_cand, block_repeats, activation=tf.nn.swish, batch_norm_activation=nn_ops.BatchNormActivation(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', name=None, is_training=False): """Creates one group of blocks for SpineNet.""" block_fn_candidates = { 'bottleneck': nn_blocks.bottleneck_block, 'residual': nn_blocks.residual_block, } if block_fn_cand not in block_fn_candidates: raise ValueError('Block function {} not implemented.'.format(block_fn_cand)) block_fn = block_fn_candidates[block_fn_cand] _, _, _, num_filters = inputs.get_shape().as_list() if block_fn_cand == 'bottleneck': use_projection = not (num_filters == (filters * 4) and strides == 1) else: use_projection = not (num_filters == filters and strides == 1) # Only the first block per block_group uses projection shortcut and strides. inputs = block_fn( inputs, filters, strides, use_projection=use_projection, activation=activation, batch_norm_activation=batch_norm_activation, dropblock=dropblock, drop_connect_rate=drop_connect_rate, data_format=data_format, is_training=is_training) for _ in range(1, block_repeats): inputs = block_fn( inputs, filters, 1, use_projection=False, activation=activation, batch_norm_activation=batch_norm_activation, dropblock=dropblock, drop_connect_rate=drop_connect_rate, data_format=data_format, is_training=is_training) return tf.identity(inputs, name)
def block_group(inputs, filters, strides, block_fn, block_repeats, conv2d_op=None, activation=tf.nn.swish, batch_norm_activation=nn_ops.BatchNormActivation(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', name=None, is_training=False): """Creates one group of blocks for NAS-FPN.""" if block_fn == 'conv': inputs = conv2d_op(inputs, filters=filters, kernel_size=(3, 3), padding='same', data_format=data_format, name='conv') inputs = batch_norm_activation(inputs, is_training=is_training, relu=False, name='bn') inputs = dropblock(inputs, is_training=is_training) return inputs if block_fn != 'bottleneck': raise ValueError('Block function {} not implemented.'.format(block_fn)) _, _, _, num_filters = inputs.get_shape().as_list() block_fn = nn_blocks.bottleneck_block use_projection = not (num_filters == (filters * 4) and strides == 1) return resnet.block_group(inputs=inputs, filters=filters, strides=strides, use_projection=use_projection, block_fn=block_fn, block_repeats=block_repeats, activation=activation, batch_norm_activation=batch_norm_activation, dropblock=dropblock, drop_connect_rate=drop_connect_rate, data_format=data_format, name=name, is_training=is_training)
def __init__(self, min_level, max_level, num_classes, anchors_per_location, num_convs=4, num_filters=256, use_separable_conv=False, activation='relu', use_batch_norm=True, batch_norm_activation=nn_ops.BatchNormActivation( activation='relu')): """Initialize params to build RetinaNet head. Args: min_level: `int` number of minimum feature level. max_level: `int` number of maximum feature level. num_classes: `int` number of classification categories. anchors_per_location: `int` number of anchors per pixel location. num_convs: `int` number of stacked convolution before the last prediction layer. num_filters: `int` number of filters used in the head architecture. use_separable_conv: `bool` to indicate whether to use separable convoluation. activation: activation function. Support 'relu' and 'swish'. use_batch_norm: 'bool', indicating whether batchnorm layers are added. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. """ self._min_level = min_level self._max_level = max_level self._num_classes = num_classes self._anchors_per_location = anchors_per_location self._num_convs = num_convs self._num_filters = num_filters self._use_separable_conv = use_separable_conv if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError('Activation {} not implemented.'.format(activation)) self._use_batch_norm = use_batch_norm self._batch_norm_activation = batch_norm_activation
def __init__(self, block_specs=build_block_specs(), batch_norm_activation=nn_ops.BatchNormActivation(), data_format='channels_last'): """EfficientNet initialization function. Args: block_specs: a list of BlockSpec objects that specifies the EfficientNet network. By default, the previously discovered EfficientNet-A1 is used. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. data_format: An optional string from: "channels_last", "channels_first". Defaults to "channels_last". """ self._block_specs = block_specs self._batch_norm_activation = batch_norm_activation self._data_format = data_format
def block_group(inputs, in_filters, out_filters, strides, expand_ratio, block_repeats, se_ratio=0.2, batch_norm_activation=nn_ops.BatchNormActivation(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', name=None, is_training=False): """Creates one group of blocks for Mobile SpineNet.""" # Apply strides only to the first block in block_group. inputs = nn_blocks.mbconv_block( inputs, in_filters, out_filters, expand_ratio, strides, se_ratio=se_ratio, batch_norm_activation=batch_norm_activation, dropblock=dropblock, drop_connect_rate=drop_connect_rate, data_format=data_format, is_training=is_training) for _ in range(1, block_repeats): inputs = nn_blocks.mbconv_block( inputs, out_filters, out_filters, expand_ratio, 1, # strides se_ratio=se_ratio, batch_norm_activation=batch_norm_activation, dropblock=dropblock, drop_connect_rate=drop_connect_rate, data_format=data_format, is_training=is_training) return tf.identity(inputs, name)
def __init__(self, num_classes, level, num_convs=2, upsample_factor=1, upsample_num_filters=256, activation='relu', use_batch_norm=True, batch_norm_activation=nn_ops.BatchNormActivation( activation='relu')): """Initialize params to build segmentation head. Args: num_classes: `int` number of mask classification categories. The number of classes does not include background class. level: `int` feature level used for prediction. num_convs: `int` number of stacked convolution before the last prediction layer. upsample_factor: `int` number to specify the upsampling factor to generate finer mask. Default 1 means no upsampling is applied. upsample_num_filters: `int` number to specify the number of filters used in deconv for the upsampling operation. Default is 256. activation: activation function. Support 'relu' and 'swish'. use_batch_norm: 'bool', indicating whether batchnorm layers are added. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. """ self._num_classes = num_classes self._level = level self._num_convs = num_convs self._upsample_factor = upsample_factor self._upsample_num_filters = upsample_num_filters if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError('Activation {} not implemented.'.format(activation)) self._use_batch_norm = use_batch_norm self._batch_norm_activation = batch_norm_activation
def resample_with_alpha(feat, input_block_fn, target_width, target_num_filters, target_block_fn, alpha=1.0, use_native_resize_op=False, batch_norm_activation=nn_ops.BatchNormActivation(), data_format='channels_last', name=None, is_training=False): """Match resolution and feature dimension to the target block.""" _, height, width, num_filters = feat.get_shape().as_list() if width is None or num_filters is None: raise ValueError('Shape of feat is None (shape:{}).'.format( feat.shape)) if input_block_fn == 'bottleneck': num_filters /= 4 new_num_filters = int(num_filters * alpha) with tf.variable_scope('resample_with_alpha_{}'.format(name)): # First 1x1 conv to reduce feature dimension to alpha*. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=new_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) # Down-sample. if width > target_width: # Apply stride-2 conv to reduce feature map size to 1/2. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=new_num_filters, kernel_size=3, strides=2, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) # Apply maxpool to further reduce feature map size if necessary. if width // target_width > 2: if width % target_width != 0: stride_size = 2 else: stride_size = width // target_width // 2 feat = tf.layers.max_pooling2d( inputs=feat, pool_size=3 if width / target_width <= 4 else 5, strides=stride_size, padding='SAME', data_format=data_format) # Use NN interpolation to resize if necessary. This could happen in cases # where `wdith` is not divisible by `target_width`. if feat.get_shape().as_list()[2] != target_width: feat = spatial_transform_ops.native_resize( feat, [int(target_width / width * height), target_width]) # Up-sample with NN interpolation. elif width < target_width: if target_width % width != 0 or use_native_resize_op: feat = spatial_transform_ops.native_resize( feat, [int(target_width / width * height), target_width]) else: scale = target_width // width feat = spatial_transform_ops.nearest_upsampling(feat, scale=scale) # Match feature dimension to the target block. if target_block_fn == 'bottleneck': target_num_filters *= 4 feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=target_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, relu=False, is_training=is_training) return feat
def __init__(self, resnet_depth, dropblock=nn_ops.Dropblock(), activation='relu', batch_norm_activation=nn_ops.BatchNormActivation(), init_drop_connect_rate=None, data_format='channels_last', space_to_depth_block_size=1): """ResNet initialization function. Args: resnet_depth: `int` depth of ResNet backbone model. dropblock: a dropblock layer. activation: activation function. Support 'relu' and 'swish'. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. init_drop_connect_rate: a 'float' number that specifies the initial drop connection rate. Note that the default `None` means no drop connection is applied. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. space_to_depth_block_size: an integer indicates the block size of space-to-depth convolution for conv0. `0` means use the original conv2d in ResNet """ self._resnet_depth = resnet_depth self._dropblock = dropblock if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError( 'Activation {} not implemented.'.format(activation)) self._batch_norm_activation = batch_norm_activation self._init_drop_connect_rate = init_drop_connect_rate self._data_format = data_format self._space_to_depth_block_size = space_to_depth_block_size model_params = { 10: { 'block': nn_blocks.residual_block, 'layers': [1, 1, 1, 1] }, 14: { 'block': nn_blocks.bottleneck_block, 'layers': [1, 1, 1, 1] }, 18: { 'block': nn_blocks.residual_block, 'layers': [2, 2, 2, 2] }, 26: { 'block': nn_blocks.bottleneck_block, 'layers': [2, 2, 2, 2] }, 34: { 'block': nn_blocks.residual_block, 'layers': [3, 4, 6, 3] }, 50: { 'block': nn_blocks.bottleneck_block, 'layers': [3, 4, 6, 3] }, 101: { 'block': nn_blocks.bottleneck_block, 'layers': [3, 4, 23, 3] }, 152: { 'block': nn_blocks.bottleneck_block, 'layers': [3, 8, 36, 3] }, 200: { 'block': nn_blocks.bottleneck_block, 'layers': [3, 24, 36, 3] } } if resnet_depth not in model_params: valid_resnet_depths = ', '.join( [str(depth) for depth in sorted(model_params.keys())]) raise ValueError( 'The resnet_depth should be in [%s]. Not a valid resnet_depth:' % (valid_resnet_depths), self._resnet_depth) params = model_params[resnet_depth] self._resnet_fn = self.resnet_v1_generator( params['block'], params['layers'], self._space_to_depth_block_size)
def block_group(inputs, filters, strides, use_projection, block_fn, block_repeats, activation=tf.nn.relu, batch_norm_activation=nn_ops.BatchNormActivation(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', name=None, is_training=False): """Builds one group of blocks. Args: inputs: a `Tensor` of size `[batch, channels, height, width]`. filters: an `int` number of filters for the first two convolutions. strides: an `int` block stride. If greater than 1, this block will ultimately downsample the input. use_projection: a `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. block_fn: the `function` for the block to use within the model block_repeats: an `int` number of blocks to repeat in the group. activation: activation function. Support 'relu' and 'swish'. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. dropblock: a drop block layer that is added after convluations. Note that the default implementation does not apply any drop block. drop_connect_rate: a 'float' number that specifies the drop connection rate of the block. Note that the default `None` means no drop connection is applied. data_format: a `str` that specifies the data format. name: a `str` name for the Tensor output of the block layer. is_training: a `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block layer. """ # Only the first block per block_group uses projection shortcut and strides. inputs = block_fn(inputs, filters, strides, use_projection=use_projection, activation=activation, batch_norm_activation=batch_norm_activation, dropblock=dropblock, drop_connect_rate=drop_connect_rate, data_format=data_format, is_training=is_training) for _ in range(1, block_repeats): inputs = block_fn(inputs, filters, 1, use_projection=False, activation=activation, batch_norm_activation=batch_norm_activation, dropblock=dropblock, drop_connect_rate=drop_connect_rate, data_format=data_format, is_training=is_training) return tf.identity(inputs, name)
def __call__(self, images, is_training=False): """Generate a multiscale feature pyramid. Args: images: The input image tensor. is_training: `bool` if True, the model is in training mode. Returns: a `dict` containing `int` keys for continuous feature levels [min_level, min_level + 1, ..., max_level]. The values are corresponding features with shape [batch_size, height_l, width_l, endpoints_num_filters]. """ x = images with tf.variable_scope('efficientnet'): x = nn_ops.conv2d_fixed_padding(inputs=x, filters=32, kernel_size=3, strides=2, data_format=self._data_format) x = tf.identity(x, 'initial_conv') x = self._batch_norm_activation(x, is_training=is_training) endpoints = [] for i, block_spec in enumerate(self._block_specs): bn_act = nn_ops.BatchNormActivation( activation=block_spec.act_fn) with tf.variable_scope('block_{}'.format(i)): for j in range(block_spec.num_repeats): strides = ( 1 if j > 0 else efficientnet_constants.EFFICIENTNET_STRIDES[i]) if block_spec.block_fn == 'conv': x = nn_ops.conv2d_fixed_padding( inputs=x, filters=block_spec.output_filters, kernel_size=block_spec.kernel_size, strides=strides, data_format=self._data_format) x = bn_act(x, is_training=is_training) elif block_spec.block_fn == 'mbconv': x_shape = x.get_shape().as_list() in_filters = (x_shape[1] if self._data_format == 'channel_first' else x_shape[-1]) x = nn_blocks.mbconv_block( inputs=x, in_filters=in_filters, out_filters=block_spec.output_filters, expand_ratio=block_spec.expand_ratio, strides=strides, kernel_size=block_spec.kernel_size, se_ratio=block_spec.se_ratio, batch_norm_activation=bn_act, data_format=self._data_format, is_training=is_training) elif block_spec.block_fn == 'fused_mbconv': x_shape = x.get_shape().as_list() in_filters = (x_shape[1] if self._data_format == 'channel_first' else x_shape[-1]) x = nn_blocks.fused_mbconv_block( inputs=x, in_filters=in_filters, out_filters=block_spec.output_filters, expand_ratio=block_spec.expand_ratio, strides=strides, kernel_size=block_spec.kernel_size, se_ratio=block_spec.se_ratio, batch_norm_activation=bn_act, data_format=self._data_format, is_training=is_training) else: raise ValueError( 'Un-supported block_fn `{}`!'.format( block_spec.block_fn)) x = tf.identity(x, 'endpoints') endpoints.append(x) return { 2: endpoints[1], 3: endpoints[2], 4: endpoints[4], 5: endpoints[6] }
def __init__( self, num_classes, num_convs=0, num_filters=256, use_separable_conv=False, num_fcs=2, fc_dims=1024, # for vild classifier: start clip_dim=512, classifier_weight_path=None, normalize_classifier=False, normalize_visual=False, temperature=1.0, # feature distillation visual_feature_distill=None, max_distill_rois=300, # for vild classifier: end activation='relu', use_batch_norm=True, batch_norm_activation=nn_ops.BatchNormActivation( activation='relu'), class_agnostic_bbox_pred=False): """Initialize params to build Fast R-CNN box head. Args: num_classes: an integer for the number of classes. num_convs: `int` number that represents the number of the intermediate conv layers before the FC layers. num_filters: `int` number that represents the number of filters of the intermediate conv layers. use_separable_conv: `bool`, indicating whether the separable conv layers is used. num_fcs: `int` number that represents the number of FC layers before the predictions. fc_dims: `int` number that represents the number of dimension of the FC layers. clip_dim: `int` number that represents the number of dimension of the CLIP text embeddings. classifier_weight_path: `str` for the text embeddings used as classifier. normalize_classifier: `bool`, indicating whether to normalize the classifier. normalize_visual: indication whether to normalize the visual features used for classification. temperature: `float`, temperature applied to the logits. visual_feature_distill: None or `str` in ['vanilla', 'double_branch'] to specify the type of visual feature distillation. max_distill_rois: `int`, specify the number of precomputed rois used for distillation. activation: activation function. Support 'relu' and 'swish'. use_batch_norm: 'bool', indicating whether batchnorm layers are added. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. class_agnostic_bbox_pred: `bool`, indicating whether bboxes should be predicted for every class or not. """ self._num_classes = num_classes self._num_convs = num_convs self._num_filters = num_filters if use_separable_conv: self._conv2d_op = functools.partial( tf.layers.separable_conv2d, depth_multiplier=1, bias_initializer=tf.zeros_initializer()) else: self._conv2d_op = functools.partial( tf.layers.conv2d, kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2, mode='fan_out', distribution='untruncated_normal'), bias_initializer=tf.zeros_initializer()) self._num_fcs = num_fcs self._fc_dims = fc_dims if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError( 'Activation {} not implemented.'.format(activation)) self._use_batch_norm = use_batch_norm self._batch_norm_activation = batch_norm_activation self._class_agnostic_bbox_pred = class_agnostic_bbox_pred # clip classifier related self._clip_dim = clip_dim self._classifier_weight_path = classifier_weight_path assert tf.gfile.Exists(self._classifier_weight_path) self._normalize_classifier = normalize_classifier self._normalize_visual = normalize_visual self._temperature = temperature # feature distill self._feat_distill = visual_feature_distill self._max_distill_rois = max_distill_rois assert self._normalize_classifier and self._normalize_visual
def fused_mbconv_block(inputs, in_filters, out_filters, expand_ratio, strides, kernel_size=3, se_ratio=None, batch_norm_activation=nn_ops.BatchNormActivation(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', is_training=False): """The fused bottleneck block with BN and DropBlock after convolutions. Args: inputs: a `Tensor` of size `[batch, channels, height, width]`. in_filters: a `int` number of filters for the input feature map. out_filters: a `int` number of filters for the output feature map. expand_ratio: a `int` number as the feature dimension expansion ratio. strides: a `int` block stride. If greater than 1, this block will ultimately downsample the input. kernel_size: kernel size for the depthwise convolution. se_ratio: squeeze and excitation ratio. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. dropblock: a drop block layer that is added after convluations. Note that the default implementation does not apply any drop block. drop_connect_rate: a 'float' number that specifies the drop connection rate of the block. Note that the default `None` means no drop connection is applied. data_format: a `str` that specifies the data format. is_training: a `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block. """ tf.logging.info('-----> Building fused mbconv block.') shortcut = inputs # First 1x1 conv for channel expansion. inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=in_filters * expand_ratio, kernel_size=kernel_size, strides=strides, data_format=data_format) inputs = batch_norm_activation(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) # Squeeze and excitation. if se_ratio is not None and se_ratio > 0 and se_ratio <= 1: inputs = nn_ops.squeeze_excitation(inputs, in_filters, se_ratio, expand_ratio=expand_ratio, data_format=data_format) # Third 1x1 conv for reversed bottleneck. inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=out_filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm_activation(inputs, relu=False, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) if in_filters == out_filters and strides == 1: if drop_connect_rate: inputs = nn_ops.drop_connect(inputs, is_training, drop_connect_rate) inputs = tf.add(inputs, shortcut) return inputs
def bottleneck_block(inputs, filters, strides, use_projection, activation=tf.nn.relu, batch_norm_activation=nn_ops.BatchNormActivation(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', is_training=False): """The bottleneck block with BN and DropBlock after convolutions. Args: inputs: a `Tensor` of size `[batch, channels, height, width]`. filters: a `int` number of filters for the first two convolutions. Note that the third and final convolution will use 4 times as many filters. strides: an `int` block stride. If greater than 1, this block will ultimately downsample the input. use_projection: a `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. activation: activation function. Support 'relu' and 'swish'. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. dropblock: a drop block layer that is added after convluations. Note that the default implementation does not apply any drop block. drop_connect_rate: a 'float' number that specifies the drop connection rate of the block. Note that the default `None` means no drop connection is applied. data_format: a `str` that specifies the data format. is_training: a `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block. """ logging.info('-----> Building bottleneck block.') shortcut = inputs if use_projection: out_filters = 4 * filters shortcut = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=out_filters, kernel_size=1, strides=strides, data_format=data_format) shortcut = batch_norm_activation(shortcut, relu=False, is_training=is_training) shortcut = dropblock(shortcut, is_training=is_training) inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm_activation(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=filters, kernel_size=3, strides=strides, data_format=data_format) inputs = batch_norm_activation(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=4 * filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm_activation(inputs, relu=False, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) if drop_connect_rate: inputs = nn_ops.drop_connect(inputs, is_training, drop_connect_rate) return activation(inputs + shortcut)
def __init__(self, min_level=3, max_level=7, block_specs=build_block_specs(), endpoints_num_filters=256, resample_alpha=0.5, use_native_resize_op=False, block_repeats=1, filter_size_scale=1.0, activation='swish', batch_norm_activation=nn_ops.BatchNormActivation( activation='swish'), init_drop_connect_rate=None, data_format='channels_last'): """SpineNet initialization function. Args: min_level: an `int` representing the minimum level in SpineNet endpoints. max_level: an `int` representing the maximum level in SpineNet endpoints. block_specs: a list of BlockSpec objects that specifies the SpineNet network topology. By default, the previously discovered architecture is used. endpoints_num_filters: an `int` representing the final feature dimension of endpoints before the shared conv layers in head. resample_alpha: a `float` representing the scaling factor to scale feature dimension before resolution resampling. use_native_resize_op: Whether to use native tf.image.nearest_neighbor_resize or the broadcast implmentation to do upsampling. block_repeats: an `int` representing the number of repeats per block group. filter_size_scale: a `float` representing the scaling factor to uniformaly scale feature dimension in SpineNet. activation: activation function. Support 'relu' and 'swish'. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. init_drop_connect_rate: a 'float' number that specifies the initial drop connection rate. Note that the default `None` means no drop connection is applied. data_format: An optional string from: "channels_last", "channels_first". Defaults to "channels_last". """ self._min_level = min_level self._max_level = max_level self._block_specs = block_specs self._endpoints_num_filters = endpoints_num_filters self._use_native_resize_op = use_native_resize_op self._resample_alpha = resample_alpha self._block_repeats = block_repeats self._filter_size_scale = filter_size_scale if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError( 'Activation {} not implemented.'.format(activation)) self._batch_norm_activation = batch_norm_activation self._init_drop_connect_rate = init_drop_connect_rate self._data_format = data_format self._dropblock = nn_ops.Dropblock( ) # Hard-code it to not use DropBlock. self._init_block_fn = 'bottleneck' self._num_init_blocks = 2
def __init__(self, min_level=3, max_level=7, block_specs=build_block_specs(), fpn_feat_dims=256, num_repeats=7, use_separable_conv=False, dropblock=nn_ops.Dropblock(), block_fn='conv', block_repeats=1, activation='relu', batch_norm_activation=nn_ops.BatchNormActivation( activation='relu'), init_drop_connect_rate=None, data_format='channels_last', use_sum_for_combination=False): """NAS-FPN initialization function. Args: min_level: `int` minimum level in NAS-FPN output feature maps. max_level: `int` maximum level in NAS-FPN output feature maps. block_specs: a list of BlockSpec objects that specifies the SpineNet network topology. By default, the previously discovered architecture is used. fpn_feat_dims: `int` number of filters in FPN layers. num_repeats: number of repeats for feature pyramid network. use_separable_conv: `bool`, if True use separable convolution for convolution in NAS-FPN layers. dropblock: a Dropblock layer. block_fn: `string` representing types of block group support: conv, bottleneck. block_repeats: `int` representing the number of repeats per block group when block group is bottleneck. activation: activation function. Support 'relu' and 'swish'. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. init_drop_connect_rate: a 'float' number that specifies the initial drop connection rate. Note that the default `None` means no drop connection is applied. data_format: An optional string from: "channels_last", "channels_first". Defaults to "channels_last". use_sum_for_combination: `bool`, if True only 'sum' is used for combining two nodes. """ self._min_level = min_level self._max_level = max_level self._block_specs = block_specs self._fpn_feat_dims = fpn_feat_dims self._num_repeats = num_repeats self._block_fn = block_fn self._block_repeats = block_repeats if use_separable_conv: self._conv2d_op = functools.partial(tf.layers.separable_conv2d, depth_multiplier=1) else: self._conv2d_op = tf.layers.conv2d self._dropblock = dropblock if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError( 'Activation {} not implemented.'.format(activation)) self._batch_norm_activation = batch_norm_activation self._init_drop_connect_rate = init_drop_connect_rate self._data_format = data_format self._resample_feature_map = functools.partial( resample_feature_map, target_feat_dims=fpn_feat_dims, conv2d_op=self._conv2d_op, batch_norm_activation=batch_norm_activation, data_format=self._data_format) self._use_sum_for_combination = use_sum_for_combination
def __init__(self, min_level=3, max_level=7, fpn_feat_dims=256, num_repeats=7, use_separable_conv=False, dropblock=nn_ops.Dropblock(), block_fn='conv', block_repeats=1, activation='swish', batch_norm_activation=nn_ops.BatchNormActivation(), init_drop_connect_rate=None): """NAS-FPN initialization function. Args: min_level: `int` minimum level in NAS-FPN output feature maps. max_level: `int` maximum level in NAS-FPN output feature maps. fpn_feat_dims: `int` number of filters in FPN layers. num_repeats: number of repeats for feature pyramid network. use_separable_conv: `bool`, if True use separable convolution for convolution in NAS-FPN layers. dropblock: a Dropblock layer. block_fn: `string` representing types of block group support: conv, bottleneck. block_repeats: `int` representing the number of repeats per block group when block group is bottleneck. activation: activation function. Support 'relu' and 'swish'. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. init_drop_connect_rate: a 'float' number that specifies the initial drop connection rate. Note that the default `None` means no drop connection is applied. """ self._min_level = min_level self._max_level = max_level if min_level == 3 and max_level == 7: model_config = [ 3, 1, 1, 3, 3, 0, 1, 5, 4, 0, 0, 6, # Output to level 3. 3, 0, 6, 7, # Output to level 4. 2, 1, 7, 8, # Output to level 5. 0, 1, 6, 9, # Output to level 7. 1, 1, 9, 10] # Output to level 6. else: raise ValueError('The NAS-FPN with min level {} and max level {} ' 'is not supported.'.format(min_level, max_level)) self._config = Config(model_config, self._min_level, self._max_level) self._num_repeats = num_repeats self._fpn_feat_dims = fpn_feat_dims self._block_fn = block_fn self._block_repeats = block_repeats if use_separable_conv: self._conv2d_op = functools.partial( tf.layers.separable_conv2d, depth_multiplier=1) else: self._conv2d_op = tf.layers.conv2d self._dropblock = dropblock if activation == 'relu': self._activation = tf.nn.relu elif activation == 'swish': self._activation = tf.nn.swish else: raise ValueError('Activation {} not implemented.'.format(activation)) self._batch_norm_activation = batch_norm_activation self._init_drop_connect_rate = init_drop_connect_rate self._resample_feature_map = functools.partial( resample_feature_map, target_feat_dims=fpn_feat_dims, conv2d_op=self._conv2d_op, batch_norm_activation=batch_norm_activation)