def residual_block(self, inputs, filters, strides, use_projection=False, is_training=False): """Standard building block for residual networks with BN after convolutions. Args: inputs: `Tensor` of size `[batch, channels, height, width]`. filters: `int` number of filters for the first two convolutions. Note that the third and final convolution will use 4 times as many filters. strides: `int` block stride. If greater than 1, this block will ultimately downsample the input. use_projection: `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. is_training: `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block. """ shortcut = inputs if use_projection: # Projection shortcut in first layer to match filters and strides shortcut = nn_ops.conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=1, strides=strides, data_format=self._data_format) shortcut = self._batch_norm_relu(shortcut, relu=False, is_training=is_training) inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=filters, kernel_size=3, strides=strides, data_format=self._data_format) inputs = self._batch_norm_relu(inputs, is_training=is_training) inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=filters, kernel_size=3, strides=1, data_format=self._data_format) inputs = self._batch_norm_relu(inputs, relu=False, init_zero=True, is_training=is_training) return tf.nn.relu(inputs + shortcut)
def model(inputs, is_training=False): """Creation of the model graph.""" inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=64, kernel_size=7, strides=2, data_format=self._data_format) inputs = tf.identity(inputs, 'initial_conv') inputs = self._batch_norm_relu(inputs, is_training=is_training) inputs = tf.layers.max_pooling2d(inputs=inputs, pool_size=3, strides=2, padding='SAME', data_format=self._data_format) inputs = tf.identity(inputs, 'initial_max_pool') c2 = block_group(inputs=inputs, filters=64, strides=1, use_projection=True, block_fn=block_fn, block_repeats=layers[0], batch_norm_relu=self._batch_norm_relu, dropblock=self._dropblock, name='block_group1', is_training=is_training) c3 = block_group(inputs=c2, filters=128, strides=2, use_projection=True, block_fn=block_fn, block_repeats=layers[1], batch_norm_relu=self._batch_norm_relu, dropblock=self._dropblock, name='block_group2', is_training=is_training) c4 = block_group(inputs=c3, filters=256, strides=2, use_projection=True, block_fn=block_fn, block_repeats=layers[2], batch_norm_relu=self._batch_norm_relu, dropblock=self._dropblock, name='block_group3', is_training=is_training) c5 = block_group(inputs=c4, filters=512, strides=2, use_projection=True, block_fn=block_fn, block_repeats=layers[3], batch_norm_relu=self._batch_norm_relu, dropblock=self._dropblock, name='block_group4', is_training=is_training) return {2: c2, 3: c3, 4: c4, 5: c5}
def resample_with_sepconv(feat, target_width, target_num_filters, use_native_resize_op=False, batch_norm_activation=nn_ops.BatchNormActivation(), data_format='channels_last', name=None, is_training=False): """Match resolution and feature dimension to the target block.""" _, height, width, num_filters = feat.get_shape().as_list() if width is None or num_filters is None: raise ValueError('Shape of feat is None (shape:{}).'.format( feat.shape)) with tf.variable_scope('resample_with_sepconv_{}'.format(name)): # Down-sample. if width > target_width: if width % target_width != 0: raise ValueError('width ({}) is not divisible by ' 'target_width ({}).'.format( width, target_width)) while width > target_width: feat = nn_ops.depthwise_conv2d_fixed_padding( inputs=feat, kernel_size=3, strides=2, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) width /= 2 # Up-sample with NN interpolation. elif width < target_width: if target_width % width != 0: raise ValueError('target_wdith ({}) is not divisible by ' 'width ({}).'.format(target_width, width)) scale = target_width // width if use_native_resize_op: feat = tf.image.resize_nearest_neighbor( feat, [height * scale, width * scale]) else: feat = spatial_transform_ops.nearest_upsampling(feat, scale=scale) # Match feature dimension to the target block. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=target_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, relu=False, is_training=is_training) return feat
def _build_endpoints(self, features, is_training): """Match filter size for endpoints before sharing conv layers.""" endpoints = {} for level in range(self._min_level, self._max_level + 1): feature = nn_ops.conv2d_fixed_padding( inputs=features[level], filters=self._endpoints_num_filters, kernel_size=1, strides=1, data_format=self._data_format) feature = self._batch_norm_relu(feature, is_training=is_training) endpoints[level] = feature return endpoints
def __call__(self, features, is_training): """Generate logits for classification. It takes a dict of multiscale feature maps and produces the final logits used for classification. Args: features: a dict of Tensors representing the multiscale feature maps with keys being level and values being the feature maps. is_training: a bool indicating whether it's in training mode. Returns: logits: a Tensor of shape [batch_size, num_classes] representing the prediction logits. """ with tf.variable_scope('classification_head'): if self._aggregation == 'top': bottleneck = features[max(features.keys())] else: raise ValueError( 'Un-supported aggregation type: `{}`!'.format(self._aggregation)) # Optionally project to an embedding space of different dimensions. if self._endpoints_num_filters > 0: bottleneck = nn_ops.conv2d_fixed_padding( inputs=bottleneck, filters=self._endpoints_num_filters, kernel_size=1, strides=1, data_format=self._data_format) bottleneck = self._batch_norm_relu(bottleneck, is_training=is_training) # Global average pooling. bottleneck = tf.reduce_mean( bottleneck, axis=([1, 2] if self._data_format == 'channels_last' else [2, 3])) bottleneck = tf.identity(bottleneck, 'final_avg_pool') # Dropout layer. if is_training and self._dropout_rate > 0.0: bottleneck = tf.nn.dropout(bottleneck, self._dropout_rate) # Prediction layer. logits = tf.layers.dense( inputs=bottleneck, units=self._num_classes, kernel_initializer=tf.random_normal_initializer(stddev=0.01)) logits = tf.identity(logits, 'logits') return logits
def _build_stem_network(self, inputs, is_training): """Build the stem network.""" # Build the first conv layer. inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=nn_ops.round_filters( FILTER_SIZE_MAP[0], self._filter_size_scale), kernel_size=3, strides=2, data_format=self._data_format) inputs = tf.identity(inputs, 'initial_conv') inputs = self._batch_norm_activation(inputs, is_training=is_training) # Build the initial L1 block and L2 block. base0 = block_group( inputs=inputs, in_filters=nn_ops.round_filters(FILTER_SIZE_MAP[0], self._filter_size_scale), out_filters=nn_ops.round_filters(FILTER_SIZE_MAP[1], self._filter_size_scale), expand_ratio=DEFAULT_EXPAND_RATIO, block_repeats=self._block_repeats, strides=1, se_ratio=self._se_ratio, batch_norm_activation=self._batch_norm_activation, dropblock=self._dropblock, data_format=self._data_format, name='stem_block_0', is_training=is_training) base1 = block_group( inputs=base0, in_filters=nn_ops.round_filters(FILTER_SIZE_MAP[1], self._filter_size_scale), out_filters=nn_ops.round_filters(FILTER_SIZE_MAP[2], self._filter_size_scale), expand_ratio=DEFAULT_EXPAND_RATIO, block_repeats=self._block_repeats, strides=2, se_ratio=self._se_ratio, batch_norm_activation=self._batch_norm_activation, dropblock=self._dropblock, data_format=self._data_format, name='stem_block_1', is_training=is_training) return [base0, base1]
def _build_stem_network(self, inputs, is_training): """Build the stem network.""" # Build the first conv and maxpooling layers. net = nn_ops.conv2d_fixed_padding( inputs=inputs, filters=64, kernel_size=7, strides=2, data_format=self._data_format) net = tf.identity(net, 'initial_conv') net = self._batch_norm_activation(net, is_training=is_training) net = tf.layers.max_pooling2d( inputs=net, pool_size=3, strides=2, padding='SAME', data_format=self._data_format) net = tf.identity(net, 'initial_max_pool') stem_features = [] # Build the initial level 2 blocks. for i in range(self._num_init_blocks): net = block_group( inputs=net, filters=int(FILTER_SIZE_MAP[2] * self._filter_size_scale), strides=1, block_fn_cand=self._init_block_fn, block_repeats=self._block_repeats, activation=self._activation, batch_norm_activation=self._batch_norm_activation, dropblock=self._dropblock, data_format=self._data_format, name='stem_block_{}'.format(i + 1), is_training=is_training) stem_features.append(net) return stem_features
def resample_with_alpha(feat, input_block_fn, target_width, target_num_filters, target_block_fn, alpha=1.0, use_native_resize_op=False, batch_norm_activation=nn_ops.BatchNormActivation(), data_format='channels_last', name=None, is_training=False): """Match resolution and feature dimension to the target block.""" _, height, width, num_filters = feat.get_shape().as_list() if width is None or num_filters is None: raise ValueError('Shape of feat is None (shape:{}).'.format( feat.shape)) if input_block_fn == 'bottleneck': num_filters /= 4 new_num_filters = int(num_filters * alpha) with tf.variable_scope('resample_with_alpha_{}'.format(name)): # First 1x1 conv to reduce feature dimension to alpha*. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=new_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) # Down-sample. if width > target_width: # Apply stride-2 conv to reduce feature map size to 1/2. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=new_num_filters, kernel_size=3, strides=2, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) # Apply maxpool to further reduce feature map size if necessary. if width // target_width > 2: if width % target_width != 0: stride_size = 2 else: stride_size = width // target_width // 2 feat = tf.layers.max_pooling2d( inputs=feat, pool_size=3 if width / target_width <= 4 else 5, strides=stride_size, padding='SAME', data_format=data_format) # Use NN interpolation to resize if necessary. This could happen in cases # where `wdith` is not divisible by `target_width`. if feat.get_shape().as_list()[2] != target_width: feat = spatial_transform_ops.native_resize( feat, [int(target_width / width * height), target_width]) # Up-sample with NN interpolation. elif width < target_width: if target_width % width != 0 or use_native_resize_op: feat = spatial_transform_ops.native_resize( feat, [int(target_width / width * height), target_width]) else: scale = target_width // width feat = spatial_transform_ops.nearest_upsampling(feat, scale=scale) # Match feature dimension to the target block. if target_block_fn == 'bottleneck': target_num_filters *= 4 feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=target_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, relu=False, is_training=is_training) return feat
def bottleneck_block(inputs, filters, strides, use_projection, activation=tf.nn.relu, batch_norm_relu=nn_ops.BatchNormRelu(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', is_training=False): """The bottleneck block with BN and DropBlock after convolutions. Args: inputs: a `Tensor` of size `[batch, channels, height, width]`. filters: a `int` number of filters for the first two convolutions. Note that the third and final convolution will use 4 times as many filters. strides: an `int` block stride. If greater than 1, this block will ultimately downsample the input. use_projection: a `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. activation: activation function. Support 'relu' and 'swish'. batch_norm_relu: an operation that is added after convolutions, including a batch norm layer and an optional relu activation. dropblock: a drop block layer that is added after convluations. Note that the default implementation does not apply any drop block. drop_connect_rate: a 'float' number that specifies the drop connection rate of the block. Note that the default `None` means no drop connection is applied. data_format: a `str` that specifies the data format. is_training: a `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block. """ logging.info('-----> Building bottleneck block.') shortcut = inputs if use_projection: out_filters = 4 * filters shortcut = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=out_filters, kernel_size=1, strides=strides, data_format=data_format) shortcut = batch_norm_relu(shortcut, relu=False, is_training=is_training) shortcut = dropblock(shortcut, is_training=is_training) inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm_relu(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=filters, kernel_size=3, strides=strides, data_format=data_format) inputs = batch_norm_relu(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=4 * filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm_relu(inputs, relu=False, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) if drop_connect_rate: inputs = nn_ops.drop_connect(inputs, is_training, drop_connect_rate) return activation(inputs + shortcut)
def mbconv_block(inputs, in_filters, out_filters, expand_ratio, strides, use_projection, kernel_size=3, se_ratio=None, batch_norm_relu=nn_ops.BatchNormRelu(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', is_training=False): """The bottleneck block with BN and DropBlock after convolutions. Args: inputs: a `Tensor` of size `[batch, channels, height, width]`. in_filters: a `int` number of filters for the input feature map. out_filters: a `int` number of filters for the output feature map. expand_ratio: a `int` number as the feature dimension expansion ratio. strides: a `int` block stride. If greater than 1, this block will ultimately downsample the input. use_projection: a `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. kernel_size: kernel size for the depthwise convolution. se_ratio: squeeze and excitation ratio. batch_norm_relu: an operation that is added after convolutions, including a batch norm layer and an optional relu activation. dropblock: a drop block layer that is added after convluations. Note that the default implementation does not apply any drop block. drop_connect_rate: a 'float' number that specifies the drop connection rate of the block. Note that the default `None` means no drop connection is applied. data_format: a `str` that specifies the data format. is_training: a `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block. """ tf.logging.info('-----> Building mbconv block.') shortcut = inputs if use_projection: shortcut = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=out_filters, kernel_size=1, strides=strides, data_format=data_format) shortcut = batch_norm_relu(shortcut, is_training=is_training) shortcut = dropblock(shortcut, is_training=is_training) # First 1x1 conv for channel expansion. inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=in_filters * expand_ratio, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm_relu(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) # Second depthwise conv. inputs = nn_ops.depthwise_conv2d_fixed_padding(inputs=inputs, kernel_size=kernel_size, strides=strides, data_format=data_format) inputs = batch_norm_relu(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) # Squeeze and excitation. if se_ratio is not None and se_ratio > 0 and se_ratio <= 1: inputs = nn_ops.squeeze_excitation(inputs, in_filters, se_ratio, expand_ratio=expand_ratio, data_format=data_format) # Third 1x1 conv for reversed bottleneck. inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=out_filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm_relu(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) if drop_connect_rate: inputs = nn_ops.drop_connect(inputs, is_training, drop_connect_rate) return tf.add(inputs, shortcut)
def model(inputs, is_training=False): """Creation of the model graph.""" if space_to_depth_block_size > 1: # conv0 uses space-to-depth transform for TPU performance. inputs = nn_ops.conv0_space_to_depth( inputs=inputs, filters=64, kernel_size=7, strides=2, data_format=self._data_format, space_to_depth_block_size=space_to_depth_block_size) else: inputs = nn_ops.conv2d_fixed_padding( inputs=inputs, filters=64, kernel_size=7, strides=2, data_format=self._data_format) inputs = tf.identity(inputs, 'initial_conv') inputs = self._batch_norm_activation(inputs, is_training=is_training) inputs = tf.layers.max_pooling2d(inputs=inputs, pool_size=3, strides=2, padding='SAME', data_format=self._data_format) inputs = tf.identity(inputs, 'initial_max_pool') c2 = block_group(inputs=inputs, filters=64, strides=1, use_projection=True, block_fn=block_fn, block_repeats=layers[0], activation=self._activation, batch_norm_activation=self._batch_norm_activation, dropblock=self._dropblock, drop_connect_rate=get_drop_connect_rate( self._init_drop_connect_rate, 2, 5), name='block_group1', is_training=is_training) c3 = block_group(inputs=c2, filters=128, strides=2, use_projection=True, block_fn=block_fn, block_repeats=layers[1], activation=self._activation, batch_norm_activation=self._batch_norm_activation, dropblock=self._dropblock, drop_connect_rate=get_drop_connect_rate( self._init_drop_connect_rate, 3, 5), name='block_group2', is_training=is_training) c4 = block_group(inputs=c3, filters=256, strides=2, use_projection=True, block_fn=block_fn, block_repeats=layers[2], activation=self._activation, batch_norm_activation=self._batch_norm_activation, dropblock=self._dropblock, drop_connect_rate=get_drop_connect_rate( self._init_drop_connect_rate, 4, 5), name='block_group3', is_training=is_training) c5 = block_group(inputs=c4, filters=512, strides=2, use_projection=True, block_fn=block_fn, block_repeats=layers[3], activation=self._activation, batch_norm_activation=self._batch_norm_activation, dropblock=self._dropblock, drop_connect_rate=get_drop_connect_rate( self._init_drop_connect_rate, 5, 5), name='block_group4', is_training=is_training) return {2: c2, 3: c3, 4: c4, 5: c5}
def fused_mbconv_block(inputs, in_filters, out_filters, expand_ratio, strides, kernel_size=3, se_ratio=None, batch_norm_activation=nn_ops.BatchNormActivation(), dropblock=nn_ops.Dropblock(), drop_connect_rate=None, data_format='channels_last', is_training=False): """The fused bottleneck block with BN and DropBlock after convolutions. Args: inputs: a `Tensor` of size `[batch, channels, height, width]`. in_filters: a `int` number of filters for the input feature map. out_filters: a `int` number of filters for the output feature map. expand_ratio: a `int` number as the feature dimension expansion ratio. strides: a `int` block stride. If greater than 1, this block will ultimately downsample the input. kernel_size: kernel size for the depthwise convolution. se_ratio: squeeze and excitation ratio. batch_norm_activation: an operation that includes a batch normalization layer followed by an optional activation layer. dropblock: a drop block layer that is added after convluations. Note that the default implementation does not apply any drop block. drop_connect_rate: a 'float' number that specifies the drop connection rate of the block. Note that the default `None` means no drop connection is applied. data_format: a `str` that specifies the data format. is_training: a `bool` if True, the model is in training mode. Returns: The output `Tensor` of the block. """ tf.logging.info('-----> Building fused mbconv block.') shortcut = inputs # First 1x1 conv for channel expansion. inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=in_filters * expand_ratio, kernel_size=kernel_size, strides=strides, data_format=data_format) inputs = batch_norm_activation(inputs, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) # Squeeze and excitation. if se_ratio is not None and se_ratio > 0 and se_ratio <= 1: inputs = nn_ops.squeeze_excitation(inputs, in_filters, se_ratio, expand_ratio=expand_ratio, data_format=data_format) # Third 1x1 conv for reversed bottleneck. inputs = nn_ops.conv2d_fixed_padding(inputs=inputs, filters=out_filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm_activation(inputs, relu=False, is_training=is_training) inputs = dropblock(inputs, is_training=is_training) if in_filters == out_filters and strides == 1: if drop_connect_rate: inputs = nn_ops.drop_connect(inputs, is_training, drop_connect_rate) inputs = tf.add(inputs, shortcut) return inputs
def __call__(self, images, is_training=False): """Generate a multiscale feature pyramid. Args: images: The input image tensor. is_training: `bool` if True, the model is in training mode. Returns: a `dict` containing `int` keys for continuous feature levels [min_level, min_level + 1, ..., max_level]. The values are corresponding features with shape [batch_size, height_l, width_l, endpoints_num_filters]. """ x = images with tf.variable_scope('efficientnet'): x = nn_ops.conv2d_fixed_padding(inputs=x, filters=32, kernel_size=3, strides=2, data_format=self._data_format) x = tf.identity(x, 'initial_conv') x = self._batch_norm_activation(x, is_training=is_training) endpoints = [] for i, block_spec in enumerate(self._block_specs): bn_act = nn_ops.BatchNormActivation( activation=block_spec.act_fn) with tf.variable_scope('block_{}'.format(i)): for j in range(block_spec.num_repeats): strides = ( 1 if j > 0 else efficientnet_constants.EFFICIENTNET_STRIDES[i]) if block_spec.block_fn == 'conv': x = nn_ops.conv2d_fixed_padding( inputs=x, filters=block_spec.output_filters, kernel_size=block_spec.kernel_size, strides=strides, data_format=self._data_format) x = bn_act(x, is_training=is_training) elif block_spec.block_fn == 'mbconv': x_shape = x.get_shape().as_list() in_filters = (x_shape[1] if self._data_format == 'channel_first' else x_shape[-1]) x = nn_blocks.mbconv_block( inputs=x, in_filters=in_filters, out_filters=block_spec.output_filters, expand_ratio=block_spec.expand_ratio, strides=strides, kernel_size=block_spec.kernel_size, se_ratio=block_spec.se_ratio, batch_norm_activation=bn_act, data_format=self._data_format, is_training=is_training) elif block_spec.block_fn == 'fused_mbconv': x_shape = x.get_shape().as_list() in_filters = (x_shape[1] if self._data_format == 'channel_first' else x_shape[-1]) x = nn_blocks.fused_mbconv_block( inputs=x, in_filters=in_filters, out_filters=block_spec.output_filters, expand_ratio=block_spec.expand_ratio, strides=strides, kernel_size=block_spec.kernel_size, se_ratio=block_spec.se_ratio, batch_norm_activation=bn_act, data_format=self._data_format, is_training=is_training) else: raise ValueError( 'Un-supported block_fn `{}`!'.format( block_spec.block_fn)) x = tf.identity(x, 'endpoints') endpoints.append(x) return { 2: endpoints[1], 3: endpoints[2], 4: endpoints[4], 5: endpoints[6] }