def _resample_feature_map(self, inputs, input_level, target_level, target_num_filters=256): x = inputs _, _, _, input_num_filters = x.get_shape().as_list() if input_num_filters != target_num_filters: x = self._conv_op( filters=target_num_filters, kernel_size=1, padding='same', **self._conv_kwargs)(x) x = self._norm_op(**self._norm_kwargs)(x) if input_level < target_level: stride = int(2 ** (target_level - input_level)) return tf.keras.layers.MaxPool2D( pool_size=stride, strides=stride, padding='same')(x) if input_level > target_level: scale = int(2 ** (input_level - target_level)) return spatial_transform_ops.nearest_upsampling(x, scale=scale) # Force output x to be the same dtype as mixed precision policy. This avoids # dtype mismatch when one input (by default float32 dtype) does not meet all # the above conditions and is output unchanged, while other inputs are # processed to have different dtype, e.g., using bfloat16 on TPU. compute_dtype = tf.keras.layers.Layer().dtype_policy.compute_dtype if (compute_dtype is not None) and (x.dtype != compute_dtype): return tf.cast(x, dtype=compute_dtype) else: return x
def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]], Union[tf.Tensor, Mapping[str, tf.Tensor]]]): """Forward pass of the segmentation head. It supports both a tuple of 2 tensors or 2 dictionaries. The first is backbone endpoints, and the second is decoder endpoints. When inputs are tensors, they are from a single level of feature maps. When inputs are dictionaries, they contain multiple levels of feature maps, where the key is the index of feature map. Args: inputs: A tuple of 2 feature map tensors of shape [batch, height_l, width_l, channels] or 2 dictionaries of tensors: - key: A `str` of the level of the multilevel features. - values: A `tf.Tensor` of the feature map tensors, whose shape is [batch, height_l, width_l, channels]. The first is backbone endpoints, and the second is decoder endpoints. Returns: segmentation prediction mask: A `tf.Tensor` of the segmentation mask scores predicted from input features. """ backbone_output = inputs[0] decoder_output = inputs[1] if self._config_dict['feature_fusion'] == 'deeplabv3plus': # deeplabv3+ feature fusion x = decoder_output[str(self._config_dict['level'])] if isinstance( decoder_output, dict) else decoder_output y = backbone_output[str( self._config_dict['low_level'])] if isinstance( backbone_output, dict) else backbone_output y = self._dlv3p_norm(self._dlv3p_conv(y)) y = self._activation(y) x = tf.image.resize(x, tf.shape(y)[1:3], method=tf.image.ResizeMethod.BILINEAR) x = tf.cast(x, dtype=y.dtype) x = tf.concat([x, y], axis=self._bn_axis) elif self._config_dict['feature_fusion'] == 'pyramid_fusion': if not isinstance(decoder_output, dict): raise ValueError('Only support dictionary decoder_output.') x = nn_layers.pyramid_feature_fusion(decoder_output, self._config_dict['level']) elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion': x = self._panoptic_fpn_fusion(decoder_output) else: x = decoder_output[str(self._config_dict['level'])] if isinstance( decoder_output, dict) else decoder_output for conv, norm in zip(self._convs, self._norms): x = conv(x) x = norm(x) x = self._activation(x) if self._config_dict['upsample_factor'] > 1: x = spatial_transform_ops.nearest_upsampling( x, scale=self._config_dict['upsample_factor']) return self._classifier(x)
def _resample_with_sepconv(self, inputs, input_width, target_width, target_num_filters): """Matches resolution and feature dimension.""" x = inputs # Spatial resampling. if input_width > target_width: while input_width > target_width: x = layers.DepthwiseConv2D( kernel_size=3, strides=2, padding='SAME', use_bias=False, kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer)( x) x = self._norm( axis=self._bn_axis, momentum=self._norm_momentum, epsilon=self._norm_epsilon)( x) x = tf_utils.get_activation( self._activation, use_keras_layer=True)(x) input_width /= 2 elif input_width < target_width: scale = target_width // input_width x = spatial_transform_ops.nearest_upsampling( x, scale=scale, use_keras_layer=self._use_keras_upsampling_2d) # Last 1x1 conv to match filter size. x = layers.Conv2D( filters=target_num_filters, kernel_size=1, strides=1, use_bias=False, kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer)( x) x = self._norm( axis=self._bn_axis, momentum=self._norm_momentum, epsilon=self._norm_epsilon)( x) return x
def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]], Union[tf.Tensor, Mapping[str, tf.Tensor]]], training=None): """Forward pass of the head. It supports both a tuple of 2 tensors or 2 dictionaries. The first is backbone endpoints, and the second is decoder endpoints. When inputs are tensors, they are from a single level of feature maps. When inputs are dictionaries, they contain multiple levels of feature maps, where the key is the index of feature map. Args: inputs: A tuple of 2 feature map tensors of shape [batch, height_l, width_l, channels] or 2 dictionaries of tensors: - key: A `str` of the level of the multilevel features. - values: A `tf.Tensor` of the feature map tensors, whose shape is [batch, height_l, width_l, channels]. training: A bool, runs the model in training/eval mode. Returns: A `tf.Tensor` of the fused backbone and decoder features. """ if training is None: training = tf.keras.backend.learning_phase() x = self._panoptic_deeplab_fusion(inputs, training=training) for conv, norm in zip(self._convs, self._norms): x = conv(x) x = norm(x, training=training) x = self._activation(x) if self._config_dict['upsample_factor'] > 1: x = spatial_transform_ops.nearest_upsampling( x, scale=self._config_dict['upsample_factor']) return x
def __init__(self, input_specs: Mapping[str, tf.TensorShape], min_level: int = 3, max_level: int = 7, num_filters: int = 256, fusion_type: str = 'sum', use_separable_conv: bool = False, use_keras_layer: bool = False, activation: str = 'relu', use_sync_bn: bool = False, norm_momentum: float = 0.99, norm_epsilon: float = 0.001, kernel_initializer: str = 'VarianceScaling', kernel_regularizer: Optional[ tf.keras.regularizers.Regularizer] = None, bias_regularizer: Optional[ tf.keras.regularizers.Regularizer] = None, **kwargs): """Initializes a Feature Pyramid Network (FPN). Args: input_specs: A `dict` of input specifications. A dictionary consists of {level: TensorShape} from a backbone. min_level: An `int` of minimum level in FPN output feature maps. max_level: An `int` of maximum level in FPN output feature maps. num_filters: An `int` number of filters in FPN layers. fusion_type: A `str` of `sum` or `concat`. Whether performing sum or concat for feature fusion. use_separable_conv: A `bool`. If True use separable convolution for convolution in FPN layers. use_keras_layer: A `bool`. If Ture use keras layers as many as possible. activation: A `str` name of the activation function. use_sync_bn: A `bool`. If True, use synchronized batch normalization. norm_momentum: A `float` of normalization momentum for the moving average. norm_epsilon: A `float` added to variance to avoid dividing by zero. kernel_initializer: A `str` name of kernel_initializer for convolutional layers. kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. Default is None. bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. **kwargs: Additional keyword arguments to be passed. """ self._config_dict = { 'input_specs': input_specs, 'min_level': min_level, 'max_level': max_level, 'num_filters': num_filters, 'fusion_type': fusion_type, 'use_separable_conv': use_separable_conv, 'use_keras_layer': use_keras_layer, 'activation': activation, 'use_sync_bn': use_sync_bn, 'norm_momentum': norm_momentum, 'norm_epsilon': norm_epsilon, 'kernel_initializer': kernel_initializer, 'kernel_regularizer': kernel_regularizer, 'bias_regularizer': bias_regularizer, } if use_separable_conv: conv2d = tf.keras.layers.SeparableConv2D else: conv2d = tf.keras.layers.Conv2D if use_sync_bn: norm = tf.keras.layers.experimental.SyncBatchNormalization else: norm = tf.keras.layers.BatchNormalization activation_fn = tf_utils.get_activation(activation, use_keras_layer=True) # Build input feature pyramid. if tf.keras.backend.image_data_format() == 'channels_last': bn_axis = -1 else: bn_axis = 1 # Get input feature pyramid from backbone. logging.info('FPN input_specs: %s', input_specs) inputs = self._build_input_pyramid(input_specs, min_level) backbone_max_level = min(int(max(inputs.keys())), max_level) # Build lateral connections. feats_lateral = {} for level in range(min_level, backbone_max_level + 1): feats_lateral[str(level)] = conv2d( filters=num_filters, kernel_size=1, padding='same', kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)(inputs[str(level)]) # Build top-down path. feats = { str(backbone_max_level): feats_lateral[str(backbone_max_level)] } for level in range(backbone_max_level - 1, min_level - 1, -1): feat_a = spatial_transform_ops.nearest_upsampling( feats[str(level + 1)], 2, use_keras_layer=use_keras_layer) feat_b = feats_lateral[str(level)] if fusion_type == 'sum': if use_keras_layer: feats[str(level)] = tf.keras.layers.Add()([feat_a, feat_b]) else: feats[str(level)] = feat_a + feat_b elif fusion_type == 'concat': if use_keras_layer: feats[str(level)] = tf.keras.layers.Concatenate(axis=-1)( [feat_a, feat_b]) else: feats[str(level)] = tf.concat([feat_a, feat_b], axis=-1) else: raise ValueError( 'Fusion type {} not supported.'.format(fusion_type)) # TODO(xianzhi): consider to remove bias in conv2d. # Build post-hoc 3x3 convolution kernel. for level in range(min_level, backbone_max_level + 1): feats[str(level)] = conv2d(filters=num_filters, strides=1, kernel_size=3, padding='same', kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)( feats[str(level)]) # TODO(xianzhi): consider to remove bias in conv2d. # Build coarser FPN levels introduced for RetinaNet. for level in range(backbone_max_level + 1, max_level + 1): feats_in = feats[str(level - 1)] if level > backbone_max_level + 1: feats_in = activation_fn(feats_in) feats[str(level)] = conv2d( filters=num_filters, strides=2, kernel_size=3, padding='same', kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)(feats_in) # Apply batch norm layers. for level in range(min_level, max_level + 1): feats[str(level)] = norm(axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)(feats[str(level)]) self._output_specs = { str(level): feats[str(level)].get_shape() for level in range(min_level, max_level + 1) } super(FPN, self).__init__(inputs=inputs, outputs=feats, **kwargs)
def _resample_with_alpha(self, inputs, input_width, input_block_fn, target_width, target_num_filters, target_block_fn, alpha=0.5): """Matches resolution and feature dimension.""" _, _, _, input_num_filters = inputs.get_shape().as_list() if input_block_fn == 'bottleneck': input_num_filters /= 4 new_num_filters = int(input_num_filters * alpha) x = layers.Conv2D( filters=new_num_filters, kernel_size=1, strides=1, use_bias=False, kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer)( inputs) x = self._norm( axis=self._bn_axis, momentum=self._norm_momentum, epsilon=self._norm_epsilon)( x) x = tf_utils.get_activation(self._activation_fn)(x) # Spatial resampling. if input_width > target_width: x = layers.Conv2D( filters=new_num_filters, kernel_size=3, strides=2, padding='SAME', use_bias=False, kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer)( x) x = self._norm( axis=self._bn_axis, momentum=self._norm_momentum, epsilon=self._norm_epsilon)( x) x = tf_utils.get_activation(self._activation_fn)(x) input_width /= 2 while input_width > target_width: x = layers.MaxPool2D(pool_size=3, strides=2, padding='SAME')(x) input_width /= 2 elif input_width < target_width: scale = target_width // input_width x = spatial_transform_ops.nearest_upsampling(x, scale=scale) # Last 1x1 conv to match filter size. if target_block_fn == 'bottleneck': target_num_filters *= 4 x = layers.Conv2D( filters=target_num_filters, kernel_size=1, strides=1, use_bias=False, kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer)( x) x = self._norm( axis=self._bn_axis, momentum=self._norm_momentum, epsilon=self._norm_epsilon)( x) return x