def __init__( self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm="", box_head_depthwise_convs=False, box_head_depthwise_double_activation=False, ): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature. conv_dims (list[int]): the output dimensions of the conv layers fc_dims (list[int]): the output dimensions of the fc layers conv_norm (str or callable): normalization for the conv layers. See :func:`detectron2.layers.get_norm` for supported types. """ super().__init__() assert len(conv_dims) + len(fc_dims) > 0 self._output_size = (input_shape.channels, input_shape.height, input_shape.width) self.conv_norm_relus = [] for k, conv_dim in enumerate(conv_dims): if box_head_depthwise_convs: conv = DepthwiseSeparableConv2d(self._output_size[0], conv_dim, kernel_size=3, padding=1, norm1=conv_norm if box_head_depthwise_double_activation else None, activation1=nn.ReLU() if box_head_depthwise_double_activation else None, norm2=conv_norm, activation2=nn.ReLU()) else: conv = Conv2d( self._output_size[0], conv_dim, kernel_size=3, padding=1, bias=not conv_norm, norm=get_norm(conv_norm, conv_dim), activation=nn.ReLU(), ) self.add_module("conv{}".format(k + 1), conv) self.conv_norm_relus.append(conv) self._output_size = (conv_dim, self._output_size[1], self._output_size[2]) self.fcs = [] for k, fc_dim in enumerate(fc_dims): if k == 0: self.add_module("flatten", nn.Flatten()) fc = nn.Linear(int(np.prod(self._output_size)), fc_dim) self.add_module("fc{}".format(k + 1), fc) self.add_module("fc_relu{}".format(k + 1), nn.ReLU()) self.fcs.append(fc) self._output_size = fc_dim for layer in self.conv_norm_relus: if not box_head_depthwise_convs: weight_init.c2_msra_fill(layer) for layer in self.fcs: weight_init.c2_xavier_fill(layer)
def __init__(self, cfg: CfgNode, input_channels: int): """ Initialize DensePose fully convolutional head Args: cfg (CfgNode): configuration options input_channels (int): number of input channels """ super(DensePoseV1ConvXHead, self).__init__() # fmt: off hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS self.depthwise_on = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEPTHWISE.DEPTHWISE_ON depthwise_norms = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEPTHWISE.NORMS depthwise_activations = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEPTHWISE.ACTIVATIONS # fmt: on pad_size = kernel_size // 2 n_channels = input_channels for i in range(self.n_stacked_convs): if self.depthwise_on: layer = DepthwiseSeparableConv2d( n_channels, hidden_dim, kernel_size=kernel_size, padding=pad_size, norm1=depthwise_norms[0], activation1=nn.ReLU() if depthwise_activations[0] else None, norm2=depthwise_norms[1], activation2=nn.ReLU() if depthwise_activations[1] else None) else: layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size) layer_name = self._get_layer_name(i) self.add_module(layer_name, layer) # pyre-ignore[16] n_channels = hidden_dim self.n_out_channels = n_channels if not self.depthwise_on: initialize_module_params(self)
def __init__( self, input_shape: Dict[str, ShapeSpec], *, decoder_channels: List[int], norm: Union[str, Callable], head_channels: int, center_loss_weight: float, offset_loss_weight: float, **kwargs, ): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature decoder_channels (list[int]): a list of output channels of each decoder stage. It should have the same length as "in_features" (each element in "in_features" corresponds to one decoder stage). norm (str or callable): normalization for all conv layers. head_channels (int): the output channels of extra convolutions between decoder and predictor. center_loss_weight (float): loss weight for center point prediction. offset_loss_weight (float): loss weight for center offset prediction. """ super().__init__(input_shape, decoder_channels=decoder_channels, norm=norm, **kwargs) assert self.decoder_only self.center_loss_weight = center_loss_weight self.offset_loss_weight = offset_loss_weight use_bias = norm == "" # center prediction # `head` is additional transform before predictor self.center_head = nn.Sequential( Conv2d( decoder_channels[0], decoder_channels[0], kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, decoder_channels[0]), activation=F.relu, ), Conv2d( decoder_channels[0], head_channels, kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, head_channels), activation=F.relu, ), ) weight_init.c2_xavier_fill(self.center_head[0]) weight_init.c2_xavier_fill(self.center_head[1]) self.center_predictor = Conv2d(head_channels, 1, kernel_size=1) nn.init.normal_(self.center_predictor.weight, 0, 0.001) nn.init.constant_(self.center_predictor.bias, 0) # offset prediction # `head` is additional transform before predictor if self.use_depthwise_separable_conv: # We use a single 5x5 DepthwiseSeparableConv2d to replace # 2 3x3 Conv2d since they have the same receptive field. self.offset_head = DepthwiseSeparableConv2d( decoder_channels[0], head_channels, kernel_size=5, padding=2, norm1=norm, activation1=F.relu, norm2=norm, activation2=F.relu, ) else: self.offset_head = nn.Sequential( Conv2d( decoder_channels[0], decoder_channels[0], kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, decoder_channels[0]), activation=F.relu, ), Conv2d( decoder_channels[0], head_channels, kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, head_channels), activation=F.relu, ), ) weight_init.c2_xavier_fill(self.offset_head[0]) weight_init.c2_xavier_fill(self.offset_head[1]) self.offset_predictor = Conv2d(head_channels, 2, kernel_size=1) nn.init.normal_(self.offset_predictor.weight, 0, 0.001) nn.init.constant_(self.offset_predictor.bias, 0) self.center_loss = nn.MSELoss(reduction="none") self.offset_loss = nn.L1Loss(reduction="none")
def __init__( self, input_shape: Dict[str, ShapeSpec], *, decoder_channels: List[int], norm: Union[str, Callable], head_channels: int, loss_weight: float, loss_type: str, loss_top_k: float, ignore_value: int, num_classes: int, **kwargs, ): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature decoder_channels (list[int]): a list of output channels of each decoder stage. It should have the same length as "in_features" (each element in "in_features" corresponds to one decoder stage). norm (str or callable): normalization for all conv layers. head_channels (int): the output channels of extra convolutions between decoder and predictor. loss_weight (float): loss weight. loss_top_k: (float): setting the top k% hardest pixels for "hard_pixel_mining" loss. loss_type, ignore_value, num_classes: the same as the base class. """ super().__init__( input_shape, decoder_channels=decoder_channels, norm=norm, ignore_value=ignore_value, **kwargs, ) assert self.decoder_only self.loss_weight = loss_weight use_bias = norm == "" # `head` is additional transform before predictor if self.use_depthwise_separable_conv: # We use a single 5x5 DepthwiseSeparableConv2d to replace # 2 3x3 Conv2d since they have the same receptive field. self.head = DepthwiseSeparableConv2d( decoder_channels[0], head_channels, kernel_size=5, padding=2, norm1=norm, activation1=F.relu, norm2=norm, activation2=F.relu, ) else: self.head = nn.Sequential( Conv2d( decoder_channels[0], decoder_channels[0], kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, decoder_channels[0]), activation=F.relu, ), Conv2d( decoder_channels[0], head_channels, kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, head_channels), activation=F.relu, ), ) weight_init.c2_xavier_fill(self.head[0]) weight_init.c2_xavier_fill(self.head[1]) self.predictor = Conv2d(head_channels, num_classes, kernel_size=1) nn.init.normal_(self.predictor.weight, 0, 0.001) nn.init.constant_(self.predictor.bias, 0) if loss_type == "cross_entropy": self.loss = nn.CrossEntropyLoss(reduction="mean", ignore_index=ignore_value) elif loss_type == "hard_pixel_mining": self.loss = DeepLabCE(ignore_label=ignore_value, top_k_percent_pixels=loss_top_k) else: raise ValueError("Unexpected loss type: %s" % loss_type)
def __init__( self, input_shape: Dict[str, ShapeSpec], *, in_features: List[str], project_channels: List[int], aspp_dilations: List[int], aspp_dropout: float, decoder_channels: List[int], common_stride: int, norm: Union[str, Callable], train_size: Optional[Tuple], loss_weight: float = 1.0, loss_type: str = "cross_entropy", ignore_value: int = -1, num_classes: Optional[int] = None, use_depthwise_separable_conv: bool = False, ): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature in_features (list[str]): a list of input feature names, the last name of "in_features" is used as the input to the decoder (i.e. the ASPP module) and rest of "in_features" are low-level feature the the intermediate levels of decoder. "in_features" should be ordered from highest resolution to lowest resolution. For example: ["res2", "res3", "res4", "res5"]. project_channels (list[int]): a list of low-level feature channels. The length should be len(in_features) - 1. aspp_dilations (list(int)): a list of 3 dilations in ASPP. aspp_dropout (float): apply dropout on the output of ASPP. decoder_channels (list[int]): a list of output channels of each decoder stage. It should have the same length as "in_features" (each element in "in_features" corresponds to one decoder stage). common_stride (int): output stride of decoder. norm (str or callable): normalization for all conv layers. train_size (tuple): (height, width) of training images. loss_weight (float): loss weight. loss_type (str): type of loss function, 2 opptions: (1) "cross_entropy" is the standard cross entropy loss. (2) "hard_pixel_mining" is the loss in DeepLab that samples top k% hardest pixels. ignore_value (int): category to be ignored during training. num_classes (int): number of classes, if set to None, the decoder will not construct a predictor. use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d in ASPP and decoder. """ super().__init__() # fmt: off self.in_features = in_features # starting from "res2" to "res5" in_channels = [input_shape[f].channels for f in self.in_features] aspp_channels = decoder_channels[-1] self.ignore_value = ignore_value self.common_stride = common_stride # output stride self.loss_weight = loss_weight self.loss_type = loss_type self.decoder_only = num_classes is None self.use_depthwise_separable_conv = use_depthwise_separable_conv # fmt: on assert (len(project_channels) == len(self.in_features) - 1), "Expected {} project_channels, got {}".format( len(self.in_features) - 1, len(project_channels)) assert len(decoder_channels) == len( self.in_features), "Expected {} decoder_channels, got {}".format( len(self.in_features), len(decoder_channels)) self.decoder = nn.ModuleDict() use_bias = norm == "" for idx, in_channel in enumerate(in_channels): decoder_stage = nn.ModuleDict() if idx == len(self.in_features) - 1: # ASPP module if train_size is not None: train_h, train_w = train_size encoder_stride = input_shape[self.in_features[-1]].stride if train_h % encoder_stride or train_w % encoder_stride: raise ValueError( "Crop size need to be divisible by encoder stride." ) pool_h = train_h // encoder_stride pool_w = train_w // encoder_stride pool_kernel_size = (pool_h, pool_w) else: pool_kernel_size = None project_conv = ASPP( in_channel, aspp_channels, aspp_dilations, norm=norm, activation=F.relu, pool_kernel_size=pool_kernel_size, dropout=aspp_dropout, use_depthwise_separable_conv=use_depthwise_separable_conv, ) fuse_conv = None else: project_conv = Conv2d( in_channel, project_channels[idx], kernel_size=1, bias=use_bias, norm=get_norm(norm, project_channels[idx]), activation=F.relu, ) weight_init.c2_xavier_fill(project_conv) if use_depthwise_separable_conv: # We use a single 5x5 DepthwiseSeparableConv2d to replace # 2 3x3 Conv2d since they have the same receptive field, # proposed in :paper:`Panoptic-DeepLab`. fuse_conv = DepthwiseSeparableConv2d( project_channels[idx] + decoder_channels[idx + 1], decoder_channels[idx], kernel_size=5, padding=2, norm1=norm, activation1=F.relu, norm2=norm, activation2=F.relu, ) else: fuse_conv = nn.Sequential( Conv2d( project_channels[idx] + decoder_channels[idx + 1], decoder_channels[idx], kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, decoder_channels[idx]), activation=F.relu, ), Conv2d( decoder_channels[idx], decoder_channels[idx], kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, decoder_channels[idx]), activation=F.relu, ), ) weight_init.c2_xavier_fill(fuse_conv[0]) weight_init.c2_xavier_fill(fuse_conv[1]) decoder_stage["project_conv"] = project_conv decoder_stage["fuse_conv"] = fuse_conv self.decoder[self.in_features[idx]] = decoder_stage if not self.decoder_only: self.predictor = Conv2d(decoder_channels[0], num_classes, kernel_size=1, stride=1, padding=0) nn.init.normal_(self.predictor.weight, 0, 0.001) nn.init.constant_(self.predictor.bias, 0) if self.loss_type == "cross_entropy": self.loss = nn.CrossEntropyLoss(reduction="mean", ignore_index=self.ignore_value) elif self.loss_type == "hard_pixel_mining": self.loss = DeepLabCE(ignore_label=self.ignore_value, top_k_percent_pixels=0.2) else: raise ValueError("Unexpected loss type: %s" % self.loss_type)
def test_separable_conv(self): DepthwiseSeparableConv2d(3, 10, norm1="BN", activation1=nn.PReLU())
def __init__(self, *, in_channels: int, num_anchors: int, box_dim: int = 4, rpn_kernel_size: int = 3, depthwise_rpn: bool = False, depthwise_rpn_double_activation: bool = False, depthwise_rpn_norm: str = ''): """ NOTE: this interface is experimental. Args: in_channels (int): number of input feature channels. When using multiple input features, they must have the same number of channels. num_anchors (int): number of anchors to predict for *each spatial position* on the feature map. The total number of anchors for each feature map will be `num_anchors * H * W`. box_dim (int): dimension of a box, which is also the number of box regression predictions to make for each anchor. An axis aligned box has box_dim=4, while a rotated box has box_dim=5. """ super().__init__() # 3x3 conv for the hidden representation if depthwise_rpn: self.conv = DepthwiseSeparableConv2d( in_channels, in_channels, kernel_size=rpn_kernel_size, padding=rpn_kernel_size // 2, norm1=depthwise_rpn_norm if depthwise_rpn_double_activation else None, activation1=nn.ReLU() if depthwise_rpn_double_activation else None, norm2=depthwise_rpn_norm) else: self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) # 1x1 conv for predicting objectness logits self.objectness_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) # 1x1 conv for predicting box2box transform deltas self.anchor_deltas = nn.Conv2d(in_channels, num_anchors * box_dim, kernel_size=1, stride=1) for l in [self.objectness_logits, self.anchor_deltas]: nn.init.normal_(l.weight, std=0.01) nn.init.constant_(l.bias, 0) if depthwise_rpn: if not depthwise_rpn_double_activation: nn.init.normal_(self.conv.depthwise.weight, std=0.01) nn.init.normal_(self.conv.pointwise.weight, std=0.01) if depthwise_rpn_norm == '': if not depthwise_rpn_double_activation: nn.init.constant_(self.conv.depthwise.bias, 0) nn.init.constant_(self.conv.pointwise.bias, 0) else: nn.init.normal_(self.conv.weight, std=0.01) nn.init.constant_(self.conv.bias, 0)