def _init_layers(self): """A helper function to take a config setting and turn it into a network.""" # Possible patterns: # ( 256, 3) -> conv # ( 256,-2) -> deconv # (None,-2) -> bilinear interpolate in_channels = self.in_channels protonets = ModuleList() for num_channels, kernel_size in zip(self.proto_channels, self.proto_kernel_sizes): if kernel_size > 0: layer = nn.Conv2d(in_channels, num_channels, kernel_size, padding=kernel_size // 2) else: if num_channels is None: layer = InterpolateModule(scale_factor=-kernel_size, mode='bilinear', align_corners=False) else: layer = nn.ConvTranspose2d(in_channels, num_channels, -kernel_size, padding=kernel_size // 2) protonets.append(layer) protonets.append(nn.ReLU(inplace=True)) in_channels = num_channels if num_channels is not None \ else in_channels if not self.include_last_relu: protonets = protonets[:-1] return nn.Sequential(*protonets)
def _add_fc_branch(self): """Add the fc branch which consists of a sequential of fc layers.""" branch_fcs = ModuleList() for i in range(self.num_fcs): fc_in_channels = (self.in_channels * self.roi_feat_area if i == 0 else self.fc_out_channels) branch_fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels)) return branch_fcs
def _add_conv_branch(self): """Add the fc branch which consists of a sequential of conv layers.""" branch_convs = ModuleList() for i in range(self.num_convs): branch_convs.append( Bottleneck(inplanes=self.conv_out_channels, planes=self.conv_out_channels // 4, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg)) return branch_convs
class FPNF(BaseModule): """FPN-like fusion module in Shape Robust Text Detection with Progressive Scale Expansion Network.""" def __init__(self, in_channels=[256, 512, 1024, 2048], out_channels=256, fusion_type='concat', upsample_ratio=1, init_cfg=dict(type='Xavier', layer='Conv2d', distribution='uniform')): super().__init__(init_cfg=init_cfg) conv_cfg = None norm_cfg = dict(type='BN') act_cfg = dict(type='ReLU') self.in_channels = in_channels self.out_channels = out_channels self.lateral_convs = ModuleList() self.fpn_convs = ModuleList() self.backbone_end_level = len(in_channels) for i in range(self.backbone_end_level): l_conv = ConvModule(in_channels[i], out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.lateral_convs.append(l_conv) if i < self.backbone_end_level - 1: fpn_conv = ConvModule(out_channels, out_channels, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.fpn_convs.append(fpn_conv) self.fusion_type = fusion_type if self.fusion_type == 'concat': feature_channels = 1024 elif self.fusion_type == 'add': feature_channels = 256 else: raise NotImplementedError self.output_convs = ConvModule(feature_channels, out_channels, 3, padding=1, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.upsample_ratio = upsample_ratio @auto_fp16() def forward(self, inputs): assert len(inputs) == len(self.in_channels) # build laterals laterals = [ lateral_conv(inputs[i]) for i, lateral_conv in enumerate(self.lateral_convs) ] # build top-down path used_backbone_levels = len(laterals) for i in range(used_backbone_levels - 1, 0, -1): # step 1: upsample to level i-1 size and add level i-1 prev_shape = laterals[i - 1].shape[2:] laterals[i - 1] += F.interpolate(laterals[i], size=prev_shape, mode='nearest') # step 2: smooth level i-1 laterals[i - 1] = self.fpn_convs[i - 1](laterals[i - 1]) # upsample and cont bottom_shape = laterals[0].shape[2:] for i in range(1, used_backbone_levels): laterals[i] = F.interpolate(laterals[i], size=bottom_shape, mode='nearest') if self.fusion_type == 'concat': out = torch.cat(laterals, 1) elif self.fusion_type == 'add': out = laterals[0] for i in range(1, used_backbone_levels): out += laterals[i] else: raise NotImplementedError out = self.output_convs(out) return out
class PCPVT(BaseModule): """The backbone of Twins-PCPVT. This backbone is the implementation of `Twins: Revisiting the Design of Spatial Attention in Vision Transformers <https://arxiv.org/abs/1512.03385>`_. Args: in_channels (int): Number of input channels. Default: 3. embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512]. patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2]. strides (list): The strides. Default: [4, 2, 2, 2]. num_heads (int): Number of attention heads. Default: [1, 2, 4, 8]. mlp_ratios (int): Ratio of mlp hidden dim to embedding dim. Default: [4, 4, 4, 4]. out_indices (tuple[int]): Output from which stages. Default: (0, 1, 2, 3). qkv_bias (bool): Enable bias for qkv if True. Default: False. drop_rate (float): Probability of an element to be zeroed. Default 0. attn_drop_rate (float): The drop out rate for attention layer. Default 0.0 drop_path_rate (float): Stochastic depth rate. Default 0.0 norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN') depths (list): Depths of each stage. Default [3, 4, 6, 3] sr_ratios (list): Kernel_size of conv in each Attn module in Transformer encoder layer. Default: [8, 4, 2, 1]. norm_after_stage(bool): Add extra norm. Default False. init_cfg (dict, optional): The Config for initialization. Defaults to None. """ def __init__(self, in_channels=3, embed_dims=[64, 128, 256, 512], patch_sizes=[4, 2, 2, 2], strides=[4, 2, 2, 2], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], out_indices=(0, 1, 2, 3), qkv_bias=False, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_cfg=dict(type='LN'), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], norm_after_stage=False, pretrained=None, init_cfg=None): super(PCPVT, self).__init__(init_cfg=init_cfg) assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be set at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is not None: raise TypeError('pretrained must be a str or None') self.depths = depths # patch_embed self.patch_embeds = ModuleList() self.position_encoding_drops = ModuleList() self.layers = ModuleList() for i in range(len(depths)): self.patch_embeds.append( PatchEmbed( in_channels=in_channels if i == 0 else embed_dims[i - 1], embed_dims=embed_dims[i], conv_type='Conv2d', kernel_size=patch_sizes[i], stride=strides[i], padding='corner', norm_cfg=norm_cfg)) self.position_encoding_drops.append(nn.Dropout(p=drop_rate)) self.position_encodings = ModuleList([ ConditionalPositionEncoding(embed_dim, embed_dim) for embed_dim in embed_dims ]) # transformer encoder dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule cur = 0 for k in range(len(depths)): _block = ModuleList([ GSAEncoderLayer( embed_dims=embed_dims[k], num_heads=num_heads[k], feedforward_channels=mlp_ratios[k] * embed_dims[k], attn_drop_rate=attn_drop_rate, drop_rate=drop_rate, drop_path_rate=dpr[cur + i], num_fcs=2, qkv_bias=qkv_bias, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), sr_ratio=sr_ratios[k]) for i in range(depths[k]) ]) self.layers.append(_block) cur += depths[k] self.norm_name, norm = build_norm_layer(norm_cfg, embed_dims[-1], postfix=1) self.out_indices = out_indices self.norm_after_stage = norm_after_stage if self.norm_after_stage: self.norm_list = ModuleList() for dim in embed_dims: self.norm_list.append(build_norm_layer(norm_cfg, dim)[1]) def init_weights(self): if self.init_cfg is not None: super(PCPVT, self).init_weights() else: for m in self.modules(): if isinstance(m, nn.Linear): trunc_normal_init(m, std=.02, bias=0.) elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): constant_init(m, val=1.0, bias=0.) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[ 1] * m.out_channels fan_out //= m.groups normal_init(m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0) def forward(self, x): outputs = list() b = x.shape[0] for i in range(len(self.depths)): x, hw_shape = self.patch_embeds[i](x) h, w = hw_shape x = self.position_encoding_drops[i](x) for j, blk in enumerate(self.layers[i]): x = blk(x, hw_shape) if j == 0: x = self.position_encodings[i](x, hw_shape) if self.norm_after_stage: x = self.norm_list[i](x) x = x.reshape(b, h, w, -1).permute(0, 3, 1, 2).contiguous() if i in self.out_indices: outputs.append(x) return tuple(outputs)
class NASFPN(BaseModule): """NAS-FPN. Implementation of `NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection <https://arxiv.org/abs/1904.07392>`_ Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale) num_outs (int): Number of output scales. stack_times (int): The number of times the pyramid architecture will be stacked. start_level (int): Index of the start input backbone level used to build the feature pyramid. Default: 0. end_level (int): Index of the end input backbone level (exclusive) to build the feature pyramid. Default: -1, which means the last level. add_extra_convs (bool): It decides whether to add conv layers on top of the original feature maps. Default to False. If True, its actual mode is specified by `extra_convs_on_inputs`. init_cfg (dict or list[dict], optional): Initialization config dict. """ def __init__(self, in_channels, out_channels, num_outs, stack_times, start_level=0, end_level=-1, add_extra_convs=False, norm_cfg=None, init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')): super(NASFPN, self).__init__(init_cfg) assert isinstance(in_channels, list) self.in_channels = in_channels self.out_channels = out_channels self.num_ins = len(in_channels) # num of input feature levels self.num_outs = num_outs # num of output feature levels self.stack_times = stack_times self.norm_cfg = norm_cfg if end_level == -1: self.backbone_end_level = self.num_ins assert num_outs >= self.num_ins - start_level else: # if end_level < inputs, no extra level is allowed self.backbone_end_level = end_level assert end_level <= len(in_channels) assert num_outs == end_level - start_level self.start_level = start_level self.end_level = end_level self.add_extra_convs = add_extra_convs # add lateral connections self.lateral_convs = nn.ModuleList() for i in range(self.start_level, self.backbone_end_level): l_conv = ConvModule( in_channels[i], out_channels, 1, norm_cfg=norm_cfg, act_cfg=None) self.lateral_convs.append(l_conv) # add extra downsample layers (stride-2 pooling or conv) extra_levels = num_outs - self.backbone_end_level + self.start_level self.extra_downsamples = nn.ModuleList() for i in range(extra_levels): extra_conv = ConvModule( out_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None) self.extra_downsamples.append( nn.Sequential(extra_conv, nn.MaxPool2d(2, 2))) # add NAS FPN connections self.fpn_stages = ModuleList() for _ in range(self.stack_times): stage = nn.ModuleDict() # gp(p6, p4) -> p4_1 stage['gp_64_4'] = GlobalPoolingCell( in_channels=out_channels, out_channels=out_channels, out_norm_cfg=norm_cfg) # sum(p4_1, p4) -> p4_2 stage['sum_44_4'] = SumCell( in_channels=out_channels, out_channels=out_channels, out_norm_cfg=norm_cfg) # sum(p4_2, p3) -> p3_out stage['sum_43_3'] = SumCell( in_channels=out_channels, out_channels=out_channels, out_norm_cfg=norm_cfg) # sum(p3_out, p4_2) -> p4_out stage['sum_34_4'] = SumCell( in_channels=out_channels, out_channels=out_channels, out_norm_cfg=norm_cfg) # sum(p5, gp(p4_out, p3_out)) -> p5_out stage['gp_43_5'] = GlobalPoolingCell(with_out_conv=False) stage['sum_55_5'] = SumCell( in_channels=out_channels, out_channels=out_channels, out_norm_cfg=norm_cfg) # sum(p7, gp(p5_out, p4_2)) -> p7_out stage['gp_54_7'] = GlobalPoolingCell(with_out_conv=False) stage['sum_77_7'] = SumCell( in_channels=out_channels, out_channels=out_channels, out_norm_cfg=norm_cfg) # gp(p7_out, p5_out) -> p6_out stage['gp_75_6'] = GlobalPoolingCell( in_channels=out_channels, out_channels=out_channels, out_norm_cfg=norm_cfg) self.fpn_stages.append(stage) def forward(self, inputs): """Forward function.""" # build P3-P5 feats = [ lateral_conv(inputs[i + self.start_level]) for i, lateral_conv in enumerate(self.lateral_convs) ] # build P6-P7 on top of P5 for downsample in self.extra_downsamples: feats.append(downsample(feats[-1])) p3, p4, p5, p6, p7 = feats for stage in self.fpn_stages: # gp(p6, p4) -> p4_1 p4_1 = stage['gp_64_4'](p6, p4, out_size=p4.shape[-2:]) # sum(p4_1, p4) -> p4_2 p4_2 = stage['sum_44_4'](p4_1, p4, out_size=p4.shape[-2:]) # sum(p4_2, p3) -> p3_out p3 = stage['sum_43_3'](p4_2, p3, out_size=p3.shape[-2:]) # sum(p3_out, p4_2) -> p4_out p4 = stage['sum_34_4'](p3, p4_2, out_size=p4.shape[-2:]) # sum(p5, gp(p4_out, p3_out)) -> p5_out p5_tmp = stage['gp_43_5'](p4, p3, out_size=p5.shape[-2:]) p5 = stage['sum_55_5'](p5, p5_tmp, out_size=p5.shape[-2:]) # sum(p7, gp(p5_out, p4_2)) -> p7_out p7_tmp = stage['gp_54_7'](p5, p4_2, out_size=p7.shape[-2:]) p7 = stage['sum_77_7'](p7, p7_tmp, out_size=p7.shape[-2:]) # gp(p7_out, p5_out) -> p6_out p6 = stage['gp_75_6'](p7, p5, out_size=p6.shape[-2:]) return p3, p4, p5, p6, p7
class CascadeRoIHead(BaseRoIHead, BBoxTestMixin, MaskTestMixin): """Cascade roi head including one bbox head and one mask head. https://arxiv.org/abs/1712.00726 """ def __init__(self, num_stages, stage_loss_weights, bbox_roi_extractor=None, bbox_head=None, mask_roi_extractor=None, mask_head=None, shared_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): assert bbox_roi_extractor is not None assert bbox_head is not None assert shared_head is None, \ 'Shared head is not supported in Cascade RCNN anymore' self.num_stages = num_stages self.stage_loss_weights = stage_loss_weights super(CascadeRoIHead, self).__init__(bbox_roi_extractor=bbox_roi_extractor, bbox_head=bbox_head, mask_roi_extractor=mask_roi_extractor, mask_head=mask_head, shared_head=shared_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, init_cfg=init_cfg) def init_bbox_head(self, bbox_roi_extractor, bbox_head): """Initialize box head and box roi extractor. Args: bbox_roi_extractor (dict): Config of box roi extractor. bbox_head (dict): Config of box in box head. """ self.bbox_roi_extractor = ModuleList() self.bbox_head = ModuleList() if not isinstance(bbox_roi_extractor, list): bbox_roi_extractor = [ bbox_roi_extractor for _ in range(self.num_stages) ] if not isinstance(bbox_head, list): bbox_head = [bbox_head for _ in range(self.num_stages)] assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages for roi_extractor, head in zip(bbox_roi_extractor, bbox_head): self.bbox_roi_extractor.append(build_roi_extractor(roi_extractor)) self.bbox_head.append(build_head(head)) def init_mask_head(self, mask_roi_extractor, mask_head): """Initialize mask head and mask roi extractor. Args: mask_roi_extractor (dict): Config of mask roi extractor. mask_head (dict): Config of mask in mask head. """ self.mask_head = nn.ModuleList() if not isinstance(mask_head, list): mask_head = [mask_head for _ in range(self.num_stages)] assert len(mask_head) == self.num_stages for head in mask_head: self.mask_head.append(build_head(head)) if mask_roi_extractor is not None: self.share_roi_extractor = False self.mask_roi_extractor = ModuleList() if not isinstance(mask_roi_extractor, list): mask_roi_extractor = [ mask_roi_extractor for _ in range(self.num_stages) ] assert len(mask_roi_extractor) == self.num_stages for roi_extractor in mask_roi_extractor: self.mask_roi_extractor.append( build_roi_extractor(roi_extractor)) else: self.share_roi_extractor = True self.mask_roi_extractor = self.bbox_roi_extractor def init_assigner_sampler(self): """Initialize assigner and sampler for each stage.""" self.bbox_assigner = [] self.bbox_sampler = [] if self.train_cfg is not None: for idx, rcnn_train_cfg in enumerate(self.train_cfg): self.bbox_assigner.append( build_assigner(rcnn_train_cfg.assigner)) self.current_stage = idx self.bbox_sampler.append( build_sampler(rcnn_train_cfg.sampler, context=self)) def forward_dummy(self, x, proposals): """Dummy forward function.""" # bbox head outs = () rois = bbox2roi([proposals]) if self.with_bbox: for i in range(self.num_stages): bbox_results = self._bbox_forward(i, x, rois) outs = outs + (bbox_results['cls_score'], bbox_results['bbox_pred']) # mask heads if self.with_mask: mask_rois = rois[:100] for i in range(self.num_stages): mask_results = self._mask_forward(i, x, mask_rois) outs = outs + (mask_results['mask_pred'], ) return outs def _bbox_forward(self, stage, x, rois): """Box head forward function used in both training and testing.""" bbox_roi_extractor = self.bbox_roi_extractor[stage] bbox_head = self.bbox_head[stage] bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) # do not support caffe_c4 model anymore cls_score, bbox_pred = bbox_head(bbox_feats) bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats) return bbox_results def _bbox_forward_train(self, stage, x, sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg): """Run forward function and calculate loss for box head in training.""" rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_results = self._bbox_forward(stage, x, rois) bbox_targets = self.bbox_head[stage].get_targets( sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = self.bbox_head[stage].loss(bbox_results['cls_score'], bbox_results['bbox_pred'], rois, *bbox_targets) bbox_results.update(loss_bbox=loss_bbox, rois=rois, bbox_targets=bbox_targets) return bbox_results def _mask_forward(self, stage, x, rois): """Mask head forward function used in both training and testing.""" mask_roi_extractor = self.mask_roi_extractor[stage] mask_head = self.mask_head[stage] mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs], rois) # do not support caffe_c4 model anymore mask_pred = mask_head(mask_feats) mask_results = dict(mask_pred=mask_pred) return mask_results def _mask_forward_train(self, stage, x, sampling_results, gt_masks, rcnn_train_cfg, bbox_feats=None): """Run forward function and calculate loss for mask head in training.""" pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_results = self._mask_forward(stage, x, pos_rois) mask_targets = self.mask_head[stage].get_targets( sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head[stage].loss(mask_results['mask_pred'], mask_targets, pos_labels) mask_results.update(loss_mask=loss_mask) return mask_results def forward_train(self, x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None): """ Args: x (list[Tensor]): list of multi-level img features. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. proposals (list[Tensors]): list of region proposals. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. Returns: dict[str, Tensor]: a dictionary of loss components """ losses = dict() for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg[i] lw = self.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = self.bbox_assigner[i] bbox_sampler = self.bbox_sampler[i] num_imgs = len(img_metas) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss bbox_results = self._bbox_forward_train(i, x, sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) for name, value in bbox_results['loss_bbox'].items(): losses[f's{i}.{name}'] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: mask_results = self._mask_forward_train( i, x, sampling_results, gt_masks, rcnn_train_cfg, bbox_results['bbox_feats']) for name, value in mask_results['loss_mask'].items(): losses[f's{i}.{name}'] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] # bbox_targets is a tuple roi_labels = bbox_results['bbox_targets'][0] with torch.no_grad(): roi_labels = torch.where( roi_labels == self.bbox_head[i].num_classes, bbox_results['cls_score'][:, :-1].argmax(1), roi_labels) proposal_list = self.bbox_head[i].refine_bboxes( bbox_results['rois'], roi_labels, bbox_results['bbox_pred'], pos_is_gts, img_metas) return losses def simple_test(self, x, proposal_list, img_metas, rescale=False): """Test without augmentation.""" assert self.with_bbox, 'Bbox head must be implemented.' num_imgs = len(proposal_list) img_shapes = tuple(meta['img_shape'] for meta in img_metas) ori_shapes = tuple(meta['ori_shape'] for meta in img_metas) scale_factors = tuple(meta['scale_factor'] for meta in img_metas) # "ms" in variable names means multi-stage ms_bbox_result = {} ms_segm_result = {} ms_scores = [] rcnn_test_cfg = self.test_cfg rois = bbox2roi(proposal_list) for i in range(self.num_stages): bbox_results = self._bbox_forward(i, x, rois) # split batch bbox prediction back to each image cls_score = bbox_results['cls_score'] bbox_pred = bbox_results['bbox_pred'] num_proposals_per_img = tuple( len(proposals) for proposals in proposal_list) rois = rois.split(num_proposals_per_img, 0) cls_score = cls_score.split(num_proposals_per_img, 0) if isinstance(bbox_pred, torch.Tensor): bbox_pred = bbox_pred.split(num_proposals_per_img, 0) else: bbox_pred = self.bbox_head[i].bbox_pred_split( bbox_pred, num_proposals_per_img) ms_scores.append(cls_score) if i < self.num_stages - 1: bbox_label = [s[:, :-1].argmax(dim=1) for s in cls_score] rois = torch.cat([ self.bbox_head[i].regress_by_class(rois[j], bbox_label[j], bbox_pred[j], img_metas[j]) for j in range(num_imgs) ]) # average scores of each image by stages cls_score = [ sum([score[i] for score in ms_scores]) / float(len(ms_scores)) for i in range(num_imgs) ] # apply bbox post-processing to each image individually det_bboxes = [] det_labels = [] for i in range(num_imgs): det_bbox, det_label = self.bbox_head[-1].get_bboxes( rois[i], cls_score[i], bbox_pred[i], img_shapes[i], scale_factors[i], rescale=rescale, cfg=rcnn_test_cfg) det_bboxes.append(det_bbox) det_labels.append(det_label) if torch.onnx.is_in_onnx_export(): return det_bboxes, det_labels bbox_results = [ bbox2result(det_bboxes[i], det_labels[i], self.bbox_head[-1].num_classes) for i in range(num_imgs) ] ms_bbox_result['ensemble'] = bbox_results if self.with_mask: if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes): mask_classes = self.mask_head[-1].num_classes segm_results = [[[] for _ in range(mask_classes)] for _ in range(num_imgs)] else: if rescale and not isinstance(scale_factors[0], float): scale_factors = [ torch.from_numpy(scale_factor).to(det_bboxes[0].device) for scale_factor in scale_factors ] _bboxes = [ det_bboxes[i][:, :4] * scale_factors[i] if rescale else det_bboxes[i][:, :4] for i in range(len(det_bboxes)) ] mask_rois = bbox2roi(_bboxes) num_mask_rois_per_img = tuple( _bbox.size(0) for _bbox in _bboxes) aug_masks = [] for i in range(self.num_stages): mask_results = self._mask_forward(i, x, mask_rois) mask_pred = mask_results['mask_pred'] # split batch mask prediction back to each image mask_pred = mask_pred.split(num_mask_rois_per_img, 0) aug_masks.append( [m.sigmoid().cpu().numpy() for m in mask_pred]) # apply mask post-processing to each image individually segm_results = [] for i in range(num_imgs): if det_bboxes[i].shape[0] == 0: segm_results.append( [[] for _ in range(self.mask_head[-1].num_classes)]) else: aug_mask = [mask[i] for mask in aug_masks] merged_masks = merge_aug_masks( aug_mask, [[img_metas[i]]] * self.num_stages, rcnn_test_cfg) segm_result = self.mask_head[-1].get_seg_masks( merged_masks, _bboxes[i], det_labels[i], rcnn_test_cfg, ori_shapes[i], scale_factors[i], rescale) segm_results.append(segm_result) ms_segm_result['ensemble'] = segm_results if self.with_mask: results = list( zip(ms_bbox_result['ensemble'], ms_segm_result['ensemble'])) else: results = ms_bbox_result['ensemble'] return results def aug_test(self, features, proposal_list, img_metas, rescale=False): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ rcnn_test_cfg = self.test_cfg aug_bboxes = [] aug_scores = [] for x, img_meta in zip(features, img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] flip_direction = img_meta[0]['flip_direction'] proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip, flip_direction) # "ms" in variable names means multi-stage ms_scores = [] rois = bbox2roi([proposals]) for i in range(self.num_stages): bbox_results = self._bbox_forward(i, x, rois) ms_scores.append(bbox_results['cls_score']) if i < self.num_stages - 1: bbox_label = bbox_results['cls_score'][:, :-1].argmax( dim=1) rois = self.bbox_head[i].regress_by_class( rois, bbox_label, bbox_results['bbox_pred'], img_meta[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) bboxes, scores = self.bbox_head[-1].get_bboxes( rois, cls_score, bbox_results['bbox_pred'], img_shape, scale_factor, rescale=False, cfg=None) aug_bboxes.append(bboxes) aug_scores.append(scores) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) if self.with_mask: if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head[-1].num_classes)] else: aug_masks = [] aug_img_metas = [] for x, img_meta in zip(features, img_metas): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] flip_direction = img_meta[0]['flip_direction'] _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip, flip_direction) mask_rois = bbox2roi([_bboxes]) for i in range(self.num_stages): mask_results = self._mask_forward(i, x, mask_rois) aug_masks.append( mask_results['mask_pred'].sigmoid().cpu().numpy()) aug_img_metas.append(img_meta) merged_masks = merge_aug_masks(aug_masks, aug_img_metas, self.test_cfg) ori_shape = img_metas[0][0]['ori_shape'] dummy_scale_factor = np.ones(4) segm_result = self.mask_head[-1].get_seg_masks( merged_masks, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor=dummy_scale_factor, rescale=False) return [(bbox_result, segm_result)] else: return [bbox_result]
class CascadeRPNHead(BaseDenseHead): """The CascadeRPNHead will predict more accurate region proposals, which is required for two-stage detectors (such as Fast/Faster R-CNN). CascadeRPN consists of a sequence of RPNStage to progressively improve the accuracy of the detected proposals. More details can be found in ``https://arxiv.org/abs/1909.06720``. Args: num_stages (int): number of CascadeRPN stages. stages (list[dict]): list of configs to build the stages. train_cfg (list[dict]): list of configs at training time each stage. test_cfg (dict): config at testing time. """ def __init__(self, num_stages, stages, train_cfg, test_cfg, init_cfg=None): super(CascadeRPNHead, self).__init__(init_cfg) assert num_stages == len(stages) self.num_stages = num_stages # Be careful! Pretrained weights cannot be loaded when use # nn.ModuleList self.stages = ModuleList() for i in range(len(stages)): train_cfg_i = train_cfg[i] if train_cfg is not None else None stages[i].update(train_cfg=train_cfg_i) stages[i].update(test_cfg=test_cfg) self.stages.append(build_head(stages[i])) self.train_cfg = train_cfg self.test_cfg = test_cfg def loss(self): """loss() is implemented in StageCascadeRPNHead.""" pass def get_bboxes(self): """get_bboxes() is implemented in StageCascadeRPNHead.""" pass def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None): """Forward train function.""" assert gt_labels is None, 'RPN does not require gt_labels' featmap_sizes = [featmap.size()[-2:] for featmap in x] device = x[0].device anchor_list, valid_flag_list = self.stages[0].get_anchors( featmap_sizes, img_metas, device=device) losses = dict() for i in range(self.num_stages): stage = self.stages[i] if stage.adapt_cfg['type'] == 'offset': offset_list = stage.anchor_offset(anchor_list, stage.anchor_strides, featmap_sizes) else: offset_list = None x, cls_score, bbox_pred = stage(x, offset_list) rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred, gt_bboxes, img_metas) stage_loss = stage.loss(*rpn_loss_inputs) for name, value in stage_loss.items(): losses['s{}.{}'.format(i, name)] = value # refine boxes if i < self.num_stages - 1: anchor_list = stage.refine_bboxes(anchor_list, bbox_pred, img_metas) if proposal_cfg is None: return losses else: proposal_list = self.stages[-1].get_bboxes(anchor_list, cls_score, bbox_pred, img_metas, self.test_cfg) return losses, proposal_list def simple_test_rpn(self, x, img_metas): """Simple forward test function.""" featmap_sizes = [featmap.size()[-2:] for featmap in x] device = x[0].device anchor_list, _ = self.stages[0].get_anchors(featmap_sizes, img_metas, device=device) for i in range(self.num_stages): stage = self.stages[i] if stage.adapt_cfg['type'] == 'offset': offset_list = stage.anchor_offset(anchor_list, stage.anchor_strides, featmap_sizes) else: offset_list = None x, cls_score, bbox_pred = stage(x, offset_list) if i < self.num_stages - 1: anchor_list = stage.refine_bboxes(anchor_list, bbox_pred, img_metas) proposal_list = self.stages[-1].get_bboxes(anchor_list, cls_score, bbox_pred, img_metas, self.test_cfg) return proposal_list def aug_test_rpn(self, x, img_metas): """Augmented forward test function.""" raise NotImplementedError
class FPNOCR(BaseModule): """FPN-like Network for segmentation based text recognition. Args: in_channels (list[int]): Number of input channels :math:`C_i` for each scale. out_channels (int): Number of output channels :math:`C_{out}` for each scale. last_stage_only (bool): If True, output last stage only. init_cfg (dict or list[dict], optional): Initialization configs. """ def __init__(self, in_channels, out_channels, last_stage_only=True, init_cfg=None): super().__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels self.num_ins = len(in_channels) self.last_stage_only = last_stage_only self.lateral_convs = ModuleList() self.smooth_convs_1x1 = ModuleList() self.smooth_convs_3x3 = ModuleList() for i in range(self.num_ins): l_conv = ConvModule(in_channels[i], out_channels, 1, norm_cfg=dict(type='BN')) self.lateral_convs.append(l_conv) for i in range(self.num_ins - 1): s_conv_1x1 = ConvModule(out_channels * 2, out_channels, 1, norm_cfg=dict(type='BN')) s_conv_3x3 = ConvModule(out_channels, out_channels, 3, padding=1, norm_cfg=dict(type='BN')) self.smooth_convs_1x1.append(s_conv_1x1) self.smooth_convs_3x3.append(s_conv_3x3) def _upsample_x2(self, x): return F.interpolate(x, scale_factor=2, mode='bilinear') def forward(self, inputs): """ Args: inputs (list[Tensor]): A list of n tensors. Each tensor has the shape of :math:`(N, C_i, H_i, W_i)`. It usually expects 4 tensors (C2-C5 features) from ResNet. Returns: tuple(Tensor): A tuple of n-1 tensors. Each has the of shape :math:`(N, C_{out}, H_{n-2-i}, W_{n-2-i})`. If ``last_stage_only=True`` (default), the size of the tuple is 1 and only the last element will be returned. """ lateral_features = [ l_conv(inputs[i]) for i, l_conv in enumerate(self.lateral_convs) ] outs = [] for i in range(len(self.smooth_convs_3x3), 0, -1): # 3, 2, 1 last_out = lateral_features[-1] if len(outs) == 0 else outs[-1] upsample = self._upsample_x2(last_out) upsample_cat = torch.cat((upsample, lateral_features[i - 1]), dim=1) smooth_1x1 = self.smooth_convs_1x1[i - 1](upsample_cat) smooth_3x3 = self.smooth_convs_3x3[i - 1](smooth_1x1) outs.append(smooth_3x3) return tuple(outs[-1:]) if self.last_stage_only else tuple(outs)
class SwinTransformer(BaseModule): """ Swin Transformer A PyTorch implement of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/abs/2103.14030 Inspiration from https://github.com/microsoft/Swin-Transformer Args: pretrain_img_size (int | tuple[int]): The size of input image when pretrain. Defaults: 224. in_channels (int): The num of input channels. Defaults: 3. embed_dims (int): The feature dimension. Default: 96. patch_size (int | tuple[int]): Patch size. Default: 4. window_size (int): Window size. Default: 7. mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. Default: 4. depths (tuple[int]): Depths of each Swin Transformer stage. Default: (2, 2, 6, 2). num_heads (tuple[int]): Parallel attention heads of each Swin Transformer stage. Default: (3, 6, 12, 24). strides (tuple[int]): The patch merging or patch embedding stride of each Swin Transformer stage. (In swin, we set kernel size equal to stride.) Default: (4, 2, 2, 2). out_indices (tuple[int]): Output from which stages. Default: (0, 1, 2, 3). qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. patch_norm (bool): If add a norm layer for patch embed and patch merging. Default: True. drop_rate (float): Dropout rate. Defaults: 0. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Defaults: 0.1. use_abs_pos_embed (bool): If True, add absolute position embedding to the patch embedding. Defaults: False. act_cfg (dict): Config dict for activation layer. Default: dict(type='LN'). norm_cfg (dict): Config dict for normalization layer at output of backone. Defaults: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. pretrained (str, optional): model pretrained path. Default: None. convert_weights (bool): The flag indicates whether the pre-trained model is from the original repo. We may need to convert some keys to make it compatible. Default: False. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. init_cfg (dict, optional): The Config for initialization. Defaults to None. """ def __init__(self, pretrain_img_size=224, in_channels=3, embed_dims=96, patch_size=4, window_size=7, mlp_ratio=4, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), strides=(4, 2, 2, 2), out_indices=(0, 1, 2, 3), qkv_bias=True, qk_scale=None, patch_norm=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, pretrained=None, convert_weights=False, frozen_stages=-1, init_cfg=None): self.convert_weights = convert_weights self.frozen_stages = frozen_stages if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): if len(pretrain_img_size) == 1: pretrain_img_size = to_2tuple(pretrain_img_size[0]) assert len(pretrain_img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pretrain_img_size)}' assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be specified at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is None: self.init_cfg = init_cfg else: raise TypeError('pretrained must be a str or None') super(SwinTransformer, self).__init__(init_cfg=init_cfg) num_layers = len(depths) self.out_indices = out_indices self.use_abs_pos_embed = use_abs_pos_embed assert strides[0] == patch_size, 'Use non-overlapping patch embed.' self.patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims, conv_type='Conv2d', kernel_size=patch_size, stride=strides[0], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) if self.use_abs_pos_embed: patch_row = pretrain_img_size[0] // patch_size patch_col = pretrain_img_size[1] // patch_size num_patches = patch_row * patch_col self.absolute_pos_embed = nn.Parameter( torch.zeros((1, num_patches, embed_dims))) self.drop_after_pos = nn.Dropout(p=drop_rate) # set stochastic depth decay rule total_depth = sum(depths) dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, total_depth) ] self.stages = ModuleList() in_channels = embed_dims for i in range(num_layers): if i < num_layers - 1: downsample = PatchMerging( in_channels=in_channels, out_channels=2 * in_channels, stride=strides[i + 1], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) else: downsample = None stage = SwinBlockSequence( embed_dims=in_channels, num_heads=num_heads[i], feedforward_channels=mlp_ratio * in_channels, depth=depths[i], window_size=window_size, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], downsample=downsample, act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None) self.stages.append(stage) if downsample: in_channels = downsample.out_channels self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] # Add a norm layer for each output for i in out_indices: layer = build_norm_layer(norm_cfg, self.num_features[i])[1] layer_name = f'norm{i}' self.add_module(layer_name, layer) def train(self, mode=True): """Convert the model into training mode while keep layers freezed.""" super(SwinTransformer, self).train(mode) self._freeze_stages() def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.use_abs_pos_embed: self.absolute_pos_embed.requires_grad = False self.drop_after_pos.eval() for i in range(1, self.frozen_stages + 1): if (i - 1) in self.out_indices: norm_layer = getattr(self, f'norm{i-1}') norm_layer.eval() for param in norm_layer.parameters(): param.requires_grad = False m = self.stages[i - 1] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self): logger = get_root_logger() if self.init_cfg is None: logger.warn(f'No pre-trained weights for ' f'{self.__class__.__name__}, ' f'training start from scratch') if self.use_abs_pos_embed: trunc_normal_(self.absolute_pos_embed, std=0.02) for m in self.modules(): if isinstance(m, nn.Linear): trunc_normal_init(m, std=.02, bias=0.) elif isinstance(m, nn.LayerNorm): constant_init(m, 1.0) else: assert 'checkpoint' in self.init_cfg, f'Only support ' \ f'specify `Pretrained` in ' \ f'`init_cfg` in ' \ f'{self.__class__.__name__} ' ckpt = _load_checkpoint(self.init_cfg.checkpoint, logger=logger, map_location='cpu') if 'state_dict' in ckpt: _state_dict = ckpt['state_dict'] elif 'model' in ckpt: _state_dict = ckpt['model'] else: _state_dict = ckpt if self.convert_weights: # supported loading weight from original repo, _state_dict = swin_converter(_state_dict) state_dict = OrderedDict() for k, v in _state_dict.items(): if k.startswith('backbone.'): state_dict[k[9:]] = v # strip prefix of state_dict if list(state_dict.keys())[0].startswith('module.'): state_dict = {k[7:]: v for k, v in state_dict.items()} # reshape absolute position embedding if state_dict.get('absolute_pos_embed') is not None: absolute_pos_embed = state_dict['absolute_pos_embed'] N1, L, C1 = absolute_pos_embed.size() N2, C2, H, W = self.absolute_pos_embed.size() if N1 != N2 or C1 != C2 or L != H * W: logger.warning('Error in loading absolute_pos_embed, pass') else: state_dict['absolute_pos_embed'] = absolute_pos_embed.view( N2, H, W, C2).permute(0, 3, 1, 2).contiguous() # interpolate position bias table if needed relative_position_bias_table_keys = [ k for k in state_dict.keys() if 'relative_position_bias_table' in k ] for table_key in relative_position_bias_table_keys: table_pretrained = state_dict[table_key] table_current = self.state_dict()[table_key] L1, nH1 = table_pretrained.size() L2, nH2 = table_current.size() if nH1 != nH2: logger.warning(f'Error in loading {table_key}, pass') elif L1 != L2: S1 = int(L1**0.5) S2 = int(L2**0.5) table_pretrained_resized = F.interpolate( table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1), size=(S2, S2), mode='bicubic') state_dict[table_key] = table_pretrained_resized.view( nH2, L2).permute(1, 0).contiguous() # load state_dict self.load_state_dict(state_dict, False) def forward(self, x): x, hw_shape = self.patch_embed(x) if self.use_abs_pos_embed: x = x + self.absolute_pos_embed x = self.drop_after_pos(x) outs = [] for i, stage in enumerate(self.stages): x, hw_shape, out, out_hw_shape = stage(x, hw_shape) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') out = norm_layer(out) out = out.view(-1, *out_hw_shape, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs.append(out) return outs
class Bottle2neck(_Bottleneck): expansion = 4 def __init__(self, in_channels, out_channels, scales=4, base_width=26, base_channels=64, stage_type='normal', **kwargs): """Bottle2neck block for Res2Net.""" super(Bottle2neck, self).__init__(in_channels, out_channels, **kwargs) assert scales > 1, 'Res2Net degenerates to ResNet when scales = 1.' mid_channels = out_channels // self.expansion width = int(math.floor(mid_channels * (base_width / base_channels))) self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, width * scales, postfix=1) self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.out_channels, postfix=3) self.conv1 = build_conv_layer(self.conv_cfg, self.in_channels, width * scales, kernel_size=1, stride=self.conv1_stride, bias=False) self.add_module(self.norm1_name, norm1) if stage_type == 'stage': self.pool = nn.AvgPool2d(kernel_size=3, stride=self.conv2_stride, padding=1) self.convs = ModuleList() self.bns = ModuleList() for i in range(scales - 1): self.convs.append( build_conv_layer(self.conv_cfg, width, width, kernel_size=3, stride=self.conv2_stride, padding=self.dilation, dilation=self.dilation, bias=False)) self.bns.append( build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1]) self.conv3 = build_conv_layer(self.conv_cfg, width * scales, self.out_channels, kernel_size=1, bias=False) self.add_module(self.norm3_name, norm3) self.stage_type = stage_type self.scales = scales self.width = width delattr(self, 'conv2') delattr(self, self.norm2_name) def forward(self, x): """Forward function.""" def _inner_forward(x): identity = x out = self.conv1(x) out = self.norm1(out) out = self.relu(out) spx = torch.split(out, self.width, 1) sp = self.convs[0](spx[0].contiguous()) sp = self.relu(self.bns[0](sp)) out = sp for i in range(1, self.scales - 1): if self.stage_type == 'stage': sp = spx[i] else: sp = sp + spx[i] sp = self.convs[i](sp.contiguous()) sp = self.relu(self.bns[i](sp)) out = torch.cat((out, sp), 1) if self.stage_type == 'normal' and self.scales != 1: out = torch.cat((out, spx[self.scales - 1]), 1) elif self.stage_type == 'stage' and self.scales != 1: out = torch.cat((out, self.pool(spx[self.scales - 1])), 1) out = self.conv3(out) out = self.norm3(out) if self.downsample is not None: identity = self.downsample(x) out += identity return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) out = self.relu(out) return out
class FPEM_FFM(BaseModule): """This code is from https://github.com/WenmuZhou/PAN.pytorch.""" def __init__(self, in_channels, conv_out=128, fpem_repeat=2, align_corners=False, init_cfg=dict(type='Xavier', layer='Conv2d', distribution='uniform')): super().__init__(init_cfg=init_cfg) # reduce layers self.reduce_conv_c2 = nn.Sequential( nn.Conv2d(in_channels=in_channels[0], out_channels=conv_out, kernel_size=1), nn.BatchNorm2d(conv_out), nn.ReLU()) self.reduce_conv_c3 = nn.Sequential( nn.Conv2d(in_channels=in_channels[1], out_channels=conv_out, kernel_size=1), nn.BatchNorm2d(conv_out), nn.ReLU()) self.reduce_conv_c4 = nn.Sequential( nn.Conv2d(in_channels=in_channels[2], out_channels=conv_out, kernel_size=1), nn.BatchNorm2d(conv_out), nn.ReLU()) self.reduce_conv_c5 = nn.Sequential( nn.Conv2d(in_channels=in_channels[3], out_channels=conv_out, kernel_size=1), nn.BatchNorm2d(conv_out), nn.ReLU()) self.align_corners = align_corners self.fpems = ModuleList() for _ in range(fpem_repeat): self.fpems.append(FPEM(conv_out)) def forward(self, x): c2, c3, c4, c5 = x # reduce channel c2 = self.reduce_conv_c2(c2) c3 = self.reduce_conv_c3(c3) c4 = self.reduce_conv_c4(c4) c5 = self.reduce_conv_c5(c5) # FPEM for i, fpem in enumerate(self.fpems): c2, c3, c4, c5 = fpem(c2, c3, c4, c5) if i == 0: c2_ffm = c2 c3_ffm = c3 c4_ffm = c4 c5_ffm = c5 else: c2_ffm += c2 c3_ffm += c3 c4_ffm += c4 c5_ffm += c5 # FFM c5 = F.interpolate(c5_ffm, c2_ffm.size()[-2:], mode='bilinear', align_corners=self.align_corners) c4 = F.interpolate(c4_ffm, c2_ffm.size()[-2:], mode='bilinear', align_corners=self.align_corners) c3 = F.interpolate(c3_ffm, c2_ffm.size()[-2:], mode='bilinear', align_corners=self.align_corners) outs = [c2_ffm, c3, c4, c5] return tuple(outs)
class CoarseMaskHead(FCNMaskHead): """Coarse mask head used in PointRend. Compared with standard ``FCNMaskHead``, ``CoarseMaskHead`` will downsample the input feature map instead of upsample it. Args: num_convs (int): Number of conv layers in the head. Default: 0. num_fcs (int): Number of fc layers in the head. Default: 2. fc_out_channels (int): Number of output channels of fc layer. Default: 1024. downsample_factor (int): The factor that feature map is downsampled by. Default: 2. init_cfg (dict or list[dict], optional): Initialization config dict. """ def __init__(self, num_convs=0, num_fcs=2, fc_out_channels=1024, downsample_factor=2, init_cfg=dict(type='Xavier', override=[ dict(name='fcs'), dict(type='Constant', val=0.001, name='fc_logits') ]), *arg, **kwarg): super(CoarseMaskHead, self).__init__(*arg, num_convs=num_convs, upsample_cfg=dict(type=None), init_cfg=None, **kwarg) self.init_cfg = init_cfg self.num_fcs = num_fcs assert self.num_fcs > 0 self.fc_out_channels = fc_out_channels self.downsample_factor = downsample_factor assert self.downsample_factor >= 1 # remove conv_logit delattr(self, 'conv_logits') if downsample_factor > 1: downsample_in_channels = (self.conv_out_channels if self.num_convs > 0 else self.in_channels) self.downsample_conv = ConvModule(downsample_in_channels, self.conv_out_channels, kernel_size=downsample_factor, stride=downsample_factor, padding=0, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg) else: self.downsample_conv = None self.output_size = (self.roi_feat_size[0] // downsample_factor, self.roi_feat_size[1] // downsample_factor) self.output_area = self.output_size[0] * self.output_size[1] last_layer_dim = self.conv_out_channels * self.output_area self.fcs = ModuleList() for i in range(num_fcs): fc_in_channels = (last_layer_dim if i == 0 else self.fc_out_channels) self.fcs.append(Linear(fc_in_channels, self.fc_out_channels)) last_layer_dim = self.fc_out_channels output_channels = self.num_classes * self.output_area self.fc_logits = Linear(last_layer_dim, output_channels) def init_weights(self): super(FCNMaskHead, self).init_weights() @auto_fp16() def forward(self, x): for conv in self.convs: x = conv(x) if self.downsample_conv is not None: x = self.downsample_conv(x) x = x.flatten(1) for fc in self.fcs: x = self.relu(fc(x)) mask_pred = self.fc_logits(x).view(x.size(0), self.num_classes, *self.output_size) return mask_pred
class ConvMLP(BaseModule): """ConvMLP backbone. https://arxiv.org/abs/2109.04454 """ def __init__(self, blocks, dims, mlp_ratios, in_channels=3, stem_channels=64, num_conv_blocks=3, out_indices=(0, 1, 2, 3), init_cfg=None): super().__init__(init_cfg=init_cfg) if out_indices != (0, 1, 2, 3): raise NotImplementedError assert len(blocks) == len(dims) == len(mlp_ratios), \ 'blocks, dims and mlp_ratios must agree in size, ' \ f'{len(blocks)}, {len(dims)} and {len(mlp_ratios)} passed.' self.tokenizer = ConvTokenizer( in_dim=in_channels, embed_dim=stem_channels) self.conv_stages = ConvStage( num_conv_blocks, embed_dim_in=stem_channels, hidden_dim=dims[0], embed_dim_out=dims[0]) self.stages = ModuleList() for i in range(0, len(blocks)): is_last_stage = i == len(blocks) - 1 stage = ConvMLPStage( num_blocks=blocks[i], embed_dims=dims[i:i + 2], mlp_ratio=mlp_ratios[i], drop_path_rate=0.1, downsample=(not is_last_stage)) self.stages.append(stage) def forward(self, x): """Forward function.""" outs = [] x = self.tokenizer(x) outs.append(x) # feature map F1 x = self.conv_stages(x) outs.append(x) # feature map F2 x = x.permute(0, 2, 3, 1) for i, stage in enumerate(self.stages): x = stage(x) # skip second last stage whose resolution is the same as last stage if i == len(self.stages) - 2: continue outs.append(x.permute(0, 3, 1, 2).contiguous()) # feat map F3, F4 return tuple(outs) @staticmethod def _init_weights(m): if isinstance(m, (nn.Linear, nn.Conv1d)): nn.init.trunc_normal_(m.weight, std=.02) if isinstance(m, (nn.Linear, nn.Conv1d)) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): nn.init.kaiming_normal_( m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1.) nn.init.constant_(m.bias, 0.) def init_weights(self): """Initialize the weights in backbone.""" logger = get_root_logger() if self.init_cfg is None: logger.warn(f'No pre-trained weights for ' f'{self.__class__.__name__}, ' f'training start from scratch') self.apply(self._init_weights) else: assert 'checkpoint' in self.init_cfg, f'Only support ' \ f'specify `Pretrained` in ' \ f'`init_cfg` in ' \ f'{self.__class__.__name__} ' if not isinstance(self.init_cfg.checkpoint, str): raise TypeError('init_cfg.checkpoint must be str') load_checkpoint( self, self.init_cfg.checkpoint, logger=logger, map_location='cpu')
class FPNC(BaseModule): """FPN-like fusion module in Real-time Scene Text Detection with Differentiable Binarization. This was partially adapted from https://github.com/MhLiao/DB and https://github.com/WenmuZhou/DBNet.pytorch """ def __init__(self, in_channels, lateral_channels=256, out_channels=64, bias_on_lateral=False, bn_re_on_lateral=False, bias_on_smooth=False, bn_re_on_smooth=False, conv_after_concat=False, init_cfg=None): super().__init__(init_cfg=init_cfg) assert isinstance(in_channels, list) self.in_channels = in_channels self.lateral_channels = lateral_channels self.out_channels = out_channels self.num_ins = len(in_channels) self.bn_re_on_lateral = bn_re_on_lateral self.bn_re_on_smooth = bn_re_on_smooth self.conv_after_concat = conv_after_concat self.lateral_convs = ModuleList() self.smooth_convs = ModuleList() self.num_outs = self.num_ins for i in range(self.num_ins): norm_cfg = None act_cfg = None if self.bn_re_on_lateral: norm_cfg = dict(type='BN') act_cfg = dict(type='ReLU') l_conv = ConvModule(in_channels[i], lateral_channels, 1, bias=bias_on_lateral, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) norm_cfg = None act_cfg = None if self.bn_re_on_smooth: norm_cfg = dict(type='BN') act_cfg = dict(type='ReLU') smooth_conv = ConvModule(lateral_channels, out_channels, 3, bias=bias_on_smooth, padding=1, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.lateral_convs.append(l_conv) self.smooth_convs.append(smooth_conv) if self.conv_after_concat: norm_cfg = dict(type='BN') act_cfg = dict(type='ReLU') self.out_conv = ConvModule(out_channels * self.num_outs, out_channels * self.num_outs, 3, padding=1, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) @auto_fp16() def forward(self, inputs): assert len(inputs) == len(self.in_channels) # build laterals laterals = [ lateral_conv(inputs[i]) for i, lateral_conv in enumerate(self.lateral_convs) ] used_backbone_levels = len(laterals) # build top-down path for i in range(used_backbone_levels - 1, 0, -1): prev_shape = laterals[i - 1].shape[2:] laterals[i - 1] += F.interpolate(laterals[i], size=prev_shape, mode='nearest') # build outputs # part 1: from original levels outs = [ self.smooth_convs[i](laterals[i]) for i in range(used_backbone_levels) ] for i, out in enumerate(outs): outs[i] = F.interpolate(outs[i], size=outs[0].shape[2:], mode='nearest') out = torch.cat(outs, dim=1) if self.conv_after_concat: out = self.out_conv(out) return out
class StackedLinearClsHead(ClsHead): """Classifier head with several hidden fc layer and a output fc layer. Args: num_classes (int): Number of categories. in_channels (int): Number of channels in the input feature map. mid_channels (Sequence): Number of channels in the hidden fc layers. dropout_rate (float): Dropout rate after each hidden fc layer, except the last layer. Defaults to 0. norm_cfg (dict, optional): Config dict of normalization layer after each hidden fc layer, except the last layer. Defaults to None. act_cfg (dict, optional): Config dict of activation function after each hidden layer, except the last layer. Defaults to use "ReLU". """ def __init__(self, num_classes: int, in_channels: int, mid_channels: Sequence, dropout_rate: float = 0., norm_cfg: Dict = None, act_cfg: Dict = dict(type='ReLU'), **kwargs): super(StackedLinearClsHead, self).__init__(**kwargs) assert num_classes > 0, \ f'`num_classes` of StackedLinearClsHead must be a positive ' \ f'integer, got {num_classes} instead.' self.num_classes = num_classes self.in_channels = in_channels assert isinstance(mid_channels, Sequence), \ f'`mid_channels` of StackedLinearClsHead should be a sequence, ' \ f'instead of {type(mid_channels)}' self.mid_channels = mid_channels self.dropout_rate = dropout_rate self.norm_cfg = norm_cfg self.act_cfg = act_cfg self._init_layers() def _init_layers(self): self.layers = ModuleList() in_channels = self.in_channels for hidden_channels in self.mid_channels: self.layers.append( LinearBlock(in_channels, hidden_channels, dropout_rate=self.dropout_rate, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) in_channels = hidden_channels self.layers.append( LinearBlock(self.mid_channels[-1], self.num_classes, dropout_rate=0., norm_cfg=None, act_cfg=None)) def init_weights(self): self.layers.init_weights() def pre_logits(self, x): if isinstance(x, tuple): x = x[-1] for layer in self.layers[:-1]: x = layer(x) return x @property def fc(self): return self.layers[-1] def simple_test(self, x, softmax=True, post_process=True): """Inference without augmentation. Args: x (tuple[Tensor]): The input features. Multi-stage inputs are acceptable but only the last stage will be used to classify. The shape of every item should be ``(num_samples, in_channels)``. softmax (bool): Whether to softmax the classification score. post_process (bool): Whether to do post processing the inference results. It will convert the output to a list. Returns: Tensor | list: The inference results. - If no post processing, the output is a tensor with shape ``(num_samples, num_classes)``. - If post processing, the output is a multi-dimentional list of float and the dimensions are ``(num_samples, num_classes)``. """ x = self.pre_logits(x) cls_score = self.fc(x) if softmax: pred = (F.softmax(cls_score, dim=1) if cls_score is not None else None) else: pred = cls_score if post_process: return self.post_process(pred) else: return pred def forward_train(self, x, gt_label, **kwargs): x = self.pre_logits(x) cls_score = self.fc(x) losses = self.loss(cls_score, gt_label, **kwargs) return losses
class SwinBlockSequence(BaseModule): """Implements one stage in Swin Transformer. Args: embed_dims (int): The feature dimension. num_heads (int): Parallel attention heads. feedforward_channels (int): The hidden dimension for FFNs. depth (int): The number of blocks in this stage. window_size (int, optional): The local window scale. Default: 7. qkv_bias (bool, optional): enable bias for qkv if True. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float | list[float], optional): Stochastic depth rate. Default: 0. downsample (BaseModule | None, optional): The downsample operation module. Default: None. act_cfg (dict, optional): The config dict of activation function. Default: dict(type='GELU'). norm_cfg (dict, optional): The config dict of normalization. Default: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. init_cfg (dict | list | None, optional): The init config. Default: None. """ def __init__(self, embed_dims, num_heads, feedforward_channels, depth, window_size=7, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., downsample=None, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), with_cp=False, init_cfg=None): super().__init__(init_cfg=init_cfg) if isinstance(drop_path_rate, list): drop_path_rates = drop_path_rate assert len(drop_path_rates) == depth else: drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)] self.blocks = ModuleList() for i in range(depth): block = SwinBlock(embed_dims=embed_dims, num_heads=num_heads, feedforward_channels=feedforward_channels, window_size=window_size, shift=False if i % 2 == 0 else True, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rates[i], act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None) self.blocks.append(block) self.downsample = downsample def forward(self, x, hw_shape): for block in self.blocks: x = block(x, hw_shape) if self.downsample: x_down, down_hw_shape = self.downsample(x, hw_shape) return x_down, down_hw_shape, x, hw_shape else: return x, hw_shape, x, hw_shape
class Mask2FormerHead(MaskFormerHead): """Implements the Mask2Former head. See `Masked-attention Mask Transformer for Universal Image Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details. Args: in_channels (list[int]): Number of channels in the input feature map. feat_channels (int): Number of channels for features. out_channels (int): Number of channels for output. num_things_classes (int): Number of things. num_stuff_classes (int): Number of stuff. num_queries (int): Number of query in Transformer decoder. pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel decoder. Defaults to None. enforce_decoder_input_project (bool, optional): Whether to add a layer to change the embed_dim of tranformer encoder in pixel decoder to the embed_dim of transformer decoder. Defaults to False. transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for transformer decoder. Defaults to None. positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for transformer decoder position encoding. Defaults to None. loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification loss. Defaults to None. loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss. Defaults to None. loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss. Defaults to None. train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of Mask2Former head. test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of Mask2Former head. init_cfg (dict or list[dict], optional): Initialization config dict. Defaults to None. """ def __init__(self, in_channels, feat_channels, out_channels, num_things_classes=80, num_stuff_classes=53, num_queries=100, num_transformer_feat_level=3, pixel_decoder=None, enforce_decoder_input_project=False, transformer_decoder=None, positional_encoding=None, loss_cls=None, loss_mask=None, loss_dice=None, train_cfg=None, test_cfg=None, init_cfg=None, **kwargs): super(AnchorFreeHead, self).__init__(init_cfg) self.num_things_classes = num_things_classes self.num_stuff_classes = num_stuff_classes self.num_classes = self.num_things_classes + self.num_stuff_classes self.num_queries = num_queries self.num_transformer_feat_level = num_transformer_feat_level self.num_heads = transformer_decoder.transformerlayers.\ attn_cfgs.num_heads self.num_transformer_decoder_layers = transformer_decoder.num_layers assert pixel_decoder.encoder.transformerlayers.\ attn_cfgs.num_levels == num_transformer_feat_level pixel_decoder_ = copy.deepcopy(pixel_decoder) pixel_decoder_.update(in_channels=in_channels, feat_channels=feat_channels, out_channels=out_channels) self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1] self.transformer_decoder = build_transformer_layer_sequence( transformer_decoder) self.decoder_embed_dims = self.transformer_decoder.embed_dims self.decoder_input_projs = ModuleList() # from low resolution to high resolution for _ in range(num_transformer_feat_level): if (self.decoder_embed_dims != feat_channels or enforce_decoder_input_project): self.decoder_input_projs.append( Conv2d(feat_channels, self.decoder_embed_dims, kernel_size=1)) else: self.decoder_input_projs.append(nn.Identity()) self.decoder_positional_encoding = build_positional_encoding( positional_encoding) self.query_embed = nn.Embedding(self.num_queries, feat_channels) self.query_feat = nn.Embedding(self.num_queries, feat_channels) # from low resolution to high resolution self.level_embed = nn.Embedding(self.num_transformer_feat_level, feat_channels) self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) self.mask_embed = nn.Sequential( nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, out_channels)) self.test_cfg = test_cfg self.train_cfg = train_cfg if train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) self.sampler = build_sampler(self.train_cfg.sampler, context=self) self.num_points = self.train_cfg.get('num_points', 12544) self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0) self.importance_sample_ratio = self.train_cfg.get( 'importance_sample_ratio', 0.75) self.class_weight = loss_cls.class_weight self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice) def init_weights(self): for m in self.decoder_input_projs: if isinstance(m, Conv2d): caffe2_xavier_init(m, bias=0) self.pixel_decoder.init_weights() for p in self.transformer_decoder.parameters(): if p.dim() > 1: nn.init.xavier_normal_(p) def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks, img_metas): """Compute classification and mask targets for one image. Args: cls_score (Tensor): Mask score logits from a single decoder layer for one image. Shape (num_queries, cls_out_channels). mask_pred (Tensor): Mask logits for a single decoder layer for one image. Shape (num_queries, h, w). gt_labels (Tensor): Ground truth class indices for one image with shape (num_gts, ). gt_masks (Tensor): Ground truth mask for each image, each with shape (num_gts, h, w). img_metas (dict): Image informtation. Returns: tuple[Tensor]: A tuple containing the following for one image. - labels (Tensor): Labels of each image. \ shape (num_queries, ). - label_weights (Tensor): Label weights of each image. \ shape (num_queries, ). - mask_targets (Tensor): Mask targets of each image. \ shape (num_queries, h, w). - mask_weights (Tensor): Mask weights of each image. \ shape (num_queries, ). - pos_inds (Tensor): Sampled positive indices for each \ image. - neg_inds (Tensor): Sampled negative indices for each \ image. """ # sample points num_queries = cls_score.shape[0] num_gts = gt_labels.shape[0] point_coords = torch.rand((1, self.num_points, 2), device=cls_score.device) # shape (num_queries, num_points) mask_points_pred = point_sample(mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1, 1)).squeeze(1) # shape (num_gts, num_points) gt_points_masks = point_sample( gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1, 1)).squeeze(1) # assign and sample assign_result = self.assigner.assign(cls_score, mask_points_pred, gt_labels, gt_points_masks, img_metas) sampling_result = self.sampler.sample(assign_result, mask_pred, gt_masks) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds # label target labels = gt_labels.new_full((self.num_queries, ), self.num_classes, dtype=torch.long) labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] label_weights = gt_labels.new_ones((self.num_queries, )) # mask target mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds] mask_weights = mask_pred.new_zeros((self.num_queries, )) mask_weights[pos_inds] = 1.0 return (labels, label_weights, mask_targets, mask_weights, pos_inds, neg_inds) def loss_single(self, cls_scores, mask_preds, gt_labels_list, gt_masks_list, img_metas): """Loss function for outputs from a single decoder layer. Args: cls_scores (Tensor): Mask score logits from a single decoder layer for all images. Shape (batch_size, num_queries, cls_out_channels). Note `cls_out_channels` should includes background. mask_preds (Tensor): Mask logits for a pixel decoder for all images. Shape (batch_size, num_queries, h, w). gt_labels_list (list[Tensor]): Ground truth class indices for each image, each with shape (num_gts, ). gt_masks_list (list[Tensor]): Ground truth mask for each image, each with shape (num_gts, h, w). img_metas (list[dict]): List of image meta information. Returns: tuple[Tensor]: Loss components for outputs from a single \ decoder layer. """ num_imgs = cls_scores.size(0) cls_scores_list = [cls_scores[i] for i in range(num_imgs)] mask_preds_list = [mask_preds[i] for i in range(num_imgs)] (labels_list, label_weights_list, mask_targets_list, mask_weights_list, num_total_pos, num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list, gt_labels_list, gt_masks_list, img_metas) # shape (batch_size, num_queries) labels = torch.stack(labels_list, dim=0) # shape (batch_size, num_queries) label_weights = torch.stack(label_weights_list, dim=0) # shape (num_total_gts, h, w) mask_targets = torch.cat(mask_targets_list, dim=0) # shape (batch_size, num_queries) mask_weights = torch.stack(mask_weights_list, dim=0) # classfication loss # shape (batch_size * num_queries, ) cls_scores = cls_scores.flatten(0, 1) labels = labels.flatten(0, 1) label_weights = label_weights.flatten(0, 1) class_weight = cls_scores.new_tensor(self.class_weight) loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=class_weight[labels].sum()) num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos])) num_total_masks = max(num_total_masks, 1) # extract positive ones # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w) mask_preds = mask_preds[mask_weights > 0] if mask_targets.shape[0] == 0: # zero match loss_dice = mask_preds.sum() loss_mask = mask_preds.sum() return loss_cls, loss_mask, loss_dice with torch.no_grad(): points_coords = get_uncertain_point_coords_with_randomness( mask_preds.unsqueeze(1), None, self.num_points, self.oversample_ratio, self.importance_sample_ratio) # shape (num_total_gts, h, w) -> (num_total_gts, num_points) mask_point_targets = point_sample( mask_targets.unsqueeze(1).float(), points_coords).squeeze(1) # shape (num_queries, h, w) -> (num_queries, num_points) mask_point_preds = point_sample(mask_preds.unsqueeze(1), points_coords).squeeze(1) # dice loss loss_dice = self.loss_dice(mask_point_preds, mask_point_targets, avg_factor=num_total_masks) # mask loss # shape (num_queries, num_points) -> (num_queries * num_points, ) mask_point_preds = mask_point_preds.reshape(-1) # shape (num_total_gts, num_points) -> (num_total_gts * num_points, ) mask_point_targets = mask_point_targets.reshape(-1) loss_mask = self.loss_mask(mask_point_preds, mask_point_targets, avg_factor=num_total_masks * self.num_points) return loss_cls, loss_mask, loss_dice def forward_head(self, decoder_out, mask_feature, attn_mask_target_size): """Forward for head part which is called after every decoder layer. Args: decoder_out (Tensor): in shape (num_queries, batch_size, c). mask_feature (Tensor): in shape (batch_size, c, h, w). attn_mask_target_size (tuple[int, int]): target attention mask size. Returns: tuple: A tuple contain three elements. - cls_pred (Tensor): Classification scores in shape \ (batch_size, num_queries, cls_out_channels). \ Note `cls_out_channels` should includes background. - mask_pred (Tensor): Mask scores in shape \ (batch_size, num_queries,h, w). - attn_mask (Tensor): Attention mask in shape \ (batch_size * num_heads, num_queries, h, w). """ decoder_out = self.transformer_decoder.post_norm(decoder_out) decoder_out = decoder_out.transpose(0, 1) # shape (num_queries, batch_size, c) cls_pred = self.cls_embed(decoder_out) # shape (num_queries, batch_size, c) mask_embed = self.mask_embed(decoder_out) # shape (num_queries, batch_size, h, w) mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature) attn_mask = F.interpolate(mask_pred, attn_mask_target_size, mode='bilinear', align_corners=False) # shape (num_queries, batch_size, h, w) -> # (batch_size * num_head, num_queries, h, w) attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat( (1, self.num_heads, 1, 1)).flatten(0, 1) attn_mask = attn_mask.sigmoid() < 0.5 attn_mask = attn_mask.detach() return cls_pred, mask_pred, attn_mask def forward(self, feats, img_metas): """Forward function. Args: feats (list[Tensor]): Multi scale Features from the upstream network, each is a 4D-tensor. img_metas (list[dict]): List of image information. Returns: tuple: A tuple contains two elements. - cls_pred_list (list[Tensor)]: Classification logits \ for each decoder layer. Each is a 3D-tensor with shape \ (batch_size, num_queries, cls_out_channels). \ Note `cls_out_channels` should includes background. - mask_pred_list (list[Tensor]): Mask logits for each \ decoder layer. Each with shape (batch_size, num_queries, \ h, w). """ batch_size = len(img_metas) mask_features, multi_scale_memorys = self.pixel_decoder(feats) # multi_scale_memorys (from low resolution to high resolution) decoder_inputs = [] decoder_positional_encodings = [] for i in range(self.num_transformer_feat_level): decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i]) # shape (batch_size, c, h, w) -> (h*w, batch_size, c) decoder_input = decoder_input.flatten(2).permute(2, 0, 1) level_embed = self.level_embed.weight[i].view(1, 1, -1) decoder_input = decoder_input + level_embed # shape (batch_size, c, h, w) -> (h*w, batch_size, c) mask = decoder_input.new_zeros( (batch_size, ) + multi_scale_memorys[i].shape[-2:], dtype=torch.bool) decoder_positional_encoding = self.decoder_positional_encoding( mask) decoder_positional_encoding = decoder_positional_encoding.flatten( 2).permute(2, 0, 1) decoder_inputs.append(decoder_input) decoder_positional_encodings.append(decoder_positional_encoding) # shape (num_queries, c) -> (num_queries, batch_size, c) query_feat = self.query_feat.weight.unsqueeze(1).repeat( (1, batch_size, 1)) query_embed = self.query_embed.weight.unsqueeze(1).repeat( (1, batch_size, 1)) cls_pred_list = [] mask_pred_list = [] cls_pred, mask_pred, attn_mask = self.forward_head( query_feat, mask_features, multi_scale_memorys[0].shape[-2:]) cls_pred_list.append(cls_pred) mask_pred_list.append(mask_pred) for i in range(self.num_transformer_decoder_layers): level_idx = i % self.num_transformer_feat_level # if a mask is all True(all background), then set it all False. attn_mask[torch.where( attn_mask.sum(-1) == attn_mask.shape[-1])] = False # cross_attn + self_attn layer = self.transformer_decoder.layers[i] attn_masks = [attn_mask, None] query_feat = layer( query=query_feat, key=decoder_inputs[level_idx], value=decoder_inputs[level_idx], query_pos=query_embed, key_pos=decoder_positional_encodings[level_idx], attn_masks=attn_masks, query_key_padding_mask=None, # here we do not apply masking on padded region key_padding_mask=None) cls_pred, mask_pred, attn_mask = self.forward_head( query_feat, mask_features, multi_scale_memorys[ (i + 1) % self.num_transformer_feat_level].shape[-2:]) cls_pred_list.append(cls_pred) mask_pred_list.append(mask_pred) return cls_pred_list, mask_pred_list
class ConvUpsample(BaseModule): """ConvUpsample performs 2x upsampling after Conv. There are several `ConvModule` layers. In the first few layers, upsampling will be applied after each layer of convolution. The number of upsampling must be no more than the number of ConvModule layers. Args: in_channels (int): Number of channels in the input feature map. inner_channels (int): Number of channels produced by the convolution. num_layers (int): Number of convolution layers. num_upsample (int | optional): Number of upsampling layer. Must be no more than num_layers. Upsampling will be applied after the first ``num_upsample`` layers of convolution. Default: ``num_layers``. conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (dict): Config dict for initialization. Default: None. kwargs (key word augments): Other augments used in ConvModule. """ def __init__(self, in_channels, inner_channels, num_layers=1, num_upsample=None, conv_cfg=None, norm_cfg=None, init_cfg=None, **kwargs): super(ConvUpsample, self).__init__(init_cfg) if num_upsample is None: num_upsample = num_layers assert num_upsample <= num_layers, \ f'num_upsample({num_upsample})must be no more than ' \ f'num_layers({num_layers})' self.num_layers = num_layers self.num_upsample = num_upsample self.conv = ModuleList() for i in range(num_layers): self.conv.append( ConvModule(in_channels, inner_channels, 3, padding=1, stride=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)) in_channels = inner_channels def forward(self, x): num_upsample = self.num_upsample for i in range(self.num_layers): x = self.conv[i](x) if num_upsample > 0: num_upsample -= 1 x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=False) return x
class PyramidVisionTransformer(BaseModule): """Pyramid Vision Transformer (PVT) Implementation of `Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions <https://arxiv.org/pdf/2102.12122.pdf>`_. Args: pretrain_img_size (int | tuple[int]): The size of input image when pretrain. Defaults: 224. in_channels (int): Number of input channels. Default: 3. embed_dims (int): Embedding dimension. Default: 64. num_stags (int): The num of stages. Default: 4. num_layers (Sequence[int]): The layer number of each transformer encode layer. Default: [3, 4, 6, 3]. num_heads (Sequence[int]): The attention heads of each transformer encode layer. Default: [1, 2, 5, 8]. patch_sizes (Sequence[int]): The patch_size of each patch embedding. Default: [4, 2, 2, 2]. strides (Sequence[int]): The stride of each patch embedding. Default: [4, 2, 2, 2]. paddings (Sequence[int]): The padding of each patch embedding. Default: [0, 0, 0, 0]. sr_ratios (Sequence[int]): The spatial reduction rate of each transformer encode layer. Default: [8, 4, 2, 1]. out_indices (Sequence[int] | int): Output from which stages. Default: (0, 1, 2, 3). mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the embedding dim of each transformer encode layer. Default: [8, 8, 4, 4]. qkv_bias (bool): Enable bias for qkv if True. Default: True. drop_rate (float): Probability of an element to be zeroed. Default 0.0. attn_drop_rate (float): The drop out rate for attention layer. Default 0.0. drop_path_rate (float): stochastic depth rate. Default 0.1. use_abs_pos_embed (bool): If True, add absolute position embedding to the patch embedding. Defaults: True. use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN. Default: False. act_cfg (dict): The activation config for FFNs. Default: dict(type='GELU'). norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). pretrained (str, optional): model pretrained path. Default: None. convert_weights (bool): The flag indicates whether the pre-trained model is from the original repo. We may need to convert some keys to make it compatible. Default: True. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None. """ def __init__(self, pretrain_img_size=224, in_channels=3, embed_dims=64, num_stages=4, num_layers=[3, 4, 6, 3], num_heads=[1, 2, 5, 8], patch_sizes=[4, 2, 2, 2], strides=[4, 2, 2, 2], paddings=[0, 0, 0, 0], sr_ratios=[8, 4, 2, 1], out_indices=(0, 1, 2, 3), mlp_ratios=[8, 8, 4, 4], qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=True, norm_after_stage=False, use_conv_ffn=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), pretrained=None, convert_weights=True, init_cfg=None): super().__init__(init_cfg=init_cfg) self.convert_weights = convert_weights if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): if len(pretrain_img_size) == 1: pretrain_img_size = to_2tuple(pretrain_img_size[0]) assert len(pretrain_img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pretrain_img_size)}' assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be setting at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is None: self.init_cfg = init_cfg else: raise TypeError('pretrained must be a str or None') self.embed_dims = embed_dims self.num_stages = num_stages self.num_layers = num_layers self.num_heads = num_heads self.patch_sizes = patch_sizes self.strides = strides self.sr_ratios = sr_ratios assert num_stages == len(num_layers) == len(num_heads) \ == len(patch_sizes) == len(strides) == len(sr_ratios) self.out_indices = out_indices assert max(out_indices) < self.num_stages self.pretrained = pretrained # transformer encoder dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(num_layers)) ] # stochastic num_layer decay rule cur = 0 self.layers = ModuleList() for i, num_layer in enumerate(num_layers): embed_dims_i = embed_dims * num_heads[i] patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims_i, kernel_size=patch_sizes[i], stride=strides[i], padding=paddings[i], bias=True, norm_cfg=norm_cfg) layers = ModuleList() if use_abs_pos_embed: pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1]) pos_embed = AbsolutePositionEmbedding( pos_shape=pos_shape, pos_dim=embed_dims_i, drop_rate=drop_rate) layers.append(pos_embed) layers.extend([ PVTEncoderLayer( embed_dims=embed_dims_i, num_heads=num_heads[i], feedforward_channels=mlp_ratios[i] * embed_dims_i, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[cur + idx], qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, sr_ratio=sr_ratios[i], use_conv_ffn=use_conv_ffn) for idx in range(num_layer) ]) in_channels = embed_dims_i # The ret[0] of build_norm_layer is norm name. if norm_after_stage: norm = build_norm_layer(norm_cfg, embed_dims_i)[1] else: norm = nn.Identity() self.layers.append(ModuleList([patch_embed, layers, norm])) cur += num_layer def init_weights(self): logger = get_root_logger() if self.init_cfg is None: logger.warn(f'No pre-trained weights for ' f'{self.__class__.__name__}, ' f'training start from scratch') for m in self.modules(): if isinstance(m, nn.Linear): trunc_normal_init(m.weight, std=.02) if m.bias is not None: constant_init(m.bias, 0) elif isinstance(m, nn.LayerNorm): constant_init(m.bias, 0) constant_init(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[ 1] * m.out_channels fan_out //= m.groups normal_init(m.weight, 0, math.sqrt(2.0 / fan_out)) if m.bias is not None: constant_init(m.bias, 0) elif isinstance(m, AbsolutePositionEmbedding): m.init_weights() else: assert 'checkpoint' in self.init_cfg, f'Only support ' \ f'specify `Pretrained` in ' \ f'`init_cfg` in ' \ f'{self.__class__.__name__} ' checkpoint = _load_checkpoint( self.init_cfg.checkpoint, logger=logger, map_location='cpu') logger.warn(f'Load pre-trained model for ' f'{self.__class__.__name__} from original repo') if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] elif 'model' in checkpoint: state_dict = checkpoint['model'] else: state_dict = checkpoint if self.convert_weights: # Because pvt backbones are not supported by mmcls, # so we need to convert pre-trained weights to match this # implementation. state_dict = pvt_convert(state_dict) load_state_dict(self, state_dict, strict=False, logger=logger) def forward(self, x): outs = [] for i, layer in enumerate(self.layers): x, hw_shape = layer[0](x) for block in layer[1]: x = block(x, hw_shape) x = layer[2](x) x = nlc_to_nchw(x, hw_shape) if i in self.out_indices: outs.append(x) return outs
class MAE(BEiT): """VisionTransformer with support for patch. Args: img_size (int | tuple): Input image size. Default: 224. patch_size (int): The patch size. Default: 16. in_channels (int): Number of input channels. Default: 3. embed_dims (int): embedding dimension. Default: 768. num_layers (int): depth of transformer. Default: 12. num_heads (int): number of attention heads. Default: 12. mlp_ratio (int): ratio of mlp hidden dim to embedding dim. Default: 4. out_indices (list | tuple | int): Output from which stages. Default: -1. attn_drop_rate (float): The drop out rate for attention layer. Default 0.0 drop_path_rate (float): stochastic depth rate. Default 0.0. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN') act_cfg (dict): The activation config for FFNs. Default: dict(type='GELU'). patch_norm (bool): Whether to add a norm in PatchEmbed Block. Default: False. final_norm (bool): Whether to add a additional layer to normalize final feature map. Default: False. num_fcs (int): The number of fully-connected layers for FFNs. Default: 2. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. pretrained (str, optional): model pretrained path. Default: None. init_values (float): Initialize the values of Attention and FFN with learnable scaling. Defaults to 0.1. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None. """ def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dims=768, num_layers=12, num_heads=12, mlp_ratio=4, out_indices=-1, attn_drop_rate=0., drop_path_rate=0., norm_cfg=dict(type='LN'), act_cfg=dict(type='GELU'), patch_norm=False, final_norm=False, num_fcs=2, norm_eval=False, pretrained=None, init_values=0.1, init_cfg=None): super(MAE, self).__init__( img_size=img_size, patch_size=patch_size, in_channels=in_channels, embed_dims=embed_dims, num_layers=num_layers, num_heads=num_heads, mlp_ratio=mlp_ratio, out_indices=out_indices, qv_bias=False, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate, norm_cfg=norm_cfg, act_cfg=act_cfg, patch_norm=patch_norm, final_norm=final_norm, num_fcs=num_fcs, norm_eval=norm_eval, pretrained=pretrained, init_values=init_values, init_cfg=init_cfg) self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) self.num_patches = self.patch_shape[0] * self.patch_shape[1] self.pos_embed = nn.Parameter( torch.zeros(1, self.num_patches + 1, embed_dims)) def _build_layers(self): dpr = [ x.item() for x in torch.linspace(0, self.drop_path_rate, self.num_layers) ] self.layers = ModuleList() for i in range(self.num_layers): self.layers.append( MAETransformerEncoderLayer( embed_dims=self.embed_dims, num_heads=self.num_heads, feedforward_channels=self.mlp_ratio * self.embed_dims, attn_drop_rate=self.attn_drop_rate, drop_path_rate=dpr[i], num_fcs=self.num_fcs, bias=True, act_cfg=self.act_cfg, norm_cfg=self.norm_cfg, window_size=self.patch_shape, init_values=self.init_values)) def fix_init_weight(self): """Rescale the initialization according to layer id. This function is copied from https://github.com/microsoft/unilm/blob/master/beit/modeling_pretrain.py. # noqa: E501 Copyright (c) Microsoft Corporation Licensed under the MIT License """ def rescale(param, layer_id): param.div_(math.sqrt(2.0 * layer_id)) for layer_id, layer in enumerate(self.layers): rescale(layer.attn.proj.weight.data, layer_id + 1) rescale(layer.ffn.layers[1].weight.data, layer_id + 1) def init_weights(self): def _init_weights(m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) self.apply(_init_weights) self.fix_init_weight() if (isinstance(self.init_cfg, dict) and self.init_cfg.get('type') == 'Pretrained'): logger = get_root_logger() checkpoint = _load_checkpoint( self.init_cfg['checkpoint'], logger=logger, map_location='cpu') state_dict = self.resize_rel_pos_embed(checkpoint) state_dict = self.resize_abs_pos_embed(state_dict) self.load_state_dict(state_dict, False) elif self.init_cfg is not None: super(MAE, self).init_weights() else: # We only implement the 'jax_impl' initialization implemented at # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 # Copyright 2019 Ross Wightman # Licensed under the Apache License, Version 2.0 (the "License") trunc_normal_(self.cls_token, std=.02) for n, m in self.named_modules(): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if m.bias is not None: if 'ffn' in n: nn.init.normal_(m.bias, mean=0., std=1e-6) else: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Conv2d): kaiming_init(m, mode='fan_in', bias=0.) elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): constant_init(m, val=1.0, bias=0.) def resize_abs_pos_embed(self, state_dict): if 'pos_embed' in state_dict: pos_embed_checkpoint = state_dict['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_extra_tokens = self.pos_embed.shape[-2] - self.num_patches # height (== width) for the checkpoint position embedding orig_size = int( (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5) # height (== width) for the new position embedding new_size = int(self.num_patches**0.5) # class_token and dist_token are kept unchanged if orig_size != new_size: extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute( 0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) state_dict['pos_embed'] = new_pos_embed return state_dict def forward(self, inputs): B = inputs.shape[0] x, hw_shape = self.patch_embed(inputs) # stole cls_tokens impl from Phil Wang, thanks cls_tokens = self.cls_token.expand(B, -1, -1) x = torch.cat((cls_tokens, x), dim=1) x = x + self.pos_embed outs = [] for i, layer in enumerate(self.layers): x = layer(x) if i == len(self.layers) - 1: if self.final_norm: x = self.norm1(x) if i in self.out_indices: out = x[:, 1:] B, _, C = out.shape out = out.reshape(B, hw_shape[0], hw_shape[1], C).permute(0, 3, 1, 2).contiguous() outs.append(out) return tuple(outs)
def __init__(self, pretrain_img_size=224, in_channels=3, embed_dims=64, num_stages=4, num_layers=[3, 4, 6, 3], num_heads=[1, 2, 5, 8], patch_sizes=[4, 2, 2, 2], strides=[4, 2, 2, 2], paddings=[0, 0, 0, 0], sr_ratios=[8, 4, 2, 1], out_indices=(0, 1, 2, 3), mlp_ratios=[8, 8, 4, 4], qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=True, norm_after_stage=False, use_conv_ffn=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), pretrained=None, convert_weights=True, init_cfg=None): super().__init__(init_cfg=init_cfg) self.convert_weights = convert_weights if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): if len(pretrain_img_size) == 1: pretrain_img_size = to_2tuple(pretrain_img_size[0]) assert len(pretrain_img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pretrain_img_size)}' assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be setting at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is None: self.init_cfg = init_cfg else: raise TypeError('pretrained must be a str or None') self.embed_dims = embed_dims self.num_stages = num_stages self.num_layers = num_layers self.num_heads = num_heads self.patch_sizes = patch_sizes self.strides = strides self.sr_ratios = sr_ratios assert num_stages == len(num_layers) == len(num_heads) \ == len(patch_sizes) == len(strides) == len(sr_ratios) self.out_indices = out_indices assert max(out_indices) < self.num_stages self.pretrained = pretrained # transformer encoder dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(num_layers)) ] # stochastic num_layer decay rule cur = 0 self.layers = ModuleList() for i, num_layer in enumerate(num_layers): embed_dims_i = embed_dims * num_heads[i] patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims_i, kernel_size=patch_sizes[i], stride=strides[i], padding=paddings[i], bias=True, norm_cfg=norm_cfg) layers = ModuleList() if use_abs_pos_embed: pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1]) pos_embed = AbsolutePositionEmbedding( pos_shape=pos_shape, pos_dim=embed_dims_i, drop_rate=drop_rate) layers.append(pos_embed) layers.extend([ PVTEncoderLayer( embed_dims=embed_dims_i, num_heads=num_heads[i], feedforward_channels=mlp_ratios[i] * embed_dims_i, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[cur + idx], qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, sr_ratio=sr_ratios[i], use_conv_ffn=use_conv_ffn) for idx in range(num_layer) ]) in_channels = embed_dims_i # The ret[0] of build_norm_layer is norm name. if norm_after_stage: norm = build_norm_layer(norm_cfg, embed_dims_i)[1] else: norm = nn.Identity() self.layers.append(ModuleList([patch_embed, layers, norm])) cur += num_layer
class YOLACTHead(AnchorHead): """YOLACT box head used in https://arxiv.org/abs/1904.02689. Note that YOLACT head is a light version of RetinaNet head. Four differences are described as follows: 1. YOLACT box head has three-times fewer anchors. 2. YOLACT box head shares the convs for box and cls branches. 3. YOLACT box head uses OHEM instead of Focal loss. 4. YOLACT box head predicts a set of mask coefficients for each box. Args: num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. anchor_generator (dict): Config dict for anchor generator loss_cls (dict): Config of classification loss. loss_bbox (dict): Config of localization loss. num_head_convs (int): Number of the conv layers shared by box and cls branches. num_protos (int): Number of the mask coefficients. use_ohem (bool): If true, ``loss_single_OHEM`` will be used for cls loss calculation. If false, ``loss_single`` will be used. conv_cfg (dict): Dictionary to construct and config conv layer. norm_cfg (dict): Dictionary to construct and config norm layer. init_cfg (dict or list[dict], optional): Initialization config dict. """ def __init__(self, num_classes, in_channels, anchor_generator=dict( type='AnchorGenerator', octave_base_scale=3, scales_per_octave=1, ratios=[0.5, 1.0, 2.0], strides=[8, 16, 32, 64, 128]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, reduction='none', loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0, loss_weight=1.5), num_head_convs=1, num_protos=32, use_ohem=True, conv_cfg=None, norm_cfg=None, init_cfg=dict( type='Xavier', distribution='uniform', bias=0, layer='Conv2d'), **kwargs): self.num_head_convs = num_head_convs self.num_protos = num_protos self.use_ohem = use_ohem self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg super(YOLACTHead, self).__init__( num_classes, in_channels, loss_cls=loss_cls, loss_bbox=loss_bbox, anchor_generator=anchor_generator, init_cfg=init_cfg, **kwargs) if self.use_ohem: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.sampling = False def _init_layers(self): """Initialize layers of the head.""" self.relu = nn.ReLU(inplace=True) self.head_convs = ModuleList() for i in range(self.num_head_convs): chn = self.in_channels if i == 0 else self.feat_channels self.head_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg)) self.conv_cls = nn.Conv2d( self.feat_channels, self.num_base_priors * self.cls_out_channels, 3, padding=1) self.conv_reg = nn.Conv2d( self.feat_channels, self.num_base_priors * 4, 3, padding=1) self.conv_coeff = nn.Conv2d( self.feat_channels, self.num_base_priors * self.num_protos, 3, padding=1) def forward_single(self, x): """Forward feature of a single scale level. Args: x (Tensor): Features of a single scale level. Returns: tuple: cls_score (Tensor): Cls scores for a single scale level \ the channels number is num_anchors * num_classes. bbox_pred (Tensor): Box energies / deltas for a single scale \ level, the channels number is num_anchors * 4. coeff_pred (Tensor): Mask coefficients for a single scale \ level, the channels number is num_anchors * num_protos. """ for head_conv in self.head_convs: x = head_conv(x) cls_score = self.conv_cls(x) bbox_pred = self.conv_reg(x) coeff_pred = self.conv_coeff(x).tanh() return cls_score, bbox_pred, coeff_pred @force_fp32(apply_to=('cls_scores', 'bbox_preds')) def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """A combination of the func:``AnchorHead.loss`` and func:``SSDHead.loss``. When ``self.use_ohem == True``, it functions like ``SSDHead.loss``, otherwise, it follows ``AnchorHead.loss``. Besides, it additionally returns ``sampling_results``. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_anchors * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_anchors * 4, H, W) gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): Class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): Specify which bounding boxes can be ignored when computing the loss. Default: None Returns: tuple: dict[str, Tensor]: A dictionary of loss components. List[:obj:``SamplingResult``]: Sampler results for each image. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.prior_generator.num_levels device = cls_scores[0].device anchor_list, valid_flag_list = self.get_anchors( featmap_sizes, img_metas, device=device) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 cls_reg_targets = self.get_targets( anchor_list, valid_flag_list, gt_bboxes, img_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, label_channels=label_channels, unmap_outputs=not self.use_ohem, return_sampling_results=True) if cls_reg_targets is None: return None (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg, sampling_results) = cls_reg_targets if self.use_ohem: num_images = len(img_metas) all_cls_scores = torch.cat([ s.permute(0, 2, 3, 1).reshape( num_images, -1, self.cls_out_channels) for s in cls_scores ], 1) all_labels = torch.cat(labels_list, -1).view(num_images, -1) all_label_weights = torch.cat(label_weights_list, -1).view(num_images, -1) all_bbox_preds = torch.cat([ b.permute(0, 2, 3, 1).reshape(num_images, -1, 4) for b in bbox_preds ], -2) all_bbox_targets = torch.cat(bbox_targets_list, -2).view(num_images, -1, 4) all_bbox_weights = torch.cat(bbox_weights_list, -2).view(num_images, -1, 4) # concat all level anchors to a single tensor all_anchors = [] for i in range(num_images): all_anchors.append(torch.cat(anchor_list[i])) # check NaN and Inf assert torch.isfinite(all_cls_scores).all().item(), \ 'classification scores become infinite or NaN!' assert torch.isfinite(all_bbox_preds).all().item(), \ 'bbox predications become infinite or NaN!' losses_cls, losses_bbox = multi_apply( self.loss_single_OHEM, all_cls_scores, all_bbox_preds, all_anchors, all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, num_total_samples=num_total_pos) else: num_total_samples = ( num_total_pos + num_total_neg if self.sampling else num_total_pos) # anchor number of multi levels num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] # concat all level anchors and flags to a single tensor concat_anchor_list = [] for i in range(len(anchor_list)): concat_anchor_list.append(torch.cat(anchor_list[i])) all_anchor_list = images_to_levels(concat_anchor_list, num_level_anchors) losses_cls, losses_bbox = multi_apply( self.loss_single, cls_scores, bbox_preds, all_anchor_list, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_samples=num_total_samples) return dict( loss_cls=losses_cls, loss_bbox=losses_bbox), sampling_results def loss_single_OHEM(self, cls_score, bbox_pred, anchors, labels, label_weights, bbox_targets, bbox_weights, num_total_samples): """"See func:``SSDHead.loss``.""" loss_cls_all = self.loss_cls(cls_score, labels, label_weights) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero( as_tuple=False).reshape(-1) neg_inds = (labels == self.num_classes).nonzero( as_tuple=False).view(-1) num_pos_samples = pos_inds.size(0) if num_pos_samples == 0: num_neg_samples = neg_inds.size(0) else: num_neg_samples = self.train_cfg.neg_pos_ratio * num_pos_samples if num_neg_samples > neg_inds.size(0): num_neg_samples = neg_inds.size(0) topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples) loss_cls_pos = loss_cls_all[pos_inds].sum() loss_cls_neg = topk_loss_cls_neg.sum() loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples if self.reg_decoded_bbox: # When the regression loss (e.g. `IouLoss`, `GIouLoss`) # is applied directly on the decoded bounding boxes, it # decodes the already encoded coordinates to absolute format. bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) loss_bbox = self.loss_bbox( bbox_pred, bbox_targets, bbox_weights, avg_factor=num_total_samples) return loss_cls[None], loss_bbox @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'coeff_preds')) def get_bboxes(self, cls_scores, bbox_preds, coeff_preds, img_metas, cfg=None, rescale=False): """"Similar to func:``AnchorHead.get_bboxes``, but additionally processes coeff_preds. Args: cls_scores (list[Tensor]): Box scores for each scale level with shape (N, num_anchors * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_anchors * 4, H, W) coeff_preds (list[Tensor]): Mask coefficients for each scale level with shape (N, num_anchors * num_protos, H, W) img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. cfg (mmcv.Config | None): Test / postprocessing configuration, if None, test_cfg would be used rescale (bool): If True, return boxes in original image space. Default: False. Returns: list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is a 3-tuple. The first item is an (n, 5) tensor, where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between 0 and 1. The second item is an (n,) tensor where each item is the predicted class label of the corresponding box. The third item is an (n, num_protos) tensor where each item is the predicted mask coefficients of instance inside the corresponding box. """ assert len(cls_scores) == len(bbox_preds) num_levels = len(cls_scores) device = cls_scores[0].device featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] mlvl_anchors = self.prior_generator.grid_priors( featmap_sizes, device=device) det_bboxes = [] det_labels = [] det_coeffs = [] for img_id in range(len(img_metas)): cls_score_list = select_single_mlvl(cls_scores, img_id) bbox_pred_list = select_single_mlvl(bbox_preds, img_id) coeff_pred_list = select_single_mlvl(coeff_preds, img_id) img_shape = img_metas[img_id]['img_shape'] scale_factor = img_metas[img_id]['scale_factor'] bbox_res = self._get_bboxes_single(cls_score_list, bbox_pred_list, coeff_pred_list, mlvl_anchors, img_shape, scale_factor, cfg, rescale) det_bboxes.append(bbox_res[0]) det_labels.append(bbox_res[1]) det_coeffs.append(bbox_res[2]) return det_bboxes, det_labels, det_coeffs def _get_bboxes_single(self, cls_score_list, bbox_pred_list, coeff_preds_list, mlvl_anchors, img_shape, scale_factor, cfg, rescale=False): """"Similar to func:``AnchorHead._get_bboxes_single``, but additionally processes coeff_preds_list and uses fast NMS instead of traditional NMS. Args: cls_score_list (list[Tensor]): Box scores for a single scale level Has shape (num_anchors * num_classes, H, W). bbox_pred_list (list[Tensor]): Box energies / deltas for a single scale level with shape (num_anchors * 4, H, W). coeff_preds_list (list[Tensor]): Mask coefficients for a single scale level with shape (num_anchors * num_protos, H, W). mlvl_anchors (list[Tensor]): Box reference for a single scale level with shape (num_total_anchors, 4). img_shape (tuple[int]): Shape of the input image, (height, width, 3). scale_factor (ndarray): Scale factor of the image arange as (w_scale, h_scale, w_scale, h_scale). cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool): If True, return boxes in original image space. Returns: tuple[Tensor, Tensor, Tensor]: The first item is an (n, 5) tensor, where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between 0 and 1. The second item is an (n,) tensor where each item is the predicted class label of the corresponding box. The third item is an (n, num_protos) tensor where each item is the predicted mask coefficients of instance inside the corresponding box. """ cfg = self.test_cfg if cfg is None else cfg assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_anchors) nms_pre = cfg.get('nms_pre', -1) mlvl_bboxes = [] mlvl_scores = [] mlvl_coeffs = [] for cls_score, bbox_pred, coeff_pred, anchors in \ zip(cls_score_list, bbox_pred_list, coeff_preds_list, mlvl_anchors): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels) if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) coeff_pred = coeff_pred.permute(1, 2, 0).reshape(-1, self.num_protos) if 0 < nms_pre < scores.shape[0]: # Get maximum scores for foreground classes. if self.use_sigmoid_cls: max_scores, _ = scores.max(dim=1) else: # remind that we set FG labels to [0, num_class-1] # since mmdet v2.0 # BG cat_id: num_class max_scores, _ = scores[:, :-1].max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] coeff_pred = coeff_pred[topk_inds, :] bboxes = self.bbox_coder.decode( anchors, bbox_pred, max_shape=img_shape) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_coeffs.append(coeff_pred) mlvl_bboxes = torch.cat(mlvl_bboxes) if rescale: mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) mlvl_scores = torch.cat(mlvl_scores) mlvl_coeffs = torch.cat(mlvl_coeffs) if self.use_sigmoid_cls: # Add a dummy background class to the backend when using sigmoid # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 # BG cat_id: num_class padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) det_bboxes, det_labels, det_coeffs = fast_nms(mlvl_bboxes, mlvl_scores, mlvl_coeffs, cfg.score_thr, cfg.iou_thr, cfg.top_k, cfg.max_per_img) return det_bboxes, det_labels, det_coeffs
class PanopticFPNHead(BaseSemanticHead): """PanopticFPNHead used in Panoptic FPN. In this head, the number of output channels is ``num_stuff_classes + 1``, including all stuff classes and one thing class. The stuff classes will be reset from ``0`` to ``num_stuff_classes - 1``, the thing classes will be merged to ``num_stuff_classes``-th channel. Arg: num_things_classes (int): Number of thing classes. Default: 80. num_stuff_classes (int): Number of stuff classes. Default: 53. num_classes (int): Number of classes, including all stuff classes and one thing class. This argument is deprecated, please use ``num_things_classes`` and ``num_stuff_classes``. The module will automatically infer the num_classes by ``num_stuff_classes + 1``. in_channels (int): Number of channels in the input feature map. inner_channels (int): Number of channels in inner features. start_level (int): The start level of the input features used in PanopticFPN. end_level (int): The end level of the used features, the ``end_level``-th layer will not be used. fg_range (tuple): Range of the foreground classes. It starts from ``0`` to ``num_things_classes-1``. Deprecated, please use ``num_things_classes`` directly. bg_range (tuple): Range of the background classes. It starts from ``num_things_classes`` to ``num_things_classes + num_stuff_classes - 1``. Deprecated, please use ``num_stuff_classes`` and ``num_things_classes`` directly. conv_cfg (dict): Dictionary to construct and config conv layer. Default: None. norm_cfg (dict): Dictionary to construct and config norm layer. Use ``GN`` by default. init_cfg (dict or list[dict], optional): Initialization config dict. loss_seg (dict): the loss of the semantic head. """ def __init__(self, num_things_classes=80, num_stuff_classes=53, num_classes=None, in_channels=256, inner_channels=128, start_level=0, end_level=4, fg_range=None, bg_range=None, conv_cfg=None, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), init_cfg=None, loss_seg=dict(type='CrossEntropyLoss', ignore_index=-1, loss_weight=1.0)): if num_classes is not None: warnings.warn( '`num_classes` is deprecated now, please set ' '`num_stuff_classes` directly, the `num_classes` will be ' 'set to `num_stuff_classes + 1`') # num_classes = num_stuff_classes + 1 for PanopticFPN. assert num_classes == num_stuff_classes + 1 super(PanopticFPNHead, self).__init__(num_stuff_classes + 1, init_cfg, loss_seg) self.num_things_classes = num_things_classes self.num_stuff_classes = num_stuff_classes if fg_range is not None and bg_range is not None: self.fg_range = fg_range self.bg_range = bg_range self.num_things_classes = fg_range[1] - fg_range[0] + 1 self.num_stuff_classes = bg_range[1] - bg_range[0] + 1 warnings.warn( '`fg_range` and `bg_range` are deprecated now, ' f'please use `num_things_classes`={self.num_things_classes} ' f'and `num_stuff_classes`={self.num_stuff_classes} instead.') # Used feature layers are [start_level, end_level) self.start_level = start_level self.end_level = end_level self.num_stages = end_level - start_level self.inner_channels = inner_channels self.conv_upsample_layers = ModuleList() for i in range(start_level, end_level): self.conv_upsample_layers.append( ConvUpsample( in_channels, inner_channels, num_layers=i if i > 0 else 1, num_upsample=i if i > 0 else 0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, )) self.conv_logits = nn.Conv2d(inner_channels, self.num_classes, 1) def _set_things_to_void(self, gt_semantic_seg): """Merge thing classes to one class. In PanopticFPN, the background labels will be reset from `0` to `self.num_stuff_classes-1`, the foreground labels will be merged to `self.num_stuff_classes`-th channel. """ gt_semantic_seg = gt_semantic_seg.int() fg_mask = gt_semantic_seg < self.num_things_classes bg_mask = (gt_semantic_seg >= self.num_things_classes) * ( gt_semantic_seg < self.num_things_classes + self.num_stuff_classes) new_gt_seg = torch.clone(gt_semantic_seg) new_gt_seg = torch.where(bg_mask, gt_semantic_seg - self.num_things_classes, new_gt_seg) new_gt_seg = torch.where(fg_mask, fg_mask.int() * self.num_stuff_classes, new_gt_seg) return new_gt_seg def loss(self, seg_preds, gt_semantic_seg): """The loss of PanopticFPN head. Things classes will be merged to one class in PanopticFPN. """ gt_semantic_seg = self._set_things_to_void(gt_semantic_seg) return super().loss(seg_preds, gt_semantic_seg) def init_weights(self): super().init_weights() nn.init.normal_(self.conv_logits.weight.data, 0, 0.01) self.conv_logits.bias.data.zero_() def forward(self, x): # the number of subnets must be not more than # the length of features. assert self.num_stages <= len(x) feats = [] for i, layer in enumerate(self.conv_upsample_layers): f = layer(x[self.start_level + i]) feats.append(f) feats = torch.sum(torch.stack(feats, dim=0), dim=0) seg_preds = self.conv_logits(feats) out = dict(seg_preds=seg_preds, feats=feats) return out
class FCNMaskHead(BaseModule): def __init__(self, num_convs=4, roi_feat_size=14, in_channels=256, conv_kernel_size=3, conv_out_channels=256, num_classes=80, class_agnostic=False, upsample_cfg=dict(type='deconv', scale_factor=2), conv_cfg=None, norm_cfg=None, predictor_cfg=dict(type='Conv'), loss_mask=dict(type='CrossEntropyLoss', use_mask=True, loss_weight=1.0), init_cfg=None): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(FCNMaskHead, self).__init__(init_cfg) self.upsample_cfg = upsample_cfg.copy() if self.upsample_cfg['type'] not in [ None, 'deconv', 'nearest', 'bilinear', 'carafe' ]: raise ValueError( f'Invalid upsample method {self.upsample_cfg["type"]}, ' 'accepted methods are "deconv", "nearest", "bilinear", ' '"carafe"') self.num_convs = num_convs # WARN: roi_feat_size is reserved and not used self.roi_feat_size = _pair(roi_feat_size) self.in_channels = in_channels self.conv_kernel_size = conv_kernel_size self.conv_out_channels = conv_out_channels self.upsample_method = self.upsample_cfg.get('type') self.scale_factor = self.upsample_cfg.pop('scale_factor', None) self.num_classes = num_classes self.class_agnostic = class_agnostic self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.predictor_cfg = predictor_cfg self.fp16_enabled = False self.loss_mask = build_loss(loss_mask) self.convs = ModuleList() for i in range(self.num_convs): in_channels = (self.in_channels if i == 0 else self.conv_out_channels) padding = (self.conv_kernel_size - 1) // 2 self.convs.append( ConvModule(in_channels, self.conv_out_channels, self.conv_kernel_size, padding=padding, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) upsample_in_channels = (self.conv_out_channels if self.num_convs > 0 else in_channels) upsample_cfg_ = self.upsample_cfg.copy() if self.upsample_method is None: self.upsample = None elif self.upsample_method == 'deconv': upsample_cfg_.update(in_channels=upsample_in_channels, out_channels=self.conv_out_channels, kernel_size=self.scale_factor, stride=self.scale_factor) self.upsample = build_upsample_layer(upsample_cfg_) elif self.upsample_method == 'carafe': upsample_cfg_.update(channels=upsample_in_channels, scale_factor=self.scale_factor) self.upsample = build_upsample_layer(upsample_cfg_) else: # suppress warnings align_corners = (None if self.upsample_method == 'nearest' else False) upsample_cfg_.update(scale_factor=self.scale_factor, mode=self.upsample_method, align_corners=align_corners) self.upsample = build_upsample_layer(upsample_cfg_) out_channels = 1 if self.class_agnostic else self.num_classes logits_in_channel = (self.conv_out_channels if self.upsample_method == 'deconv' else upsample_in_channels) self.conv_logits = build_conv_layer(self.predictor_cfg, logits_in_channel, out_channels, 1) self.relu = nn.ReLU(inplace=True) self.debug_imgs = None def init_weights(self): super(FCNMaskHead, self).init_weights() for m in [self.upsample, self.conv_logits]: if m is None: continue elif isinstance(m, CARAFEPack): m.init_weights() else: nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') nn.init.constant_(m.bias, 0) @auto_fp16() def forward(self, x): for conv in self.convs: x = conv(x) if self.upsample is not None: x = self.upsample(x) if self.upsample_method == 'deconv': x = self.relu(x) mask_pred = self.conv_logits(x) return mask_pred def get_targets(self, sampling_results, gt_masks, rcnn_train_cfg): pos_proposals = [res.pos_bboxes for res in sampling_results] pos_assigned_gt_inds = [ res.pos_assigned_gt_inds for res in sampling_results ] mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds, gt_masks, rcnn_train_cfg) return mask_targets @force_fp32(apply_to=('mask_pred', )) def loss(self, mask_pred, mask_targets, labels): """ Example: >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import * # NOQA >>> N = 7 # N = number of extracted ROIs >>> C, H, W = 11, 32, 32 >>> # Create example instance of FCN Mask Head. >>> # There are lots of variations depending on the configuration >>> self = FCNMaskHead(num_classes=C, num_convs=1) >>> inputs = torch.rand(N, self.in_channels, H, W) >>> mask_pred = self.forward(inputs) >>> sf = self.scale_factor >>> labels = torch.randint(0, C, size=(N,)) >>> # With the default properties the mask targets should indicate >>> # a (potentially soft) single-class label >>> mask_targets = torch.rand(N, H * sf, W * sf) >>> loss = self.loss(mask_pred, mask_targets, labels) >>> print('loss = {!r}'.format(loss)) """ loss = dict() if mask_pred.size(0) == 0: loss_mask = mask_pred.sum() else: if self.class_agnostic: loss_mask = self.loss_mask(mask_pred, mask_targets, torch.zeros_like(labels)) else: loss_mask = self.loss_mask(mask_pred, mask_targets, labels) loss['loss_mask'] = loss_mask return loss def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale): """Get segmentation masks from mask_pred and bboxes. Args: mask_pred (Tensor or ndarray): shape (n, #class, h, w). For single-scale testing, mask_pred is the direct output of model, whose type is Tensor, while for multi-scale testing, it will be converted to numpy array outside of this method. det_bboxes (Tensor): shape (n, 4/5) det_labels (Tensor): shape (n, ) rcnn_test_cfg (dict): rcnn testing config ori_shape (Tuple): original image height and width, shape (2,) scale_factor(ndarray | Tensor): If ``rescale is True``, box coordinates are divided by this scale factor to fit ``ori_shape``. rescale (bool): If True, the resulting masks will be rescaled to ``ori_shape``. Returns: list[list]: encoded masks. The c-th item in the outer list corresponds to the c-th class. Given the c-th outer list, the i-th item in that inner list is the mask for the i-th box with class label c. Example: >>> import mmcv >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import * # NOQA >>> N = 7 # N = number of extracted ROIs >>> C, H, W = 11, 32, 32 >>> # Create example instance of FCN Mask Head. >>> self = FCNMaskHead(num_classes=C, num_convs=0) >>> inputs = torch.rand(N, self.in_channels, H, W) >>> mask_pred = self.forward(inputs) >>> # Each input is associated with some bounding box >>> det_bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N) >>> det_labels = torch.randint(0, C, size=(N,)) >>> rcnn_test_cfg = mmcv.Config({'mask_thr_binary': 0, }) >>> ori_shape = (H * 4, W * 4) >>> scale_factor = torch.FloatTensor((1, 1)) >>> rescale = False >>> # Encoded masks are a list for each category. >>> encoded_masks = self.get_seg_masks( >>> mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, >>> scale_factor, rescale >>> ) >>> assert len(encoded_masks) == C >>> assert sum(list(map(len, encoded_masks))) == N """ if isinstance(mask_pred, torch.Tensor): mask_pred = mask_pred.sigmoid() else: # In AugTest, has been activated before mask_pred = det_bboxes.new_tensor(mask_pred) device = mask_pred.device cls_segms = [[] for _ in range(self.num_classes) ] # BG is not included in num_classes bboxes = det_bboxes[:, :4] labels = det_labels # In most cases, scale_factor should have been # converted to Tensor when rescale the bbox if not isinstance(scale_factor, torch.Tensor): if isinstance(scale_factor, float): scale_factor = np.array([scale_factor] * 4) warn('Scale_factor should be a Tensor or ndarray ' 'with shape (4,), float would be deprecated. ') assert isinstance(scale_factor, np.ndarray) scale_factor = torch.Tensor(scale_factor) if rescale: img_h, img_w = ori_shape[:2] bboxes = bboxes / scale_factor else: w_scale, h_scale = scale_factor[0], scale_factor[1] img_h = np.round(ori_shape[0] * h_scale.item()).astype(np.int32) img_w = np.round(ori_shape[1] * w_scale.item()).astype(np.int32) N = len(mask_pred) # The actual implementation split the input into chunks, # and paste them chunk by chunk. if device.type == 'cpu': # CPU is most efficient when they are pasted one by one with # skip_empty=True, so that it performs minimal number of # operations. num_chunks = N else: # GPU benefits from parallelism for larger chunks, # but may have memory issue # the types of img_w and img_h are np.int32, # when the image resolution is large, # the calculation of num_chunks will overflow. # so we neet to change the types of img_w and img_h to int. # See https://github.com/open-mmlab/mmdetection/pull/5191 num_chunks = int( np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT)) assert (num_chunks <= N), 'Default GPU_MEM_LIMIT is too small; try increasing it' chunks = torch.chunk(torch.arange(N, device=device), num_chunks) threshold = rcnn_test_cfg.mask_thr_binary im_mask = torch.zeros( N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8) if not self.class_agnostic: mask_pred = mask_pred[range(N), labels][:, None] for inds in chunks: masks_chunk, spatial_inds = _do_paste_mask( mask_pred[inds], bboxes[inds], img_h, img_w, skip_empty=device.type == 'cpu') if threshold >= 0: masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool) else: # for visualization and debugging masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8) im_mask[(inds, ) + spatial_inds] = masks_chunk for i in range(N): cls_segms[labels[i]].append(im_mask[i].detach().cpu().numpy()) return cls_segms def onnx_export(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, **kwargs): """Get segmentation masks from mask_pred and bboxes. Args: mask_pred (Tensor): shape (n, #class, h, w). det_bboxes (Tensor): shape (n, 4/5) det_labels (Tensor): shape (n, ) rcnn_test_cfg (dict): rcnn testing config ori_shape (Tuple): original image height and width, shape (2,) Returns: Tensor: a mask of shape (N, img_h, img_w). """ mask_pred = mask_pred.sigmoid() bboxes = det_bboxes[:, :4] labels = det_labels # No need to consider rescale and scale_factor while exporting to ONNX img_h, img_w = ori_shape[:2] threshold = rcnn_test_cfg.mask_thr_binary if not self.class_agnostic: box_inds = torch.arange(mask_pred.shape[0]) mask_pred = mask_pred[box_inds, labels][:, None] masks, _ = _do_paste_mask(mask_pred, bboxes, img_h, img_w, skip_empty=False) if threshold >= 0: # should convert to float to avoid problems in TRT masks = (masks >= threshold).to(dtype=torch.float) return masks
class FPEM_FFM(BaseModule): """This code is from https://github.com/WenmuZhou/PAN.pytorch. Args: in_channels (list[int]): A list of 4 numbers of input channels. conv_out (int): Number of output channels. fpem_repeat (int): Number of FPEM layers before FFM operations. align_corners (bool): The interpolation behaviour in FFM operation, used in :func:`torch.nn.functional.interpolate`. init_cfg (dict or list[dict], optional): Initialization configs. """ def __init__(self, in_channels, conv_out=128, fpem_repeat=2, align_corners=False, init_cfg=dict( type='Xavier', layer='Conv2d', distribution='uniform')): super().__init__(init_cfg=init_cfg) # reduce layers self.reduce_conv_c2 = nn.Sequential( nn.Conv2d( in_channels=in_channels[0], out_channels=conv_out, kernel_size=1), nn.BatchNorm2d(conv_out), nn.ReLU()) self.reduce_conv_c3 = nn.Sequential( nn.Conv2d( in_channels=in_channels[1], out_channels=conv_out, kernel_size=1), nn.BatchNorm2d(conv_out), nn.ReLU()) self.reduce_conv_c4 = nn.Sequential( nn.Conv2d( in_channels=in_channels[2], out_channels=conv_out, kernel_size=1), nn.BatchNorm2d(conv_out), nn.ReLU()) self.reduce_conv_c5 = nn.Sequential( nn.Conv2d( in_channels=in_channels[3], out_channels=conv_out, kernel_size=1), nn.BatchNorm2d(conv_out), nn.ReLU()) self.align_corners = align_corners self.fpems = ModuleList() for _ in range(fpem_repeat): self.fpems.append(FPEM(conv_out)) def forward(self, x): """ Args: x (list[Tensor]): A list of four tensors of shape :math:`(N, C_i, H_i, W_i)`, representing C2, C3, C4, C5 features respectively. :math:`C_i` should matches the number in ``in_channels``. Returns: list[Tensor]: Four tensors of shape :math:`(N, C_{out}, H_0, W_0)` where :math:`C_{out}` is ``conv_out``. """ c2, c3, c4, c5 = x # reduce channel c2 = self.reduce_conv_c2(c2) c3 = self.reduce_conv_c3(c3) c4 = self.reduce_conv_c4(c4) c5 = self.reduce_conv_c5(c5) # FPEM for i, fpem in enumerate(self.fpems): c2, c3, c4, c5 = fpem(c2, c3, c4, c5) if i == 0: c2_ffm = c2 c3_ffm = c3 c4_ffm = c4 c5_ffm = c5 else: c2_ffm += c2 c3_ffm += c3 c4_ffm += c4 c5_ffm += c5 # FFM c5 = F.interpolate( c5_ffm, c2_ffm.size()[-2:], mode='bilinear', align_corners=self.align_corners) c4 = F.interpolate( c4_ffm, c2_ffm.size()[-2:], mode='bilinear', align_corners=self.align_corners) c3 = F.interpolate( c3_ffm, c2_ffm.size()[-2:], mode='bilinear', align_corners=self.align_corners) outs = [c2_ffm, c3, c4, c5] return tuple(outs)
class VisionTransformer(BaseModule): """Vision Transformer. A PyTorch implement of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 Args: img_size (int | tuple): Input image size. Default: 224. patch_size (int): The patch size. Default: 16. in_channels (int): Number of input channels. Default: 3. embed_dims (int): embedding dimension. Default: 768. num_layers (int): depth of transformer. Default: 12. num_heads (int): number of attention heads. Default: 12. mlp_ratio (int): ratio of mlp hidden dim to embedding dim. Default: 4. out_indices (list | tuple | int): Output from which stages. Default: -1. qkv_bias (bool): enable bias for qkv if True. Default: True. drop_rate (float): Probability of an element to be zeroed. Default 0.0 attn_drop_rate (float): The drop out rate for attention layer. Default 0.0 drop_path_rate (float): stochastic depth rate. Default 0.0 with_cls_token (bool): Whether concatenating class token into image tokens as transformer input. Default: True. output_cls_token (bool): Whether output the cls_token. If set True, `with_cls_token` must be True. Default: False. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN') act_cfg (dict): The activation config for FFNs. Defalut: dict(type='GELU'). patch_norm (bool): Whether to add a norm in PatchEmbed Block. Default: False. final_norm (bool): Whether to add a additional layer to normalize final feature map. Default: False. interpolate_mode (str): Select the interpolate mode for position embeding vector resize. Default: bicubic. num_fcs (int): The number of fully-connected layers for FFNs. Default: 2. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. pretrain_style (str): Choose to use timm or mmcls pretrain weights. Default: timm. pretrained (str, optional): model pretrained path. Default: None. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None. """ def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dims=768, num_layers=12, num_heads=12, mlp_ratio=4, out_indices=-1, qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., with_cls_token=True, output_cls_token=False, norm_cfg=dict(type='LN'), act_cfg=dict(type='GELU'), patch_norm=False, final_norm=False, interpolate_mode='bicubic', num_fcs=2, norm_eval=False, with_cp=False, pretrain_style='timm', pretrained=None, init_cfg=None): super(VisionTransformer, self).__init__() if isinstance(img_size, int): img_size = to_2tuple(img_size) elif isinstance(img_size, tuple): if len(img_size) == 1: img_size = to_2tuple(img_size[0]) assert len(img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(img_size)}' assert pretrain_style in ['timm', 'mmcls'] if output_cls_token: assert with_cls_token is True, f'with_cls_token must be True if' \ f'set output_cls_token to True, but got {with_cls_token}' if isinstance(pretrained, str) or pretrained is None: warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') else: raise TypeError('pretrained must be a str or None') self.img_size = img_size self.patch_size = patch_size self.interpolate_mode = interpolate_mode self.norm_eval = norm_eval self.with_cp = with_cp self.pretrain_style = pretrain_style self.pretrained = pretrained self.init_cfg = init_cfg self.patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims, conv_type='Conv2d', kernel_size=patch_size, stride=patch_size, pad_to_patch_size=True, norm_cfg=norm_cfg if patch_norm else None, init_cfg=None, ) num_patches = (img_size[0] // patch_size) * \ (img_size[1] // patch_size) self.with_cls_token = with_cls_token self.output_cls_token = output_cls_token self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) self.pos_embed = nn.Parameter( torch.zeros(1, num_patches + 1, embed_dims)) self.drop_after_pos = nn.Dropout(p=drop_rate) if isinstance(out_indices, int): if out_indices == -1: out_indices = num_layers - 1 self.out_indices = [out_indices] elif isinstance(out_indices, list) or isinstance(out_indices, tuple): self.out_indices = out_indices else: raise TypeError('out_indices must be type of int, list or tuple') dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, num_layers) ] # stochastic depth decay rule self.layers = ModuleList() for i in range(num_layers): self.layers.append( TransformerEncoderLayer(embed_dims=embed_dims, num_heads=num_heads, feedforward_channels=mlp_ratio * embed_dims, attn_drop_rate=attn_drop_rate, drop_rate=drop_rate, drop_path_rate=dpr[i], num_fcs=num_fcs, qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, batch_first=True)) self.final_norm = final_norm if final_norm: self.norm1_name, norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1) self.add_module(self.norm1_name, norm1) @property def norm1(self): return getattr(self, self.norm1_name) def init_weights(self): if isinstance(self.pretrained, str): logger = get_root_logger() checkpoint = _load_checkpoint(self.pretrained, logger=logger, map_location='cpu') if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] elif 'model' in checkpoint: state_dict = checkpoint['model'] else: state_dict = checkpoint if self.pretrain_style == 'timm': # Because the refactor of vit is blocked by mmcls, # so we firstly use timm pretrain weights to train # downstream model. state_dict = vit_convert(state_dict) if 'pos_embed' in state_dict.keys(): if self.pos_embed.shape != state_dict['pos_embed'].shape: logger.info(msg=f'Resize the pos_embed shape from ' f'{state_dict["pos_embed"].shape} to ' f'{self.pos_embed.shape}') h, w = self.img_size pos_size = int( math.sqrt(state_dict['pos_embed'].shape[1] - 1)) state_dict['pos_embed'] = self.resize_pos_embed( state_dict['pos_embed'], (h // self.patch_size, w // self.patch_size), (pos_size, pos_size), self.interpolate_mode) self.load_state_dict(state_dict, False) elif self.pretrained is None: super(VisionTransformer, self).init_weights() # We only implement the 'jax_impl' initialization implemented at # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 trunc_normal_init(self.pos_embed, std=.02) trunc_normal_init(self.cls_token, std=.02) for n, m in self.named_modules(): if isinstance(m, nn.Linear): trunc_normal_init(m.weight, std=.02) if m.bias is not None: if 'ffn' in n: normal_init(m.bias, std=1e-6) else: constant_init(m.bias, 0) elif isinstance(m, nn.Conv2d): kaiming_init(m.weight, mode='fan_in') if m.bias is not None: constant_init(m.bias, 0) elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): constant_init(m.bias, 0) constant_init(m.weight, 1.0) def _pos_embeding(self, patched_img, hw_shape, pos_embed): """Positiong embeding method. Resize the pos_embed, if the input image size doesn't match the training size. Args: patched_img (torch.Tensor): The patched image, it should be shape of [B, L1, C]. hw_shape (tuple): The downsampled image resolution. pos_embed (torch.Tensor): The pos_embed weighs, it should be shape of [B, L2, c]. Return: torch.Tensor: The pos encoded image feature. """ assert patched_img.ndim == 3 and pos_embed.ndim == 3, \ 'the shapes of patched_img and pos_embed must be [B, L, C]' x_len, pos_len = patched_img.shape[1], pos_embed.shape[1] if x_len != pos_len: if pos_len == (self.img_size[0] // self.patch_size) * ( self.img_size[1] // self.patch_size) + 1: pos_h = self.img_size[0] // self.patch_size pos_w = self.img_size[1] // self.patch_size else: raise ValueError( 'Unexpected shape of pos_embed, got {}.'.format( pos_embed.shape)) pos_embed = self.resize_pos_embed(pos_embed, hw_shape, (pos_h, pos_w), self.interpolate_mode) return self.drop_after_pos(patched_img + pos_embed) @staticmethod def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode): """Resize pos_embed weights. Resize pos_embed using bicubic interpolate method. Args: pos_embed (torch.Tensor): Position embedding weights. input_shpae (tuple): Tuple for (downsampled input image height, downsampled input image width). pos_shape (tuple): The resolution of downsampled origin training image. mode (str): Algorithm used for upsampling: ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` | ``'trilinear'``. Default: ``'nearest'`` Return: torch.Tensor: The resized pos_embed of shape [B, L_new, C] """ assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]' pos_h, pos_w = pos_shape cls_token_weight = pos_embed[:, 0] pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):] pos_embed_weight = pos_embed_weight.reshape( 1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2) pos_embed_weight = F.interpolate(pos_embed_weight, size=input_shpae, align_corners=False, mode=mode) cls_token_weight = cls_token_weight.unsqueeze(1) pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2) pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1) return pos_embed def forward(self, inputs): B = inputs.shape[0] x, hw_shape = self.patch_embed(inputs), (self.patch_embed.DH, self.patch_embed.DW) # stole cls_tokens impl from Phil Wang, thanks cls_tokens = self.cls_token.expand(B, -1, -1) x = torch.cat((cls_tokens, x), dim=1) x = self._pos_embeding(x, hw_shape, self.pos_embed) if not self.with_cls_token: # Remove class token for transformer encoder input x = x[:, 1:] outs = [] for i, layer in enumerate(self.layers): x = layer(x) if i == len(self.layers) - 1: if self.final_norm: x = self.norm1(x) if i in self.out_indices: if self.with_cls_token: # Remove class token and reshape token for decoder head out = x[:, 1:] else: out = x B, _, C = out.shape out = out.reshape(B, hw_shape[0], hw_shape[1], C).permute(0, 3, 1, 2) if self.output_cls_token: out = [out, x[:, 0]] outs.append(out) return tuple(outs) def train(self, mode=True): super(VisionTransformer, self).train(mode) if mode and self.norm_eval: for m in self.modules(): if isinstance(m, nn.LayerNorm): m.eval()
class MixVisionTransformer(BaseModule): """The backbone of Segformer. This backbone is the implementation of `SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers <https://arxiv.org/abs/2105.15203>`_. Args: in_channels (int): Number of input channels. Default: 3. embed_dims (int): Embedding dimension. Default: 768. num_stags (int): The num of stages. Default: 4. num_layers (Sequence[int]): The layer number of each transformer encode layer. Default: [3, 4, 6, 3]. num_heads (Sequence[int]): The attention heads of each transformer encode layer. Default: [1, 2, 4, 8]. patch_sizes (Sequence[int]): The patch_size of each overlapped patch embedding. Default: [7, 3, 3, 3]. strides (Sequence[int]): The stride of each overlapped patch embedding. Default: [4, 2, 2, 2]. sr_ratios (Sequence[int]): The spatial reduction rate of each transformer encode layer. Default: [8, 4, 2, 1]. out_indices (Sequence[int] | int): Output from which stages. Default: (0, 1, 2, 3). mlp_ratio (int): ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): Enable bias for qkv if True. Default: True. drop_rate (float): Probability of an element to be zeroed. Default 0.0 attn_drop_rate (float): The drop out rate for attention layer. Default 0.0 drop_path_rate (float): stochastic depth rate. Default 0.0 norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN') act_cfg (dict): The activation config for FFNs. Defalut: dict(type='GELU'). pretrained (str, optional): model pretrained path. Default: None. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None. """ def __init__(self, in_channels=3, embed_dims=64, num_stages=4, num_layers=[3, 4, 6, 3], num_heads=[1, 2, 4, 8], patch_sizes=[7, 3, 3, 3], strides=[4, 2, 2, 2], sr_ratios=[8, 4, 2, 1], out_indices=(0, 1, 2, 3), mlp_ratio=4, qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), pretrained=None, init_cfg=None): super().__init__() if isinstance(pretrained, str) or pretrained is None: warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') else: raise TypeError('pretrained must be a str or None') self.embed_dims = embed_dims self.num_stages = num_stages self.num_layers = num_layers self.num_heads = num_heads self.patch_sizes = patch_sizes self.strides = strides self.sr_ratios = sr_ratios assert num_stages == len(num_layers) == len(num_heads) \ == len(patch_sizes) == len(strides) == len(sr_ratios) self.out_indices = out_indices assert max(out_indices) < self.num_stages self.pretrained = pretrained self.init_cfg = init_cfg # transformer encoder dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(num_layers)) ] # stochastic num_layer decay rule cur = 0 self.layers = ModuleList() for i, num_layer in enumerate(num_layers): embed_dims_i = embed_dims * num_heads[i] patch_embed = PatchEmbed(in_channels=in_channels, embed_dims=embed_dims_i, kernel_size=patch_sizes[i], stride=strides[i], padding=patch_sizes[i] // 2, norm_cfg=norm_cfg) layer = ModuleList([ TransformerEncoderLayer( embed_dims=embed_dims_i, num_heads=num_heads[i], feedforward_channels=mlp_ratio * embed_dims_i, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[cur + idx], qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, sr_ratio=sr_ratios[i]) for idx in range(num_layer) ]) in_channels = embed_dims_i # The ret[0] of build_norm_layer is norm name. norm = build_norm_layer(norm_cfg, embed_dims_i)[1] self.layers.append(ModuleList([patch_embed, layer, norm])) cur += num_layer def init_weights(self): if self.pretrained is None: for m in self.modules(): if isinstance(m, nn.Linear): trunc_normal_init(m.weight, std=.02) if m.bias is not None: constant_init(m.bias, 0) elif isinstance(m, nn.LayerNorm): constant_init(m.bias, 0) constant_init(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[ 1] * m.out_channels fan_out //= m.groups normal_init(m.weight, 0, math.sqrt(2.0 / fan_out)) if m.bias is not None: constant_init(m.bias, 0) elif isinstance(self.pretrained, str): logger = get_root_logger() checkpoint = _load_checkpoint(self.pretrained, logger=logger, map_location='cpu') if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] else: state_dict = checkpoint self.load_state_dict(state_dict, False) def forward(self, x): outs = [] for i, layer in enumerate(self.layers): x, hw_shape = layer[0](x) for block in layer[1]: x = block(x, hw_shape) x = layer[2](x) x = nlc_to_nchw(x, hw_shape) if i in self.out_indices: outs.append(x) return outs
class FPNC(BaseModule): """FPN-like fusion module in Real-time Scene Text Detection with Differentiable Binarization. This was partially adapted from https://github.com/MhLiao/DB and https://github.com/WenmuZhou/DBNet.pytorch. Args: in_channels (list[int]): A list of numbers of input channels. lateral_channels (int): Number of channels for lateral layers. out_channels (int): Number of output channels. bias_on_lateral (bool): Whether to use bias on lateral convolutional layers. bn_re_on_lateral (bool): Whether to use BatchNorm and ReLU on lateral convolutional layers. bias_on_smooth (bool): Whether to use bias on smoothing layer. bn_re_on_smooth (bool): Whether to use BatchNorm and ReLU on smoothing layer. conv_after_concat (bool): Whether to add a convolution layer after the concatenation of predictions. init_cfg (dict or list[dict], optional): Initialization configs. """ def __init__(self, in_channels, lateral_channels=256, out_channels=64, bias_on_lateral=False, bn_re_on_lateral=False, bias_on_smooth=False, bn_re_on_smooth=False, conv_after_concat=False, init_cfg=None): super().__init__(init_cfg=init_cfg) assert isinstance(in_channels, list) self.in_channels = in_channels self.lateral_channels = lateral_channels self.out_channels = out_channels self.num_ins = len(in_channels) self.bn_re_on_lateral = bn_re_on_lateral self.bn_re_on_smooth = bn_re_on_smooth self.conv_after_concat = conv_after_concat self.lateral_convs = ModuleList() self.smooth_convs = ModuleList() self.num_outs = self.num_ins for i in range(self.num_ins): norm_cfg = None act_cfg = None if self.bn_re_on_lateral: norm_cfg = dict(type='BN') act_cfg = dict(type='ReLU') l_conv = ConvModule( in_channels[i], lateral_channels, 1, bias=bias_on_lateral, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) norm_cfg = None act_cfg = None if self.bn_re_on_smooth: norm_cfg = dict(type='BN') act_cfg = dict(type='ReLU') smooth_conv = ConvModule( lateral_channels, out_channels, 3, bias=bias_on_smooth, padding=1, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.lateral_convs.append(l_conv) self.smooth_convs.append(smooth_conv) if self.conv_after_concat: norm_cfg = dict(type='BN') act_cfg = dict(type='ReLU') self.out_conv = ConvModule( out_channels * self.num_outs, out_channels * self.num_outs, 3, padding=1, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) @auto_fp16() def forward(self, inputs): """ Args: inputs (list[Tensor]): Each tensor has the shape of :math:`(N, C_i, H_i, W_i)`. It usually expects 4 tensors (C2-C5 features) from ResNet. Returns: Tensor: A tensor of shape :math:`(N, C_{out}, H_0, W_0)` where :math:`C_{out}` is ``out_channels``. """ assert len(inputs) == len(self.in_channels) # build laterals laterals = [ lateral_conv(inputs[i]) for i, lateral_conv in enumerate(self.lateral_convs) ] used_backbone_levels = len(laterals) # build top-down path for i in range(used_backbone_levels - 1, 0, -1): prev_shape = laterals[i - 1].shape[2:] laterals[i - 1] += F.interpolate( laterals[i], size=prev_shape, mode='nearest') # build outputs # part 1: from original levels outs = [ self.smooth_convs[i](laterals[i]) for i in range(used_backbone_levels) ] for i, out in enumerate(outs): outs[i] = F.interpolate( outs[i], size=outs[0].shape[2:], mode='nearest') out = torch.cat(outs, dim=1) if self.conv_after_concat: out = self.out_conv(out) return out
class StackedLinearClsHead(ClsHead): """Classifier head with several hidden fc layer and a output fc layer. Args: num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. mid_channels (Sequence): Number of channels in the hidden fc layers. dropout_rate (float): Dropout rate after each hidden fc layer, except the last layer. Defaults to 0. norm_cfg (dict, optional): Config dict of normalization layer after each hidden fc layer, except the last layer. Defaults to None. act_cfg (dict, optional): Config dict of activation function after each hidden layer, except the last layer. Defaults to use "ReLU". """ def __init__(self, num_classes: int, in_channels: int, mid_channels: Sequence, dropout_rate: float = 0., norm_cfg: Dict = None, act_cfg: Dict = dict(type='ReLU'), **kwargs): super(StackedLinearClsHead, self).__init__(**kwargs) assert num_classes > 0, \ f'`num_classes` of StackedLinearClsHead must be a positive ' \ f'integer, got {num_classes} instead.' self.num_classes = num_classes self.in_channels = in_channels assert isinstance(mid_channels, Sequence), \ f'`mid_channels` of StackedLinearClsHead should be a sequence, ' \ f'instead of {type(mid_channels)}' self.mid_channels = mid_channels self.dropout_rate = dropout_rate self.norm_cfg = norm_cfg self.act_cfg = act_cfg self._init_layers() def _init_layers(self): self.layers = ModuleList( init_cfg=dict( type='Normal', layer='Linear', mean=0., std=0.01, bias=0.)) in_channels = self.in_channels for hidden_channels in self.mid_channels: self.layers.append( LinearBlock( in_channels, hidden_channels, dropout_rate=self.dropout_rate, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) in_channels = hidden_channels self.layers.append( LinearBlock( self.mid_channels[-1], self.num_classes, dropout_rate=0., norm_cfg=None, act_cfg=None)) def init_weights(self): self.layers.init_weights() def simple_test(self, x): """Test without augmentation.""" if isinstance(x, tuple): x = x[-1] cls_score = x for layer in self.layers: cls_score = layer(cls_score) if isinstance(cls_score, list): cls_score = sum(cls_score) / float(len(cls_score)) pred = F.softmax(cls_score, dim=1) if cls_score is not None else None return self.post_process(pred) def forward_train(self, x, gt_label, **kwargs): if isinstance(x, tuple): x = x[-1] cls_score = x for layer in self.layers: cls_score = layer(cls_score) losses = self.loss(cls_score, gt_label, **kwargs) return losses