def __init__(self, num_query=1, transformer=None, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), bbox_head=None, cls_head=None, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, ), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0), train_cfg=None, test_cfg=None, init_cfg=None, frozen_modules=None, **kwargs): super(StarkHead, self).__init__(init_cfg=init_cfg) self.transformer = build_transformer(transformer) self.positional_encoding = build_positional_encoding( positional_encoding) assert bbox_head is not None self.bbox_head = build_head(bbox_head) if cls_head is None: # the stage-1 training self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) self.cls_head = None else: # the stage-2 training self.cls_head = build_head(cls_head) self.loss_cls = build_loss(loss_cls) self.embed_dims = self.transformer.embed_dims self.num_query = num_query self.query_embedding = nn.Embedding(self.num_query, self.embed_dims) self.train_cfg = train_cfg self.test_cfg = test_cfg self.fp16_enabled = False if frozen_modules is not None: assert isinstance(frozen_modules, list) for module in frozen_modules: m = getattr(self, module) # TODO: Study the influence of freezing BN running_mean and # running_variance of `frozen_modules` in the 2nd stage train. # The official code doesn't freeze these. for param in m.parameters(): param.requires_grad = False
def __init__(self, num_convs=4, roi_feat_size=14, in_channels=256, conv_kernel_size=3, conv_out_channels=256, num_classes=80, class_agnostic=False, upsample_cfg=dict(type='deconv', scale_factor=2), conv_cfg=None, norm_cfg=None, dynamic_conv_cfg=dict( type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, input_feat_shape=14, with_proj=False, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_mask=dict(type='DiceLoss', loss_weight=8.0), **kwargs): super(DynamicMaskHead, self).__init__( num_convs=num_convs, roi_feat_size=roi_feat_size, in_channels=in_channels, conv_kernel_size=conv_kernel_size, conv_out_channels=conv_out_channels, num_classes=num_classes, class_agnostic=class_agnostic, upsample_cfg=upsample_cfg, conv_cfg=conv_cfg, norm_cfg=norm_cfg, loss_mask=loss_mask, **kwargs) assert class_agnostic is False, \ 'DynamicMaskHead only support class_agnostic=False' self.fp16_enabled = False self.instance_interactive_conv = build_transformer(dynamic_conv_cfg)
def __init__(self, num_classes=80, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_reg_fcs=3, feedforward_channels=2048, in_channels=256, dropout=0.0, ffn_act_cfg=dict(type='ReLU', inplace=True), dynamic_conv_cfg=dict(type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, input_feat_shape=7, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_iou=dict(type='GIoULoss', loss_weight=2.0), init_cfg=None, **kwargs): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(DIIHead, self).__init__(num_classes=num_classes, reg_decoded_bbox=True, reg_class_agnostic=True, init_cfg=init_cfg, **kwargs) self.loss_iou = build_loss(loss_iou) self.in_channels = in_channels self.fp16_enabled = False self.attention = MultiheadAttention(in_channels, num_heads, dropout) self.attention_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.instance_interactive_conv = build_transformer(dynamic_conv_cfg) self.instance_interactive_conv_dropout = nn.Dropout(dropout) self.instance_interactive_conv_norm = build_norm_layer( dict(type='LN'), in_channels)[1] self.ffn = FFN(in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.cls_fcs = nn.ModuleList() for _ in range(num_cls_fcs): self.cls_fcs.append(nn.Linear(in_channels, in_channels, bias=False)) self.cls_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.cls_fcs.append( build_activation_layer(dict(type='ReLU', inplace=True))) # over load the self.fc_cls in BBoxHead if self.loss_cls.use_sigmoid: self.fc_cls = nn.Linear(in_channels, self.num_classes) else: self.fc_cls = nn.Linear(in_channels, self.num_classes + 1) self.reg_fcs = nn.ModuleList() for _ in range(num_reg_fcs): self.reg_fcs.append(nn.Linear(in_channels, in_channels, bias=False)) self.reg_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.reg_fcs.append( build_activation_layer(dict(type='ReLU', inplace=True))) # over load the self.fc_cls in BBoxHead self.fc_reg = nn.Linear(in_channels, 4) assert self.reg_class_agnostic, 'DIIHead only ' \ 'suppport `reg_class_agnostic=True` ' assert self.reg_decoded_bbox, 'DIIHead only ' \ 'suppport `reg_decoded_bbox=True`'
def __init__(self, num_classes, in_channels, num_fcs=2, transformer=dict( type='Transformer', embed_dims=256, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, feedforward_channels=2048, dropout=0.1, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN'), num_fcs=2, pre_norm=False, return_intermediate_dec=True), positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), loss_cls=dict( type='CrossEntropyLoss', bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0), train_cfg=dict( assigner=dict( type='HungarianAssigner', cls_weight=1., bbox_weight=5., iou_weight=2., iou_calculator=dict(type='BboxOverlaps2D'), iou_mode='giou')), test_cfg=dict(max_per_img=100), **kwargs): # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, # since it brings inconvenience when the initialization of # `AnchorFreeHead` is called. super(AnchorFreeHead, self).__init__() use_sigmoid_cls = loss_cls.get('use_sigmoid', False) assert not use_sigmoid_cls, 'setting use_sigmoid_cls as True is ' \ 'not supported in DETR, since background is needed for the ' \ 'matching process.' assert 'embed_dims' in transformer \ and 'num_feats' in positional_encoding num_feats = positional_encoding['num_feats'] embed_dims = transformer['embed_dims'] assert num_feats * 2 == embed_dims, 'embed_dims should' \ f' be exactly 2 times of num_feats. Found {embed_dims}' \ f' and {num_feats}.' assert test_cfg is not None and 'max_per_img' in test_cfg class_weight = loss_cls.get('class_weight', None) if class_weight is not None: assert isinstance(class_weight, float), 'Expected ' \ 'class_weight to have type float. Found ' \ f'{type(class_weight)}.' # NOTE following the official DETR rep0, bg_cls_weight means # relative classification weight of the no-object class. bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) assert isinstance(bg_cls_weight, float), 'Expected ' \ 'bg_cls_weight to have type float. Found ' \ f'{type(bg_cls_weight)}.' class_weight = torch.ones(num_classes + 1) * class_weight # set background class as the last indice class_weight[num_classes] = bg_cls_weight loss_cls.update({'class_weight': class_weight}) if 'bg_cls_weight' in loss_cls: loss_cls.pop('bg_cls_weight') self.bg_cls_weight = bg_cls_weight if train_cfg: assert 'assigner' in train_cfg, 'assigner should be provided '\ 'when train_cfg is set.' assigner = train_cfg['assigner'] assert loss_cls['loss_weight'] == assigner['cls_weight'], \ 'The classification weight for loss and matcher should be' \ 'exactly the same.' assert loss_bbox['loss_weight'] == assigner['bbox_weight'], \ 'The regression L1 weight for loss and matcher should be' \ 'exactly the same.' assert loss_iou['loss_weight'] == assigner['iou_weight'], \ 'The regression iou weight for loss and matcher should be' \ 'exactly the same.' self.assigner = build_assigner(assigner) # DETR sampling=False, so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.num_classes = num_classes self.cls_out_channels = num_classes + 1 self.in_channels = in_channels self.num_fcs = num_fcs self.train_cfg = train_cfg self.test_cfg = test_cfg self.use_sigmoid_cls = use_sigmoid_cls self.embed_dims = embed_dims self.num_query = test_cfg['max_per_img'] self.fp16_enabled = False self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True)) self.activate = build_activation_layer(self.act_cfg) self.positional_encoding = build_positional_encoding( positional_encoding) self.transformer = build_transformer(transformer) self._init_layers()
def __init__( self, num_classes, in_channels, num_query=100, num_reg_fcs=2, transformer=None, sync_cls_avg_factor=False, positional_encoding=dict(type='SinePositionalEncoding', num_feats=128, normalize=True), loss_cls=dict(type='CrossEntropyLoss', bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0), train_cfg=dict(assigner=dict( type='HungarianAssigner', cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=5.0), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), test_cfg=dict(max_per_img=100), init_cfg=None, **kwargs): # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, # since it brings inconvenience when the initialization of # `AnchorFreeHead` is called. super(AnchorFreeHead, self).__init__(init_cfg) self.bg_cls_weight = 0 self.sync_cls_avg_factor = sync_cls_avg_factor class_weight = loss_cls.get('class_weight', None) if class_weight is not None and (self.__class__ is DETRHead): assert isinstance(class_weight, float), 'Expected ' \ 'class_weight to have type float. Found ' \ f'{type(class_weight)}.' # NOTE following the official DETR rep0, bg_cls_weight means # relative classification weight of the no-object class. bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) assert isinstance(bg_cls_weight, float), 'Expected ' \ 'bg_cls_weight to have type float. Found ' \ f'{type(bg_cls_weight)}.' class_weight = torch.ones(num_classes + 1) * class_weight # set background class as the last indice class_weight[num_classes] = bg_cls_weight loss_cls.update({'class_weight': class_weight}) if 'bg_cls_weight' in loss_cls: loss_cls.pop('bg_cls_weight') self.bg_cls_weight = bg_cls_weight if train_cfg: assert 'assigner' in train_cfg, 'assigner should be provided '\ 'when train_cfg is set.' assigner = train_cfg['assigner'] assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \ 'The classification weight for loss and matcher should be' \ 'exactly the same.' assert loss_bbox['loss_weight'] == assigner['reg_cost'][ 'weight'], 'The regression L1 weight for loss and matcher ' \ 'should be exactly the same.' assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \ 'The regression iou weight for loss and matcher should be' \ 'exactly the same.' self.assigner = build_assigner(assigner) # DETR sampling=False, so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.num_query = num_query self.num_classes = num_classes self.in_channels = in_channels self.num_reg_fcs = num_reg_fcs self.train_cfg = train_cfg self.test_cfg = test_cfg self.fp16_enabled = False self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) if self.loss_cls.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True)) self.activate = build_activation_layer(self.act_cfg) self.positional_encoding = build_positional_encoding( positional_encoding) self.transformer = build_transformer(transformer) self.embed_dims = self.transformer.embed_dims assert 'num_feats' in positional_encoding num_feats = positional_encoding['num_feats'] assert num_feats * 2 == self.embed_dims, 'embed_dims should' \ f' be exactly 2 times of num_feats. Found {self.embed_dims}' \ f' and {num_feats}.' self._init_layers()