def __init__(self, cfg, input_shape: List[ShapeSpec]): """ SOD Base Head. """ super().__init__() # fmt: off self.instance_in_features = cfg.MODEL.SOD.INSTANCE_IN_FEATURES self.num_in_channels = cfg.MODEL.SOD.INSTANCE_IN_CHANNELS # = fpn. self.num_channels = cfg.MODEL.SOD.BASE_CHANNELS self.num_conv = cfg.MODEL.SOD.NUM_BASE_CONVS self.norm = cfg.MODEL.SOD.NORM self.with_coord = cfg.MODEL.SOD.WITH_COORD self.num_levels = len(input_shape) assert self.num_levels == len(self.instance_in_features), \ print("Input shape should match the features.") # fmt: on head_configs = { "base": (self.num_conv, self.with_coord, False), # leave for DCN. } in_channels = [s.channels for s in input_shape] assert len(set(in_channels)) == 1, \ print("Each level must have the same channel!") in_channels = in_channels[0] assert in_channels == self.num_in_channels, \ print("In channels should equal to tower in channels!") for head in head_configs: tower = [] num_convs, use_coord, use_deformable = head_configs[head] for i in range(num_convs): # with coord or not. if i == 0: if use_coord: chn = self.num_in_channels + 2 else: chn = self.num_in_channels else: chn = self.num_channels # use deformable conv or not. if use_deformable and i == num_convs - 1: raise NotImplementedError else: conv_func = nn.Conv2d tower.append( conv_func(chn, self.num_channels, kernel_size=3, stride=1, padding=1, bias=self.norm is None)) if self.norm == "GN": tower.append(nn.GroupNorm(32, self.num_channels)) tower.append(nn.ReLU(inplace=True)) self.add_module('{}_tower'.format(head), nn.Sequential(*tower)) # init. for l in self.base_tower: if isinstance(l, nn.Conv2d): nn.init.normal_(l.weight, std=0.01) if l.bias is not None: nn.init.constant_(l.bias, 0)
def gn_helper(planes): return nn.GroupNorm(group_norm, planes)
def __init__(self, in_dim, cout, nf=64, activation=nn.Tanh, requires_grad=True): super(FaceModelNet, self).__init__() prenet = [nn.Linear(in_dim, nf), nn.ReLU(inplace=True)] self.prenet = nn.Sequential(*prenet) network = [ nn.ConvTranspose2d(nf, nf * 8, kernel_size=4, stride=1, padding=0, bias=False), # 1x1 -> 4x4 nn.ReLU(inplace=True), nn.Conv2d(nf * 8, nf * 8, kernel_size=3, stride=1, padding=1, bias=False), nn.ReLU(inplace=True), nn.ConvTranspose2d(nf * 8, nf * 4, kernel_size=4, stride=2, padding=1, bias=False), # 4x4 -> 8x8 nn.GroupNorm(16 * 4, nf * 4), nn.ReLU(inplace=True), nn.Conv2d(nf * 4, nf * 4, kernel_size=3, stride=1, padding=1, bias=False), nn.GroupNorm(16 * 4, nf * 4), nn.ReLU(inplace=True), nn.ConvTranspose2d(nf * 4, nf * 2, kernel_size=4, stride=2, padding=1, bias=False), # 8x8 -> 16x16 nn.GroupNorm(16 * 2, nf * 2), nn.ReLU(inplace=True), nn.Conv2d(nf * 2, nf * 2, kernel_size=3, stride=1, padding=1, bias=False), nn.GroupNorm(16 * 2, nf * 2), nn.ReLU(inplace=True), nn.ConvTranspose2d(nf * 2, nf, kernel_size=4, stride=2, padding=1, bias=False), # 16x16 -> 32x32 nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1, bias=False), nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Upsample(scale_factor=2, mode='nearest'), # 32x32 -> 64x64 nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1, bias=False), nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Conv2d(nf, nf, kernel_size=5, stride=1, padding=2, bias=False), nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Conv2d(nf, cout, kernel_size=5, stride=1, padding=2, bias=False) ] if activation is not None: network += [activation()] self.network = nn.Sequential(*network) if not requires_grad: for param in self.parameters(): param.requires_grad = False
def __init__(self, num_classes, num_queries, num_feature_levels): """Initializes the model. Args: num_classes (int): number of object classes num_queries (int): number of object queries num_feature_levels (int): num feature lecels """ super().__init__() # create ResNet50 backbone position_embedding = PositionEmbeddingSine(HIDDEN_DIM // 2, normalize=True) backbone = Joniner(Backbone(), position_embedding) # create deformable transformer transformer = DeformableTransformer(HIDDEN_DIM, NHEADS, ENC_LAYERS, DEC_LAYERS, DIM_FEEDFORWARD, DROPOUT, True, NUM_FEATURE_LEVELS, DEC_N_POINTS, ENC_N_POINTS) self.num_queries = num_queries self.transformer = transformer hidden_dim = transformer.d_model self.class_embed = nn.Linear(hidden_dim, num_classes) self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) self.num_feature_levels = num_feature_levels # 使用的backbone特征层数,如果大于backbone提供的stage数,则使用卷积继续推进 self.query_embed = nn.Embedding(num_queries, hidden_dim * 2) num_backbone_outs = len(backbone.strides) input_proj_list = [] for _ in range(num_backbone_outs): in_channels = backbone.num_channels[_] input_proj_list.append( nn.Sequential( nn.Conv2d(in_channels, hidden_dim, kernel_size=1), nn.GroupNorm(32, hidden_dim), )) # 将不同stage的输出通道映射到相同大小 for _ in range( num_feature_levels - num_backbone_outs): # 初始的in_channels即backbone的最后输出层channel input_proj_list.append( nn.Sequential( nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), nn.GroupNorm(32, hidden_dim), )) in_channels = hidden_dim # 使用一层卷积层构建后续的特征金字塔 self.input_proj = nn.ModuleList(input_proj_list) self.backbone = backbone prior_prob = 0.01 bias_value = -math.log((1 - prior_prob) / prior_prob) self.class_embed.bias.data = torch.ones(num_classes) * bias_value nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) for proj in self.input_proj: nn.init.xavier_uniform_(proj[0].weight, gain=1) nn.init.constant_(proj[0].bias, 0) num_pred = transformer.decoder.num_layers nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) self.class_embed = nn.ModuleList( [self.class_embed for _ in range(num_pred)]) self.bbox_embed = nn.ModuleList( [self.bbox_embed for _ in range(num_pred)]) self.transformer.decoder.bbox_embed = None
def __init__(self, in_nfeat=3, num_stack=4, norm_type='group', hg_down='ave_pool', num_hourglass=2, hourglass_dim=256): super(HGFilter, self).__init__() self.num_modules = num_stack self.norm_type = norm_type self.hg_down = hg_down self.num_hourglass = num_hourglass self.hourglass_dim = hourglass_dim # Base part self.conv1 = nn.Conv2d(in_nfeat, 64, kernel_size=7, stride=2, padding=3) if self.norm_type == 'batch': self.bn1 = nn.BatchNorm2d(64) elif self.norm_type == 'group': self.bn1 = nn.GroupNorm(32, 64) if self.hg_down == 'conv64': self.conv2 = ConvBlock(64, 64, self.norm_type) self.down_conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) elif self.hg_down == 'conv128': self.conv2 = ConvBlock(64, 128, self.norm_type) self.down_conv2 = nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1) elif self.hg_down == 'ave_pool': self.conv2 = ConvBlock(64, 128, self.norm_type) else: raise NameError('Unknown Fan Filter setting!') self.conv3 = ConvBlock(128, 128, self.norm_type) self.conv4 = ConvBlock(128, 256, self.norm_type) # Stacking part for hg_module in range(self.num_modules): self.add_module( 'm' + str(hg_module), HourGlass(1, self.num_hourglass, 256, self.norm_type)) self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256, self.norm_type)) self.add_module( 'conv_last' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)) if self.norm_type == 'batch': self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256)) elif self.norm_type == 'group': self.add_module('bn_end' + str(hg_module), nn.GroupNorm(32, 256)) self.add_module( 'l' + str(hg_module), nn.Conv2d(256, self.hourglass_dim, kernel_size=1, stride=1, padding=0)) if hg_module < self.num_modules - 1: self.add_module( 'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)) self.add_module( 'al' + str(hg_module), nn.Conv2d(self.hourglass_dim, 256, kernel_size=1, stride=1, padding=0))
def Norm(planes): return nn.GroupNorm(32, planes)
def custom_cnn(input_channels, specification, input_name='input', output_name='output', default_nonlin='relu', batch_norm=False): """ Creates a CNN for the given number of input channels, with an architecture defined as a comma-separated string of layer definitions. Supported layer definitions are (with variables in <>, and optional parts in []): - pad1d:<method>@<size> - pad2d:<method>@<size> - crop1d:<size> - crop2d:<size> - conv1d:<channels>@<size>[s<stride>][p<pad>][d<dilation>][g<groups>] - conv2d:<channels>@<size0>x<size1>[s<stride>][p<pad>][d<dilation>][g<groups>] - pool1d:<method>@<size>[s<stride>][p<pad>][d<dilation] - pool2d:<method>@<size0>x<size1>[s<stride>][p<pad>][d<dilation>] - globalpool1d:<method> - globalpool2d:<method> - globallmepool:<alpha>[t<trainable>][c<channelwise>][e<exponentiated>] - bn1d - bn2d - groupnorm:<groups> - dropout:<drop_probability> - relu - lrelu - sigm - swish - mish - bipol:<nonlin> - shift:<amount> - bypass (does nothing) - squeeze:<dim> - cat[layers1|layers2|...] (apply stacks to same input, then concat) - add[layers1|layers2|...] (apply stacks to same input, then add) - shake[layers1|layers2|...] (apply stacks to same input, then shake-shake) If there is a batch normalization one or two layers after a convolution, the convolution will not have a bias term. """ def read_layers(s): """ Yields all layer definitions (as separated by , | [ or ]) as tuples of the definition string and the following delimiter. """ pos = 0 for match in re.finditer(r'[,|[\]]', s): yield s[pos:match.start()], s[match.start():match.end()] pos = match.end() yield s[pos:], None def read_size(s, t=int, expect_remainder=True): """ Read and parse a size (e.g., 1, 1x1, 1x1x1) at the beginning of `s`, with elements of type `t`. If `expect_remainder`, returns the remainder, otherwise tries to parse the complete `s` as a size. """ if expect_remainder: # yes, we could use a precompiled regular expression... p = next((i for i, c in enumerate(s) if c not in '0123456789x'), len(s)) remainder = s[p:] s = s[:p] size = tuple(map(t, s.split('x'))) if len(size) == 1: size = size[0] if expect_remainder: return size, remainder else: return size def size_string(size): """ Convert a size integer or tuple back into its string form. """ try: return 'x'.join(map(str, size)) except TypeError: return str(size) def read_extra_sizes(s, prefixes, t=int): """ Read and parse any extra size definitions prefixed by any of the allowed prefixes, and returns them as a dictionary. If `prefixes` is a dictionary, the prefixes (keys) will be translated to the expanded names (values) in the returned dictionary. Values will be converted from strings to `t`. """ if not isinstance(prefixes, dict): prefixes = {prefix: prefix for prefix in prefixes} result = {} while s: for prefix, return_key in prefixes.items(): if s.startswith(prefix): size, s = read_size(s[len(prefix):], t) result[return_key] = size break else: raise ValueError("unrecognized part in layer definition: " "%r" % s) return result stack = [] layers = [] if input_name: layers = [PickDictKey(input_name)] # track receptive field for the full network receptive_field = ReceptiveField() # split specification string into definition, delimiter tuples specification = list(read_layers(specification)) # iterate over it (in a way that allows us to expand macro definitions) while specification: layer_def, delim = specification.pop(0) layer_def = layer_def.split(':') kind = layer_def[0] if kind in ('pad1d', 'pad2d'): method, size = layer_def[1].split('@') size = read_size(size, expect_remainder=False) cls = {'reflectpad1d': nn.ReflectionPad1d, 'reflectpad2d': nn.ReflectionPad2d}[method + kind] layers.append(cls(size)) receptive_field *= ReceptiveField(padding=size) elif kind in ('crop1d', 'crop2d'): size = int(layer_def[1]) dimensionality = int(kind[-2]) layers.append(Crop(dimensionality, size)) receptive_field *= ReceptiveField(padding=-size) elif kind in ('conv1d', 'conv2d'): channels, remainder = layer_def[1].split('@') channels = int(channels) size, remainder = read_size(remainder) params = dict(stride=1, padding=0, dilation=1, groups=1) params.update(read_extra_sizes( remainder, dict(s='stride', p='padding', d='dilation', g='groups'))) cls = {'conv1d': nn.Conv1d, 'conv2d': nn.Conv2d}[kind] layers.append(cls(input_channels, channels, size, **params)) input_channels = channels # effective kernel size: size = (np.array(size) - 1) * params['dilation'] + 1 receptive_field *= ReceptiveField(size, params['stride'], params['padding']) elif kind in ('pool1d', 'pool2d'): method, size = layer_def[1].split('@') size, remainder = read_size(size) params = dict(stride=None, padding=0, dilation=1) params.update(read_extra_sizes( remainder, dict(s='stride', p='padding', d='dilation'))) cls = {'maxpool1d': nn.MaxPool1d, 'meanpool1d': nn.AvgPool1d, 'maxpool2d': nn.MaxPool2d, 'meanpool2d': nn.AvgPool2d}[method + kind] layers.append(cls(size, **params)) # effective kernel size: size = (np.array(size) - 1) * params['dilation'] + 1 if params['stride'] is None: params['stride'] = size receptive_field *= ReceptiveField(size, params['stride'], params['padding']) elif kind in ('globalpool1d', 'globalpool2d'): method = layer_def[1] cls = {'maxglobalpool1d': nn.AdaptiveMaxPool1d, 'meanglobalpool1d': nn.AdaptiveAvgPool1d, 'maxglobalpool2d': nn.AdaptiveMaxPool2d, 'meanglobalpool2d': nn.AdaptiveAvgPool2d}[method + kind] layers.append(cls(output_size=1)) # we do not adjust the receptive field; it spans the whole input elif kind == 'globallmepool': alpha, remainder = read_size(layer_def[1], float) params = read_extra_sizes( remainder, dict(t='trainable', c='per_channel', e='exp'), t=lambda s: bool(int(s))) layers.append(SpatialLogMeanExp(alpha, in_channels=input_channels, keepdim=True, **params)) # we do not adjust the receptive field; it spans the whole input elif kind == 'bn1d': if len(layers) >= 1 and hasattr(layers[-1], 'bias'): layers[-1].register_parameter('bias', None) elif len(layers) >=2 and hasattr(layers[-2], 'bias'): layers[-2].register_parameter('bias', None) layers.append(nn.BatchNorm1d(input_channels)) elif kind == 'bn2d': if len(layers) >= 1 and hasattr(layers[-1], 'bias'): layers[-1].register_parameter('bias', None) elif len(layers) >= 2 and hasattr(layers[-2], 'bias'): layers[-2].register_parameter('bias', None) layers.append(nn.BatchNorm2d(input_channels)) elif kind == 'groupnorm': groups = int(layer_def[1]) layers.append(nn.GroupNorm(groups, input_channels)) elif kind == 'dropout': p = float(layer_def[1]) layers.append(nn.Dropout(p)) elif kind == 'squeeze': dim = int(layer_def[1]) layers.append(Squeeze(dim)) elif kind == 'shift': amount = float(layer_def[1]) layers.append(Shift(amount)) elif kind == 'bypass': layers.append(nn.Identity()) elif kind == 'cat': stack.append((layers, input_channels, receptive_field)) stack.append((Cat(), input_channels, receptive_field)) layers = [] receptive_field = ReceptiveField() elif kind == 'add': stack.append((layers, input_channels, receptive_field)) stack.append((Add(), input_channels, receptive_field)) layers = [] receptive_field = ReceptiveField() elif kind == 'mul': stack.append((layers, input_channels, receptive_field)) stack.append((Mul(), input_channels, receptive_field)) layers = [] receptive_field = ReceptiveField() elif kind == 'shake': stack.append((layers, input_channels, receptive_field)) stack.append((ShakeShake(), input_channels, receptive_field)) layers = [] receptive_field = ReceptiveField() elif kind == '': pass elif kind == 'mbconv2d': # mobile inverted bottleneck convolution layer from MobileNetV2 channels, remainder = layer_def[1].split('@') channels = int(channels) size, remainder = read_size(remainder) params = dict(stride=1, dilation=1, groups=1, expansion=1, size=size, channels=channels) params.update(read_extra_sizes( remainder, dict(s="stride", d="dilation", g="groups", e="expansion"))) hidden_channels = int(input_channels * params['expansion']) # define layers macro = [] # 1x1 channel expansion if hidden_channels != input_channels: macro.append('conv2d:%d@1x1g%d' % (hidden_channels, params['groups'])) if batch_norm: macro.append('bn2d') macro.append(default_nonlin) # channelwise convolution macro.append('conv2d:%d@%ss%sd%sg%d' % (hidden_channels, size_string(size), size_string(params['stride']), size_string(params['dilation']), hidden_channels)) if batch_norm: macro.append('bn2d') macro.append(default_nonlin) # linear projection macro.append('conv2d:%d@1x1g%d' % (channels, params['groups'])) # residual shortcut, if applicable macro = ','.join(macro) if params['stride'] == 1 and channels == input_channels: crop = ((np.array(size) - 1) * params['dilation'] + 1) // 2 macro = 'add[%s|%s]' % ('crop2d:%d' % crop[0], macro) # push to beginning of remaining layer specifications specification[:0] = read_layers(macro) elif kind == 'bipol': layers.append(nonlinearity('bipol:' + layer_def[1])) else: try: layers.append(nonlinearity(kind)) except KeyError: raise ValueError('Unknown layer type "%s"' % kind) if delim is not None and delim in '|]': if isinstance(layers, list): layers = nn.Sequential(*layers) if len(layers) > 1 else layers[0] layers.receptive_field = receptive_field layers.out_channels = input_channels # append layers to Cat() or Add() stack[-1][0].append(layers) if delim == '|': # reset input_channels to match input of Cat() or Add() input_channels = stack[-1][1] # we expect another set of layers layers = [] receptive_field = ReceptiveField() elif delim == ']': # take the Cat() or Add() from the stack layers, _, receptive_field = stack.pop() # append it to what we were building before stack[-1][0].append(layers) # and continue there if isinstance(layers, Cat): input_channels = sum(path.out_channels for path in layers) receptive_field *= sum(path.receptive_field for path in layers) layers, _, _ = stack.pop() if stack: raise ValueError('There seems to be a missing "]" bracket.') if output_name: layers.append(PutDictKey(output_name)) if isinstance(layers, list): layers = nn.Sequential(*layers) layers.receptive_field = receptive_field layers.out_channels = input_channels return layers
def __init__(self, config): super(Segtran2d, self).__init__(config) self.config = config self.device = config.device self.trans_in_dim = config.trans_in_dim self.trans_out_dim = config.trans_out_dim self.num_translayers = config.num_translayers self.bb_feat_upsize = config.bb_feat_upsize self.G = config.G self.use_global_bias = config.use_global_bias if not self.use_global_bias: self.voxel_fusion = SegtranFusionEncoder(config, 'Fusion') self.vfeat_bias = None self.vfeat_bias_norm_layer = nn.Identity() else: self.vfeat_bias = Parameter(torch.randn(1, 1, self.trans_out_dim)) self.vfeat_bias_norm_layer = nn.LayerNorm(self.trans_out_dim, elementwise_affine=True) self.backbone_type = config.backbone_type self.use_pretrained = config.use_pretrained self.pos_embed_every_layer = config.pos_embed_every_layer if self.backbone_type.startswith('resnet'): self.backbone = resnet.__dict__[self.backbone_type]( pretrained=self.use_pretrained, do_pool1=not self.bb_feat_upsize) print("%s created" % self.backbone_type) elif self.backbone_type.startswith('resibn'): mat = re.search(r"resibn(\d+)", self.backbone_type) backbone_type = 'resnet{}_ibn_a'.format(mat.group(1)) self.backbone = resnet_ibn.__dict__[backbone_type]( pretrained=self.use_pretrained, do_pool1=not self.bb_feat_upsize) print("%s created" % backbone_type) elif self.backbone_type.startswith('eff'): backbone_type = self.backbone_type.replace("eff", "efficientnet") stem_stride = 1 if self.bb_feat_upsize else 2 advprop = True if self.use_pretrained: self.backbone = EfficientNet.from_pretrained( backbone_type, advprop=advprop, ignore_missing_keys=True, stem_stride=stem_stride) else: self.backbone = EfficientNet.from_name(backbone_type, stem_stride=stem_stride) print("{} created (stem_stride={}, advprop={})".format( backbone_type, stem_stride, advprop)) self.in_fpn_use_bn = config.in_fpn_use_bn self.in_fpn_layers = config.in_fpn_layers self.in_fpn_scheme = config.in_fpn_scheme # FPN output resolution is determined by the smallest number (lowest layer). pool_stride = 2**np.min(self.in_fpn_layers) if not self.bb_feat_upsize: pool_stride *= 2 self.mask_pool = nn.AvgPool2d((pool_stride, pool_stride)) self.bb_feat_dims = config.bb_feat_dims self.in_fpn23_conv = nn.Conv2d(self.bb_feat_dims[2], self.bb_feat_dims[3], 1) self.in_fpn34_conv = nn.Conv2d(self.bb_feat_dims[3], self.bb_feat_dims[4], 1) # Default in_fpn_layers: 34. last_in_fpn_layer_idx: 4. last_in_fpn_layer_idx = self.in_fpn_layers[-1] if self.bb_feat_dims[last_in_fpn_layer_idx] != self.trans_in_dim: self.in_fpn_bridgeconv = nn.Conv2d( self.bb_feat_dims[last_in_fpn_layer_idx], self.trans_in_dim, 1) else: self.in_fpn_bridgeconv = nn.Identity() # in_bn4b/in_gn4b normalizes in_fpn43_conv(layer 4 features), # so the feature dim = dim of layer 3. # in_bn3b/in_gn3b normalizes in_fpn32_conv(layer 3 features), # so the feature dim = dim of layer 2. if self.in_fpn_use_bn: self.in_bn3b = nn.BatchNorm2d(self.bb_feat_dims[3]) self.in_bn4b = nn.BatchNorm2d(self.bb_feat_dims[4]) self.in_fpn_norms = [None, None, None, self.in_bn3b, self.in_bn4b] else: self.in_gn3b = nn.GroupNorm(self.G, self.bb_feat_dims[3]) self.in_gn4b = nn.GroupNorm(self.G, self.bb_feat_dims[4]) self.in_fpn_norms = [None, None, None, self.in_gn3b, self.in_gn4b] self.in_fpn_convs = [ None, None, self.in_fpn23_conv, self.in_fpn34_conv ] self.num_classes = config.num_classes self.num_modalities = config.num_modalities if self.num_modalities > 0: self.mod_fuse_conv = nn.Conv2d(self.num_modalities, 1, 1) self.out_fpn_use_bn = config.out_fpn_use_bn self.out_fpn_layers = config.out_fpn_layers self.out_fpn_scheme = config.out_fpn_scheme self.out_fpn_do_dropout = config.out_fpn_do_dropout self.posttrans_use_bn = config.posttrans_use_bn if self.out_fpn_layers != self.in_fpn_layers: self.do_out_fpn = True self.out_fpn12_conv = nn.Conv2d(self.bb_feat_dims[1], self.bb_feat_dims[2], 1) self.out_fpn23_conv = nn.Conv2d(self.bb_feat_dims[2], self.bb_feat_dims[3], 1) self.out_fpn34_conv = nn.Conv2d(self.bb_feat_dims[3], self.bb_feat_dims[4], 1) # Default in_fpn_layers: 34, out_fpn_layers: 1234. last_out_fpn_layer_idx: 3. last_out_fpn_layer_idx = self.out_fpn_layers[-len(self. in_fpn_layers)] if self.bb_feat_dims[last_out_fpn_layer_idx] != self.trans_out_dim: self.out_fpn_bridgeconv = nn.Conv2d( self.bb_feat_dims[last_out_fpn_layer_idx], self.trans_out_dim, 1) else: self.out_fpn_bridgeconv = nn.Identity() # out_bn3b/out_gn3b normalizes out_fpn23_conv(layer 3 features), # so the feature dim = dim of layer 2. # out_bn2b/out_gn2b normalizes out_fpn12_conv(layer 2 features), # so the feature dim = dim of layer 1. if self.out_fpn_use_bn: self.out_bn2b = nn.BatchNorm2d(self.bb_feat_dims[2]) self.out_bn3b = nn.BatchNorm2d(self.bb_feat_dims[3]) self.out_bn4b = nn.BatchNorm2d(self.bb_feat_dims[4]) self.out_fpn_norms = [ None, None, self.out_bn2b, self.out_bn3b, self.out_bn4b ] else: self.out_gn2b = nn.GroupNorm(self.G, self.bb_feat_dims[2]) self.out_gn3b = nn.GroupNorm(self.G, self.bb_feat_dims[3]) self.out_gn4b = nn.GroupNorm(self.G, self.bb_feat_dims[4]) self.out_fpn_norms = [ None, None, self.out_gn2b, self.out_gn3b, self.out_gn4b ] self.out_fpn_convs = [ None, self.out_fpn12_conv, self.out_fpn23_conv, self.out_fpn34_conv ] self.out_conv = nn.Conv2d(self.trans_out_dim, self.num_classes, 1) self.out_fpn_dropout = nn.Dropout(config.hidden_dropout_prob) # out_fpn_layers = in_fpn_layers, no need to do fpn at the output end. # Output class scores directly. else: self.do_out_fpn = False if '2' in self.in_fpn_layers: # Output resolution is 1/4 of input already. No need to do upsampling here. self.out_conv = nn.Conv2d(config.trans_out_dim, self.num_classes, 1) else: # Output resolution is 1/8 of input. Do upsampling to make resolution x 2 self.out_conv = nn.ConvTranspose2d(config.trans_out_dim, self.num_classes, 2, 2) self.apply(self.init_weights) # tie_qk() has to be executed after weight initialization. self.apply(self.tie_qk) self.apply(self.add_identity_bias) # Initialize mod_fuse_conv weights and bias. # Set all modalities to have equal weights. if self.num_modalities > 0: self.mod_fuse_conv.weight.data.fill_(1 / self.num_modalities) self.mod_fuse_conv.bias.data.zero_() self.scales_printed = False self.translayer_dims = config.translayer_dims if not self.use_global_bias: self.num_vis_layers = 1 + 2 * self.num_translayers else: self.num_vis_layers = 1
def __init__(self, backbone, transformer, num_classes, num_queries, num_feature_levels, aux_loss=True, with_box_refine=False, two_stage=False): """ Initializes the model. Parameters: backbone: torch module of the backbone to be used. See backbone.py transformer: torch module of the transformer architecture. See transformer.py num_classes: number of object classes num_queries: number of object queries, ie detection slot. This is the maximal number of objects DETR can detect in a single image. For COCO, we recommend 100 queries. aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. with_box_refine: iterative bounding box refinement two_stage: two-stage Deformable DETR """ super().__init__() self.num_queries = num_queries self.transformer = transformer hidden_dim = transformer.d_model self.class_embed = nn.Linear(hidden_dim, num_classes) self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) self.num_feature_levels = num_feature_levels if not two_stage: self.query_embed = nn.Embedding(num_queries, hidden_dim * 2) if num_feature_levels > 1: num_backbone_outs = len(backbone.strides) input_proj_list = [] for _ in range(num_backbone_outs): in_channels = backbone.num_channels[_] input_proj_list.append( nn.Sequential( nn.Conv2d(in_channels, hidden_dim, kernel_size=1), nn.GroupNorm(32, hidden_dim), )) for _ in range(num_feature_levels - num_backbone_outs): input_proj_list.append( nn.Sequential( nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), nn.GroupNorm(32, hidden_dim), )) in_channels = hidden_dim self.input_proj = nn.ModuleList(input_proj_list) else: self.input_proj = nn.ModuleList([ nn.Sequential( nn.Conv2d(backbone.num_channels[0], hidden_dim, kernel_size=1), nn.GroupNorm(32, hidden_dim), ) ]) self.backbone = backbone self.aux_loss = aux_loss self.with_box_refine = with_box_refine self.two_stage = two_stage prior_prob = 0.01 bias_value = -math.log((1 - prior_prob) / prior_prob) self.class_embed.bias.data = torch.ones(num_classes) * bias_value nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) for proj in self.input_proj: nn.init.xavier_uniform_(proj[0].weight, gain=1) nn.init.constant_(proj[0].bias, 0) # if two-stage, the last class_embed and bbox_embed is for region proposal generation num_pred = (transformer.decoder.num_layers + 1) if two_stage else transformer.decoder.num_layers if with_box_refine: self.class_embed = _get_clones(self.class_embed, num_pred) self.bbox_embed = _get_clones(self.bbox_embed, num_pred) nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) # hack implementation for iterative bounding box refinement self.transformer.decoder.bbox_embed = self.bbox_embed else: nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) self.class_embed = nn.ModuleList( [self.class_embed for _ in range(num_pred)]) self.bbox_embed = nn.ModuleList( [self.bbox_embed for _ in range(num_pred)]) self.transformer.decoder.bbox_embed = None if two_stage: # hack implementation for two-stage self.transformer.decoder.class_embed = self.class_embed for box_embed in self.bbox_embed: nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
def __init__(self, input_dim, output_dim, kernel_size, stride, padding=0, norm='none', activation='relu', pad_type='zero'): super(Conv2dBlock, self).__init__() self.use_bias = True # initialize padding if pad_type == 'reflect': self.pad = nn.ReflectionPad2d(padding) elif pad_type == 'replicate': self.pad = nn.ReplicationPad2d(padding) elif pad_type == 'zero': self.pad = nn.ZeroPad2d(padding) else: assert 0, "Unsupported padding type: {}".format(pad_type) # initialize normalization norm_dim = output_dim if norm == 'bn': self.norm = nn.BatchNorm2d(norm_dim) elif norm == 'gn': self.norm = nn.GroupNorm(2, norm_dim) elif norm == 'in': #self.norm = nn.InstanceNorm2d(norm_dim, track_running_stats=True) self.norm = nn.InstanceNorm2d(norm_dim) elif norm == 'ln': self.norm = LayerNorm(norm_dim) elif norm == 'adain': self.norm = AdaptiveInstanceNorm2d(norm_dim) elif norm == 'none': self.norm = None else: assert 0, "Unsupported normalization: {}".format(norm) # initialize activation if activation == 'relu': self.activation = nn.ReLU(inplace=True) elif activation == 'lrelu': self.activation = nn.LeakyReLU(0.2, inplace=True) elif activation == 'prelu': self.activation = nn.PReLU() elif activation == 'selu': self.activation = nn.SELU(inplace=True) elif activation == 'tanh': self.activation = nn.Tanh() elif activation == 'none': self.activation = None else: assert 0, "Unsupported activation: {}".format(activation) # initialize convolution self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride, bias=self.use_bias)
def _get_conv_bn_layer( self, in_channels, out_channels, kernel_size=11, stride=1, dilation=1, padding=0, bias=False, groups=1, heads=-1, separable=False, normalization="batch", norm_groups=1, ): if norm_groups == -1: norm_groups = out_channels if separable: layers = [ self._get_conv( in_channels, in_channels, kernel_size, stride=stride, dilation=dilation, padding=padding, bias=bias, groups=in_channels, heads=heads, ), self._get_conv( in_channels, out_channels, kernel_size=1, stride=1, dilation=1, padding=0, bias=bias, groups=groups, ), ] else: layers = [ self._get_conv( in_channels, out_channels, kernel_size, stride=stride, dilation=dilation, padding=padding, bias=bias, groups=groups, ) ] if normalization == "group": layers.append(nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels)) elif normalization == "instance": layers.append(nn.GroupNorm(num_groups=out_channels, num_channels=out_channels)) elif normalization == "layer": layers.append(nn.GroupNorm(num_groups=1, num_channels=out_channels)) elif normalization == "batch": layers.append(nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1)) else: raise ValueError( f"Normalization method ({normalization}) does not match" f" one of [batch, layer, group, instance]." ) if groups > 1: layers.append(GroupShuffle(groups, out_channels)) return layers
def __init__(self, layers, num_groups=32): super().__init__(layers, norm_layer=lambda x: nn.GroupNorm(num_groups, x))
def groupNorm(num_channels, eps=1e-5, momentum=0.1, affine=True): return nn.GroupNorm(min(32, num_channels), num_channels, eps=eps, affine=affine)
def __init__(self, cfg, input_shape: List[ShapeSpec]): """ SOD Instance Head. """ super().__init__() # fmt: off self.num_classes = cfg.MODEL.SOD.NUM_CLASSES # without background. self.num_kernels = cfg.MODEL.SOD.NUM_KERNELS self.instance_in_features = cfg.MODEL.SOD.INSTANCE_IN_FEATURES self.num_in_channels = cfg.MODEL.SOD.INSTANCE_IN_CHANNELS # = fpn. self.num_channels = cfg.MODEL.SOD.INSTANCE_CHANNELS self.num_grids = cfg.MODEL.SOD.NUM_GRIDS self.strides = cfg.MODEL.SOD.FPN_INSTANCE_STRIDES self.fc_dim = cfg.MODEL.SOD.FC_DIM self.with_coord = cfg.MODEL.SOD.WITH_COORD self.type_att = cfg.MODEL.SOD.TYPE_ATTENTION self.norm = cfg.MODEL.SOD.NORM self.center_symmetry = cfg.MODEL.SOD.CENTER_SYMMETRY self.pe_on = cfg.MODEL.SOD.PE_ON # use positional encoding or not. self.use_base = cfg.MODEL.SOD.USE_BASE # use dense2sparse or not. self.num_conv_before = cfg.MODEL.SOD.NUM_INSTANCE_CONVS_BEFORE self.num_conv_after = cfg.MODEL.SOD.NUM_INSTANCE_CONVS_AFTER self.rescale_first = cfg.MODEL.SOD.RESCALE_FIRST self.max_pool = cfg.MODEL.SOD.MAX_POOL # Convolutions to use in the towers self.num_levels = len(self.instance_in_features) assert self.num_levels == len(self.strides), \ print("Strides should match the features.") assert len(set(self.num_grids)) == 1, \ print("The grid among different stages should be same.") # fmt: on if self.pe_on: num_ins = torch.tensor(self.num_grids).pow(2).sum() self.ins_embed = nn.Embedding(num_ins, self.num_in_channels) in_channels = [s.channels for s in input_shape] assert len(set(in_channels)) == 1, \ print("Each level must have the same channel!") in_channels = in_channels[0] assert in_channels == self.num_in_channels, \ print("In channels should equal to tower in channels!") head_configs = { "ins_before": (self.num_conv_before, self.with_coord, False), # leave for DCN. "ins_after": (self.num_conv_after, self.with_coord, False) } # shared conv. for head in head_configs: tower = [] num_convs, use_coord, use_deformable = head_configs[head] for i in range(num_convs): # with coord or not. if i == 0: if use_coord: chn = self.num_in_channels + 2 else: chn = self.num_in_channels else: chn = self.num_channels # use deformable conv or not. if use_deformable and i == num_convs - 1: raise NotImplementedError else: conv_func = nn.Conv2d tower.append( conv_func(chn, self.num_channels, kernel_size=3, stride=1, padding=1, bias=self.norm is None)) if self.norm == "GN": tower.append(nn.GroupNorm(32, self.num_channels)) tower.append(nn.ReLU(inplace=True)) self.add_module('{}_tower'.format(head), nn.Sequential(*tower)) # att conv. if self.use_base: self.base_att = nn.Conv2d(self.num_channels, self.num_kernels, kernel_size=1, stride=1, padding=0) self.ins_att = nn.Conv2d(self.num_channels, self.num_kernels, kernel_size=1, stride=1, padding=0) # individual fc. cls_tower = [] bbox_tower = [] self._output_size = self.num_channels for k, fc_dim in enumerate(self.fc_dim): cls_tower.append(nn.Linear(self._output_size, fc_dim)) cls_tower.append(nn.ReLU(inplace=True)) bbox_tower.append(nn.Linear(self._output_size, fc_dim)) bbox_tower.append(nn.ReLU(inplace=True)) self._output_size = fc_dim self.add_module('cls_tower', nn.Sequential(*cls_tower)) self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) # pred layer. self.cls_pred = nn.Linear(self._output_size, self.num_classes + 1) self.bbox_pred = nn.Linear(self._output_size, 4) # init. conv_modules = [self.ins_before_tower, self.ins_after_tower] if self.use_base: conv_modules += [self.base_att, self.ins_att] for modules in conv_modules: for l in modules.modules(): if isinstance(l, nn.Conv2d): nn.init.normal_(l.weight, std=0.01) if l.bias is not None: nn.init.constant_(l.bias, 0) for modules in [self.cls_tower, self.bbox_tower]: for l in modules.modules(): if isinstance(l, nn.Linear): weight_init.c2_xavier_fill(l) nn.init.normal_(self.cls_pred.weight, std=0.01) nn.init.normal_(self.bbox_pred.weight, std=0.001) for l in [self.cls_pred, self.bbox_pred]: if l.bias is not None: nn.init.constant_(l.bias, 0) # initialize the bias for scale. prior_prob = cfg.MODEL.SOD.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) nn.init.constant_(self.bbox_pred.bias[2:], bias_value)
def __init__(self, cfg, input_shape: List[ShapeSpec]): """ Arguments: in_channels (int): number of channels of the input feature """ super(FCOSRepPointsHead, self).__init__() # TODO: Implement the sigmoid version first. # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.FCOS.NUM_CLASSES self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS self.centerness_on_reg = cfg.MODEL.FCOS.CENTERNESS_ON_REG self.use_dcn_in_tower = cfg.MODEL.FCOS.USE_DCN_IN_TOWER self.use_dcn_v2 = cfg.MODEL.FCOS.USE_DCN_V2 # fmt: on cls_tower = [] bbox_tower = [] for i in range(cfg.MODEL.FCOS.NUM_CONVS): use_dcn = False use_v2 = True if self.use_dcn_in_tower and i == cfg.MODEL.FCOS.NUM_CONVS - 1: conv_func = DFConv2d bias = False use_dcn = True if not self.use_dcn_v2: use_v2 = False else: conv_func = nn.Conv2d bias = True if use_dcn and not use_v2: cls_tower.append( conv_func(in_channels, in_channels, with_modulated_dcn=False, kernel_size=3, stride=1, padding=1, bias=bias)) else: cls_tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias)) cls_tower.append(nn.GroupNorm(32, in_channels)) cls_tower.append(nn.ReLU()) if use_dcn and not use_v2: bbox_tower.append( conv_func(in_channels, in_channels, with_modulated_dcn=False, kernel_size=3, stride=1, padding=1, bias=bias)) else: bbox_tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias)) bbox_tower.append(nn.GroupNorm(32, in_channels)) bbox_tower.append(nn.ReLU()) self.add_module('cls_tower', nn.Sequential(*cls_tower)) self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) # rep part self.point_feat_channels = in_channels self.num_points = 9 self.dcn_kernel = int(np.sqrt(self.num_points)) self.dcn_pad = int((self.dcn_kernel - 1) / 2) self.cls_out_channels = num_classes self.gradient_mul = 0.1 dcn_base = np.arange(-self.dcn_pad, self.dcn_pad + 1).astype(np.float64) dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) dcn_base_x = np.tile(dcn_base, self.dcn_kernel) dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape( (-1)) dcn_base_offset = torch.tensor(dcn_base_offset, dtype=torch.float32).view(1, -1, 1, 1) self.register_buffer("dcn_base_offset", dcn_base_offset) self.deform_cls_conv = DeformConv(self.point_feat_channels, self.point_feat_channels, self.dcn_kernel, 1, self.dcn_pad) self.deform_reg_conv = DeformConv(self.point_feat_channels, self.point_feat_channels, self.dcn_kernel, 1, self.dcn_pad) points_out_dim = 2 * self.num_points self.offsets_init = nn.Sequential( nn.Conv2d(self.point_feat_channels, self.point_feat_channels, 3, 1, 1), nn.ReLU(inplace=True), nn.Conv2d(self.point_feat_channels, points_out_dim, 1, 1, 0)) self.offsets_refine = nn.Sequential( nn.ReLU(), nn.Conv2d(self.point_feat_channels, points_out_dim, 1, 1, 0)) self.logits = nn.Sequential( nn.ReLU(), nn.Conv2d(self.point_feat_channels, self.cls_out_channels, 1, 1, 0)) # self.cls_logits = nn.Conv2d(in_channels, num_classes, kernel_size=3, stride=1, padding=1) # self.bbox_pred = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1) self.centerness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1) # initialization for modules in [ self.cls_tower, self.bbox_tower, # self.cls_logits, self.bbox_pred, self.offsets_init, self.offsets_refine, self.deform_cls_conv, self.deform_reg_conv, self.centerness ]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.FCOS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) # torch.nn.init.constant_(self.cls_logits.bias, bias_value) for module in self.logits.modules(): if hasattr(module, 'bias') and module.bias is not None: torch.nn.init.constant_(module.bias, bias_value) self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(5)])
def gnnorm2d(num_channels, num_groups=32): if num_groups > 0: return nn.GroupNorm(num_groups, num_channels) else: return nn.BatchNorm2d(num_channels)
def get_gn(num_channels): return nn.GroupNorm(32, num_channels)
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', norm='BN', groups_size=16, conv_last=False): super(ConvNorm, self).__init__() if norm not in [ None, 'BN', 'IN', 'GN', 'LN', 'WN', 'SN', 'MSN', 'MSNTReLU', 'WNTReLU' ]: raise ValueError( "Undefined norm value. Must be one of " "[None,'BN', 'IN', 'GN', 'LN', 'WN', 'SN','MSN', 'MSNTReLU', 'WNTReLU']" ) layers = [] if norm in ['MSN', 'MSNTReLU']: conv2d = MeanSpectralNormConv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) layers += [conv2d] elif norm == 'SN': conv2d = SpectralNormConv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) layers += [conv2d] elif norm in ['WN', 'WNTReLU']: conv2d = MeanWeightNormConv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) layers += [conv2d] elif norm == 'IN': conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) layers += [conv2d, nn.InstanceNorm2d(out_channels)] elif norm == 'GN': conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) layers += [conv2d, nn.GroupNorm(groups_size, out_channels)] elif norm == 'LN': conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) layers += [conv2d, nn.LayerNorm(out_channels)] elif norm == 'BN': conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) layers += [conv2d, nn.BatchNorm2d(out_channels)] else: conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode) layers += [conv2d] """ conv_last is a flag to change the order of operations from Conv2D+ BN to BN+Con2D This is frequently used in DenseNet & ResNet architectures. So to change the order, we simply rotate the array by 1 to the left and change the num_features to the in_channels size """ if conv_last and norm not in [ None, 'MSN', 'SN', 'WN', 'WNTReLU', 'MSNTReLU' ]: layers = layers[1:] + layers[:1] # Reinitialize the batchnorm layer or its variants layers[0].__init__(in_channels) self.layers = nn.Sequential(*layers)
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.FCOS.NUM_CLASSES num_convs = cfg.MODEL.FCOS.NUM_CONVS prior_prob = cfg.MODEL.FCOS.PRIOR_PROB self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES self.centerness_on_reg = cfg.MODEL.FCOS.CENTERNESS_ON_REG self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS # fmt: on cls_subnet = [] bbox_subnet = [] for _ in range(num_convs): cls_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) cls_subnet.append(nn.GroupNorm(32, in_channels)) cls_subnet.append(nn.ReLU()) bbox_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) bbox_subnet.append(nn.GroupNorm(32, in_channels)) bbox_subnet.append(nn.ReLU()) self.cls_subnet = nn.Sequential(*cls_subnet) self.bbox_subnet = nn.Sequential(*bbox_subnet) self.cls_score = nn.Conv2d(in_channels, num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1) self.centerness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1) self.add_module("border_cls_subnet", BorderBranch(in_channels, 256)) self.add_module("border_bbox_subnet", BorderBranch(in_channels, 128)) self.border_cls_score = nn.Conv2d(in_channels, num_classes, kernel_size=1, stride=1) self.border_bbox_pred = nn.Conv2d(in_channels, 4, kernel_size=1, stride=1) # Initialization for modules in [ self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred, self.centerness, self.border_cls_subnet, self.border_bbox_subnet, self.border_cls_score, self.border_bbox_pred ]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, 0) if isinstance(layer, nn.GroupNorm): torch.nn.init.constant_(layer.weight, 1) torch.nn.init.constant_(layer.bias, 0) # Use prior in model initialization to improve stability bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_score.bias, bias_value) torch.nn.init.constant_(self.border_cls_score.bias, bias_value) self.scales = nn.ModuleList( [Scale(init_value=1.0) for _ in range(len(self.fpn_strides))])
def __init__(self, cfg, in_channels): """ Arguments: in_channels (int): number of channels of the input feature """ super(FCOSHead, self).__init__() # TODO: Implement the sigmoid version first. num_classes = cfg.MODEL.NUM_CLASSES - 1 self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS self.centerness_on_reg = cfg.MODEL.FCOS.CENTERNESS_ON_REG self.use_dcn_in_tower = cfg.MODEL.FCOS.USE_DCN_IN_TOWER cls_tower = [] bbox_tower = [] for i in range(cfg.MODEL.FCOS.NUM_CONVS): if self.use_dcn_in_tower and \ i == cfg.MODEL.FCOS.NUM_CONVS - 1: conv_func = DFConv2d else: conv_func = nn.Conv2d cls_tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True)) cls_tower.append(nn.GroupNorm(32, in_channels)) cls_tower.append(nn.ReLU(True)) bbox_tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True)) bbox_tower.append(nn.GroupNorm(32, in_channels)) bbox_tower.append(nn.ReLU(True)) self.add_module('cls_tower', nn.Sequential(*cls_tower)) self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) self.cls_logits = nn.Conv2d(in_channels, num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1) self.centerness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1) # initialization for modules in [ self.cls_tower, self.bbox_tower, self.cls_logits, self.bbox_pred, self.centerness ]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.FCOS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_logits.bias, bias_value) self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(4)])
def norm(dim): return nn.GroupNorm(min(32, dim), dim)
def __init__(self, cfg, in_channels): super(ATSSHead, self).__init__() self.cfg = cfg num_classes = cfg.MODEL.ATSS.NUM_CLASSES num_anchors = len(cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS[0]) * len( cfg.MODEL.ANCHOR_GENERATOR.SIZES[0]) head_configs = { "cls": (cfg.MODEL.ATSS.NUM_CONVS, False), "bbox": (cfg.MODEL.ATSS.NUM_CONVS, cfg.MODEL.ATSS.USE_DCN_IN_TOWER), } norm = None if cfg.MODEL.ATSS.NORM == "none" else cfg.MODEL.ATSS.NORM for head in head_configs: tower = [] num_convs, use_deformable = head_configs[head] if use_deformable: conv_func = DFConv2d else: conv_func = nn.Conv2d for i in range(num_convs): tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True)) if norm == "GN": tower.append(nn.GroupNorm(32, in_channels)) elif norm is not None: tower.append(get_norm(norm, in_channels)) tower.append(nn.ReLU()) self.add_module('{}_tower'.format(head), nn.Sequential(*tower)) self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) # initialization for modules in [ self.cls_tower, self.bbox_tower, self.cls_logits, self.bbox_pred, ]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.ATSS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_logits.bias, bias_value) if self.cfg.MODEL.ATSS.REGRESSION_TYPE == 'POINT': assert num_anchors == 1, "regressing from a point only support num_anchors == 1" torch.nn.init.constant_(self.bbox_pred.bias, 4) self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(5)])
def BatchNorm2d(num_features): return nn.GroupNorm(num_channels=num_features, num_groups=32)
def create_conv(in_channels, out_channels, kernel_size, order, num_groups, padding=1): """ Create a list of modules with together constitute a single conv layer with non-linearity and optional batchnorm/groupnorm. Args: in_channels (int): number of input channels out_channels (int): number of output channels order (string): order of things, e.g. 'cr' -> conv + ReLU 'crg' -> conv + ReLU + groupnorm 'cl' -> conv + LeakyReLU 'ce' -> conv + ELU num_groups (int): number of groups for the GroupNorm padding (int): add zero-padding to the input Return: list of tuple (name, module) """ assert 'c' in order, "Conv layer MUST be present" assert order[ 0] not in 'rle', 'Non-linearity cannot be the first operation in the layer' modules = [] for i, char in enumerate(order): if char == 'r': modules.append(('ReLU', nn.ReLU(inplace=True))) elif char == 'l': modules.append( ('LeakyReLU', nn.LeakyReLU(negative_slope=0.1, inplace=True))) elif char == 'e': modules.append(('ELU', nn.ELU(inplace=True))) elif char == 'c': # add learnable bias only in the absence of gatchnorm/groupnorm bias = not ('g' in order or 'b' in order) modules.append(('conv', conv3d(in_channels, out_channels, kernel_size, bias, padding=padding))) elif char == 'g': is_before_conv = i < order.index('c') assert not is_before_conv, 'GroupNorm MUST go after the Conv3d' # number of groups must be less or equal the number of channels if out_channels < num_groups: num_groups = out_channels modules.append(('groupnorm', nn.GroupNorm(num_groups=num_groups, num_channels=out_channels))) elif char == 'b': is_before_conv = i < order.index('c') if is_before_conv: modules.append(('batchnorm', nn.BatchNorm3d(in_channels))) else: modules.append(('batchnorm', nn.BatchNorm3d(out_channels))) else: raise ValueError( f"Unsupported layer type '{char}'. MUST be one of ['b', 'g', 'r', 'l', 'e', 'c']" ) return modules
def __init__(self, conv_body_func, fpn_level_info, P2only=False): super().__init__() self.fpn_level_info = fpn_level_info self.P2only = P2only self.dim_out = fpn_dim = cfg.FPN.DIM min_level, max_level = get_min_max_levels() self.num_backbone_stages = len( fpn_level_info.blobs) - (min_level - LOWEST_BACKBONE_LVL) fpn_dim_lateral = fpn_level_info.dims self.spatial_scale = [] # a list of scales for FPN outputs # # Step 1: recursively build down starting from the coarsest backbone level # # For the coarest backbone level: 1x1 conv only seeds recursion self.conv_top = nn.Conv2d(fpn_dim_lateral[0], fpn_dim, 1, 1, 0) if cfg.FPN.USE_GN: self.conv_top = nn.Sequential( nn.Conv2d(fpn_dim_lateral[0], fpn_dim, 1, 1, 0, bias=False), nn.GroupNorm(net_utils.get_group_gn(fpn_dim), fpn_dim, eps=cfg.GROUP_NORM.EPSILON)) else: self.conv_top = nn.Conv2d(fpn_dim_lateral[0], fpn_dim, 1, 1, 0) self.topdown_lateral_modules = nn.ModuleList() self.posthoc_modules = nn.ModuleList() # For other levels add top-down and lateral connections for i in range(self.num_backbone_stages - 1): self.topdown_lateral_modules.append( topdown_lateral_module(fpn_dim, fpn_dim_lateral[i + 1])) # Post-hoc scale-specific 3x3 convs for i in range(self.num_backbone_stages): if cfg.FPN.USE_GN: self.posthoc_modules.append( nn.Sequential( nn.Conv2d(fpn_dim, fpn_dim, 3, 1, 1, bias=False), nn.GroupNorm(net_utils.get_group_gn(fpn_dim), fpn_dim, eps=cfg.GROUP_NORM.EPSILON))) else: self.posthoc_modules.append( nn.Conv2d(fpn_dim, fpn_dim, 3, 1, 1)) self.spatial_scale.append(fpn_level_info.spatial_scales[i]) # # Step 2: build up starting from the coarsest backbone level # # Check if we need the P6 feature map if not cfg.FPN.EXTRA_CONV_LEVELS and max_level == HIGHEST_BACKBONE_LVL + 1: # Original FPN P6 level implementation from our CVPR'17 FPN paper # Use max pooling to simulate stride 2 subsampling self.maxpool_p6 = nn.MaxPool2d(kernel_size=1, stride=2, padding=0) self.spatial_scale.insert(0, self.spatial_scale[0] * 0.5) # Coarser FPN levels introduced for RetinaNet if cfg.FPN.EXTRA_CONV_LEVELS and max_level > HIGHEST_BACKBONE_LVL: self.extra_pyramid_modules = nn.ModuleList() dim_in = fpn_level_info.dims[0] for i in range(HIGHEST_BACKBONE_LVL + 1, max_level + 1): self.extra_pyramid_modules(nn.Conv2d(dim_in, fpn_dim, 3, 2, 1)) dim_in = fpn_dim self.spatial_scale.insert(0, self.spatial_scale[0] * 0.5) if self.P2only: # use only the finest level self.spatial_scale = self.spatial_scale[-1] self._init_weights() # Deliberately add conv_body after _init_weights. # conv_body has its own _init_weights function self.conv_body = conv_body_func() # e.g resnet
def __init__(self, cfg, input_shape: List[ShapeSpec]): """ Arguments: in_channels (int): number of channels of the input feature """ super().__init__() # TODO: Implement the sigmoid version first. self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES head_configs = { "cls": (cfg.MODEL.FCOS.NUM_CLS_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE), "bbox": (cfg.MODEL.FCOS.NUM_BOX_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE), "share": (cfg.MODEL.FCOS.NUM_SHARE_CONVS, False) } norm = None if cfg.MODEL.FCOS.NORM == "none" else cfg.MODEL.FCOS.NORM self.num_levels = len(input_shape) in_channels = [s.channels for s in input_shape] assert len( set(in_channels)) == 1, "Each level must have the same channel!" in_channels = in_channels[0] for head in head_configs: tower = [] num_convs, use_deformable = head_configs[head] for i in range(num_convs): if use_deformable and i == num_convs - 1: conv_func = DFConv2d else: conv_func = nn.Conv2d tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True)) if norm == "GN": tower.append(nn.GroupNorm(32, in_channels)) elif norm == "NaiveGN": tower.append(NaiveGroupNorm(32, in_channels)) elif norm == "BN": tower.append( ModuleListDial([ nn.BatchNorm2d(in_channels) for _ in range(self.num_levels) ])) elif norm == "SyncBN": tower.append( ModuleListDial([ NaiveSyncBatchNorm(in_channels) for _ in range(self.num_levels) ])) tower.append(nn.ReLU()) self.add_module('{}_tower'.format(head), nn.Sequential(*tower)) self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1) self.ctrness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1) if cfg.MODEL.FCOS.USE_SCALE: self.scales = nn.ModuleList( [Scale(init_value=1.0) for _ in range(self.num_levels)]) else: self.scales = None for modules in [ self.cls_tower, self.bbox_tower, self.share_tower, self.cls_logits, self.bbox_pred, self.ctrness ]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.FCOS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_logits.bias, bias_value)
def __init__(self, cin, cout, zdim=128, nf=64): super(ConfNet, self).__init__() ## downsampling network = [ nn.Conv2d(cin, nf, kernel_size=4, stride=2, padding=1, bias=False), # 64x64 -> 32x32 nn.GroupNorm(16, nf), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(nf, nf * 2, kernel_size=4, stride=2, padding=1, bias=False), # 32x32 -> 16x16 nn.GroupNorm(16 * 2, nf * 2), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(nf * 2, nf * 4, kernel_size=4, stride=2, padding=1, bias=False), # 16x16 -> 8x8 nn.GroupNorm(16 * 4, nf * 4), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(nf * 4, nf * 8, kernel_size=4, stride=2, padding=1, bias=False), # 8x8 -> 4x4 nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(nf * 8, zdim, kernel_size=4, stride=1, padding=0, bias=False), # 4x4 -> 1x1 nn.ReLU(inplace=True) ] ## upsampling network += [ nn.ConvTranspose2d(zdim, nf * 8, kernel_size=4, padding=0, bias=False), # 1x1 -> 4x4 nn.ReLU(inplace=True), nn.ConvTranspose2d(nf * 8, nf * 4, kernel_size=4, stride=2, padding=1, bias=False), # 4x4 -> 8x8 nn.GroupNorm(16 * 4, nf * 4), nn.ReLU(inplace=True), nn.ConvTranspose2d(nf * 4, nf * 2, kernel_size=4, stride=2, padding=1, bias=False), # 8x8 -> 16x16 nn.GroupNorm(16 * 2, nf * 2), nn.ReLU(inplace=True) ] self.network = nn.Sequential(*network) out_net1 = [ nn.ConvTranspose2d(nf * 2, nf, kernel_size=4, stride=2, padding=1, bias=False), # 16x16 -> 32x32 nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.ConvTranspose2d(nf, nf, kernel_size=4, stride=2, padding=1, bias=False), # 32x32 -> 64x64 nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Conv2d(nf, 2, kernel_size=5, stride=1, padding=2, bias=False), # 64x64 nn.Softplus() ] self.out_net1 = nn.Sequential(*out_net1) out_net2 = [ nn.Conv2d(nf * 2, 2, kernel_size=3, stride=1, padding=1, bias=False), # 16x16 nn.Softplus() ] self.out_net2 = nn.Sequential(*out_net2)
def gn_helper(planes): return nn.GroupNorm(8, planes)
def __init__(self, cin, cout, zdim=128, nf=64, activation=nn.Tanh): super(EDDeconv, self).__init__() ## downsampling network = [ nn.Conv2d(cin, nf, kernel_size=4, stride=2, padding=1, bias=False), # 64x64 -> 32x32 nn.GroupNorm(16, nf), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(nf, nf * 2, kernel_size=4, stride=2, padding=1, bias=False), # 32x32 -> 16x16 nn.GroupNorm(16 * 2, nf * 2), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(nf * 2, nf * 4, kernel_size=4, stride=2, padding=1, bias=False), # 16x16 -> 8x8 nn.GroupNorm(16 * 4, nf * 4), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(nf * 4, nf * 8, kernel_size=4, stride=2, padding=1, bias=False), # 8x8 -> 4x4 nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(nf * 8, zdim, kernel_size=4, stride=1, padding=0, bias=False), # 4x4 -> 1x1 nn.ReLU(inplace=True) ] ## upsampling network += [ nn.ConvTranspose2d(zdim, nf * 8, kernel_size=4, stride=1, padding=0, bias=False), # 1x1 -> 4x4 nn.ReLU(inplace=True), nn.Conv2d(nf * 8, nf * 8, kernel_size=3, stride=1, padding=1, bias=False), nn.ReLU(inplace=True), nn.ConvTranspose2d(nf * 8, nf * 4, kernel_size=4, stride=2, padding=1, bias=False), # 4x4 -> 8x8 nn.GroupNorm(16 * 4, nf * 4), nn.ReLU(inplace=True), nn.Conv2d(nf * 4, nf * 4, kernel_size=3, stride=1, padding=1, bias=False), nn.GroupNorm(16 * 4, nf * 4), nn.ReLU(inplace=True), nn.ConvTranspose2d(nf * 4, nf * 2, kernel_size=4, stride=2, padding=1, bias=False), # 8x8 -> 16x16 nn.GroupNorm(16 * 2, nf * 2), nn.ReLU(inplace=True), nn.Conv2d(nf * 2, nf * 2, kernel_size=3, stride=1, padding=1, bias=False), nn.GroupNorm(16 * 2, nf * 2), nn.ReLU(inplace=True), nn.ConvTranspose2d(nf * 2, nf, kernel_size=4, stride=2, padding=1, bias=False), # 16x16 -> 32x32 nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1, bias=False), nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Upsample(scale_factor=2, mode='nearest'), # 32x32 -> 64x64 nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1, bias=False), nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Conv2d(nf, nf, kernel_size=5, stride=1, padding=2, bias=False), nn.GroupNorm(16, nf), nn.ReLU(inplace=True), nn.Conv2d(nf, cout, kernel_size=5, stride=1, padding=2, bias=False) ] if activation is not None: network += [activation()] self.network = nn.Sequential(*network)
def test_groupnorm(self): self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10)) self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10, 9)) self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10, 9, 8))