def __init__(self, mlp: List[int], n_points=None, radius=None, n_samples=None, bn=True, use_xyz=True): super().__init__() self.n_points = n_points self.groupers = nn.ModuleList() if self.n_points is not None: self.sampler = FurthestPointSampler(n_points) self.groupers.append(BallQueryGrouper(radius, n_samples, use_xyz)) else: self.groupers.append(GroupAll(use_xyz)) self.mlps = nn.ModuleList() self.mlps.append(self.build_mlps(mlp, use_xyz))
def __init__(self, npoint, nsample, in_channel, mlp, bandwidth, group_all): super(PointConvDensitySetAbstraction, self).__init__() self.npoint = npoint self.nsample = nsample self.mlp_convs = nn.ModuleList() self.mlp_bns = nn.ModuleList() last_channel = in_channel for out_channel in mlp: self.mlp_convs.append(nn.Conv(last_channel, out_channel, 1)) self.mlp_bns.append(nn.BatchNorm(out_channel)) last_channel = out_channel self.weightnet = WeightNet(3, 16) self.densitynet = DensityNet() self.linear = nn.Linear(16 * mlp[-1], mlp[-1]) self.bn_linear = nn.BatchNorm1d(mlp[-1]) self.group_all = group_all self.bandwidth = bandwidth self.relu = nn.ReLU()
def __init__(self, cfg, in_channels): """ Arguments: in_channels (int): number of channels of the input feature """ super(FCOSSharedHead, self).__init__() # TODO: Implement the sigmoid version first. num_classes = cfg.MODEL.FCOS.NUM_CLASSES - 1 self.identity = cfg.MODEL.FCOS.RESIDUAL_CONNECTION shared_tower = [] for i in range(cfg.MODEL.FCOS.NUM_CONVS): shared_tower.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) shared_tower.append(nn.GroupNorm(32, in_channels)) shared_tower.append(nn.ReLU()) setattr(self, 'shared_tower', nn.Sequential(*shared_tower)) self.dense_points = cfg.MODEL.FCOS.DENSE_POINTS self.cls_logits = nn.Conv(in_channels, num_classes * self.dense_points, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv(in_channels, 4 * self.dense_points, kernel_size=3, stride=1, padding=1) self.centerness = nn.Conv(in_channels, 1 * self.dense_points, kernel_size=3, stride=1, padding=1) # initialization for modules in [ self.shared_tower, self.cls_logits, self.bbox_pred, self.centerness ]: for l in modules.modules(): if isinstance(l, nn.Conv): nn.init.gauss_(l.weight, std=0.01) nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.FCOS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) nn.init.constant_(self.cls_logits.bias, bias_value) self.scales = nn.ModuleList(*[Scale(init_value=1.0) for _ in range(5)])
def _compile(self, C, op_names, indices, concat, reduction): assert len(op_names) == len(indices) self._steps = len(op_names) // 2 self._concat = concat self.multiplier = len(concat) self._ops = nn.ModuleList() for name, index in zip(op_names, indices): stride = 2 if reduction and index < 2 else 1 op = OPS[name](C, stride, True) self._ops.append(op) self._indices = indices
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, baseWidth=26, scale = 4, stype='normal'): """ Constructor Args: inplanes: input channel dimensionality planes: output channel dimensionality stride: conv stride. Replaces pooling layer. downsample: None when stride = 1 baseWidth: basic width of conv3x3 scale: number of scale. type: 'normal': normal set. 'stage': first block of a new stage. """ super(Bottle2neck, self).__init__() width = int(math.floor(planes * (baseWidth/64.0))) self.conv1 = nn.Conv(inplanes, width*scale, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm(width*scale) assert scale > 1, 'Res2Net degenerates to ResNet when scales = 1.' if scale == 1: self.nums = 1 else: self.nums = scale -1 if stype == 'stage': self.pool = nn.Pool(kernel_size=3, stride = stride, padding=1, op='mean') self.convs = nn.ModuleList() self.bns = nn.ModuleList() for i in range(self.nums): self.convs.append(nn.Conv(width, width, kernel_size=3, stride = stride, dilation=dilation, padding=dilation, bias=False)) self.bns.append(nn.BatchNorm(width)) self.conv3 = nn.Conv(width*scale, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm(planes * self.expansion) self.relu = nn.ReLU() self.downsample = downsample self.stype = stype self.scale = scale self.width = width self.stride = stride self.dilation = dilation
def __init__(self, light_mode='surface', intensity_ambient=0.5, color_ambient=[1,1,1], intensity_directionals=0.5, color_directionals=[1,1,1], directions=[0,1,0]): super(Lighting, self).__init__() if light_mode not in ['surface', 'vertex']: raise ValueError('Lighting mode only support surface and vertex') self.light_mode = light_mode self.ambient = AmbientLighting(intensity_ambient, color_ambient) self.directionals = nn.ModuleList([DirectionalLighting(intensity_directionals, color_directionals, directions)])
def __init__(self, D=8, W=256, input_ch=3, input_ch_views=3, output_ch=4, skips=[4], use_viewdirs=False): """ """ super(NeRF, self).__init__() self.D = D self.W = W self.input_ch = input_ch self.input_ch_views = input_ch_views self.skips = skips self.use_viewdirs = use_viewdirs self.pts_linears = nn.ModuleList([nn.Linear(input_ch, W)] + [ nn.Linear(W, W) if i not in self.skips else nn.Linear(W + input_ch, W) for i in range(D - 1) ]) ### Implementation according to the official code release (https://github.com/bmild/nerf/blob/master/run_nerf_helpers.py#L104-L105) self.views_linears = nn.ModuleList( [nn.Linear(input_ch_views + W, W // 2)]) ### Implementation according to the paper # self.views_linears = nn.ModuleList( # [nn.Linear(input_ch_views + W, W//2)] + [nn.Linear(W//2, W//2) for i in range(D//2)]) if use_viewdirs: self.feature_linear = nn.Linear(W, W) self.alpha_linear = nn.Linear(W, 1) self.rgb_linear = nn.Linear(W // 2, 3) else: self.output_linear = nn.Linear(W, output_ch)
def __init__(self, in_channels, out_channels, normalize=None, pooling='AVG', share_conv=False, conv_stride=1, num_level=5, with_checkpoint=False): super(HRFPN, self).__init__() assert isinstance(in_channels, list) self.in_channels = in_channels self.out_channels = out_channels self.num_ins = len(in_channels) self.with_bias = normalize is None self.share_conv = share_conv self.num_level = num_level self.reduction_conv = nn.Sequential( nn.Conv(in_channels=sum(in_channels), out_channels=out_channels, kernel_size=1), ) if self.share_conv: self.fpn_conv = nn.Conv( in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=conv_stride, padding=1, ) else: self.fpn_conv = nn.ModuleList() for i in range(self.num_level): self.fpn_conv.append( nn.Conv(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=conv_stride, padding=1)) if pooling == 'MAX': self.pooling = 'maximum' else: self.pooling = 'mean' self.with_checkpoint = with_checkpoint
def build_model(self): self.pointnet_modules = nn.ModuleList() self.pointnet_modules.append( PointnetModule( n_points=512, radius=0.2, n_samples=64, mlp=[3, 64, 64, 128], use_xyz=self.use_xyz, ) ) self.pointnet_modules.append( PointnetModule( n_points=128, radius=0.4, n_samples=64, mlp=[128, 128, 128, 256], use_xyz=self.use_xyz, ) ) self.pointnet_modules.append( PointnetModule( mlp=[256, 256, 512, 1024], use_xyz=self.use_xyz, ) ) self.fc_layer = nn.Sequential( nn.Linear(1024, 512, bias=False), nn.BatchNorm1d(512), nn.ReLU(), nn.Linear(512, 256, bias=False), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.5), nn.Linear(256, self.n_classes), )
def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer): num_branches_cur = len(num_channels_cur_layer) num_branches_pre = len(num_channels_pre_layer) transition_layers = [] for i in range(num_branches_cur): if i < num_branches_pre: if num_channels_cur_layer[i] != num_channels_pre_layer[i]: transition_layers.append( nn.Sequential( nn.Conv(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False), BatchNorm2d(num_channels_cur_layer[i], momentum=BN_MOMENTUM), nn.ReLU())) else: transition_layers.append(None) else: conv3x3s = [] for j in range(i + 1 - num_branches_pre): inchannels = num_channels_pre_layer[-1] outchannels = num_channels_cur_layer[i] \ if j == i - num_branches_pre else inchannels conv3x3s.append( nn.Sequential( nn.Conv(inchannels, outchannels, 3, 2, 1, bias=False), BatchNorm2d(outchannels, momentum=BN_MOMENTUM), nn.ReLU())) transition_layers.append(nn.Sequential(*conv3x3s)) return nn.ModuleList(transition_layers)
def __init__(self, cfg, n_class=1000, input_size=224, width_mult=1.): super(MobileNetV2, self).__init__() block = InvertedResidual input_channel = 32 interverted_residual_setting = [ # t, c, n, s [1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], [6, 320, 1, 1], ] # building first layer assert input_size % 32 == 0 input_channel = int(input_channel * width_mult) self.return_features_indices = [3, 6, 13, 17] self.return_features_num_channels = [] self.features = nn.ModuleList(conv_bn(3, input_channel, 2)) # building inverted residual blocks for t, c, n, s in interverted_residual_setting: output_channel = int(c * width_mult) for i in range(n): if i == 0: self.features.append( block(input_channel, output_channel, s, expand_ratio=t)) else: self.features.append( block(input_channel, output_channel, 1, expand_ratio=t)) input_channel = output_channel if len(self.features) - 1 in self.return_features_indices: self.return_features_num_channels.append(output_channel) self._initialize_weights() self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT)
def __init__(self, layers=[1, 2, 8, 8, 4], block=DarkNetBlock): super().__init__() # These will be populated by _make_layer self.num_base_layers = len(layers) self.layers = nn.ModuleList() self.channels = [] self._preconv = darknetconvlayer(3, 32, kernel_size=3, padding=1) self.in_channels = 32 self._make_layer(block, 32, layers[0]) self._make_layer(block, 64, layers[1]) self._make_layer(block, 128, layers[2]) self._make_layer(block, 256, layers[3]) self._make_layer(block, 512, layers[4]) # This contains every module that should be initialized by loading in pretrained weights. # Any extra layers added onto this that won't be initialized by init_backbone will not be # in this list. That way, Yolact::init_weights knows which backbone weights to initialize # with xavier, and which ones to leave alone. self.backbone_modules = [ m for m in self.modules() if isinstance(m, nn.Conv) ]
def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True): super(MixConv2d, self).__init__() groups = len(k) if equal_ch: # equal c_ per group i = jt.array( np.linspace(0, groups - 1E-6, c2).as_type(np.float32)).floor() # c2 indices c_ = [(i == g).sum() for g in range(groups)] # intermediate channels else: # equal weight.numel() per group b = [c2] + [0] * groups a = np.eye(groups + 1, groups, k=-1) a -= np.roll(a, 1, axis=1) a *= np.array(k)**2 a[0] = 1 c_ = np.linalg.lstsq(a, b, rcond=None)[0].round( ) # solve for equal weight indices, ax = b self.m = nn.ModuleList([ nn.Conv(c1, int(c_[g]), k[g], s, k[g] // 2, bias=False) for g in range(groups) ]) self.bn = nn.BatchNorm(c2) self.act = nn.LeakyReLU(0.1)
def __init__(self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, dcn_config={}): super(_OSA_module, self).__init__() self.identity = identity self.layers = nn.ModuleList() in_channel = in_ch with_dcn = dcn_config.get("stage_with_dcn", False) for i in range(layer_per_block): if with_dcn: deformable_groups = dcn_config.get("deformable_groups", 1) with_modulated_dcn = dcn_config.get("with_modulated_dcn", False) #self.layers.append(nn.Sequential(OrderedDict(DFConv3x3(in_channel, stage_ch, module_name, i, # with_modulated_dcn=with_modulated_dcn, deformable_groups=deformable_groups)))) else: self.layers.append( nn.Sequential( OrderedDict( conv3x3(in_channel, stage_ch, module_name, i)))) in_channel = stage_ch # feature aggregation in_channel = in_ch + layer_per_block * stage_ch self.concat = nn.Sequential( OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat'))) self.ese = eSEModule(concat_ch)
def __init__(self, dlatent_size=512, num_channels=3, resolution=1024, fmap_base=8192, fmap_decay=1.0, fmap_max=512, use_styles=True, const_input_layer=True, use_noise=True, nonlinearity='lrelu', use_wscale=True, use_pixel_norm=False, use_instance_norm=True, blur_filter=None, structure='linear', **kwargs): """ Synthesis network used in the StyleGAN paper. :param dlatent_size: Disentangled latent (W) dimensionality. :param num_channels: Number of output color channels. :param resolution: Output resolution. :param fmap_base: Overall multiplier for the number of feature maps. :param fmap_decay: log2 feature map reduction when doubling the resolution. :param fmap_max: Maximum number of feature maps in any layer. :param use_styles: Enable style inputs? :param const_input_layer: First layer is a learned constant? :param use_noise: Enable noise inputs? # :param randomize_noise: True = randomize noise inputs every time (non-deterministic), False = read noise inputs from variables. :param nonlinearity: Activation function: 'relu', 'lrelu' :param use_wscale: Enable equalized learning rate? :param use_pixel_norm: Enable pixel_wise feature vector normalization? :param use_instance_norm: Enable instance normalization? :param blur_filter: Low-pass filter to apply when resampling activations. None = no filtering. :param structure: 'fixed' = no progressive growing, 'linear' = human-readable :param kwargs: Ignore unrecognized keyword args. """ super().__init__() # if blur_filter is None: # blur_filter = [1, 2, 1] def nf(stage): return min(int(fmap_base / (2.0**(stage * fmap_decay))), fmap_max) self.structure = structure resolution_log2 = int(np.log2(resolution)) assert resolution == 2**resolution_log2 and resolution >= 4 self.depth = resolution_log2 - 1 self.num_layers = resolution_log2 * 2 - 2 self.num_styles = self.num_layers if use_styles else 1 act, gain = { 'relu': (nn.ReLU(), np.sqrt(2)), 'lrelu': (nn.LeakyReLU(scale=0.2), np.sqrt(2)) }[nonlinearity] # Early layers. self.init_block = InputBlock(nf(1), dlatent_size, const_input_layer, gain, use_wscale, use_noise, use_pixel_norm, use_instance_norm, use_styles, act) # create the ToRGB layers for various outputs rgb_converters = [ EqualizedConv2d(nf(1), num_channels, 1, gain=1, use_wscale=use_wscale) ] # Building blocks for remaining layers. blocks = [] for res in range(3, resolution_log2 + 1): last_channels = nf(res - 2) channels = nf(res - 1) # name = '{s}x{s}'.format(s=2 ** res) blocks.append( GSynthesisBlock(last_channels, channels, blur_filter, dlatent_size, gain, use_wscale, use_noise, use_pixel_norm, use_instance_norm, use_styles, act)) rgb_converters.append( EqualizedConv2d(channels, num_channels, 1, gain=1, use_wscale=use_wscale)) self.blocks = nn.ModuleList(blocks) self.to_rgb = nn.ModuleList(rgb_converters) # register the temporary upsampler # self.temporaryUpsampler = lambda x: interpolate(x, scale_factor=2) self.temporaryUpsampler = lambda x: nn.interpolate( x, scale_factor=2, mode='nearest')
def __init__(self, resolution, num_channels=3, fmap_base=8192, fmap_decay=1.0, fmap_max=512, nonlinearity='lrelu', use_wscale=True, mbstd_group_size=4, mbstd_num_features=1, blur_filter=None, structure='linear', **kwargs): """ Discriminator used in the StyleGAN paper. :param num_channels: Number of input color channels. Overridden based on dataset. :param resolution: Input resolution. Overridden based on dataset. # label_size=0, # Dimensionality of the labels, 0 if no labels. Overridden based on dataset. :param fmap_base: Overall multiplier for the number of feature maps. :param fmap_decay: log2 feature map reduction when doubling the resolution. :param fmap_max: Maximum number of feature maps in any layer. :param nonlinearity: Activation function: 'relu', 'lrelu' :param use_wscale: Enable equalized learning rate? :param mbstd_group_size: Group size for the mini_batch standard deviation layer, 0 = disable. :param mbstd_num_features: Number of features for the mini_batch standard deviation layer. :param blur_filter: Low-pass filter to apply when resampling activations. None = no filtering. :param structure: 'fixed' = no progressive growing, 'linear' = human-readable :param kwargs: Ignore unrecognized keyword args. """ super(Discriminator, self).__init__() def nf(stage): return min(int(fmap_base / (2.0**(stage * fmap_decay))), fmap_max) self.mbstd_num_features = mbstd_num_features self.mbstd_group_size = mbstd_group_size self.structure = structure # if blur_filter is None: # blur_filter = [1, 2, 1] resolution_log2 = int(np.log2(resolution)) assert resolution == 2**resolution_log2 and resolution >= 4 self.depth = resolution_log2 - 1 act, gain = { 'relu': (nn.ReLU(), np.sqrt(2)), 'lrelu': (nn.LeakyReLU(scale=0.2), np.sqrt(2)) }[nonlinearity] # create the remaining layers blocks = [] from_rgb = [] for res in range(resolution_log2, 2, -1): # name = '{s}x{s}'.format(s=2 ** res) blocks.append( DiscriminatorBlock(nf(res - 1), nf(res - 2), gain=gain, use_wscale=use_wscale, activation_layer=act, blur_kernel=blur_filter)) # create the fromRGB layers for various inputs: from_rgb.append( EqualizedConv2d(num_channels, nf(res - 1), kernel_size=1, gain=gain, use_wscale=use_wscale)) self.blocks = nn.ModuleList(blocks) # Building the final block. self.final_block = DiscriminatorTop(self.mbstd_group_size, self.mbstd_num_features, in_channels=nf(2), intermediate_channels=nf(2), gain=gain, use_wscale=use_wscale, activation_layer=act) from_rgb.append( EqualizedConv2d(num_channels, nf(2), kernel_size=1, gain=gain, use_wscale=use_wscale)) self.from_rgb = nn.ModuleList(from_rgb) # register the temporary downSampler # self.temporaryDownsampler = nn.AvgPool2d(2) self.temporaryDownsampler = nn.Pool(kernel_size=2, op='mean')
def __init__(self, cfg, in_channels): """ Arguments: in_channels (int): number of channels of the input feature """ super(FCOSHead, self).__init__() # TODO: Implement the sigmoid version first. num_classes = cfg.MODEL.FCOS.NUM_CLASSES - 1 self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS self.centerness_on_reg = cfg.MODEL.FCOS.CENTERNESS_ON_REG self.use_dcn_in_tower = cfg.MODEL.FCOS.USE_DCN_IN_TOWER cls_tower = [] bbox_tower = [] for i in range(cfg.MODEL.FCOS.NUM_CONVS): cls_tower.append( nn.Conv(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) cls_tower.append(nn.GroupNorm(32, in_channels)) cls_tower.append(nn.ReLU()) bbox_tower.append( nn.Conv(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) bbox_tower.append(nn.GroupNorm(32, in_channels)) bbox_tower.append(nn.ReLU()) setattr(self, 'cls_tower', nn.Sequential(*cls_tower)) setattr(self, 'bbox_tower', nn.Sequential(*bbox_tower)) self.dense_points = cfg.MODEL.FCOS.DENSE_POINTS self.cls_logits = nn.Conv(in_channels, num_classes * self.dense_points, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv(in_channels, 4 * self.dense_points, kernel_size=3, stride=1, padding=1) self.centerness = nn.Conv(in_channels, 1 * self.dense_points, kernel_size=3, stride=1, padding=1) # initialization for modules in [ self.cls_tower, self.bbox_tower, self.cls_logits, self.bbox_pred, self.centerness ]: for l in modules.modules(): if isinstance(l, nn.Conv): nn.init.gauss_(l.weight, std=0.01) nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.FCOS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) nn.init.constant_(self.cls_logits.bias, bias_value) self.cfg = cfg self.scales = nn.ModuleList(*[Scale(init_value=1.0) for _ in range(5)])
def __init__(self): super().__init__() self.backbone = construct_backbone(cfg.backbone) if cfg.freeze_bn: self.freeze_bn() # Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early! if cfg.mask_type == mask_type.direct: cfg.mask_dim = cfg.mask_size**2 elif cfg.mask_type == mask_type.lincomb: if cfg.mask_proto_use_grid: self.grid = jt.Tensor(np.load(cfg.mask_proto_grid_file)) self.num_grids = self.grid.shape[0] else: self.num_grids = 0 self.proto_src = cfg.mask_proto_src if self.proto_src is None: in_channels = 3 elif cfg.fpn is not None: in_channels = cfg.fpn.num_features else: in_channels = self.backbone.channels[self.proto_src] in_channels += self.num_grids # The include_last_relu=false here is because we might want to change it to another function self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) if cfg.mask_proto_bias: cfg.mask_dim += 1 self.selected_layers = cfg.backbone.selected_layers src_channels = self.backbone.channels if cfg.use_maskiou: self.maskiou_net = FastMaskIoUNet() if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN([src_channels[i] for i in self.selected_layers]) self.selected_layers = list( range(len(self.selected_layers) + cfg.fpn.num_downsample)) src_channels = [cfg.fpn.num_features] * len(self.selected_layers) self.prediction_layers = nn.ModuleList() cfg.num_heads = len(self.selected_layers) for idx, layer_idx in enumerate(self.selected_layers): # If we're sharing prediction module weights, have every module's parent be the first one parent = None if cfg.share_prediction_module and idx > 0: parent = self.prediction_layers[0] pred = PredictionModule( src_channels[layer_idx], src_channels[layer_idx], aspect_ratios=cfg.backbone.pred_aspect_ratios[idx], scales=cfg.backbone.pred_scales[idx], parent=parent, index=idx) self.prediction_layers.append(pred) # Extra parameters for the extra losses if cfg.use_class_existence_loss: # This comes from the smallest layer selected # Also note that cfg.num_classes includes background self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) if cfg.use_semantic_segmentation_loss: self.semantic_seg_conv = nn.Conv(src_channels[0], cfg.num_classes - 1, kernel_size=1) # For use in evaluation self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=cfg.nms_top_k, conf_thresh=cfg.nms_conf_thresh, nms_thresh=cfg.nms_thresh)
def __init__(self, nets, extra_params): super().__init__() self.nets = nn.ModuleList(nets) self.extra_params = extra_params
def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm): super().__init__() # These will be populated by _make_layer self.num_base_layers = len(layers) self.layers = nn.ModuleList() self.channels = [] self.norm_layer = norm_layer self.dilation = 1 self.atrous_layers = atrous_layers # From torchvision.models.resnet.Resnet self.inplanes = 64 self.conv1 = nn.Conv(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(64) self.relu = nn.ReLU() self.maxpool = nn.Pool(kernel_size=3, stride=2, padding=1, op='maximum') self._make_layer(block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval) self._make_layer(block, 128, layers[1], stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval) self._make_layer(block, 256, layers[2], stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval) self._make_layer(block, 512, layers[3], stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval) # This contains every module that should be initialized by loading in pretrained weights. # Any extra layers added onto this that won't be initialized by init_backbone will not be # in this list. That way, Yolact::init_weights knows which backbone weights to initialize # with xavier, and which ones to leave alone. self.backbone_modules = [ m for m in self.modules() if isinstance(m, nn.Conv) ]
def __init__(self, cfg, in_channels): """ Arguments: in_channels (int): number of channels of the input feature """ super(EmbedMaskHead, self).__init__() # TODO: Implement the sigmoid version first. self.fpn_strides = cfg.MODEL.EMBED_MASK.FPN_STRIDES self.norm_reg_targets = cfg.MODEL.EMBED_MASK.NORM_REG_TARGETS self.centerness_on_reg = cfg.MODEL.EMBED_MASK.CENTERNESS_ON_REG self.use_dcn_in_tower = cfg.MODEL.EMBED_MASK.USE_DCN_IN_TOWER num_classes = cfg.MODEL.EMBED_MASK.NUM_CLASSES - 1 embed_dim = cfg.MODEL.EMBED_MASK.EMBED_DIM prior_margin = cfg.MODEL.EMBED_MASK.PRIOR_MARGIN self.init_sigma_bias = math.log(-math.log(0.5) / (prior_margin**2)) cls_tower = [] bbox_tower = [] mask_tower = [] for i in range(cfg.MODEL.FCOS.NUM_CONVS): if self.use_dcn_in_tower and \ i == cfg.MODEL.FCOS.NUM_CONVS - 1: #conv_func = DFConv2d pass else: conv_func = nn.Conv cls_tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True)) cls_tower.append(nn.GroupNorm(32, in_channels)) cls_tower.append(nn.ReLU()) bbox_tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True)) bbox_tower.append(nn.GroupNorm(32, in_channels)) bbox_tower.append(nn.ReLU()) mask_tower.append( conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True)) mask_tower.append(nn.GroupNorm(32, in_channels)) mask_tower.append(nn.ReLU()) setattr(self, 'cls_tower', nn.Sequential(*cls_tower)) setattr(self, 'bbox_tower', nn.Sequential(*bbox_tower)) self.cls_logits = nn.Conv(in_channels, num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv(in_channels, 4, kernel_size=3, stride=1, padding=1) self.centerness = nn.Conv(in_channels, 1, kernel_size=3, stride=1, padding=1) # initialization for modules in [ self.cls_tower, self.bbox_tower, self.cls_logits, self.bbox_pred, self.centerness ]: for l in modules.modules(): if isinstance(l, nn.Conv): nn.init.gauss_(l.weight, std=0.01) nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.EMBED_MASK.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) nn.init.constant_(self.cls_logits.bias, bias_value) self.scales = nn.ModuleList(*[Scale(init_value=1.0) for _ in range(5)]) ########### Mask Predictions ############ # proposal embedding self.proposal_spatial_embed_pred = nn.Conv(in_channels, 2, kernel_size=3, stride=1, padding=1, bias=True) self.proposal_other_embed_pred = nn.Conv(in_channels, embed_dim - 2, kernel_size=3, stride=1, padding=1, bias=True) for modules in [ self.proposal_spatial_embed_pred, self.proposal_other_embed_pred ]: for l in modules.modules(): if isinstance(l, nn.Conv): nn.init.gauss_(l.weight, std=0.01) nn.init.constant_(l.bias, 0) # proposal margin self.proposal_margin_pred = nn.Conv(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=True) nn.init.gauss_(self.proposal_margin_pred.weight, std=0.01) nn.init.constant_(self.proposal_margin_pred.bias, self.init_sigma_bias) # pixel embedding setattr(self, 'mask_tower', nn.Sequential(*mask_tower)) self.pixel_spatial_embed_pred = nn.Conv(in_channels, 2, kernel_size=3, stride=1, padding=1, bias=True) self.pixel_other_embed_pred = nn.Conv(in_channels, embed_dim - 2, kernel_size=3, stride=1, padding=1, bias=True) for modules in [ self.mask_tower, self.pixel_spatial_embed_pred, self.pixel_other_embed_pred ]: for l in modules.modules(): if isinstance(l, nn.Conv): nn.init.gauss_(l.weight, std=0.01) nn.init.constant_(l.bias, 0) self.position_scale = Scale(init_value=1.0)