def __init__(self, cfg): super().__init__() self.cfg = cfg self.backbone = construct_backbone(cfg.backbone, cfg.net_in_channels) if cfg.freeze_bn: self.freeze_bn() # Compute mask_dim here and add it back to the config. Make sure DVIS's constructor is called early! if cfg.fpn is not None: in_channels = cfg.fpn.num_features else: in_channels = self.backbone.channels[0] src_channels = self.backbone.channels self.selected_layers = cfg.backbone.selected_layers if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN(cfg, [src_channels[i] for i in self.selected_layers]) self.selected_layers = list(range(len(self.selected_layers) + cfg.fpn.num_downsample)) src_channels = [cfg.fpn.num_features] * len(self.selected_layers) # The include_last_relu=false here is because we might want to change it to another function self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False)
def __init__(self): super().__init__() self.backbone = construct_backbone(cfg.backbone) if cfg.freeze_bn: self.freeze_bn() # Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early! if cfg.mask_type == mask_type.direct: cfg.mask_dim = cfg.mask_size**2 elif cfg.mask_type == mask_type.lincomb: if cfg.mask_proto_use_grid: self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file)) self.num_grids = self.grid.size(0) else: self.num_grids = 0 self.proto_src = cfg.mask_proto_src if self.proto_src is None: in_channels = 3 elif cfg.fpn is not None: in_channels = cfg.fpn.num_features else: in_channels = self.backbone.channels[self.proto_src] in_channels += self.num_grids # The include_last_relu=false here is because we might want to change it to another function self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) if cfg.mask_proto_bias: cfg.mask_dim += 1 self.selected_layers = cfg.backbone.selected_layers src_channels = self.backbone.channels if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN([src_channels[i] for i in self.selected_layers]) self.selected_layers = list( range(len(self.selected_layers) + cfg.fpn.num_downsample)) src_channels = [cfg.fpn.num_features] * len(self.selected_layers) self.prediction_layers = nn.ModuleList() for idx, layer_idx in enumerate(self.selected_layers): # If we're sharing prediction module weights, have every module's parent be the first one parent = None if cfg.share_prediction_module and idx > 0: parent = self.prediction_layers[0] pred = PredictionModule( src_channels[layer_idx], src_channels[layer_idx], aspect_ratios=cfg.backbone.pred_aspect_ratios[idx], scales=cfg.backbone.pred_scales[idx], parent=parent) self.prediction_layers.append(pred) # Extra parameters for the extra losses if cfg.use_class_existence_loss: # This comes from the smallest layer selected # Also note that cfg.num_classes includes background self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) if cfg.use_semantic_segmentation_loss: self.semantic_seg_conv = nn.Conv2d(src_channels[0], cfg.num_classes - 1, kernel_size=1) # For use in evaluation self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=200, conf_thresh=0.05, nms_thresh=0.5)
def __init__(self): super().__init__() self.backbone = construct_backbone(cfg.backbone) if cfg.freeze_bn: self.freeze_bn() # Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early! if cfg.mask_type == mask_type.direct: cfg.mask_dim = cfg.mask_size**2 elif cfg.mask_type == mask_type.lincomb: if cfg.mask_proto_use_grid: self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file)) self.num_grids = self.grid.size(0) else: self.num_grids = 0 self.proto_src = cfg.mask_proto_src self.interpolation_mode = cfg.fpn.interpolation_mode if self.proto_src is None: in_channels = 3 elif cfg.fpn is not None: in_channels = cfg.fpn.num_features else: in_channels = self.backbone.channels[self.proto_src] in_channels += self.num_grids # The include_last_relu=false here is because we might want to change it to another function self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) if cfg.mask_proto_bias: cfg.mask_dim += 1 self.selected_layers = cfg.backbone.selected_layers self.pred_scales = cfg.backbone.pred_scales self.pred_aspect_ratios = cfg.backbone.pred_aspect_ratios self.num_priors = len(self.pred_scales[0]) src_channels = self.backbone.channels if cfg.use_maskiou: self.maskiou_net = FastMaskIoUNet() if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN([src_channels[i] for i in self.selected_layers]) if cfg.backbone_C2_as_features: self.selected_layers = list( range(1, len(self.selected_layers) + cfg.fpn.num_downsample)) src_channels = [cfg.fpn.num_features ] * (len(self.selected_layers) + 1) else: self.selected_layers = list( range(len(self.selected_layers) + cfg.fpn.num_downsample)) src_channels = [cfg.fpn.num_features] * len( self.selected_layers) # prediction layers for loc, conf, mask self.prediction_layers = nn.ModuleList() cfg.num_heads = len(self.selected_layers) # yolact++ for idx, layer_idx in enumerate(self.selected_layers): # If we're sharing prediction module weights, have every module's parent be the first one parent, parent_t = None, None if cfg.share_prediction_module and idx > 0: parent = self.prediction_layers[0] pred = PredictionModule_FC( src_channels[layer_idx], src_channels[layer_idx], deform_groups=1, pred_aspect_ratios=self.pred_aspect_ratios[idx], pred_scales=self.pred_scales[idx], parent=parent) self.prediction_layers.append(pred) # parameters in temporal correlation net if cfg.temporal_fusion_module: corr_channels = 2 * in_channels + cfg.correlation_patch_size**2 self.TemporalNet = TemporalNet(corr_channels, cfg.mask_proto_n) self.correlation_selected_layer = cfg.correlation_selected_layer # evaluation for frame-level tracking self.Detect_TF = Detect_TF(cfg.num_classes, bkg_label=0, top_k=cfg.nms_top_k, conf_thresh=cfg.nms_conf_thresh, nms_thresh=cfg.nms_thresh) self.Track_TF = Track_TF() # Extra parameters for the extra losses if cfg.use_class_existence_loss: # This comes from the smallest layer selected # Also note that cfg.num_classes includes background self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) if cfg.use_semantic_segmentation_loss: self.semantic_seg_conv = nn.Conv2d(src_channels[0], cfg.num_classes - 1, kernel_size=1) # For use in evaluation self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=cfg.nms_top_k, conf_thresh=cfg.nms_conf_thresh, nms_thresh=cfg.nms_thresh) self.Track = Track()
def __init__(self): super().__init__() self.backbone = construct_backbone( cfg.backbone) #resnet101_dcn_inter3_backbone if cfg.freeze_bn: self.freeze_bn() # Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early! if cfg.mask_type == mask_type.direct: cfg.mask_dim = cfg.mask_size**2 elif cfg.mask_type == mask_type.lincomb: if cfg.mask_proto_use_grid: #False self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file)) self.num_grids = self.grid.size(0) else: self.num_grids = 0 #cw yolact_plus default:0 self.proto_src = cfg.mask_proto_src if self.proto_src is None: in_channels = 3 #cw 0 != None elif cfg.fpn is not None: in_channels = cfg.fpn.num_features #cw fpn.num_features -- default:'num_features': 256, else: in_channels = self.backbone.channels[self.proto_src] in_channels += self.num_grids #cw (256 + 0) #TODO#Fig. 3 PART # The include_last_relu=false here is because we might want to change it to another function # 'mask_proto_net': [(256, 3, {'padding': 1})] * 3 + [(None, -2, {}), (256, 3, {'padding': 1})] + [(32, 1, {})], self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) #256 , 6개의 conv및 bilinear #cw make_net에 넘기는 cfg.mask_proto_net을 in_channels이 통과하였을 때 마지막 output의 채널을 두번째 인자로 반환하므로. # final in_channels이 cfg.mask_dim이 된다고 보면 되시겠다. if cfg.mask_proto_bias: #False cfg.mask_dim += 1 # cfg.mask_dim = 32 self.selected_layers = cfg.backbone.selected_layers #cw yp -- [1, 2, 3] src_channels = self.backbone.channels #src_channels = [256, 512, 1024, 2048] #True #TODO# if cfg.use_maskiou: self.maskiou_net = FastMaskIoUNet() # 'fpn': fpn_base.copy({ # 'use_conv_downsample': True, # 'num_downsample': 2, # }), #TODO# if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN([src_channels[i] for i in self.selected_layers ]) #[512, 1024, 2048] 넘김. self.selected_layers = list( range(len(self.selected_layers) + cfg.fpn.num_downsample)) #cw range(3 + 2) src_channels = [cfg.fpn.num_features] * len(self.selected_layers) # src_channels = [256, 256, 256, 256, 256] # selected_layers : [0, 1, 2, 3, 4] self.prediction_layers = nn.ModuleList() cfg.num_heads = len(self.selected_layers) #5 #Prediction Module에서 쓰임. for idx, layer_idx in enumerate(self.selected_layers): # If we're sharing prediction module weights, have every module's parent be the first one parent = None #True if cfg.share_prediction_module and idx > 0: parent = self.prediction_layers[0] #cw src_channels는 본래 resnet의 layer_idx의 채널수를 가지고 있음. # 즉, selected layer에서는 bbox를 prediction하는 것. # call하여 얻은 pred는 prediction_layers에 추가. (selected_layers 수만큼 생성) pred = PredictionModule( src_channels[layer_idx], src_channels[layer_idx], aspect_ratios=cfg.backbone.pred_aspect_ratios[idx], scales=cfg.backbone.pred_scales[idx], parent=parent, index=idx) self.prediction_layers.append(pred) #False # Extra parameters for the extra losses if cfg.use_class_existence_loss: # This comes from the smallest layer selected # Also note that cfg.num_classes includes background self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) #True if cfg.use_semantic_segmentation_loss: self.semantic_seg_conv = nn.Conv2d(src_channels[0], cfg.num_classes - 1, kernel_size=1) # For use in evaluation self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=cfg.nms_top_k, conf_thresh=cfg.nms_conf_thresh, nms_thresh=cfg.nms_thresh)
def __init__(self): super().__init__() self.backbone = construct_backbone(cfg.backbone) if cfg.freeze_bn: self.freeze_bn() #Fusion FPN self.fusion_layers = cfg.fusion_layers self.fusion_dim = cfg.fusion_dim # Compute mask_dim here and add it back to the config. if cfg.mask_type == mask_type.direct: cfg.mask_dim = cfg.mask_size**2 elif cfg.mask_type == mask_type.lincomb: if cfg.mask_proto_use_grid: self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file)) self.num_grids = self.grid.size(0) else: self.num_grids = 0 self.proto_src = cfg.mask_proto_src if self.proto_src is None: in_channels = 3 elif cfg.fpn is not None: in_channels = cfg.fpn.num_features else: in_channels = self.backbone.channels[self.proto_src] in_channels += self.num_grids # The include_last_relu=false here is because we might want to change it to another function if cfg.proto_coordconv: in_channels += 2 elif cfg.fpn_fusion: in_channels = self.fusion_dim self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) if cfg.mask_proto_bias: cfg.mask_dim += 1 self.selected_layers = cfg.backbone.selected_layers src_channels = self.backbone.channels if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN([src_channels[i] for i in self.selected_layers]) self.selected_layers = list( range(len(self.selected_layers) + cfg.fpn.num_downsample)) src_channels = [cfg.fpn.num_features] * len(self.selected_layers) if cfg.fpn_fusion is True: self.fusion_module = FusionModule(src_channels[0], self.fusion_layers, out_dim=self.fusion_dim) if cfg.ins_coordconv or cfg.sem_coordconv or cfg.proto_coordconv: self.addcoords = AddCoords() self.prediction_layers = nn.ModuleList() for idx, layer_idx in enumerate(self.selected_layers): # If we're sharing prediction module weights, have every module's parent be the first one parent = None if cfg.share_prediction_module and idx > 0: parent = self.prediction_layers[0] pred_in_ch = src_channels[ layer_idx] + 2 if cfg.ins_coordconv else src_channels[layer_idx] pred = PredictionModule( pred_in_ch, src_channels[layer_idx], aspect_ratios=cfg.backbone.pred_aspect_ratios[idx], scales=cfg.backbone.pred_scales[idx], parent=parent) self.prediction_layers.append(pred) # Extra parameters for the extra losses if cfg.use_class_existence_loss: # This comes from the smallest layer selected # Also note that cfg.num_classes includes background self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) if cfg.cross_attention_fusion: self.CALayer = CAModule(src_channels[0], share_conv=False) if cfg.use_semantic_segmentation_loss: sem_in_ch = None if cfg.sem_src_fusion is True: sem_in_ch = self.fusion_dim elif cfg.sem_lincomb is True: sem_in_ch = src_channels[0] else: # normal semantic segmentation head sem_in_ch = src_channels[-1] if cfg.sem_coordconv: sem_in_ch += 2 # Panoptic FPN Fusion Version if cfg.sem_src_fusion is True: self.semantic_seg_conv = nn.Sequential( nn.Conv2d(sem_in_ch, cfg.stuff_num_classes, kernel_size=(1, 1))) elif cfg.sem_lincomb is True: self.semantic_seg_conv = nn.Sequential( nn.Conv2d(sem_in_ch, 256, kernel_size=3), # nn.BatchNorm2d(256), nn.GroupNorm(32, 256), nn.ReLU(True), nn.Conv2d(256, (cfg.stuff_num_classes) * cfg.mask_dim, kernel_size=1), nn.Tanh()) else: self.semantic_seg_conv = nn.Sequential( nn.Conv2d(sem_in_ch, cfg.stuff_num_classes, kernel_size=(1, 1))) # For use in evaluation self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=200, conf_thresh=0.05, nms_thresh=0.5)
def __init__(self): super().__init__() # yolac++ cfg.backbone = # 'backbone': resnet101_dcn_inter3_backbone.copy({ # 'selected_layers': list(range(1, 4)), # # 'pred_aspect_ratios': [[[1, 1 / 2, 2]]] * 5, # 'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]], # 'use_pixel_scales': True, # 'preapply_sqrt': False, # 'use_square_anchors': False, # }) self.backbone = construct_backbone(cfg.backbone) if cfg.freeze_bn: self.freeze_bn() # Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early! if cfg.mask_type == mask_type.direct: # 16^2 = 256 ?? cfg.mask_dim = cfg.mask_size**2 elif cfg.mask_type == mask_type.lincomb: # mask_proto_use_grid ALWAYS false ?? if cfg.mask_proto_use_grid: self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file)) self.num_grids = self.grid.size(0) else: self.num_grids = 0 # yolact use 0 self.proto_src = cfg.mask_proto_src if self.proto_src is None: in_channels = 3 elif cfg.fpn is not None: in_channels = cfg.fpn.num_features else: in_channels = self.backbone.channels[self.proto_src] in_channels += self.num_grids # The include_last_relu=false here is because we might want to change it to another function # yolact ++ proto net # 'mask_proto_net': [(256, 3, {'padding': 1})] * 3 # + [(None, -2, {}), (256, 3, {'padding': 1})] # + [(32, 1, {})], self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) if cfg.mask_proto_bias: cfg.mask_dim += 1 ## end of mask type if else ______________________________________________] self.selected_layers = cfg.backbone.selected_layers src_channels = self.backbone.channels if cfg.use_maskiou: self.maskiou_net = FastMaskIoUNet() if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN( # yolact++ 101 selected layers = 1,2,3 # 2nd 128x4 # 3rd 256x4 # 4th 512x4 [src_channels[i] for i in self.selected_layers] ) self.selected_layers = list( # selected_layers = 0,1,2,3,4 range( # yolact++ # 1 , 2 , 3 2 len(self.selected_layers) + cfg.fpn.num_downsample) ) # num features = 256 x 5 src_channels = [cfg.fpn.num_features] * len(self.selected_layers) self.prediction_layers = nn.ModuleList() cfg.num_heads = len(self.selected_layers) # --> 5 num_heads ?? # sooo... is this making 5 prediction modules ???? for idx, layer_idx in enumerate(self.selected_layers): # If we're sharing prediction module weights, have every module's parent be the first one parent = None # yolact++ share_prediction_module always True if cfg.share_prediction_module and idx > 0: parent = self.prediction_layers[0] pred = PredictionModule( # in_channels= src_channels[layer_idx], # out_channels= src_channels[layer_idx], # 'pred_scales': [[1]] * 6 # 'pred_aspect_ratios': [[[0.66685089, 1.7073535, 0.87508774, 1.16524493, # 0.49059086]]] * 6 aspect_ratios = cfg.backbone.pred_aspect_ratios[idx], scales = cfg.backbone.pred_scales[idx], parent = parent, index = idx) self.prediction_layers.append(pred) # Extra parameters for the extra losses # always False ?? if cfg.use_class_existence_loss: # This comes from the smallest layer selected # Also note that cfg.num_classes includes background self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) # yolact always True ?? if cfg.use_semantic_segmentation_loss: self.semantic_seg_conv = nn.Conv2d(src_channels[0], cfg.num_classes-1, kernel_size=1) # For use in evaluation self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=cfg.nms_top_k, #'nms_top_k': 200, conf_thresh=cfg.nms_conf_thresh, #'nms_conf_thresh': 0.05 nms_thresh=cfg.nms_thresh #'nms_thresh': 0.5 )
def __init__(self): #super:;call the based-class' init func super().__init__() print('net initial...\n') self.backbone = construct_backbone(cfg.backbone) if cfg.freeze_bn: self.freeze_bn() ##get:: self.proto_net, cfg.mask_dim # Compute mask_dim here and add it back to the config. Make sure Yolact's constructor is called early! if cfg.mask_type == mask_type.direct: cfg.mask_dim = cfg.mask_size**2 elif cfg.mask_type == mask_type.lincomb: if cfg.mask_proto_use_grid: #cfg.mask_proto_grid_file : data/grid.npy , npy is a numpy data file self.grid = torch.Tensor(np.load(cfg.mask_proto_grid_file)) self.num_grids = self.grid.size(0) else: self.num_grids = 0 #0 self.proto_src = cfg.mask_proto_src if self.proto_src is None: in_channels = 3 #cfg.fpn is obj elif cfg.fpn is not None: in_channels = cfg.fpn.num_features else: in_channels = self.backbone.channels[self.proto_src] in_channels += self.num_grids # The include_last_relu=false here is because we might want to change it to another function self.proto_net, cfg.mask_dim = make_net(in_channels, cfg.mask_proto_net, include_last_relu=False) if cfg.mask_proto_bias: cfg.mask_dim += 1 # self.proto_net # Sequential( # (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) # (1): ReLU(inplace) # (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) # (3): ReLU(inplace) # (4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) # (5): ReLU(inplace) # (6): InterpolateModule() # (7): ReLU(inplace) # (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) # (9): ReLU(inplace) # (10): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1)) #) # # self.fpn # FPN( # (lat_layers): _ConstModuleList( # (0): WeakScriptModuleProxy() # (1): WeakScriptModuleProxy() # (2): WeakScriptModuleProxy() # ) # (pred_layers): _ConstModuleList( # (0): WeakScriptModuleProxy() # (1): WeakScriptModuleProxy() # (2): WeakScriptModuleProxy() # ) # (downsample_layers): _ConstModuleList( # (0): WeakScriptModuleProxy() # (1): WeakScriptModuleProxy() # ) # ) self.selected_layers = cfg.backbone.selected_layers src_channels = self.backbone.channels if cfg.fpn is not None: # Some hacky rewiring to accomodate the FPN self.fpn = FPN([src_channels[i] for i in self.selected_layers]) self.selected_layers = list(range(len(self.selected_layers) + cfg.fpn.num_downsample)) src_channels = [cfg.fpn.num_features] * len(self.selected_layers) self.prediction_layers = nn.ModuleList() for idx, layer_idx in enumerate(self.selected_layers): # If we're sharing prediction module weights, have every module's parent be the first one parent = None if cfg.share_prediction_module and idx > 0: parent = self.prediction_layers[0] pred = PredictionModule(src_channels[layer_idx], src_channels[layer_idx], aspect_ratios = cfg.backbone.pred_aspect_ratios[idx], scales = cfg.backbone.pred_scales[idx], parent = parent) self.prediction_layers.append(pred) #False # Extra parameters for the extra losses if cfg.use_class_existence_loss: # This comes from the smallest layer selected # Also note that cfg.num_classes includes background self.class_existence_fc = nn.Linear(src_channels[-1], cfg.num_classes - 1) if cfg.use_semantic_segmentation_loss: self.semantic_seg_conv = nn.Conv2d(src_channels[0], cfg.num_classes-1, kernel_size=1) # For use in evaluation self.detect = Detect(cfg.num_classes, bkg_label=0, top_k=200, conf_thresh=0.2, nms_thresh=0.5) self.tmp = 1