def __init__(self, cfg, din, classes=2): super(_RPN, self).__init__() self.din = din # get depth of input feature map, e.g., 512 self.feat_stride = cfg.FEAT_STRIDE[0] self.num_anchors = len(self.feat_stride) self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1) self.conv_shape = nn.Conv2d(self.feat_channels, self.num_anchors * 2, 1) self.feature_adaption = nn.Conv2d( self.feat_channels, self.feat_channels, kernel_size=3) self.conv_cls = nn.Conv2d(self.feat_channels, self.num_anchors * self.cls_out_channels, 1) self.conv_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1) # define proposal layer self.RPN_proposal = ProposalLayer( cfg, self.feat_stride, self.anchor_scales, self.anchor_ratios) self.rpn_loss_cls = 0 self.rpn_loss_box = 0
def __init__(self, use_xyz=True, mode='TRAIN'): super().__init__() self.training_mode = (mode == 'TRAIN') MODEL = importlib.import_module(cfg.RPN.BACKBONE) self.backbone_net = MODEL.get_model(input_channels=int( cfg.RPN.USE_INTENSITY), use_xyz=use_xyz) # classification branch cls_layers = [] pre_channel = cfg.RPN.FP_MLPS[0][-1] for k in range(0, cfg.RPN.CLS_FC.__len__()): cls_layers.append( pt_utils.Conv1d(pre_channel, cfg.RPN.CLS_FC[k], bn=cfg.RPN.USE_BN)) pre_channel = cfg.RPN.CLS_FC[k] cls_layers.append(pt_utils.Conv1d(pre_channel, 1, activation=None)) if cfg.RPN.DP_RATIO >= 0: cls_layers.insert(1, nn.Dropout(cfg.RPN.DP_RATIO)) self.rpn_cls_layer = nn.Sequential(*cls_layers) # regression branch per_loc_bin_num = int(cfg.RPN.LOC_SCOPE / cfg.RPN.LOC_BIN_SIZE) * 2 if cfg.RPN.LOC_XZ_FINE: reg_channel = per_loc_bin_num * 4 + cfg.RPN.NUM_HEAD_BIN * 2 + 3 else: reg_channel = per_loc_bin_num * 2 + cfg.RPN.NUM_HEAD_BIN * 2 + 3 reg_channel += 1 # reg y reg_layers = [] pre_channel = cfg.RPN.FP_MLPS[0][-1] for k in range(0, cfg.RPN.REG_FC.__len__()): reg_layers.append( pt_utils.Conv1d(pre_channel, cfg.RPN.REG_FC[k], bn=cfg.RPN.USE_BN)) pre_channel = cfg.RPN.REG_FC[k] reg_layers.append( pt_utils.Conv1d(pre_channel, reg_channel, activation=None)) if cfg.RPN.DP_RATIO >= 0: reg_layers.insert(1, nn.Dropout(cfg.RPN.DP_RATIO)) self.rpn_reg_layer = nn.Sequential(*reg_layers) if cfg.RPN.LOSS_CLS == 'DiceLoss': self.rpn_cls_loss_func = loss_utils.DiceLoss(ignore_target=-1) elif cfg.RPN.LOSS_CLS == 'SigmoidFocalLoss': self.rpn_cls_loss_func = loss_utils.SigmoidFocalClassificationLoss( alpha=cfg.RPN.FOCAL_ALPHA[0], gamma=cfg.RPN.FOCAL_GAMMA) elif cfg.RPN.LOSS_CLS == 'BinaryCrossEntropy': self.rpn_cls_loss_func = F.binary_cross_entropy else: raise NotImplementedError self.proposal_layer = ProposalLayer(mode=mode) self.init_weights()
def __init__(self, din, rpn_din): super(_RPN, self).__init__() self.din = din # get depth of input feature map, e.g., 512 self.anchor_scales = cfg.ANCHOR_SCALES self.anchor_ratios = cfg.ANCHOR_RATIOS self.feat_stride = cfg.FEAT_STRIDE[0] self.rpn_din = rpn_din # define the convrelu layers processing input feature map self.RPN_Conv = nn.Conv2d(self.din, self.rpn_din, 3, 1, 1, bias=True) # define bg/fg classifcation score layer self.nc_score_out = len(self.anchor_scales) * len( self.anchor_ratios) * 2 # 2(bg/fg) * 9 (anchors) self.RPN_cls_score = nn.Conv2d(self.rpn_din, self.nc_score_out, 1, 1, 0) # define anchor box offset prediction layer self.nc_bbox_out = len(self.anchor_scales) * len( self.anchor_ratios) * 4 # 4(coords) * 9 (anchors) self.RPN_bbox_pred = nn.Conv2d(self.rpn_din, self.nc_bbox_out, 1, 1, 0) # define proposal layer self.RPN_proposal = ProposalLayer(self.feat_stride, self.anchor_scales, self.anchor_ratios) # define anchor target layer self.RPN_anchor_target = AnchorTargetLayer(cfg) self.rpn_loss_cls = 0 self.rpn_loss_box = 0
def __init__(self, cfg, classes, pretrained=False, align=False): super(faster_rcnn, self).__init__() self.classes = classes self.rpn_cls_loss = 0 self.rpn_bbox_loss = 0 self.rpn_regression = rpn_regression(self.rpn_inchannels) self.proposallayer = ProposalLayer(cfg, cfg.FEAT_STRIDE[0], cfg.ANCHOR_SCALES, cfg.ANCHOR_RATIOS) self.proposaltargetlayer = ProposalTargetLayer(self.classes) self.roi_extraction = ROIPoolingLayer( (cfg.POOLING_SIZE, cfg.POOLING_SIZE), 1.0 / 16.0) if not align: self.roi_extraction = ROIAlignLayer( (cfg.POOLING_SIZE, cfg.POOLING_SIZE), 1.0 / 16.0, 0) self.regressionDim = 512 self.ROIDim = 256 self.Regression = nn.Sequential( OrderedDict([ ('fc6', nn.Linear(self.ROIDim * cfg.POOLING_SIZE * cfg.POOLING_SIZE, self.regressionDim)), ('fc6_relu', nn.ReLU(inplace=True)), ('fc7', nn.Linear(self.regressionDim, self.regressionDim, bias=True)), ('fc7_relu', nn.ReLU(inplace=True)) ])) self.cls_predict = nn.Sequential( OrderedDict([('fc_cls', nn.Linear(self.regressionDim, self.classes))])) self.bbox_predict = nn.Sequential( OrderedDict([('fc_bbox', nn.Linear(self.regressionDim, self.classes * 4))])) self.out_sigmoid = nn.Sigmoid()
def __init__(self, use_xyz=True, mode='TRAIN'): super().__init__() self.training_mode = (mode == 'TRAIN') MODEL = importlib.import_module(cfg.RPN.BACKBONE) self.backbone_net = MODEL.get_model(input_channels=int( cfg.RPN.USE_INTENSITY), use_xyz=use_xyz) # here Conv1d is almost the same as torch Conv1d # for torch Conv1d see https://pytorch.org/docs/stable/nn.html#conv1d # here we use the Conv1d so we can do two levels of batch calculation # the first level is at the level of the scenes and the second is at the level of th points # The input to both heads is a (B, C, N) shaped tensor. # C is number of channels (i.e. the number of features each point has) (it is apparently 128) # N is the number of points in one scene # this way we regress the output values of all the points using a single run of a Conv1d layer # Notice the output has the form: classification head (B,1,N) , regression head (B,9,N) # since the kernel_size is 1 the output is a linear combination of channels just like a simple linear regression plus a bias # in the case of the regression head, each of the 9 outputs has its own set of weights and biases. # notice the output is the result of the regression/classification for all the points not just a single one. # classification branch cls_layers = [] pre_channel = cfg.RPN.FP_MLPS[0][-1] # = 128 for k in range(0, cfg.RPN.CLS_FC.__len__()): # input is 128 output is also 128 cls_layers.append( pt_utils.Conv1d( pre_channel, cfg.RPN.CLS_FC[k], bn=cfg.RPN.USE_BN)) # bn is batch normalization pre_channel = cfg.RPN.CLS_FC[k] cls_layers.append(pt_utils.Conv1d(pre_channel, 1, activation=None) ) # sigmoid is applied in the loss function not here # this ends up being: # 1st layer 128 inputs to 128 outputs # 2nd layer 128 to 1 if cfg.RPN.DP_RATIO >= 0: cls_layers.insert(1, nn.Dropout(cfg.RPN.DP_RATIO)) self.rpn_cls_layer = nn.Sequential(*cls_layers) # it adds a dropout layer with ratio 0.5 # regression branch # we will do a normal regression for all the 9 parameters (x,y,z, w,h,l , rx,ry,rz) of our bboxes reg_channel = 9 reg_layers = [] pre_channel = cfg.RPN.FP_MLPS[0][-1] # = 128 for k in range(0, cfg.RPN.REG_FC.__len__()): # cfg.RPN.REG_FC = [128] reg_layers.append( pt_utils.Conv1d(pre_channel, cfg.RPN.REG_FC[k], bn=cfg.RPN.USE_BN)) pre_channel = cfg.RPN.REG_FC[k] reg_layers.append( pt_utils.Conv1d(pre_channel, reg_channel, activation=None)) #if you use binning and classification the activation of this last layer is applied in the loss instead # see /lib/utils/loss_utils.py "get_reg_loss" it uses BinaryCrossEntropy which applies a softmax (I need to change this !) if cfg.RPN.DP_RATIO >= 0: reg_layers.insert(1, nn.Dropout(cfg.RPN.DP_RATIO)) self.rpn_reg_layer = nn.Sequential(*reg_layers) # this ends up being: # 1st layer 128 inputs to 128 outputs # 2nd layer 128 to 9 outputs # it adds a dropout layer with ratio 0.5 if cfg.RPN.LOSS_CLS == 'DiceLoss': self.rpn_cls_loss_func = loss_utils.DiceLoss(ignore_target=-1) elif cfg.RPN.LOSS_CLS == 'SigmoidFocalLoss': self.rpn_cls_loss_func = loss_utils.SigmoidFocalClassificationLoss( alpha=cfg.RPN.FOCAL_ALPHA[0], gamma=cfg.RPN.FOCAL_GAMMA) elif cfg.RPN.LOSS_CLS == 'BinaryCrossEntropy': self.rpn_cls_loss_func = F.binary_cross_entropy else: raise NotImplementedError # proposal layer is only used in RCNN and not in RPN self.proposal_layer = ProposalLayer(mode=mode) self.init_weights()