def __init__(self, level, norm, dims=[512, 256, 256], rfb=False): super(ASFF, self).__init__() conv_bn_relu = conv_with_kaiming_uniform(norm, activation=True) self.level = level # 输入的三个特征层的channels, 根据实际修改 self.dim = dims self.inter_dim = self.dim[self.level] # 每个层级三者输出通道数需要一致 if level == 0: self.stride_level_1 = conv_bn_relu(self.dim[1], self.inter_dim, 3, 2) self.stride_level_2 = conv_bn_relu(self.dim[2], self.inter_dim, 3, 2) self.expand = conv_bn_relu(self.inter_dim, 1024, 3, 1) elif level == 1: self.compress_level_0 = conv_bn_relu(self.dim[0], self.inter_dim, 1, 1) self.stride_level_2 = conv_bn_relu(self.dim[2], self.inter_dim, 3, 2) self.expand = conv_bn_relu(self.inter_dim, 512, 3, 1) elif level == 2: self.compress_level_0 = conv_bn_relu(self.dim[0], self.inter_dim, 1, 1) if self.dim[1] != self.dim[2]: self.compress_level_1 = conv_bn_relu(self.dim[1], self.inter_dim, 1, 1) self.expand = add_conv(self.inter_dim, 256, 3, 1) compress_c = 8 if rfb else 16 self.weight_level_0 = conv_bn_relu(self.inter_dim, compress_c, 1, 1) self.weight_level_1 = conv_bn_relu(self.inter_dim, compress_c, 1, 1) self.weight_level_2 = conv_bn_relu(self.inter_dim, compress_c, 1, 1) self.weight_levels = nn.Conv2d(compress_c * 3, 3, 1, 1, 0)
def __init__(self, cfg, disable_rel_coords=False): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.in_channels = channels + 2 self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.disable_rel_coords = disable_rel_coords conv_block = conv_with_kaiming_uniform(norm, activation=True) tower = [] tower.append(conv_block( self.in_channels, channels, 3, 1 )) for i in range(1,num_convs): tower.append(conv_block( channels, channels, 3, 1 )) tower.append(nn.Conv2d( channels, max(self.num_outputs, 1), 1 )) self.add_module('tower', nn.Sequential(*tower))
def __init__(self, cfg): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE conv_block = conv_with_kaiming_uniform(norm, activation=True) tower = [] for i in range(num_convs): tower.append(conv_block(channels, channels, 3, 1)) tower.append(nn.Conv2d(channels, max(self.num_outputs, 1), 1)) self.add_module('tower', nn.Sequential(*tower))
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): super().__init__() self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON self.num_outputs = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM num_convs = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS channels = cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS self.out_stride = input_shape[self.in_features[0]].stride feature_channels = {k: v.channels for k, v in input_shape.items()} conv_block = conv_with_kaiming_uniform(norm, activation=True) self.refine = nn.ModuleList() for in_feature in self.in_features: self.refine.append(conv_block( feature_channels[in_feature], channels, 3, 1 )) tower = [] for i in range(num_convs): tower.append(conv_block( channels, channels, 3, 1 )) tower.append(nn.Conv2d( channels, max(self.num_outputs, 1), 1 )) self.add_module('tower', nn.Sequential(*tower)) # pdb.set_trace() if self.sem_loss_on: num_classes = cfg.MODEL.FCOS.NUM_CLASSES self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA in_channels = feature_channels[self.in_features[0]] self.seg_head = nn.Sequential( conv_block(in_channels, channels, kernel_size=3, stride=1), conv_block(channels, channels, kernel_size=3, stride=1) ) self.logits = nn.Conv2d(channels, num_classes, kernel_size=1, stride=1) prior_prob = cfg.MODEL.FCOS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.logits.bias, bias_value)
def __init__(self, cfg, use_rel_coords=True): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R num_dcn_layer = cfg.MODEL.CONDINST.IUVHead.NUM_DCN_LAYER assert num_lambda_layer<=num_convs agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS self.use_down_up_sampling = cfg.MODEL.CONDINST.IUVHead.DOWN_UP_SAMPLING self.use_partial_conv = cfg.MODEL.CONDINST.IUVHead.PARTIAL_CONV self.use_partial_norm = cfg.MODEL.CONDINST.IUVHead.PARTIAL_NORM # pdb.set_trace() # if self.use_rel_coords: # self.in_channels = channels + 2 # else: self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS self.use_pos_emb = self.pos_emb_num_freqs>0 if self.use_pos_emb: self.position_embedder, self.position_emb_dim = get_embedder(multires=self.pos_emb_num_freqs, input_dims=2) self.in_channels = agg_channels + self.position_emb_dim else: self.in_channels = agg_channels + 2 if self.use_abs_coords: if self.use_pos_emb: self.in_channels += self.position_emb_dim else: self.in_channels += 2 conv_block = conv_with_kaiming_uniform(norm, activation=True) partial_conv_block = conv_with_kaiming_uniform(norm, activation=True, use_partial_conv=True) deform_conv_block = conv_with_kaiming_uniform(norm, activation=True, use_deformable=True) tower = [] if self.use_partial_conv: # pdb.set_trace() layer = partial_conv_block(self.in_channels, channels, 3, 1) tower.append(layer) self.in_channels = channels if num_lambda_layer>0: layer = LambdaLayer( dim = self.in_channels, dim_out = channels, r = lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k = 16, heads = 4, dim_u = 4 ) tower.append(layer) else: tower.append(conv_block( self.in_channels, channels, 3, 1 )) if num_dcn_layer>0: tower.append(deform_conv_block( channels, channels, 3, 1 )) if self.use_down_up_sampling: for i in range(1,num_convs): if i==1: tower.append(conv_block( channels, channels*2, 3, 2 )) else: tower.append(conv_block( channels*2, channels*2, 3, 1 )) tower.append(ConvTranspose2d( channels*2, self.num_outputs, 4, stride=2, padding=int(4 / 2 - 1) )) else: for i in range(1,num_convs): tower.append(conv_block( channels, channels, 3, 1 )) tower.append(nn.Conv2d( channels, max(self.num_outputs, 1), 1 )) self.add_module('tower', nn.Sequential(*tower))
def __init__(self, cfg, use_rel_coords=True): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R assert num_lambda_layer <= num_convs agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS # pdb.set_trace() # if self.use_rel_coords: # self.in_channels = channels + 2 # else: self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS self.use_pos_emb = self.pos_emb_num_freqs > 0 extra_channels = 0 if self.use_pos_emb: self.position_embedder, self.position_emb_dim = get_embedder( multires=self.pos_emb_num_freqs, input_dims=2) extra_channels += self.position_emb_dim else: extra_channels += 2 if self.use_abs_coords: if self.use_pos_emb: extra_channels += self.position_emb_dim else: extra_channels += 2 # pdb.set_trace() conv_block = conv_with_kaiming_uniform(norm, activation=True) cnt = 0 self.layers = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=agg_channels + extra_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) else: layer = conv_block(channels + extra_channels, channels, 3, 1) setattr(self, 'layer_{}'.format(cnt), layer) self.layers.append(layer) cnt += 1 for i in range(1, num_convs): if i < num_lambda_layer: layer = LambdaLayer( dim=channels + extra_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) else: layer = conv_block(channels + extra_channels, channels, 3, 1) setattr(self, 'layer_{}'.format(cnt), layer) self.layers.append(layer) cnt += 1 layer = nn.Conv2d(channels + extra_channels, max(self.num_outputs, 1), 1) setattr(self, 'layer_{}'.format(cnt), layer) self.layers.append(layer)
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): super().__init__() self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON self.num_outputs = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM num_convs = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS channels = cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS self.out_stride = input_shape[ cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES[0]].stride # pdb.set_trace() # self.num_lambda_layer = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_LAMBDA_LAYER self.use_aspp = cfg.MODEL.CONDINST.MASK_BRANCH.USE_ASPP self.use_san = cfg.MODEL.CONDINST.MASK_BRANCH.USE_SAN self.san_type = cfg.MODEL.CONDINST.SAN_TYPE self.use_attn = cfg.MODEL.CONDINST.MASK_BRANCH.USE_ATTN self.attn_type = cfg.MODEL.CONDINST.ATTN_TYPE # lambda_layer_r = cfg.MODEL.CONDINST.MASK_BRANCH.LAMBDA_LAYER_R self.checkpoint_grad_num = cfg.MODEL.CONDINST.CHECKPOINT_GRAD_NUM self.v2 = cfg.MODEL.CONDINST.v2 self.use_res_input = cfg.MODEL.CONDINST.MASK_BRANCH.RESIDUAL_INPUT self.use_res_after_relu = cfg.MODEL.CONDINST.MASK_BRANCH.RESIDUAL_SKIP_AFTER_RELU self.use_agg_feat = cfg.MODEL.CONDINST.IUVHead.USE_AGG_FEATURES if self.use_agg_feat: self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES self.use_weight_std = cfg.MODEL.CONDINST.IUVHead.WEIGHT_STANDARDIZATION self.use_eca = cfg.MODEL.CONDINST.IUVHead.Efficient_Channel_Attention # self.use_tree_filter = cfg.MODEL.CONDINST.MASK_BRANCH.TREE_FILTER self.tf_embed_dim = cfg.MODEL.CONDINST.MASK_BRANCH.TREE_FILTER_EMBED_DIM self.tf_group_num = cfg.MODEL.CONDINST.MASK_BRANCH.TREE_FILTER_GROUP_NUM self.add_skeleton_feat = cfg.MODEL.CONDINST.IUVHead.SKELETON_FEATURES feature_channels = {k: v.channels for k, v in input_shape.items()} conv_block_no_act = conv_with_kaiming_uniform( norm, activation=False, use_weight_std=self.use_weight_std) conv_block = conv_with_kaiming_uniform( norm, activation=True, use_weight_std=self.use_weight_std) self.use_decoder = False # self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON # if self.use_decoder: # self.decoder = Decoder(cfg, input_shape, self.in_features) # assert agg_channels==cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS # else: self.refine = nn.ModuleList() self.tf = nn.ModuleList() for idx, in_feature in enumerate(self.in_features): # if num_lambda_layer>=len(self.in_features)-idx: # layer = LambdaLayer( # dim = feature_channels[in_feature], # dim_out = agg_channels, # r = lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) # dim_k = 16, # heads = 4, # dim_u = 4 # ) # self.refine.append(layer) # else: # pdb.set_trace() # self.ASFF = ASFF(level=2, norm=norm, dims=[256, 256, 256], rfb=False) # self.ASFF(x_level_0, x_level_1, x_level_2): # if self.v2 and idx>0 and in_feature not in ["p6","p7"]: if idx > 0 and in_feature not in ["p6", "p7"]: if self.add_skeleton_feat: self.refine.append( nn.Sequential(*[ conv_block_no_act(feature_channels[in_feature], agg_channels, 3, 1), nn.Upsample(scale_factor=2**idx) ])) else: self.refine.append( nn.Sequential(*[ conv_block(feature_channels[in_feature], agg_channels, 3, 1), nn.Upsample(scale_factor=2**idx) ])) # aligned_bilinear_layer( # factor=2**idx # ), # if self.use_tree_filter: # self.tf.append(TreeFilterV2_layer(agg_channels, # feature_channels[self.in_features[0]], # embed_dim=self.tf_embed_dim, # num_groups=self.tf_group_num)) else: self.refine.append( conv_block(feature_channels[in_feature], agg_channels, 3, 1)) if self.add_skeleton_feat: self.conv_skeleton = conv_block(agg_channels + 55, agg_channels, 3, 1) if self.use_eca: self.eca = eca_layer(agg_channels, k_size=3) if self.use_aspp: # self.ASPP = ASPP_share(agg_channels, [1,2,3], agg_channels) # 6, 12, 56 self.ASPP = ASPP_share_attn(agg_channels, [1, 2, 3], agg_channels) # 6, 12, 56 self.add_module("ASPP", self.ASPP) # if self.num_lambda_layer>0: # self.lambda_layer = LambdaLayer( # dim = agg_channels, # dim_out = agg_channels, # r = lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) # dim_k = 16, # heads = 4, # dim_u = 4 # ) if self.use_san: # sa_type = 1 ## 0: pairwise; 1: patchwise sa_type = 1 if self.san_type == "SAN_BottleneckGN": san_func = SAN_BottleneckGN elif self.san_type == "SAN_BottleneckGN_GatedEarly": san_func = SAN_BottleneckGN_GatedEarly elif self.san_type == "SAN_BottleneckGN_Gated": SAN_BottleneckGN_Gated self.san_blks = [] for idx in range(len(self.in_features)): san_blk = san_func(sa_type, agg_channels, agg_channels // 16, agg_channels // 4, agg_channels, 8, kernel_size=7, stride=1) self.add_module("san_blk_{}".format(idx), san_blk) self.san_blks.append(san_blk) if self.use_attn: ks = 7 if self.attn_type == "Spatial_Attn": # SpatialMaxAvg_Attn, SpatialMaxAvg_ChannelMaxAvg_Attn ch_in = sum([feature_channels[k] for k in self.in_features]) ch_out = len(self.in_features) self.attn_blk = nn.Sequential(*[ nn.Conv2d(ch_in, ch_out, kernel_size=ks, stride=1, padding=ks // 2, bias=False), nn.Softmax(dim=1) ]) elif self.attn_type == "SpatialMaxAvg_Attn": ch_in = len(self.in_features) * 2 ch_out = len(self.in_features) self.attn_blk = nn.Sequential(*[ nn.Conv2d(ch_in, ch_out, kernel_size=ks, stride=1, padding=ks // 2, bias=False), nn.Softmax(dim=1) ]) elif self.attn_type == "SpatialMaxAvg_ChannelMaxAvg_Attn": ch_in = len(self.in_features) * 2 ch_out = len(self.in_features) self.attn_blk = nn.Sequential(*[ nn.Conv2d(ch_in, ch_out, kernel_size=ks, stride=1, padding=ks // 2, bias=False), nn.Softmax(dim=1) ]) "todo channel attn" self.ch_attn_max_list = [] self.ch_attn_avg_list = [] reduct_ratio = 16 for idx, key in enumerate(self.in_features): ch_attn_max = nn.Sequential(*[ nn.Linear(feature_channels[key], feature_channels[key] // 16), nn.ReLU(inplace=True), nn.Linear(feature_channels[key] // 16, feature_channels[key]), ]) self.add_module("ch_attn_max_{}".format(idx), ch_attn_max) self.ch_attn_max_list.append(ch_attn_max) # ch_attn_avg = nn.Sequential(*[ nn.Linear(feature_channels[key], feature_channels[key] // 16), nn.ReLU(inplace=True), nn.Linear(feature_channels[key] // 16, feature_channels[key]), ]) self.add_module("ch_attn_avg_{}".format(idx), ch_attn_avg) self.ch_attn_avg_list.append(ch_attn_avg) # agg_channels = channels # if "p1" == self.in_features[0]: # self.down_conv = conv_block( # channels, channels, 3, 2, 1 # ) if "p2" == self.in_features[0]: # if self.v2: # if self.add_skeleton_feat: # tower = [conv_block( # agg_channels+55, channels, 3, 2, 1 # )] # else: tower = [conv_block(agg_channels, channels, 3, 2, 1)] # else: # self.down_conv = conv_block( # agg_channels, channels, 3, 2, 1 # ) # tower = [conv_block( # channels, channels, 3, 1 # )] else: tower = [conv_block(agg_channels, channels, 3, 1)] for i in range(1, num_convs): tower.append(conv_block(channels, channels, 3, 1)) tower.append(nn.Conv2d(channels, max(self.num_outputs, 1), 1)) if self.use_res_input or self.use_res_after_relu: for idx, layer in enumerate(tower): self.add_module('tower_layer{}'.format(idx), layer) self.tower = tower else: self.add_module('tower', nn.Sequential(*tower))
def __init__(self, cfg, use_rel_coords=True): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R assert num_lambda_layer <= num_convs agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS self.use_partial_conv = cfg.MODEL.CONDINST.IUVHead.PARTIAL_CONV self.use_partial_norm = cfg.MODEL.CONDINST.IUVHead.PARTIAL_NORM # pdb.set_trace() # if self.use_rel_coords: # self.in_channels = channels + 2 # else: self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS self.use_pos_emb = self.pos_emb_num_freqs > 0 if self.use_pos_emb: self.position_embedder, self.position_emb_dim = get_embedder( multires=self.pos_emb_num_freqs, input_dims=2) self.in_channels = agg_channels + self.position_emb_dim else: self.in_channels = agg_channels + 2 if self.use_abs_coords: if self.use_pos_emb: self.in_channels += self.position_emb_dim else: self.in_channels += 2 if self.use_partial_conv: conv_block = conv_with_kaiming_uniform(norm, activation=True, use_partial_conv=True) else: conv_block = conv_with_kaiming_uniform(norm, activation=True) # pdb.set_trace() conv_block_bn = conv_with_kaiming_uniform("BN", activation=True) # tower_attn = [] # tower_attn.append(conv_block_bn( # self.position_emb_dim, 32, 3, 1 # )) # tower_attn.append(nn.Conv2d( # 32, 3, 3, stride=1, padding=1 # )) # self.add_module('tower_attn', nn.Sequential(*tower_attn)) num_layer = 3 tower0 = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=self.in_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=8, heads=4, dim_u=4) tower0.append(layer) else: tower0.append(conv_block(self.in_channels, channels, 3, 1)) for i in range(num_layer): tower0.append(conv_block(channels, channels, 3, 1)) self.add_module('tower0', nn.Sequential(*tower0)) tower1 = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=self.in_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=8, heads=4, dim_u=4) tower1.append(layer) else: tower1.append(conv_block(self.in_channels, channels, 3, 1)) for i in range(num_layer): tower1.append(conv_block(channels, channels, 3, 1)) self.add_module('tower1', nn.Sequential(*tower1)) tower2 = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=self.in_channels, dim_out=channels, r=lambda_layer_r, # the receptive field for relative positional encoding (23 x 23) dim_k=8, heads=4, dim_u=4) tower2.append(layer) else: tower2.append(conv_block(self.in_channels, channels, 3, 1)) for i in range(num_layer): tower2.append(conv_block(channels, channels, 3, 1)) self.add_module('tower2', nn.Sequential(*tower2)) tower_out = [] for i in range(num_convs - num_layer - 1): if i == 0: tower_out.append(conv_block(channels * 3, channels, 1, 1)) else: tower_out.append(conv_block(channels, channels, 3, 1)) self.add_module('tower_out', nn.Sequential(*tower_out))
def __init__(self, cfg, use_rel_coords=True): super().__init__() self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS norm = cfg.MODEL.CONDINST.IUVHead.NORM num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER assert num_lambda_layer <= num_convs channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2])) self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS # pdb.set_trace() # if self.use_rel_coords: # self.in_channels = channels + 2 # else: self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS self.use_pos_emb = self.pos_emb_num_freqs > 0 if self.use_pos_emb: self.position_embedder, self.position_emb_dim = get_embedder( multires=self.pos_emb_num_freqs, input_dims=2) self.in_channels = channels + self.position_emb_dim else: self.in_channels = channels + 2 if self.use_abs_coords: if self.use_pos_emb: self.in_channels += self.position_emb_dim else: self.in_channels += 2 conv_block = conv_with_kaiming_uniform(norm, activation=True) tower = [] if num_lambda_layer > 0: layer = LambdaLayer( dim=self.in_channels, dim_out=channels, r=23, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) tower.append(layer) else: tower.append(conv_block(self.in_channels, channels, 3, 1)) for i in range(1, num_convs - 1): if i < num_lambda_layer: layer = LambdaLayer( dim=channels, dim_out=channels, r=23, # the receptive field for relative positional encoding (23 x 23) dim_k=16, heads=4, dim_u=4) tower.append(layer) else: tower.append(conv_block(channels, channels, 3, 1)) self.add_module('tower', nn.Sequential(*tower)) self.mid_res_conv = conv_block(channels, channels, 3, 1) self.mid_res_out = nn.Conv2d(channels, self.num_outputs, 1) self.low_res_conv = conv_block(channels, channels, 3, 2) self.low_res_out = nn.Conv2d(channels, self.num_outputs, 1) deconv_block = conv_with_kaiming_uniform(norm, activation=True, use_deconv=True) self.high_res_conv = deconv_block(channels, channels, 3, 2) self.high_res_out = nn.Conv2d(channels, self.num_outputs, 1)