def __init__(self, out_planes, is_training): super(Network_UNet, self).__init__() self.layers = [] self.is_training = is_training self.layers = [] conv_channel = 128 # base model of resnet 18 from resnet.py self.resnet = resnet18(pretrained_model=None, norm_layer=BN2D, bn_eps=config.bn_eps, bn_momentum=config.bn_momentum, deep_stem=False, stem_width=64) # tail refinement on FCN style contraction path self.refine_512 = nn.Sequential( ConvBnRelu(512, 1024, 7, 1, 3, has_bn=False, has_relu=True, has_bias=False, norm_layer=BN2D), nn.Dropout2d(), ConvBnRelu(1024, out_planes, 1, 1, 0, has_bn=False, has_relu=True, has_bias=False, norm_layer=BN2D), nn.Dropout2d(), nn.AdaptiveAvgPool2d(1) ) # upscale using Transpose convolution self.up_512 = nn.ConvTranspose2d(out_planes, out_planes, kernel_size=4,stride=2,padding=2, output_padding=1) # Refinement on intermediate layers in FCN style structure self.refine_256 = ConvBnRelu(256, out_planes, 1, 1, 0, has_bn=False, has_relu=False, has_bias=False, norm_layer=BN2D) self.refine_128 = ConvBnRelu(128, out_planes, 1, 1, 0, has_bn=False, has_relu=False, has_bias=False, norm_layer=BN2D) self.refine_64 = ConvBnRelu(64, out_planes, 1, 1, 0, has_bn=False, has_relu=False, has_bias=False, norm_layer=BN2D) # upscale using Transpose convolution self.up_256 = nn.ConvTranspose2d(out_planes, out_planes, kernel_size=4,stride=2,padding=1) self.up_128 = nn.ConvTranspose2d(out_planes, out_planes, kernel_size=4,stride=2,padding=1) self.up_final = nn.ConvTranspose2d(out_planes, out_planes, kernel_size=8,stride=4,padding=2) self.layers.append(self.resnet) self.layers.append(self.refine_512) self.layers.append(self.refine_256) self.layers.append(self.refine_128) self.layers.append(self.refine_64) self.layers.append(self.up_512) self.layers.append(self.up_256) self.layers.append(self.up_128) self.layers.append(self.up_final) self.loss = nn.CrossEntropyLoss(reduction='mean', ignore_index=255)
def __init__(self, out_planes, is_training, criterion, pretrained_model=None, norm_layer=nn.BatchNorm2d): super(conf, self).__init__() self.is_training = is_training self.business_layer = [] if is_training: self.criterion = criterion self.encoder = resnet18(pretrained_model, norm_layer=norm_layer, bn_eps=config.bn_eps, bn_momentum=config.bn_momentum, deep_stem=False, stem_width=64) self.context_ff = AttentionFusion(256, 512, 128) self.spatial_conv = ConvBnRelu(64, 128, 1, 1, 0, dilation=1, has_bn=True, norm_layer=norm_layer, has_relu=True, has_bias=False) self.loc_conf = LocationConfidence(128 + 128, 1) self.refine_block = RefineOutput(128, out_planes, 4) self.spatial_refine_block = RefineOutput(128, out_planes, 4) self.context_refine_block = RefineOutput(128, out_planes, 16) self.business_layer.append(self.context_ff) self.business_layer.append(self.spatial_conv) self.business_layer.append(self.loc_conf) self.business_layer.append(self.refine_block) self.business_layer.append(self.spatial_refine_block) self.business_layer.append(self.context_refine_block)
parser.add_argument('--epoch', type=int, default=20) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--weight_decay', type=float, default=5e-4) parser.add_argument('--stage', type=int, default=1) parser.add_argument('--evaluate', type=int, default=0) args = parser.parse_args() trainset = AudioVisualData() testset = TestData() trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, collate_fn=DataAllocate, num_workers=args.num_workers) testloader = data.DataLoader(testset, batch_size=args.val_batch, shuffle=False, collate_fn=TestAllocate, num_workers=args.num_workers) vision_net = resnet18(modal='vision', pretrained=True) audio_net = resnet18(modal='audio') if args.evaluate: net = Location(vision_net, audio_net).cuda() net.load_state_dict(torch.load(args.path)) test(net, testloader) exit() net = MTask(vision_net, audio_net).cuda() if args.stage == 1 \ else Align(vision_net, audio_net).cuda() if args.pretrained: net.load_state_dict(torch.load(args.path), strict=False) params = list(net.parameters()) optimizer = torch.optim.SGD(params=params, lr=args.learning_rate,
def __init__(self, out_planes, is_training, criterion, pretrained_model=None, norm_layer=nn.BatchNorm2d): super(BiSeNet, self).__init__() self.context_path = resnet18(pretrained_model, norm_layer=norm_layer, bn_eps=config.bn_eps, bn_momentum=config.bn_momentum, deep_stem=False, stem_width=64) self.business_layer = [] self.is_training = is_training self.spatial_path = SpatialPath(3, 128, norm_layer) conv_channel = 128 self.global_context = nn.Sequential( nn.AdaptiveAvgPool2d(1), ConvBnRelu(512, conv_channel, 1, 1, 0, has_bn=True, has_relu=True, has_bias=False, norm_layer=norm_layer)) # stage = [512, 256, 128, 64] arms = [ AttentionRefinement(512, conv_channel, norm_layer), AttentionRefinement(256, conv_channel, norm_layer) ] refines = [ ConvBnRelu(conv_channel, conv_channel, 3, 1, 1, has_bn=True, norm_layer=norm_layer, has_relu=True, has_bias=False), ConvBnRelu(conv_channel, conv_channel, 3, 1, 1, has_bn=True, norm_layer=norm_layer, has_relu=True, has_bias=False) ] heads = [ BiSeNetHead(conv_channel, out_planes, 16, True, norm_layer), BiSeNetHead(conv_channel, out_planes, 8, True, norm_layer), BiSeNetHead(conv_channel * 2, out_planes, 8, False, norm_layer) ] self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1, norm_layer) self.arms = nn.ModuleList(arms) self.refines = nn.ModuleList(refines) self.heads = nn.ModuleList(heads) self.business_layer.append(self.spatial_path) self.business_layer.append(self.global_context) self.business_layer.append(self.arms) self.business_layer.append(self.refines) self.business_layer.append(self.heads) self.business_layer.append(self.ffm) if is_training: self.criterion = criterion
def __init__(self, out_planes, is_training, BN2D=BatchNorm2d): super(Network_Res18, self).__init__() self.layers = [] self.is_training = is_training conv_channel = 128 # use base model of resnet 18 from resnet.py self.context = resnet18(pretrained_model=None, norm_layer=BN2D, bn_eps=config.bn_eps, bn_momentum=config.bn_momentum, deep_stem=False, stem_width=64) self.context_refine = nn.Sequential( nn.AdaptiveAvgPool2d(1), ConvBnRelu(512, conv_channel, 1, 1, 0, has_bn=True, has_relu=True, has_bias=False, norm_layer=BN2D)) # ARM for ResBlock 2,3,4 of resnet output arms = [ AttentionRefinement(512, conv_channel, norm_layer=BN2D), AttentionRefinement(256, conv_channel, norm_layer=BN2D), AttentionRefinement(128, conv_channel, norm_layer=BN2D) ] # Refinement of corresponding output refines = [ ConvBnRelu(conv_channel, conv_channel, 3, 1, 1, has_bn=True, norm_layer=BN2D, has_relu=True, has_bias=False), ConvBnRelu(conv_channel, conv_channel, 3, 1, 1, has_bn=True, norm_layer=BN2D, has_relu=True, has_bias=False), ConvBnRelu(conv_channel, conv_channel, 3, 1, 1, has_bn=True, norm_layer=BN2D, has_relu=True, has_bias=False) ] self.arms = nn.ModuleList(arms) self.refines = nn.ModuleList(refines) # Refinement on first layer of resnet output self.res_top_refine = ConvBnRelu(64, conv_channel // 2, 3, 1, 1, has_bn=True, norm_layer=BN2D, has_relu=True, has_bias=False) self.ffm = FeatureFusion(192, conv_channel, 1, BN2D) # classifier for final output self.class_refine = nn.Sequential( ConvBnRelu(conv_channel, conv_channel // 2, 3, 1, 1, has_bn=True, has_relu=True, has_bias=False, norm_layer=BN2D), nn.Conv2d(conv_channel // 2, out_planes, kernel_size=1, stride=1, padding=0)) self.layers.append(self.context) self.layers.append(self.class_refine) self.layers.append(self.context_refine) self.layers.append(self.arms) self.layers.append(self.ffm) self.layers.append(self.refines) self.layers.append(self.res_top_refine) self.loss = nn.CrossEntropyLoss(reduction='mean', ignore_index=255)