class SSD(nn.Module): def __init__(self, num_classes): super(SSD, self).__init__() self.num_classes = num_classes # Setup the backbone network (base_net) self.base_net = MobileNet(num_classes) # The feature map will extracted from layer[11] and layer[13] in (base_net) self.base_output_layer_indices = (11, 13) # Define the Additional feature extractor self.additional_feat_extractor = nn.ModuleList([ # Conv8_2 nn.Sequential( nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1), nn.ReLU()), # Conv9_2 nn.Sequential( nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU()), nn.Sequential( nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU()), nn.Sequential( nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU()) ]) # Bounding box offset regressor num_prior_bbox = 6 # num of prior bounding boxes self.loc_regressor = nn.ModuleList([ nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=1024, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1) ]) # Bounding box classification confidence for each label self.classifier = nn.ModuleList([ nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=1024, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1) ]) # Todo: load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning temp_state = torch.load('pretrained/mobienetv2.pth') #self.base_net.load_state_dict(cur_state) cur_dict = self.base_net.state_dict() input_state = { k: v for k, v in temp_state.items() if k in cur_dict and v.size() == cur_dict[k].size() } cur_dict.update(input_state) self.base_net.load_state_dict(cur_dict) def init_with_xavier(m): if isinstance(m, nn.Conv2d): nn.init.xavier_uniform_(m.weight) self.loc_regressor.apply(init_with_xavier) self.classifier.apply(init_with_xavier) self.additional_feat_extractor.apply(init_with_xavier) def feature_to_bbbox(self, loc_regress_layer, confidence_layer, input_feature): """ Compute the bounding box class scores and the bounding box offset :param loc_regress_layer: offset regressor layer to run forward :param confidence_layer: confidence layer to run forward :param input_feature: feature map to be feed in :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively. """ conf = confidence_layer(input_feature) loc = loc_regress_layer(input_feature) # Confidence post-processing: # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes) # where H*W*num_prior_bbox = num_priors conf = conf.permute(0, 2, 3, 1).contiguous() num_batch = conf.shape[0] c_channels = int(conf.shape[1] * conf.shape[2] * conf.shape[3] / self.num_classes) #print('conf shape',conf.shape) conf = conf.view(num_batch, c_channels, self.num_classes) # Bounding Box loc and size post-processing # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4) loc = loc.permute(0, 2, 3, 1).contiguous() #print('loc shape',loc.shape) l_channels = int(loc.shape[1] * loc.shape[2] * loc.shape[3] / 4) #print('l chanel', l_channels) loc = loc.view(num_batch, l_channels, 4) return conf, loc def forward(self, input): confidence_list = [] loc_list = [] # Run the backbone network from [0 to 11, and fetch the bbox class confidence # as well as position and size y = module_util.forward_from(self.base_net.base_net, 0, self.base_output_layer_indices[0] + 1, input) #print('y',y.shape) confidence, loc = self.feature_to_bbbox(self.loc_regressor[0], self.classifier[0], y) confidence_list.append(confidence) loc_list.append(loc) #print('cof, loc size', confidence.shape, loc.shape) # Todo: implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence y = module_util.forward_from(self.base_net.base_net, self.base_output_layer_indices[0], self.base_output_layer_indices[1] + 1, y) #print('y', y.shape) confidence, loc = self.feature_to_bbbox(self.loc_regressor[1], self.classifier[1], y) confidence_list.append(confidence) loc_list.append(loc) #print('cof, loc size', confidence.shape, loc.shape) #conv to 12 #y = module_util.forward_from(self.base_net.base_net, self.base_output_layer_indices[1], self.base_output_layer_indices[2]+1, y) # Todo: forward the 'y' to additional layers for extracting coarse features for idx in range(0, len(self.additional_feat_extractor)): #print('current idx', idx) #print('y', y.shape) y = module_util.forward_from(self.additional_feat_extractor[idx], 0, 4, y) confidence, loc = self.feature_to_bbbox( self.loc_regressor[idx + 2], self.classifier[idx + 2], y) confidence_list.append(confidence) loc_list.append(loc) #print('cof, loc size', confidence.shape, loc.shape) confidences = torch.cat(confidence_list, 1) locations = torch.cat(loc_list, 1) #print('cof, loc size after cat', np.asarray(confidences).shape, np.asarray(locations).shape) # [Debug] check the output assert confidences.dim() == 3 # should be (N, num_priors, num_classes) assert locations.dim() == 3 # should be (N, num_priors, 4) assert confidences.shape[1] == locations.shape[1] assert locations.shape[2] == 4 if not self.training: # If in testing/evaluating mode, normalize the output with Softmax confidences = F.softmax(confidences, dim=2) return confidences, locations
class SSD(nn.Module): def __init__(self, num_classes = 4): super(SSD, self).__init__() self.num_classes = num_classes # Setup the backbone network (base_net) self.base_net = MobileNet(num_classes) # The feature map will extracted from layer[11] and layer[13] in (base_net) self.base_output_layer_indices = (11, 13) # Define the Additional feature extractor self.additional_feat_extractor = nn.ModuleList([ # Conv8_2 nn.Sequential( nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1), nn.ReLU() ), # Conv9_2 nn.Sequential( nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU() ), # Conv10_2 nn.Sequential( nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1), nn.ReLU() ), # Conv11_2 nn.Sequential( nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1), nn.ReLU() ), # TODO: implement two more layers. Done ]) # Bounding box offset regressor num_prior_bbox = 6 # num of prior bounding boxes self.loc_regressor = nn.ModuleList([ nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=1024, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1) # TODO: implement remaining layers. Done ]) # Bounding box classification confidence for each label self.classifier = nn.ModuleList([ nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=1024, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1) # TODO: implement remaining layers. Done ]) # Todo: load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning Done pretrained_model = torch.load("./pretrained/mobienetv2.pth") # new = list(pretrained_model.items()) my_model= self.base_net.state_dict() # 1. filter out unnecessary keys #print(my_model) #print(pretrained_model.items()) pretrained_model = {k: v for k, v in pretrained_model.items() if k in my_model} # 2. overwrite entries in the existing state dict my_model.update(pretrained_model) # print(my_model) # 3. load the new state dict self.base_net.load_state_dict(my_model) # print(self.base_net) # print(self.additional_feat_extractor) # print(my_model_kvpair,pretrained_model) # count = 0 # for key, value in my_model_kvpair.items(): # layer_name, weights = new[count] # my_model_kvpair[key] = weights # count += 1 #self.base_net.load_state_dict(pretrained_model) def init_with_xavier(m): if isinstance(m, nn.Conv2d): nn.init.xavier_uniform_(m.weight) self.loc_regressor.apply(init_with_xavier) self.classifier.apply(init_with_xavier) self.additional_feat_extractor.apply(init_with_xavier) def feature_to_bbbox(self, loc_regress_layer, confidence_layer, input_feature): """ Compute the bounding box class scores and the bounding box offset :param loc_regress_layer: offset regressor layer to run forward :param confidence_layer: confidence layer to run forward :param input_feature: feature map to be feed in :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively. """ conf = confidence_layer(input_feature) loc = loc_regress_layer(input_feature) # Confidence post-processing: # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes) # where H*W*num_prior_bbox = num_priors conf = conf.permute(0, 2, 3, 1).contiguous() num_batch = conf.shape[0] c_channels = int(conf.shape[1]*conf.shape[2]*conf.shape[3] / self.num_classes) conf = conf.view(num_batch, c_channels, self.num_classes) # Bounding Box loc and size post-processing # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4) loc = loc.permute(0, 2, 3, 1).contiguous() loc = loc.view(num_batch, c_channels, 4) return conf, loc def forward(self, input): confidence_list = [] loc_list = [] # Run the backbone network from [0 to 11, and fetch the bbox class confidence # as well as position and size y = module_util.forward_from(self.base_net.conv_layers, 0, self.base_output_layer_indices[0], input) print(y.shape) confidence, loc = self.feature_to_bbbox(self.loc_regressor[0], self.classifier[0], y) confidence_list.append(confidence) loc_list.append(loc) # Todo: implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence Done y = module_util.forward_from(self.base_net.conv_layers,self.base_output_layer_indices[0],self.base_output_layer_indices[1], y) print(y.shape) confidence, loc = self.feature_to_bbbox(self.loc_regressor[1], self.classifier[1], y) confidence_list.append(confidence) loc_list.append(loc) # print(y) # Todo: forward the 'y' to additional layers for extracting coarse features Done for i in range(0,3): y = module_util.forward_from(self.additional_feat_extractor, i,i+1, y) confidence, loc = self.feature_to_bbbox(self.loc_regressor[i+2], self.classifier[i+2], y) confidence_list.append(confidence) loc_list.append(loc) # print(y) confidences = torch.cat(confidence_list, 1) locations = torch.cat(loc_list, 1) print(confidences.shape,locations.shape) # [Debug] check the output assert confidences.dim() == 3 # should be (N, num_priors, num_classes) assert locations.dim() == 3 # should be (N, num_priors, 4) assert confidences.shape[1] == locations.shape[1] assert locations.shape[2] == 4 if not self.training: # If in testing/evaluating mode, normalize the output with Softmax confidences = F.softmax(confidences, dim=2) print(confidences.shape) return confidences, locations
def main(): global opt, start_epoch, best_prec1 opt = cfg opt.gpuids = list(map(int, opt.gpuids)) if opt.cuda and not torch.cuda.is_available(): raise Exception("No GPU found, please run without --cuda") model = MobileNet() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay, nesterov=True) start_epoch = 0 ckpt_file = join("model", opt.ckpt) if opt.cuda: torch.cuda.set_device(opt.gpuids[0]) with torch.cuda.device(opt.gpuids[0]): model = model.cuda() criterion = criterion.cuda() model = nn.DataParallel(model, device_ids=opt.gpuids, output_device=opt.gpuids[0]) cudnn.benchmark = True # for resuming training if opt.resume: if isfile(ckpt_file): print("==> Loading Checkpoint '{}'".format(opt.ckpt)) if opt.cuda: checkpoint = torch.load(ckpt_file, map_location=lambda storage, loc: storage.cuda(opt.gpuids[0])) try: model.module.load_state_dict(checkpoint['model']) except: model.load_state_dict(checkpoint['model']) else: checkpoint = torch.load( ckpt_file, map_location=lambda storage, loc: storage) try: model.load_state_dict(checkpoint['model']) except: # create new OrderedDict that does not contain `module.` new_state_dict = OrderedDict() for k, v in checkpoint['model'].items(): if k[:7] == 'module.': name = k[7:] # remove `module.` else: name = k[:] new_state_dict[name] = v model.load_state_dict(new_state_dict) start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) print("==> Loaded Checkpoint '{}' (epoch {})".format( opt.ckpt, start_epoch)) else: print("==> no checkpoint found at '{}'".format(opt.ckpt)) return # Download & Load Dataset print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_val = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(trainset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.workers) valset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_val) val_loader = torch.utils.data.DataLoader(valset, batch_size=opt.test_batch_size, shuffle=False, num_workers=opt.workers) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # for evaluation if opt.eval: if isfile(ckpt_file): print("==> Loading Checkpoint '{}'".format(opt.ckpt)) if opt.cuda: checkpoint = torch.load(ckpt_file, map_location=lambda storage, loc: storage.cuda(opt.gpuids[0])) try: model.module.load_state_dict(checkpoint['model']) except: model.load_state_dict(checkpoint['model']) else: checkpoint = torch.load( ckpt_file, map_location=lambda storage, loc: storage) try: model.load_state_dict(checkpoint['model']) except: # create new OrderedDict that does not contain `module.` new_state_dict = OrderedDict() for k, v in checkpoint['model'].items(): if k[:7] == 'module.': name = k[7:] # remove `module.` else: name = k[:] new_state_dict[name] = v model.load_state_dict(new_state_dict) start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) print("==> Loaded Checkpoint '{}' (epoch {})".format( opt.ckpt, start_epoch)) # evaluate on validation set print("\n===> [ Evaluation ]") start_time = time.time() prec1 = validate(val_loader, model, criterion) elapsed_time = time.time() - start_time print("====> {:.2f} seconds to evaluate this model\n".format( elapsed_time)) return else: print("==> no checkpoint found at '{}'".format(opt.ckpt)) return # train... train_time = 0.0 validate_time = 0.0 for epoch in range(start_epoch, opt.epochs): adjust_learning_rate(optimizer, epoch) print('\n==> Epoch: {}, lr = {}'.format( epoch, optimizer.param_groups[0]["lr"])) # train for one epoch print("===> [ Training ]") start_time = time.time() train(train_loader, model, criterion, optimizer, epoch) elapsed_time = time.time() - start_time train_time += elapsed_time print( "====> {:.2f} seconds to train this epoch\n".format(elapsed_time)) # evaluate on validation set print("===> [ Validation ]") start_time = time.time() prec1 = validate(val_loader, model, criterion) elapsed_time = time.time() - start_time validate_time += elapsed_time print("====> {:.2f} seconds to validate this epoch\n".format( elapsed_time)) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) state = { 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } save_model(state, epoch, is_best) avg_train_time = train_time / opt.epochs avg_valid_time = validate_time / opt.epochs total_train_time = train_time + validate_time print("====> average training time per epoch: {}m {:.2f}s".format( int(avg_train_time // 60), avg_train_time % 60)) print("====> average validation time per epoch: {}m {:.2f}s".format( int(avg_valid_time // 60), avg_valid_time % 60)) print("====> training time: {}m {:.2f}s".format(int(train_time // 60), train_time % 60)) print("====> validation time: {}m {:.2f}s".format(int(validate_time // 60), validate_time % 60)) print("====> total training time: {}m {:.2f}s".format( int(total_train_time // 60), total_train_time % 60))
class SSD(nn.Module): def __init__(self, num_classes): super(SSD, self).__init__() self.num_classes = num_classes # Setup the backbone network (base_net) self.base_net = MobileNet(num_classes) # The feature map will extracted from layer[11] and layer[13] in (base_net) self.base_output_layer_indices = (6, 11) # Define the Additional feature extractor self.additional_feat_extractor = nn.ModuleList([ # Conv8_2 nn.Sequential( nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1), nn.ReLU() ), # Conv9_2 nn.Sequential( nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU() ), # Conv10_2 nn.Sequential( nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU() ), # Conv11_2 nn.Sequential( nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2), nn.ReLU() ) ]) # Bounding box offset regressor num_prior_bbox = 6 # num of prior bounding boxes self.loc_regressor = nn.ModuleList([ nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), # TODO: implement remaining layers. nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1) ]) # Bounding box classification confidence for each label self.classifier = nn.ModuleList([ nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), # TODO: implement remaining layers. nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1), ]) # load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning pretrained_dict = torch.load('./pretrained/mobienetv2.pth') pretrained_dict = {k: v for k, v in pretrained_dict.items() if 'base_net' in k} model_dict = self.base_net.state_dict() keys = [] for k,v in pretrained_dict.items(): keys.append(k) i = 0 for k,v in model_dict.items(): if v.size() == pretrained_dict[keys[i]].size(): model_dict[k] = pretrained_dict[keys[i]] i += 1 if i == len(keys): break self.base_net.load_state_dict(model_dict) self.base_net.eval() def init_with_xavier(m): if isinstance(m, nn.Conv2d): nn.init.xavier_uniform_(m.weight) self.loc_regressor.apply(init_with_xavier) self.classifier.apply(init_with_xavier) self.additional_feat_extractor.apply(init_with_xavier) def feature_to_bbbox(self, loc_regress_layer, confidence_layer, input_feature): """ Compute the bounding box class scores and the bounding box offset :param loc_regress_layer: offset regressor layer to run forward :param confidence_layer: confidence layer to run forward :param input_feature: feature map to be feed in :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively. """ conf = confidence_layer(input_feature) loc = loc_regress_layer(input_feature) # Confidence post-processing: # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes) # where H*W*num_prior_bbox = num_priors conf = conf.permute(0, 2, 3, 1).contiguous() num_batch = conf.shape[0] c_channels = int(conf.shape[1]*conf.shape[2]*conf.shape[3] / self.num_classes) conf = conf.view(num_batch, c_channels, self.num_classes) # Bounding Box loc and size post-processing # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4) loc = loc.permute(0, 2, 3, 1).contiguous() loc = loc.view(num_batch, c_channels, 4) return conf, loc def forward(self, input): confidence_list = [] loc_list = [] # Run the backbone network from [0 to 11, and fetch the bbox class confidence # as well as position and size y = module_util.forward_from(self.base_net.conv_layers, 0, self.base_output_layer_indices[0], input) confidence, loc = self.feature_to_bbbox(self.loc_regressor[0], self.classifier[0], y) confidence_list.append(confidence) loc_list.append(loc) # implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence y = module_util.forward_from(self.base_net.conv_layers, self.base_output_layer_indices[0], self.base_output_layer_indices[1], y) confidence, loc = self.feature_to_bbbox(self.loc_regressor[1], self.classifier[1], y) confidence_list.append(confidence) loc_list.append(loc) # forward the 'y' to additional layers for extracting coarse features for i in range(4): y = module_util.forward_from(self.additional_feat_extractor, i, i+1, y) confidence, loc = self.feature_to_bbbox(self.loc_regressor[i+2], self.classifier[i+2], y) confidence_list.append(confidence) loc_list.append(loc) confidences = torch.cat(confidence_list, 1) locations = torch.cat(loc_list, 1) # [Debug] check the output assert confidences.dim() == 3 # should be (N, num_priors, num_classes) assert locations.dim() == 3 # should be (N, num_priors, 4) assert confidences.shape[1] == locations.shape[1] assert locations.shape[2] == 4 if not self.training: # If in testing/evaluating mode, normalize the output with Softmax confidences = F.softmax(confidences, dim=2) return confidences, locations
class SSD(nn.Module): def __init__(self, num_classes): super(SSD, self).__init__() self.num_classes = num_classes # Setup the backbone network (base_net). self.base_net = MobileNet() # The feature map will extracted from the end of following layers sections in (base_net). self.base_output_sequence_indices = (0, 12, len(self.base_net.base_net)) # Number of prior bounding box. self.num_prior_bbox = 8 # Define the additional feature extractor. self.additional_feature_extractor = nn.ModuleList([ # Layer 28 - 29 5x5x512 nn.Sequential( nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1), nn.ReLU() ), # Layer 30 - 31 3x3x256 nn.Sequential( nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU() ), # Layer 32 - 33 2x2x256 nn.Sequential( nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU() ), # Layer 34 - 35 1x1x256 nn.Sequential( nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1), nn.ReLU() ) ]) # Bounding box offset regressor. self.loc_regressor = nn.ModuleList([ nn.Conv2d(512, 8 * 4, kernel_size=3, padding=1), # Layer 22 nn.Conv2d(1024, 8 * 4, kernel_size=3, padding=1), # Layer 27 nn.Conv2d(512, 8 * 4, kernel_size=3, padding=1), # Layer 29 nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1), # Layer 31 nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1), # Layer 33 nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1), # Layer 35 ]) # Bounding box classification confidence for each label. self.classifier = nn.ModuleList([ nn.Conv2d(512, 8 * num_classes, kernel_size=3, padding=1), # Layer 13 nn.Conv2d(1024, 8 * num_classes, kernel_size=3, padding=1), # Layer 25 nn.Conv2d(512, 8 * num_classes, kernel_size=3, padding=1), # Layer 29 nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1), # Layer 31 nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1), # Layer 33 nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1), # Layer 35 ]) # Load pretrained model. pretrained_state = torch.load('pretrained/mobienetv2.pth') model_dict = self.base_net.state_dict() # Filter out unnecessary keys. pretrained_state = {k: v for k, v in pretrained_state.items() if k in model_dict} # Overwrite entries in the existing state dict. model_dict.update(pretrained_state) # Load the new state dict. self.base_net.load_state_dict(model_dict) def init_with_xavier(m): if isinstance(m, nn.Conv2d): nn.init.xavier_uniform_(m.weight) self.loc_regressor.apply(init_with_xavier) self.classifier.apply(init_with_xavier) self.additional_feature_extractor.apply(init_with_xavier) def feature_to_bbox(self, loc_regress_layer, confidence_layer, input_feature): """ Compute the bounding box class scores and the bounding box offset. :param loc_regress_layer: offset regressor layer to run forward. :param confidence_layer: confidence layer to run forward. :param input_feature: feature map to be feed in forward. :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively. """ conf = confidence_layer(input_feature) loc = loc_regress_layer(input_feature) # Confidence post-processing: # 1: (N, num_prior_bbox * n_classes, H, W) to # (N, H * W * num_prior_bbox, n_classes) = (N, num_priors, num_classes) # where H * W * num_prior_bbox = num_priors conf = conf.permute(0, 2, 3, 1).contiguous() num_batch = conf.shape[0] c_channels = int(conf.shape[1] * conf.shape[2] * conf.shape[3] / self.num_classes) conf = conf.view(num_batch, c_channels, self.num_classes) # Bounding Box loc and size post-processing. # 1: (N, num_prior_bbox * 4, H, W) to (N, num_priors, 4) loc = loc.permute(0, 2, 3, 1).contiguous() loc = loc.view(num_batch, c_channels, 4) return conf, loc def forward(self, inp): confidence_list = [] loc_list = [] result = inp # Forward the 'result' to base net for regressor & classifier. for index in range(0, len(self.base_output_sequence_indices) - 1): result = module_util.forward_from( self.base_net.base_net, self.base_output_sequence_indices[index], self.base_output_sequence_indices[index + 1], result) confidence, loc = self.feature_to_bbox(self.loc_regressor[index], self.classifier[index], result) confidence_list.append(confidence) loc_list.append(loc) # Forward the 'result' to additional layers for extracting coarse features. for index in range(0, len(self.additional_feature_extractor)): result = module_util.forward_from( self.additional_feature_extractor, index, index + 1, result) confidence, loc = self.feature_to_bbox(self.loc_regressor[index + 2], self.classifier[index + 2], result) confidence_list.append(confidence) loc_list.append(loc) confidences = torch.cat(confidence_list, 1) locations = torch.cat(loc_list, 1) # [Debug] Check the output. assert confidences.dim() == 3 # Should be (N, num_priors, num_classes). assert confidences.shape[2] == self.num_classes # Should be (N, num_priors, num_classes). assert locations.dim() == 3 # Should be (N, num_priors, 4). assert confidences.shape[1] == locations.shape[1] assert locations.shape[2] == 4 if not self.training: # If in testing/evaluating mode, normalize the output with Softmax. confidences = f.softmax(confidences, dim=2) return confidences, locations