Ejemplo n.º 1
0
class SSD(nn.Module):
    def __init__(self, num_classes):
        super(SSD, self).__init__()
        self.num_classes = num_classes

        # Setup the backbone network (base_net)
        self.base_net = MobileNet(num_classes)

        # The feature map will extracted from layer[11] and layer[13] in (base_net)
        self.base_output_layer_indices = (11, 13)

        # Define the Additional feature extractor
        self.additional_feat_extractor = nn.ModuleList([
            # Conv8_2
            nn.Sequential(
                nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256,
                          out_channels=512,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU()),
            # Conv9_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU()),
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU()),
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU())
        ])

        # Bounding box offset regressor
        num_prior_bbox = 6  # num of prior bounding boxes
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=1024,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1)
        ])

        # Bounding box classification confidence for each label
        self.classifier = nn.ModuleList([
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=1024,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1)
        ])

        # Todo: load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning
        temp_state = torch.load('pretrained/mobienetv2.pth')
        #self.base_net.load_state_dict(cur_state)

        cur_dict = self.base_net.state_dict()
        input_state = {
            k: v
            for k, v in temp_state.items()
            if k in cur_dict and v.size() == cur_dict[k].size()
        }
        cur_dict.update(input_state)
        self.base_net.load_state_dict(cur_dict)

        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)

        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feat_extractor.apply(init_with_xavier)

    def feature_to_bbbox(self, loc_regress_layer, confidence_layer,
                         input_feature):
        """
        Compute the bounding box class scores and the bounding box offset
        :param loc_regress_layer: offset regressor layer to run forward
        :param confidence_layer: confidence layer to run forward
        :param input_feature: feature map to be feed in
        :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively.
        """
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)

        # Confidence post-processing:
        # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes)
        #    where H*W*num_prior_bbox = num_priors
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1] * conf.shape[2] * conf.shape[3] /
                         self.num_classes)
        #print('conf shape',conf.shape)

        conf = conf.view(num_batch, c_channels, self.num_classes)

        # Bounding Box loc and size post-processing
        # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        #print('loc shape',loc.shape)
        l_channels = int(loc.shape[1] * loc.shape[2] * loc.shape[3] / 4)
        #print('l chanel', l_channels)
        loc = loc.view(num_batch, l_channels, 4)

        return conf, loc

    def forward(self, input):

        confidence_list = []
        loc_list = []

        # Run the backbone network from [0 to 11, and fetch the bbox class confidence
        # as well as position and size
        y = module_util.forward_from(self.base_net.base_net, 0,
                                     self.base_output_layer_indices[0] + 1,
                                     input)
        #print('y',y.shape)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[0],
                                                self.classifier[0], y)
        confidence_list.append(confidence)
        loc_list.append(loc)
        #print('cof, loc size', confidence.shape, loc.shape)

        # Todo: implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence
        y = module_util.forward_from(self.base_net.base_net,
                                     self.base_output_layer_indices[0],
                                     self.base_output_layer_indices[1] + 1, y)
        #print('y', y.shape)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[1],
                                                self.classifier[1], y)
        confidence_list.append(confidence)
        loc_list.append(loc)
        #print('cof, loc size', confidence.shape, loc.shape)

        #conv to 12
        #y = module_util.forward_from(self.base_net.base_net, self.base_output_layer_indices[1], self.base_output_layer_indices[2]+1, y)
        # Todo: forward the 'y' to additional layers for extracting coarse features
        for idx in range(0, len(self.additional_feat_extractor)):
            #print('current idx', idx)
            #print('y', y.shape)
            y = module_util.forward_from(self.additional_feat_extractor[idx],
                                         0, 4, y)
            confidence, loc = self.feature_to_bbbox(
                self.loc_regressor[idx + 2], self.classifier[idx + 2], y)
            confidence_list.append(confidence)
            loc_list.append(loc)
            #print('cof, loc size', confidence.shape, loc.shape)

        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)
        #print('cof, loc size after cat',  np.asarray(confidences).shape, np.asarray(locations).shape)

        # [Debug] check the output
        assert confidences.dim() == 3  # should be (N, num_priors, num_classes)
        assert locations.dim() == 3  # should be (N, num_priors, 4)
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4

        if not self.training:
            # If in testing/evaluating mode, normalize the output with Softmax
            confidences = F.softmax(confidences, dim=2)

        return confidences, locations
Ejemplo n.º 2
0
class SSD(nn.Module):
    
    def __init__(self, num_classes = 4):
        super(SSD, self).__init__()
        self.num_classes = num_classes

        # Setup the backbone network (base_net)
        self.base_net = MobileNet(num_classes)

        # The feature map will extracted from layer[11] and layer[13] in (base_net)
        self.base_output_layer_indices = (11, 13)

        # Define the Additional feature extractor
        self.additional_feat_extractor = nn.ModuleList([
            # Conv8_2
            nn.Sequential(
                nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Conv9_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
	    # Conv10_2
	    nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
                nn.ReLU()
            ),
	    # Conv11_2
	     nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
                nn.ReLU()
            ),
            # TODO: implement two more layers. Done
        ])

        # Bounding box offset regressor
        num_prior_bbox = 6                                                               # num of prior bounding boxes
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=1024, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1)
            # TODO: implement remaining layers. Done
        ])

        # Bounding box classification confidence for each label
        self.classifier = nn.ModuleList([
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=1024, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1)
        # TODO: implement remaining layers. Done
        ])


        # Todo: load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning Done

        pretrained_model = torch.load("./pretrained/mobienetv2.pth")


        # new = list(pretrained_model.items())
        my_model= self.base_net.state_dict()

        # 1. filter out unnecessary keys
        #print(my_model)
        #print(pretrained_model.items())

        pretrained_model = {k: v for k, v in pretrained_model.items() if k in my_model}
        # 2. overwrite entries in the existing state dict
        my_model.update(pretrained_model)
        # print(my_model)
        # 3. load the new state dict
        self.base_net.load_state_dict(my_model)

        # print(self.base_net)
        # print(self.additional_feat_extractor)

        # print(my_model_kvpair,pretrained_model)
        # count = 0
        # for key, value in my_model_kvpair.items():
        #     layer_name, weights = new[count]
        #     my_model_kvpair[key] = weights
        #     count += 1

        #self.base_net.load_state_dict(pretrained_model)


        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feat_extractor.apply(init_with_xavier)

    def feature_to_bbbox(self, loc_regress_layer, confidence_layer, input_feature):
        """
        Compute the bounding box class scores and the bounding box offset
        :param loc_regress_layer: offset regressor layer to run forward
        :param confidence_layer: confidence layer to run forward
        :param input_feature: feature map to be feed in
        :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively.
        """
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)

        # Confidence post-processing:
        # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes)
        #    where H*W*num_prior_bbox = num_priors
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1]*conf.shape[2]*conf.shape[3] / self.num_classes)
        conf = conf.view(num_batch, c_channels, self.num_classes)

        # Bounding Box loc and size post-processing
        # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        loc = loc.view(num_batch, c_channels, 4)

        return conf, loc

    def forward(self, input):

        confidence_list = []
        loc_list = []

        # Run the backbone network from [0 to 11, and fetch the bbox class confidence
        # as well as position and size
        y = module_util.forward_from(self.base_net.conv_layers, 0, self.base_output_layer_indices[0], input)
        print(y.shape)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[0], self.classifier[0], y)
        confidence_list.append(confidence)
        loc_list.append(loc)

        # Todo: implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence Done
        y = module_util.forward_from(self.base_net.conv_layers,self.base_output_layer_indices[0],self.base_output_layer_indices[1], y)
        print(y.shape)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[1], self.classifier[1], y)
        confidence_list.append(confidence)
        loc_list.append(loc)
        # print(y)

        # Todo: forward the 'y' to additional layers for extracting coarse features Done

        for i in range(0,3):
            y = module_util.forward_from(self.additional_feat_extractor, i,i+1, y)

            confidence, loc = self.feature_to_bbbox(self.loc_regressor[i+2], self.classifier[i+2], y)

            confidence_list.append(confidence)
            loc_list.append(loc)
        # print(y)
        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)
        print(confidences.shape,locations.shape)
        # [Debug] check the output
        assert confidences.dim() == 3  # should be (N, num_priors, num_classes)
        assert locations.dim() == 3   # should be (N, num_priors, 4)
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4

        if not self.training:
            # If in testing/evaluating mode, normalize the output with Softmax
            confidences = F.softmax(confidences, dim=2)
        print(confidences.shape)
        return confidences, locations
Ejemplo n.º 3
0
def main():
    global opt, start_epoch, best_prec1
    opt = cfg
    opt.gpuids = list(map(int, opt.gpuids))

    if opt.cuda and not torch.cuda.is_available():
        raise Exception("No GPU found, please run without --cuda")

    model = MobileNet()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=opt.lr,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay,
                          nesterov=True)
    start_epoch = 0

    ckpt_file = join("model", opt.ckpt)

    if opt.cuda:
        torch.cuda.set_device(opt.gpuids[0])
        with torch.cuda.device(opt.gpuids[0]):
            model = model.cuda()
            criterion = criterion.cuda()
        model = nn.DataParallel(model,
                                device_ids=opt.gpuids,
                                output_device=opt.gpuids[0])
        cudnn.benchmark = True

    # for resuming training
    if opt.resume:
        if isfile(ckpt_file):
            print("==> Loading Checkpoint '{}'".format(opt.ckpt))
            if opt.cuda:
                checkpoint = torch.load(ckpt_file,
                                        map_location=lambda storage, loc:
                                        storage.cuda(opt.gpuids[0]))
                try:
                    model.module.load_state_dict(checkpoint['model'])
                except:
                    model.load_state_dict(checkpoint['model'])
            else:
                checkpoint = torch.load(
                    ckpt_file, map_location=lambda storage, loc: storage)
                try:
                    model.load_state_dict(checkpoint['model'])
                except:
                    # create new OrderedDict that does not contain `module.`
                    new_state_dict = OrderedDict()
                    for k, v in checkpoint['model'].items():
                        if k[:7] == 'module.':
                            name = k[7:]  # remove `module.`
                        else:
                            name = k[:]
                        new_state_dict[name] = v

                    model.load_state_dict(new_state_dict)

            start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])

            print("==> Loaded Checkpoint '{}' (epoch {})".format(
                opt.ckpt, start_epoch))
        else:
            print("==> no checkpoint found at '{}'".format(opt.ckpt))
            return

    # Download & Load Dataset
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_val = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR10(root='./data',
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    train_loader = torch.utils.data.DataLoader(trainset,
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.workers)

    valset = torchvision.datasets.CIFAR10(root='./data',
                                          train=False,
                                          download=True,
                                          transform=transform_val)
    val_loader = torch.utils.data.DataLoader(valset,
                                             batch_size=opt.test_batch_size,
                                             shuffle=False,
                                             num_workers=opt.workers)

    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    # for evaluation
    if opt.eval:
        if isfile(ckpt_file):
            print("==> Loading Checkpoint '{}'".format(opt.ckpt))
            if opt.cuda:
                checkpoint = torch.load(ckpt_file,
                                        map_location=lambda storage, loc:
                                        storage.cuda(opt.gpuids[0]))
                try:
                    model.module.load_state_dict(checkpoint['model'])
                except:
                    model.load_state_dict(checkpoint['model'])
            else:
                checkpoint = torch.load(
                    ckpt_file, map_location=lambda storage, loc: storage)
                try:
                    model.load_state_dict(checkpoint['model'])
                except:
                    # create new OrderedDict that does not contain `module.`
                    new_state_dict = OrderedDict()
                    for k, v in checkpoint['model'].items():
                        if k[:7] == 'module.':
                            name = k[7:]  # remove `module.`
                        else:
                            name = k[:]
                        new_state_dict[name] = v

                    model.load_state_dict(new_state_dict)

            start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])

            print("==> Loaded Checkpoint '{}' (epoch {})".format(
                opt.ckpt, start_epoch))

            # evaluate on validation set
            print("\n===> [ Evaluation ]")
            start_time = time.time()
            prec1 = validate(val_loader, model, criterion)
            elapsed_time = time.time() - start_time
            print("====> {:.2f} seconds to evaluate this model\n".format(
                elapsed_time))
            return
        else:
            print("==> no checkpoint found at '{}'".format(opt.ckpt))
            return

    # train...
    train_time = 0.0
    validate_time = 0.0
    for epoch in range(start_epoch, opt.epochs):
        adjust_learning_rate(optimizer, epoch)

        print('\n==> Epoch: {}, lr = {}'.format(
            epoch, optimizer.param_groups[0]["lr"]))

        # train for one epoch
        print("===> [ Training ]")
        start_time = time.time()
        train(train_loader, model, criterion, optimizer, epoch)
        elapsed_time = time.time() - start_time
        train_time += elapsed_time
        print(
            "====> {:.2f} seconds to train this epoch\n".format(elapsed_time))

        # evaluate on validation set
        print("===> [ Validation ]")
        start_time = time.time()
        prec1 = validate(val_loader, model, criterion)
        elapsed_time = time.time() - start_time
        validate_time += elapsed_time
        print("====> {:.2f} seconds to validate this epoch\n".format(
            elapsed_time))

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        state = {
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        save_model(state, epoch, is_best)

    avg_train_time = train_time / opt.epochs
    avg_valid_time = validate_time / opt.epochs
    total_train_time = train_time + validate_time
    print("====> average training time per epoch: {}m {:.2f}s".format(
        int(avg_train_time // 60), avg_train_time % 60))
    print("====> average validation time per epoch: {}m {:.2f}s".format(
        int(avg_valid_time // 60), avg_valid_time % 60))
    print("====> training time: {}m {:.2f}s".format(int(train_time // 60),
                                                    train_time % 60))
    print("====> validation time: {}m {:.2f}s".format(int(validate_time // 60),
                                                      validate_time % 60))
    print("====> total training time: {}m {:.2f}s".format(
        int(total_train_time // 60), total_train_time % 60))
Ejemplo n.º 4
0
class SSD(nn.Module):
    
    def __init__(self, num_classes):
        super(SSD, self).__init__()
        self.num_classes = num_classes

        # Setup the backbone network (base_net)
        self.base_net = MobileNet(num_classes)

        # The feature map will extracted from layer[11] and layer[13] in (base_net)
        self.base_output_layer_indices = (6, 11)

        # Define the Additional feature extractor
        self.additional_feat_extractor = nn.ModuleList([
            # Conv8_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Conv9_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Conv10_2
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Conv11_2
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2),
                nn.ReLU()
            )
        ])

        # Bounding box offset regressor
        num_prior_bbox = 6                                                               # num of prior bounding boxes
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            # TODO: implement remaining layers.
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1)
        ])
        
        # Bounding box classification confidence for each label
        self.classifier = nn.ModuleList([
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            # TODO: implement remaining layers.
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
        ])

        # load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning
        pretrained_dict = torch.load('./pretrained/mobienetv2.pth')
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if 'base_net' in k}
        model_dict = self.base_net.state_dict()

        keys = []
        for k,v in pretrained_dict.items():
            keys.append(k)

        i = 0
        for k,v in model_dict.items():
            if v.size() == pretrained_dict[keys[i]].size():
                model_dict[k] = pretrained_dict[keys[i]]
                i += 1
                if i == len(keys):
                    break
        
        self.base_net.load_state_dict(model_dict)
        self.base_net.eval()

        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feat_extractor.apply(init_with_xavier)

    def feature_to_bbbox(self, loc_regress_layer, confidence_layer, input_feature):
        """
        Compute the bounding box class scores and the bounding box offset
        :param loc_regress_layer: offset regressor layer to run forward
        :param confidence_layer: confidence layer to run forward
        :param input_feature: feature map to be feed in
        :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively.
        """
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)

        # Confidence post-processing:
        # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes)
        #    where H*W*num_prior_bbox = num_priors
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1]*conf.shape[2]*conf.shape[3] / self.num_classes)
        conf = conf.view(num_batch, c_channels, self.num_classes)

        # Bounding Box loc and size post-processing
        # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        loc = loc.view(num_batch, c_channels, 4)

        return conf, loc

    def forward(self, input):

        confidence_list = []
        loc_list = []

        # Run the backbone network from [0 to 11, and fetch the bbox class confidence
        # as well as position and size
        y = module_util.forward_from(self.base_net.conv_layers, 0, self.base_output_layer_indices[0], input)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[0], self.classifier[0], y)
        confidence_list.append(confidence)
        loc_list.append(loc)

        # implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence
        y = module_util.forward_from(self.base_net.conv_layers, self.base_output_layer_indices[0], self.base_output_layer_indices[1], y)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[1], self.classifier[1], y)
        confidence_list.append(confidence)
        loc_list.append(loc)

        # forward the 'y' to additional layers for extracting coarse features
        for i in range(4):
            y = module_util.forward_from(self.additional_feat_extractor, i, i+1, y)
            confidence, loc = self.feature_to_bbbox(self.loc_regressor[i+2], self.classifier[i+2], y)
            confidence_list.append(confidence)
            loc_list.append(loc)

        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)

        # [Debug] check the output
        assert confidences.dim() == 3  # should be (N, num_priors, num_classes)
        assert locations.dim() == 3   # should be (N, num_priors, 4)
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4

        if not self.training:
            # If in testing/evaluating mode, normalize the output with Softmax
            confidences = F.softmax(confidences, dim=2)

        return confidences, locations
Ejemplo n.º 5
0
class SSD(nn.Module):
    def __init__(self, num_classes):
        super(SSD, self).__init__()
        self.num_classes = num_classes

        # Setup the backbone network (base_net).
        self.base_net = MobileNet()

        # The feature map will extracted from the end of following layers sections in (base_net).
        self.base_output_sequence_indices = (0, 12, len(self.base_net.base_net))

        # Number of prior bounding box.
        self.num_prior_bbox = 8

        # Define the additional feature extractor.
        self.additional_feature_extractor = nn.ModuleList([
            # Layer 28 - 29 5x5x512
            nn.Sequential(
                nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Layer 30 - 31 3x3x256
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Layer 32 - 33 2x2x256
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Layer 34 - 35 1x1x256
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            )
        ])

        # Bounding box offset regressor.
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(512, 8 * 4, kernel_size=3, padding=1),  # Layer 22
            nn.Conv2d(1024, 8 * 4, kernel_size=3, padding=1),  # Layer 27
            nn.Conv2d(512, 8 * 4, kernel_size=3, padding=1),  # Layer 29
            nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),  # Layer 31
            nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),  # Layer 33
            nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),  # Layer 35
        ])

        # Bounding box classification confidence for each label.
        self.classifier = nn.ModuleList([
            nn.Conv2d(512, 8 * num_classes, kernel_size=3, padding=1),  # Layer 13
            nn.Conv2d(1024, 8 * num_classes, kernel_size=3, padding=1),  # Layer 25
            nn.Conv2d(512, 8 * num_classes, kernel_size=3, padding=1),  # Layer 29
            nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),  # Layer 31
            nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),  # Layer 33
            nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),  # Layer 35
        ])

        # Load pretrained model.
        pretrained_state = torch.load('pretrained/mobienetv2.pth')
        model_dict = self.base_net.state_dict()

        # Filter out unnecessary keys.
        pretrained_state = {k: v for k, v in pretrained_state.items() if k in model_dict}

        # Overwrite entries in the existing state dict.
        model_dict.update(pretrained_state)

        # Load the new state dict.
        self.base_net.load_state_dict(model_dict)

        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)

        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feature_extractor.apply(init_with_xavier)

    def feature_to_bbox(self, loc_regress_layer, confidence_layer, input_feature):
        """
        Compute the bounding box class scores and the bounding box offset.
        :param loc_regress_layer: offset regressor layer to run forward.
        :param confidence_layer: confidence layer to run forward.
        :param input_feature: feature map to be feed in forward.
        :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively.
        """
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)

        # Confidence post-processing:
        # 1: (N, num_prior_bbox * n_classes, H, W) to
        # (N, H * W * num_prior_bbox, n_classes) = (N, num_priors, num_classes)
        # where H * W * num_prior_bbox = num_priors
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1] * conf.shape[2] * conf.shape[3] / self.num_classes)
        conf = conf.view(num_batch, c_channels, self.num_classes)

        # Bounding Box loc and size post-processing.
        # 1: (N, num_prior_bbox * 4, H, W) to (N, num_priors, 4)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        loc = loc.view(num_batch, c_channels, 4)

        return conf, loc

    def forward(self, inp):

        confidence_list = []
        loc_list = []
        result = inp
        # Forward the 'result' to base net for regressor & classifier.
        for index in range(0, len(self.base_output_sequence_indices) - 1):
            result = module_util.forward_from(
                self.base_net.base_net,
                self.base_output_sequence_indices[index], self.base_output_sequence_indices[index + 1], result)
            confidence, loc = self.feature_to_bbox(self.loc_regressor[index], self.classifier[index], result)
            confidence_list.append(confidence)
            loc_list.append(loc)

        # Forward the 'result' to additional layers for extracting coarse features.
        for index in range(0, len(self.additional_feature_extractor)):
            result = module_util.forward_from(
                self.additional_feature_extractor,
                index, index + 1, result)
            confidence, loc = self.feature_to_bbox(self.loc_regressor[index + 2], self.classifier[index + 2], result)
            confidence_list.append(confidence)
            loc_list.append(loc)

        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)

        # [Debug] Check the output.
        assert confidences.dim() == 3  # Should be (N, num_priors, num_classes).
        assert confidences.shape[2] == self.num_classes  # Should be (N, num_priors, num_classes).
        assert locations.dim() == 3  # Should be (N, num_priors, 4).
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4

        if not self.training:
            # If in testing/evaluating mode, normalize the output with Softmax.
            confidences = f.softmax(confidences, dim=2)

        return confidences, locations