Exemple #1
0
def get_model_and_checkpoint(model, dataset, checkpoint_path, n_gpu=1):
    if dataset == 'imagenet':
        n_class = 1000
    elif dataset == 'cifar10':
        n_class = 10
    else:
        raise ValueError('unsupported dataset')

    if model == 'mobilenet':
        from mobilenet import MobileNet
        net = MobileNet(n_class=n_class)
    elif model == 'mobilenetv2':
        from mobilenet_v2 import MobileNetV2
        net = MobileNetV2(n_class=n_class)
    elif model.startswith('resnet'):
        net = resnet.__dict__[model](pretrained=True)
        in_features = net.fc.in_features
        net.fc = nn.Linear(in_features, n_class)
    else:
        raise NotImplementedError
    if checkpoint_path:
        print('loading {}...'.format(checkpoint_path))
        sd = torch.load(checkpoint_path, map_location=torch.device('cpu'))
        if 'state_dict' in sd:  # a checkpoint but not a state_dict
            sd = sd['state_dict']
        sd = {k.replace('module.', ''): v for k, v in sd.items()}
        net.load_state_dict(sd)

    if torch.cuda.is_available() and n_gpu > 0:
        net = net.cuda()
        if n_gpu > 1:
            net = torch.nn.DataParallel(net, range(n_gpu))

    return net
Exemple #2
0
def get_model_and_checkpoint(model, dataset, checkpoint_path, n_gpu=1):
    if model == 'mobilenet' and dataset == 'imagenet':
        from mobilenet import MobileNet
        net = MobileNet(n_class=1000)
    elif model == 'mobilenetv2' and dataset == 'imagenet':
        from mobilenet_v2 import MobileNetV2
        net = MobileNetV2(n_class=1000)
    elif model == 'mobilenet' and dataset == 'cifar10':
        from mobilenet import MobileNet
        net = MobileNet(n_class=10)
    elif model == 'mobilenetv2' and dataset == 'cifar10':
        from mobilenet_v2 import MobileNetV2
        net = MobileNetV2(n_class=10)
    else:
        raise NotImplementedError
    if checkpoint_path:
        print('loading {}...'.format(checkpoint_path))
        sd = torch.load(checkpoint_path, map_location=torch.device('cpu'))
        if 'state_dict' in sd:  # a checkpoint but not a state_dict
            sd = sd['state_dict']
        sd = {k.replace('module.', ''): v for k, v in sd.items()}
        net.load_state_dict(sd)

    if torch.cuda.is_available() and n_gpu > 0:
        net = net.cuda()
        if n_gpu > 1:
            net = torch.nn.DataParallel(net, range(n_gpu))

    return net
Exemple #3
0
def get_model(args):
    print('=> Building model..')

    if args.dataset == 'imagenet':
        n_class = 1000
    elif args.dataset == 'cifar10':
        n_class = 10
    else:
        raise NotImplementedError

    if args.model_type == 'mobilenet':
        net = MobileNet(n_class=n_class)
    elif args.model_type == 'mobilenetv2':
        net = MobileNetV2(n_class=n_class)
    elif args.model_type.startswith('resnet'):
        net = resnet.__dict__[args.model_type](pretrained=True)
        in_features = net.fc.in_features
        net.fc = nn.Linear(in_features, n_class)
    else:
        raise NotImplementedError

    if args.ckpt_path is not None:
        # the checkpoint can be state_dict exported by amc_search.py or saved by amc_train.py
        print('=> Loading checkpoint {} ..'.format(args.ckpt_path))
        net.load_state_dict(torch.load(args.ckpt_path, torch.device('cpu')))
        if args.mask_path is not None:
            SZ = 224 if args.dataset == 'imagenet' else 32
            data = torch.randn(2, 3, SZ, SZ)
            ms = ModelSpeedup(net, data, args.mask_path, torch.device('cpu'))
            ms.speedup_model()

    net.to(args.device)
    if torch.cuda.is_available() and args.n_gpu > 1:
        net = torch.nn.DataParallel(net, list(range(args.n_gpu)))
    return net
Exemple #4
0
def create_model(model_type=None,
                 n_classes=120,
                 input_size=224,
                 checkpoint=None,
                 pretrained=False,
                 width_mult=1.):
    if model_type == 'mobilenet_v1':
        model = MobileNet(n_class=n_classes, profile='normal')
    elif model_type == 'mobilenet_v2':
        model = MobileNetV2(n_class=n_classes,
                            input_size=input_size,
                            width_mult=width_mult)
    elif model_type == 'mobilenet_v2_torchhub':
        model = torch.hub.load('pytorch/vision:v0.8.1',
                               'mobilenet_v2',
                               pretrained=pretrained)
        # model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=pretrained)
        feature_size = model.classifier[1].weight.data.size()[1]
        replace_classifier = torch.nn.Linear(feature_size, n_classes)
        model.classifier[1] = replace_classifier
    elif model_type is None:
        model = None
    else:
        raise RuntimeError('Unknown model_type.')

    if checkpoint is not None:
        model.load_state_dict(torch.load(checkpoint))

    return model
Exemple #5
0
def get_model(args):
    print('=> Building model..')

    if args.dataset == 'imagenet':
        n_class = 1000
    elif args.dataset == 'cifar10':
        n_class = 10
    else:
        raise NotImplementedError

    if args.model_type == 'mobilenet':
        net = MobileNet(n_class=n_class).cuda()
    elif args.model_type == 'mobilenetv2':
        net = MobileNetV2(n_class=n_class).cuda()
    else:
        raise NotImplementedError

    if args.ckpt_path is not None:
        # the checkpoint can be a saved whole model object exported by amc_search.py, or a state_dict
        print('=> Loading checkpoint {} ..'.format(args.ckpt_path))
        ckpt = torch.load(args.ckpt_path)
        if type(ckpt) == dict:
            net.load_state_dict(ckpt['state_dict'])
        else:
            net = ckpt

    net.to(args.device)
    if torch.cuda.is_available() and args.n_gpu > 1:
        net = torch.nn.DataParallel(net, list(range(args.n_gpu)))
    return net
Exemple #6
0
def get_net_model(net='alexnet',
                  pretrained_dataset='imagenet',
                  dropout=False,
                  pretrained=True):
    if net == 'alexnet':
        model = myalexnet(pretrained=(pretrained_dataset == 'imagenet')
                          and pretrained,
                          dropout=dropout)
        teacher_model = alexnet(pretrained=(pretrained_dataset == 'imagenet'))
    elif net == 'mobilenet-imagenet':
        model = MobileNet(num_classes=1001, dropout=dropout)
        if pretrained and pretrained_dataset == 'imagenet':
            model.load_state_dict(torch.load(imagenet_pretrained_mbnet_path))
        teacher_model = MobileNet(num_classes=1001)
        if os.path.isfile(imagenet_pretrained_mbnet_path):
            teacher_model.load_state_dict(
                torch.load(imagenet_pretrained_mbnet_path))
        else:
            warnings.warn('failed to import teacher model!')
    elif net == 'erfnet-cityscapes':
        model = erfnet(pretrained=(pretrained_dataset == 'cityscapes')
                       and pretrained,
                       num_classes=20,
                       dropout=dropout)
        teacher_model = erfnet(pretrained=(pretrained_dataset == 'cityscapes'),
                               num_classes=20)
    else:
        raise NotImplementedError

    for p in teacher_model.parameters():
        p.requires_grad = False
    teacher_model.eval()

    return model, teacher_model
Exemple #7
0
class SSD(nn.Module):
    def __init__(self, num_classes):
        super(SSD, self).__init__()
        self.num_classes = num_classes
        # Setup the backbone network (base_net)
        self.base_net = MobileNet(num_classes)
        # The feature map will extracted from layer[11] and layer[13] in (base_net)
        self.base_output_layer_indices = (11, 13)
        # Define the Additional feature extractor
        self.additional_feat_extractor = nn.ModuleList([
            # Conv8_2
            nn.Sequential(
                nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256,
                          out_channels=512,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU()),
            # Conv9_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU()),
            # Conv10_2
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=1,
                          padding=1),
                nn.ReLU(),
            ),
            # Conv11_2
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=1),
                nn.ReLU(),
            ),
        ])

        # Bounding box offset regressor
        num_prior_bbox = 6  # num of prior bounding boxes
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),  #Cov5_3
            nn.Conv2d(in_channels=1024,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),  #FC7
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),  #Conv8_2
            # TODO: implement remaining layers.
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),  #Conv9_2
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),  #Conv10_2
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),  #Conv11_2
        ])

        # Bounding box classification confidence for each label
        self.classifier = nn.ModuleList([
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=1024,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
        ])

        # Load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning
        basenet_state = torch.load('pretrained/mobienetv2.pth',
                                   map_location='cpu')
        base_net_1 = {
            key: value
            for key, value in basenet_state.items() if 'base_net' in key
        }
        self.base_net.load_state_dict(base_net_1)
        layer_idx = 0

        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)

        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feat_extractor.apply(init_with_xavier)

    def feature_to_bbbox(self, loc_regress_layer, confidence_layer,
                         input_feature):
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1] * conf.shape[2] * conf.shape[3] /
                         self.num_classes)
        conf = conf.view(num_batch, c_channels, self.num_classes)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        loc = loc.view(num_batch, c_channels, 4)
        return conf, loc

    def forward(self, input):
        confidence_list = []
        loc_list = []
        y = module_util.forward_from(self.base_net.base_net, 0,
                                     self.base_output_layer_indices[0] + 1,
                                     input)  #11 , 13
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[0],
                                                self.classifier[0], y)
        confidence_list.append(confidence)
        loc_list.append(loc)
        y = module_util.forward_from(self.base_net.base_net,
                                     self.base_output_layer_indices[0] + 1,
                                     self.base_output_layer_indices[1] + 1, y)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[1],
                                                self.classifier[1], y)
        confidence_list.append(confidence)
        loc_list.append(loc)
        for i in range(len(self.additional_feat_extractor)):
            y = module_util.forward_from(self.additional_feat_extractor, i,
                                         i + 1, y)
            confidence, loc = self.feature_to_bbbox(self.loc_regressor[i + 2],
                                                    self.classifier[i + 2], y)
            confidence_list.append(confidence)
            loc_list.append(loc)
        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)
        # [Debug] check the output
        assert confidence.dim() == 3  # should be (N, num_priors, num_classes)
        assert locations.dim() == 3  # should be (N, num_priors, 4)
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4
        if not self.training:
            confidences = F.softmax(confidences, dim=2)
        return confidences, locations
Exemple #8
0
class SSD(nn.Module):
    def __init__(self, num_classes):
        super(SSD, self).__init__()
        self.num_classes = num_classes

        # Setup the backbone network (base_net)
        self.base_net = MobileNet(num_classes)

        # The feature map will extracted from layer[11] and layer[13] in (base_net)
        self.base_output_layer_indices = (11, 13)

        # Define the Additional feature extractor
        self.additional_feat_extractor = nn.ModuleList([
            # Conv8_2
            nn.Sequential(
                nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256,
                          out_channels=512,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU()),
            # Conv9_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU()),
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU()),
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=3,
                          stride=2,
                          padding=1), nn.ReLU())
        ])

        # Bounding box offset regressor
        num_prior_bbox = 6  # num of prior bounding boxes
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=1024,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * 4,
                      kernel_size=3,
                      padding=1)
        ])

        # Bounding box classification confidence for each label
        self.classifier = nn.ModuleList([
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=1024,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=512,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1),
            nn.Conv2d(in_channels=256,
                      out_channels=num_prior_bbox * num_classes,
                      kernel_size=3,
                      padding=1)
        ])

        # Todo: load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning
        temp_state = torch.load('pretrained/mobienetv2.pth')
        #self.base_net.load_state_dict(cur_state)

        cur_dict = self.base_net.state_dict()
        input_state = {
            k: v
            for k, v in temp_state.items()
            if k in cur_dict and v.size() == cur_dict[k].size()
        }
        cur_dict.update(input_state)
        self.base_net.load_state_dict(cur_dict)

        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)

        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feat_extractor.apply(init_with_xavier)

    def feature_to_bbbox(self, loc_regress_layer, confidence_layer,
                         input_feature):
        """
        Compute the bounding box class scores and the bounding box offset
        :param loc_regress_layer: offset regressor layer to run forward
        :param confidence_layer: confidence layer to run forward
        :param input_feature: feature map to be feed in
        :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively.
        """
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)

        # Confidence post-processing:
        # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes)
        #    where H*W*num_prior_bbox = num_priors
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1] * conf.shape[2] * conf.shape[3] /
                         self.num_classes)
        #print('conf shape',conf.shape)

        conf = conf.view(num_batch, c_channels, self.num_classes)

        # Bounding Box loc and size post-processing
        # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        #print('loc shape',loc.shape)
        l_channels = int(loc.shape[1] * loc.shape[2] * loc.shape[3] / 4)
        #print('l chanel', l_channels)
        loc = loc.view(num_batch, l_channels, 4)

        return conf, loc

    def forward(self, input):

        confidence_list = []
        loc_list = []

        # Run the backbone network from [0 to 11, and fetch the bbox class confidence
        # as well as position and size
        y = module_util.forward_from(self.base_net.base_net, 0,
                                     self.base_output_layer_indices[0] + 1,
                                     input)
        #print('y',y.shape)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[0],
                                                self.classifier[0], y)
        confidence_list.append(confidence)
        loc_list.append(loc)
        #print('cof, loc size', confidence.shape, loc.shape)

        # Todo: implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence
        y = module_util.forward_from(self.base_net.base_net,
                                     self.base_output_layer_indices[0],
                                     self.base_output_layer_indices[1] + 1, y)
        #print('y', y.shape)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[1],
                                                self.classifier[1], y)
        confidence_list.append(confidence)
        loc_list.append(loc)
        #print('cof, loc size', confidence.shape, loc.shape)

        #conv to 12
        #y = module_util.forward_from(self.base_net.base_net, self.base_output_layer_indices[1], self.base_output_layer_indices[2]+1, y)
        # Todo: forward the 'y' to additional layers for extracting coarse features
        for idx in range(0, len(self.additional_feat_extractor)):
            #print('current idx', idx)
            #print('y', y.shape)
            y = module_util.forward_from(self.additional_feat_extractor[idx],
                                         0, 4, y)
            confidence, loc = self.feature_to_bbbox(
                self.loc_regressor[idx + 2], self.classifier[idx + 2], y)
            confidence_list.append(confidence)
            loc_list.append(loc)
            #print('cof, loc size', confidence.shape, loc.shape)

        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)
        #print('cof, loc size after cat',  np.asarray(confidences).shape, np.asarray(locations).shape)

        # [Debug] check the output
        assert confidences.dim() == 3  # should be (N, num_priors, num_classes)
        assert locations.dim() == 3  # should be (N, num_priors, 4)
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4

        if not self.training:
            # If in testing/evaluating mode, normalize the output with Softmax
            confidences = F.softmax(confidences, dim=2)

        return confidences, locations
Exemple #9
0
class SSD(nn.Module):
    
    def __init__(self, num_classes = 4):
        super(SSD, self).__init__()
        self.num_classes = num_classes

        # Setup the backbone network (base_net)
        self.base_net = MobileNet(num_classes)

        # The feature map will extracted from layer[11] and layer[13] in (base_net)
        self.base_output_layer_indices = (11, 13)

        # Define the Additional feature extractor
        self.additional_feat_extractor = nn.ModuleList([
            # Conv8_2
            nn.Sequential(
                nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Conv9_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
	    # Conv10_2
	    nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
                nn.ReLU()
            ),
	    # Conv11_2
	     nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
                nn.ReLU()
            ),
            # TODO: implement two more layers. Done
        ])

        # Bounding box offset regressor
        num_prior_bbox = 6                                                               # num of prior bounding boxes
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=1024, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1)
            # TODO: implement remaining layers. Done
        ])

        # Bounding box classification confidence for each label
        self.classifier = nn.ModuleList([
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=1024, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1)
        # TODO: implement remaining layers. Done
        ])


        # Todo: load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning Done

        pretrained_model = torch.load("./pretrained/mobienetv2.pth")


        # new = list(pretrained_model.items())
        my_model= self.base_net.state_dict()

        # 1. filter out unnecessary keys
        #print(my_model)
        #print(pretrained_model.items())

        pretrained_model = {k: v for k, v in pretrained_model.items() if k in my_model}
        # 2. overwrite entries in the existing state dict
        my_model.update(pretrained_model)
        # print(my_model)
        # 3. load the new state dict
        self.base_net.load_state_dict(my_model)

        # print(self.base_net)
        # print(self.additional_feat_extractor)

        # print(my_model_kvpair,pretrained_model)
        # count = 0
        # for key, value in my_model_kvpair.items():
        #     layer_name, weights = new[count]
        #     my_model_kvpair[key] = weights
        #     count += 1

        #self.base_net.load_state_dict(pretrained_model)


        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feat_extractor.apply(init_with_xavier)

    def feature_to_bbbox(self, loc_regress_layer, confidence_layer, input_feature):
        """
        Compute the bounding box class scores and the bounding box offset
        :param loc_regress_layer: offset regressor layer to run forward
        :param confidence_layer: confidence layer to run forward
        :param input_feature: feature map to be feed in
        :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively.
        """
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)

        # Confidence post-processing:
        # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes)
        #    where H*W*num_prior_bbox = num_priors
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1]*conf.shape[2]*conf.shape[3] / self.num_classes)
        conf = conf.view(num_batch, c_channels, self.num_classes)

        # Bounding Box loc and size post-processing
        # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        loc = loc.view(num_batch, c_channels, 4)

        return conf, loc

    def forward(self, input):

        confidence_list = []
        loc_list = []

        # Run the backbone network from [0 to 11, and fetch the bbox class confidence
        # as well as position and size
        y = module_util.forward_from(self.base_net.conv_layers, 0, self.base_output_layer_indices[0], input)
        print(y.shape)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[0], self.classifier[0], y)
        confidence_list.append(confidence)
        loc_list.append(loc)

        # Todo: implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence Done
        y = module_util.forward_from(self.base_net.conv_layers,self.base_output_layer_indices[0],self.base_output_layer_indices[1], y)
        print(y.shape)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[1], self.classifier[1], y)
        confidence_list.append(confidence)
        loc_list.append(loc)
        # print(y)

        # Todo: forward the 'y' to additional layers for extracting coarse features Done

        for i in range(0,3):
            y = module_util.forward_from(self.additional_feat_extractor, i,i+1, y)

            confidence, loc = self.feature_to_bbbox(self.loc_regressor[i+2], self.classifier[i+2], y)

            confidence_list.append(confidence)
            loc_list.append(loc)
        # print(y)
        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)
        print(confidences.shape,locations.shape)
        # [Debug] check the output
        assert confidences.dim() == 3  # should be (N, num_priors, num_classes)
        assert locations.dim() == 3   # should be (N, num_priors, 4)
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4

        if not self.training:
            # If in testing/evaluating mode, normalize the output with Softmax
            confidences = F.softmax(confidences, dim=2)
        print(confidences.shape)
        return confidences, locations
Exemple #10
0
class SSD(nn.Module):
    
    def __init__(self, num_classes):
        super(SSD, self).__init__()
        self.num_classes = num_classes

        # Setup the backbone network (base_net)
        self.base_net = MobileNet(num_classes)

        # The feature map will extracted from layer[11] and layer[13] in (base_net)
        self.base_output_layer_indices = (6, 11)

        # Define the Additional feature extractor
        self.additional_feat_extractor = nn.ModuleList([
            # Conv8_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Conv9_2
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Conv10_2
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Conv11_2
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2),
                nn.ReLU()
            )
        ])

        # Bounding box offset regressor
        num_prior_bbox = 6                                                               # num of prior bounding boxes
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            # TODO: implement remaining layers.
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * 4, kernel_size=3, padding=1)
        ])
        
        # Bounding box classification confidence for each label
        self.classifier = nn.ModuleList([
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=512, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            # TODO: implement remaining layers.
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
            nn.Conv2d(in_channels=256, out_channels=num_prior_bbox * num_classes, kernel_size=3, padding=1),
        ])

        # load the pre-trained model for self.base_net, it will increase the accuracy by fine-tuning
        pretrained_dict = torch.load('./pretrained/mobienetv2.pth')
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if 'base_net' in k}
        model_dict = self.base_net.state_dict()

        keys = []
        for k,v in pretrained_dict.items():
            keys.append(k)

        i = 0
        for k,v in model_dict.items():
            if v.size() == pretrained_dict[keys[i]].size():
                model_dict[k] = pretrained_dict[keys[i]]
                i += 1
                if i == len(keys):
                    break
        
        self.base_net.load_state_dict(model_dict)
        self.base_net.eval()

        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)
        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feat_extractor.apply(init_with_xavier)

    def feature_to_bbbox(self, loc_regress_layer, confidence_layer, input_feature):
        """
        Compute the bounding box class scores and the bounding box offset
        :param loc_regress_layer: offset regressor layer to run forward
        :param confidence_layer: confidence layer to run forward
        :param input_feature: feature map to be feed in
        :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively.
        """
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)

        # Confidence post-processing:
        # 1: (N, num_prior_bbox * n_classes, H, W) to (N, H*W*num_prior_bbox, n_classes) = (N, num_priors, num_classes)
        #    where H*W*num_prior_bbox = num_priors
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1]*conf.shape[2]*conf.shape[3] / self.num_classes)
        conf = conf.view(num_batch, c_channels, self.num_classes)

        # Bounding Box loc and size post-processing
        # 1: (N, num_prior_bbox*4, H, W) to (N, num_priors, 4)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        loc = loc.view(num_batch, c_channels, 4)

        return conf, loc

    def forward(self, input):

        confidence_list = []
        loc_list = []

        # Run the backbone network from [0 to 11, and fetch the bbox class confidence
        # as well as position and size
        y = module_util.forward_from(self.base_net.conv_layers, 0, self.base_output_layer_indices[0], input)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[0], self.classifier[0], y)
        confidence_list.append(confidence)
        loc_list.append(loc)

        # implement run the backbone network from [11 to 13] and compute the corresponding bbox loc and confidence
        y = module_util.forward_from(self.base_net.conv_layers, self.base_output_layer_indices[0], self.base_output_layer_indices[1], y)
        confidence, loc = self.feature_to_bbbox(self.loc_regressor[1], self.classifier[1], y)
        confidence_list.append(confidence)
        loc_list.append(loc)

        # forward the 'y' to additional layers for extracting coarse features
        for i in range(4):
            y = module_util.forward_from(self.additional_feat_extractor, i, i+1, y)
            confidence, loc = self.feature_to_bbbox(self.loc_regressor[i+2], self.classifier[i+2], y)
            confidence_list.append(confidence)
            loc_list.append(loc)

        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)

        # [Debug] check the output
        assert confidences.dim() == 3  # should be (N, num_priors, num_classes)
        assert locations.dim() == 3   # should be (N, num_priors, 4)
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4

        if not self.training:
            # If in testing/evaluating mode, normalize the output with Softmax
            confidences = F.softmax(confidences, dim=2)

        return confidences, locations
Exemple #11
0
def main():
    global opt, start_epoch, best_prec1
    opt = cfg
    opt.gpuids = list(map(int, opt.gpuids))

    if opt.cuda and not torch.cuda.is_available():
        raise Exception("No GPU found, please run without --cuda")

    model = MobileNet()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=opt.lr,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay,
                          nesterov=True)
    start_epoch = 0

    ckpt_file = join("model", opt.ckpt)

    if opt.cuda:
        torch.cuda.set_device(opt.gpuids[0])
        with torch.cuda.device(opt.gpuids[0]):
            model = model.cuda()
            criterion = criterion.cuda()
        model = nn.DataParallel(model,
                                device_ids=opt.gpuids,
                                output_device=opt.gpuids[0])
        cudnn.benchmark = True

    # for resuming training
    if opt.resume:
        if isfile(ckpt_file):
            print("==> Loading Checkpoint '{}'".format(opt.ckpt))
            if opt.cuda:
                checkpoint = torch.load(ckpt_file,
                                        map_location=lambda storage, loc:
                                        storage.cuda(opt.gpuids[0]))
                try:
                    model.module.load_state_dict(checkpoint['model'])
                except:
                    model.load_state_dict(checkpoint['model'])
            else:
                checkpoint = torch.load(
                    ckpt_file, map_location=lambda storage, loc: storage)
                try:
                    model.load_state_dict(checkpoint['model'])
                except:
                    # create new OrderedDict that does not contain `module.`
                    new_state_dict = OrderedDict()
                    for k, v in checkpoint['model'].items():
                        if k[:7] == 'module.':
                            name = k[7:]  # remove `module.`
                        else:
                            name = k[:]
                        new_state_dict[name] = v

                    model.load_state_dict(new_state_dict)

            start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])

            print("==> Loaded Checkpoint '{}' (epoch {})".format(
                opt.ckpt, start_epoch))
        else:
            print("==> no checkpoint found at '{}'".format(opt.ckpt))
            return

    # Download & Load Dataset
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_val = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR10(root='./data',
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    train_loader = torch.utils.data.DataLoader(trainset,
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.workers)

    valset = torchvision.datasets.CIFAR10(root='./data',
                                          train=False,
                                          download=True,
                                          transform=transform_val)
    val_loader = torch.utils.data.DataLoader(valset,
                                             batch_size=opt.test_batch_size,
                                             shuffle=False,
                                             num_workers=opt.workers)

    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    # for evaluation
    if opt.eval:
        if isfile(ckpt_file):
            print("==> Loading Checkpoint '{}'".format(opt.ckpt))
            if opt.cuda:
                checkpoint = torch.load(ckpt_file,
                                        map_location=lambda storage, loc:
                                        storage.cuda(opt.gpuids[0]))
                try:
                    model.module.load_state_dict(checkpoint['model'])
                except:
                    model.load_state_dict(checkpoint['model'])
            else:
                checkpoint = torch.load(
                    ckpt_file, map_location=lambda storage, loc: storage)
                try:
                    model.load_state_dict(checkpoint['model'])
                except:
                    # create new OrderedDict that does not contain `module.`
                    new_state_dict = OrderedDict()
                    for k, v in checkpoint['model'].items():
                        if k[:7] == 'module.':
                            name = k[7:]  # remove `module.`
                        else:
                            name = k[:]
                        new_state_dict[name] = v

                    model.load_state_dict(new_state_dict)

            start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])

            print("==> Loaded Checkpoint '{}' (epoch {})".format(
                opt.ckpt, start_epoch))

            # evaluate on validation set
            print("\n===> [ Evaluation ]")
            start_time = time.time()
            prec1 = validate(val_loader, model, criterion)
            elapsed_time = time.time() - start_time
            print("====> {:.2f} seconds to evaluate this model\n".format(
                elapsed_time))
            return
        else:
            print("==> no checkpoint found at '{}'".format(opt.ckpt))
            return

    # train...
    train_time = 0.0
    validate_time = 0.0
    for epoch in range(start_epoch, opt.epochs):
        adjust_learning_rate(optimizer, epoch)

        print('\n==> Epoch: {}, lr = {}'.format(
            epoch, optimizer.param_groups[0]["lr"]))

        # train for one epoch
        print("===> [ Training ]")
        start_time = time.time()
        train(train_loader, model, criterion, optimizer, epoch)
        elapsed_time = time.time() - start_time
        train_time += elapsed_time
        print(
            "====> {:.2f} seconds to train this epoch\n".format(elapsed_time))

        # evaluate on validation set
        print("===> [ Validation ]")
        start_time = time.time()
        prec1 = validate(val_loader, model, criterion)
        elapsed_time = time.time() - start_time
        validate_time += elapsed_time
        print("====> {:.2f} seconds to validate this epoch\n".format(
            elapsed_time))

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        state = {
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        save_model(state, epoch, is_best)

    avg_train_time = train_time / opt.epochs
    avg_valid_time = validate_time / opt.epochs
    total_train_time = train_time + validate_time
    print("====> average training time per epoch: {}m {:.2f}s".format(
        int(avg_train_time // 60), avg_train_time % 60))
    print("====> average validation time per epoch: {}m {:.2f}s".format(
        int(avg_valid_time // 60), avg_valid_time % 60))
    print("====> training time: {}m {:.2f}s".format(int(train_time // 60),
                                                    train_time % 60))
    print("====> validation time: {}m {:.2f}s".format(int(validate_time // 60),
                                                      validate_time % 60))
    print("====> total training time: {}m {:.2f}s".format(
        int(total_train_time // 60), total_train_time % 60))
class SSD(nn.Module):
    def __init__(self, num_classes):
        super(SSD, self).__init__()
        self.num_classes = num_classes

        # Setup the backbone network (base_net).
        self.base_net = MobileNet()

        # The feature map will extracted from the end of following layers sections in (base_net).
        self.base_output_sequence_indices = (0, 12, len(self.base_net.base_net))

        # Number of prior bounding box.
        self.num_prior_bbox = 8

        # Define the additional feature extractor.
        self.additional_feature_extractor = nn.ModuleList([
            # Layer 28 - 29 5x5x512
            nn.Sequential(
                nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Layer 30 - 31 3x3x256
            nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Layer 32 - 33 2x2x256
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            ),
            # Layer 34 - 35 1x1x256
            nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1),
                nn.ReLU(),
                nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
                nn.ReLU()
            )
        ])

        # Bounding box offset regressor.
        self.loc_regressor = nn.ModuleList([
            nn.Conv2d(512, 8 * 4, kernel_size=3, padding=1),  # Layer 22
            nn.Conv2d(1024, 8 * 4, kernel_size=3, padding=1),  # Layer 27
            nn.Conv2d(512, 8 * 4, kernel_size=3, padding=1),  # Layer 29
            nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),  # Layer 31
            nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),  # Layer 33
            nn.Conv2d(256, 4 * 4, kernel_size=3, padding=1),  # Layer 35
        ])

        # Bounding box classification confidence for each label.
        self.classifier = nn.ModuleList([
            nn.Conv2d(512, 8 * num_classes, kernel_size=3, padding=1),  # Layer 13
            nn.Conv2d(1024, 8 * num_classes, kernel_size=3, padding=1),  # Layer 25
            nn.Conv2d(512, 8 * num_classes, kernel_size=3, padding=1),  # Layer 29
            nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),  # Layer 31
            nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),  # Layer 33
            nn.Conv2d(256, 4 * num_classes, kernel_size=3, padding=1),  # Layer 35
        ])

        # Load pretrained model.
        pretrained_state = torch.load('pretrained/mobienetv2.pth')
        model_dict = self.base_net.state_dict()

        # Filter out unnecessary keys.
        pretrained_state = {k: v for k, v in pretrained_state.items() if k in model_dict}

        # Overwrite entries in the existing state dict.
        model_dict.update(pretrained_state)

        # Load the new state dict.
        self.base_net.load_state_dict(model_dict)

        def init_with_xavier(m):
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight)

        self.loc_regressor.apply(init_with_xavier)
        self.classifier.apply(init_with_xavier)
        self.additional_feature_extractor.apply(init_with_xavier)

    def feature_to_bbox(self, loc_regress_layer, confidence_layer, input_feature):
        """
        Compute the bounding box class scores and the bounding box offset.
        :param loc_regress_layer: offset regressor layer to run forward.
        :param confidence_layer: confidence layer to run forward.
        :param input_feature: feature map to be feed in forward.
        :return: confidence and location, with dim:(N, num_priors, num_classes) and dim:(N, num_priors, 4) respectively.
        """
        conf = confidence_layer(input_feature)
        loc = loc_regress_layer(input_feature)

        # Confidence post-processing:
        # 1: (N, num_prior_bbox * n_classes, H, W) to
        # (N, H * W * num_prior_bbox, n_classes) = (N, num_priors, num_classes)
        # where H * W * num_prior_bbox = num_priors
        conf = conf.permute(0, 2, 3, 1).contiguous()
        num_batch = conf.shape[0]
        c_channels = int(conf.shape[1] * conf.shape[2] * conf.shape[3] / self.num_classes)
        conf = conf.view(num_batch, c_channels, self.num_classes)

        # Bounding Box loc and size post-processing.
        # 1: (N, num_prior_bbox * 4, H, W) to (N, num_priors, 4)
        loc = loc.permute(0, 2, 3, 1).contiguous()
        loc = loc.view(num_batch, c_channels, 4)

        return conf, loc

    def forward(self, inp):

        confidence_list = []
        loc_list = []
        result = inp
        # Forward the 'result' to base net for regressor & classifier.
        for index in range(0, len(self.base_output_sequence_indices) - 1):
            result = module_util.forward_from(
                self.base_net.base_net,
                self.base_output_sequence_indices[index], self.base_output_sequence_indices[index + 1], result)
            confidence, loc = self.feature_to_bbox(self.loc_regressor[index], self.classifier[index], result)
            confidence_list.append(confidence)
            loc_list.append(loc)

        # Forward the 'result' to additional layers for extracting coarse features.
        for index in range(0, len(self.additional_feature_extractor)):
            result = module_util.forward_from(
                self.additional_feature_extractor,
                index, index + 1, result)
            confidence, loc = self.feature_to_bbox(self.loc_regressor[index + 2], self.classifier[index + 2], result)
            confidence_list.append(confidence)
            loc_list.append(loc)

        confidences = torch.cat(confidence_list, 1)
        locations = torch.cat(loc_list, 1)

        # [Debug] Check the output.
        assert confidences.dim() == 3  # Should be (N, num_priors, num_classes).
        assert confidences.shape[2] == self.num_classes  # Should be (N, num_priors, num_classes).
        assert locations.dim() == 3  # Should be (N, num_priors, 4).
        assert confidences.shape[1] == locations.shape[1]
        assert locations.shape[2] == 4

        if not self.training:
            # If in testing/evaluating mode, normalize the output with Softmax.
            confidences = f.softmax(confidences, dim=2)

        return confidences, locations