Ejemplo n.º 1
0
def im_detect_ratio(net, im, target_size1, target_size2):
    device = net.arm_conf[0].weight.device
    h, w, _ = im.shape
    scale = torch.Tensor([w, h, w, h])
    scale = scale.to(device)
    im_orig = im.astype(np.float32, copy=True)
    if im_orig.shape[0] < im_orig.shape[1]:
        target_size1, target_size2 = target_size2, target_size1
    im = cv2.resize(im_orig,
                    None,
                    None,
                    fx=float(target_size2) / float(w),
                    fy=float(target_size1) / float(h),
                    interpolation=cv2.INTER_LINEAR)
    x = (im - MEANS).astype(np.float32)
    x = x[:, :, (2, 1, 0)]  # to rgb
    x = x.transpose(2, 0, 1)
    x = torch.from_numpy(x).unsqueeze(0)
    x = x.to(device)

    arm_loc, arm_conf, adm_loc, adm_conf, feat_sizes = net(x)
    priorbox = PriorBox(net.cfg,
                        feat_sizes, (target_size1, target_size2),
                        phase='test')
    priors = priorbox.forward()
    priors = priors.to(device)
    det = detect.forward(arm_loc, arm_conf, adm_loc, adm_conf, priors, scale)
    return det
Ejemplo n.º 2
0
    def __init__(self,
                 config,
                 phase,
                 base,
                 extras,
                 head,
                 num_classes,
                 top_k=200):
        super(SSD, self).__init__()
        self.phase = phase
        self.num_classes = num_classes
        # TODO: implement __call__ in PriorBox
        self.priorbox = PriorBox(config)
        self.priors = Variable(self.priorbox.forward(), volatile=True)

        # SSD network
        self.vgg = nn.ModuleList(base)
        # Layer learns to scale the l2 normalized features from conv4_3
        self.L2Norm = L2Norm(512, 20)
        self.extras = nn.ModuleList(extras)

        self.loc = nn.ModuleList(head[0])
        self.conf = nn.ModuleList(head[1])

        if phase == 'test':
            self.softmax = nn.Softmax()
            self.detect = Detect(num_classes,
                                 0,
                                 top_k,
                                 0.01,
                                 0.45,
                                 variance=config['variance'])
Ejemplo n.º 3
0
def im_detect(net, im, target_size):
    try:
        device = net.arm_conf[0].weight.device
    except:
        device = net.odm_conf[0].weight.device
    h, w, _ = im.shape
    scale = torch.Tensor([w, h, w, h])
    scale = scale.to(device)
    im_orig = im.astype(np.float32, copy=True)
    im = cv2.resize(im_orig, (target_size, target_size),
                    interpolation=cv2.INTER_LINEAR)
    x = (im - MEANS).astype(np.float32)
    x = x[:, :, (2, 1, 0)]  # to rgb
    x = x.transpose(2, 0, 1)
    x = torch.from_numpy(x).unsqueeze(0)
    x = x.to(device)

    if args.wo_refined_anchor:
        adm_loc, adm_conf, feat_sizes = net(x)
    else:
        arm_loc, arm_conf, adm_loc, adm_conf, feat_sizes = net(x)
    priorbox = PriorBox(net.cfg,
                        feat_sizes, (target_size, target_size),
                        phase='test')
    priors = priorbox.forward()
    priors = priors.to(device)
    if args.wo_refined_anchor:
        det = detect.forward(adm_loc, adm_conf, priors, scale)
    else:
        det = detect.forward(arm_loc, arm_conf, adm_loc, adm_conf, priors,
                             scale)
    return det
Ejemplo n.º 4
0
    def __init__(self, phase, size, base, extras, head, num_classes):
        super(SSD, self).__init__()
        self.phase = phase
        self.num_classes = num_classes
        self.cfg = (coco, voc)[num_classes == 21]
        self.priorbox = PriorBox(self.cfg)
        self.priors = Variable(self.priorbox.forward(), volatile=True)
        self.size = size

        # SSD network
        self.vgg = nn.ModuleList(base)
        # Layer learns to scale the l2 normalized features from conv4_3
        self.L2Norm = L2Norm(512, 20)
        self.extras = nn.ModuleList(extras)

        self.loc = nn.ModuleList(head[0])
        self.conf = nn.ModuleList(head[1])

        if phase == 'test':
            self.softmax = nn.Softmax(dim=-1)
            self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
Ejemplo n.º 5
0
    def __init__(self, num_classes, phase, pretrain=False, finetune=None):
        super(SSD300, self).__init__()
        self.num_classes = num_classes
        self.phase = phase
        
        self.base_net = self._base_net()
        self.extra_net = self._extra_net()
        self.loc_pred, self.cls_pred = self._predict_net()

        self.L2Norm = L2Norm(512, 20)
        self.priorbox = PriorBox(v2)
        self.priors = Variable(self.priorbox.forward(), volatile=True)

        if phase == 'test':
            self.softmax = nn.Softmax()
            self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)

        self._init_weight()
        if pretrain:
            self._load_weight()
        if finetune is not None:
            self._finetune(finetune)
Ejemplo n.º 6
0
    def __init__(self, phase, size, base, extras, head, num_classes):
        super(TBPP, self).__init__()
        self.phase = phase
        self.num_classes = num_classes
        self.cfg = {
            'num_classes':
            2,
            'lr_steps': (80000, 100000, 120000),
            'max_iter':
            120000,
            'feature_maps': [64, 32, 16, 8, 4, 2, 1],
            'min_dim':
            512,
            'steps': [8, 16, 32, 64, 128, 256, 512],
            'min_sizes': [20, 51, 133, 215, 296, 378, 460],
            'max_sizes': [51, 133, 215, 296, 378, 460, 542],
            'aspect_ratios': [[2, 3], [2, 3, 5], [2, 3, 5], [2, 3, 5],
                              [2, 3, 5], [2, 3], [2, 3]],  # TODO
            'variance': [0.1, 0.2],
            'clip':
            True,
            'name':
            'MINE'
        }
        self.priorbox = PriorBox(
            self.cfg)  # calculate the size of prior boxes, i.e. defaults boxes
        self.priors = Variable(self.priorbox.forward(), volatile=True)
        self.size = size

        # TBPP network
        self.vgg = nn.ModuleList(base)
        self.L2Norm = L2Norm(512, 20)
        self.extras = nn.ModuleList(extras)
        self.loc = nn.ModuleList(head[0])
        self.conf = nn.ModuleList(head[1])

        if phase == 'test':
            self.softmax = nn.Softmax(dim=-1)
            self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
Ejemplo n.º 7
0
    def __init__(self, num_classes):
        super(DSOD_64_16_1x1, self).__init__()
        self.num_classes = num_classes
        self.extractor = DenseNet_64_16_DSSD_s_Pred_D()
        self.loc_layers = nn.ModuleList()
        self.cls_layers = nn.ModuleList()
        self.cfg = cfg_320_64_16
        self.priorbox = PriorBox(self.cfg)
        self.priors = self.priorbox.forward()

        # in_channels = (768, 768, 768, 256, 256, 256) #pred C
        in_channels = (256, 256, 256, 256, 256, 256)  # pred D
        num_anchors = (4, 6, 6, 6, 4, 4)
        for inC, num_anchor in zip(in_channels, num_anchors):
            # self.loc_layers += [nn.Conv2d(inC, num_anchor*4, kernel_size=3, padding=1)]
            # self.cls_layers += [nn.Conv2d(inC, num_anchor* num_classes, kernel_size=3, padding=1)
            #                                   ]
            self.loc_layers += [
                nn.Sequential(
                    nn.Conv2d(inC,
                              num_anchor * 4,
                              kernel_size=1,
                              padding=0,
                              bias=False), nn.BatchNorm2d(num_anchor * 4))
            ]
            self.cls_layers += [
                nn.Sequential(
                    nn.Conv2d(inC,
                              num_anchor * num_classes,
                              kernel_size=1,
                              padding=0,
                              bias=False),
                    nn.BatchNorm2d(num_anchor * num_classes))
            ]
        self.normalize = nn.ModuleList(
            [L2Norm(chan, 20) for chan in in_channels])

        self.reset_parameters()
Ejemplo n.º 8
0
    def __init__(self, num_classes):
        super(DSOD_64_16_GN, self).__init__()
        self.num_classes = num_classes
        self.extractor = DSSD_s_GN()
        self.loc_layers = nn.ModuleList()
        self.cls_layers = nn.ModuleList()
        self.cfg = cfg_320_64_16
        self.priorbox = PriorBox(self.cfg)
        self.priors = self.priorbox.forward()

        in_channels = channel_dict['DSSD']
        num_anchors = (4, 6, 6, 6, 4, 4)
        for inC, num_anchor in zip(in_channels, num_anchors):
            # self.loc_layers += [nn.Conv2d(inC, num_anchor*4, kernel_size=3, padding=1)]
            # self.cls_layers += [nn.Conv2d(inC, num_anchor* num_classes, kernel_size=3, padding=1)
            #                                   ]
            self.loc_layers += [
                nn.Sequential(
                    nn.Conv2d(inC,
                              num_anchor * 4,
                              kernel_size=3,
                              padding=1,
                              bias=False), nn.GroupNorm(4, num_anchor * 4))
            ]
            self.cls_layers += [
                nn.Sequential(
                    nn.Conv2d(inC,
                              num_anchor * num_classes,
                              kernel_size=3,
                              padding=1,
                              bias=False),
                    nn.GroupNorm(num_classes, num_anchor * num_classes))
            ]
        self.normalize = nn.ModuleList(
            [L2Norm(chan, 20) for chan in in_channels])
        self.reset_parameters()
Ejemplo n.º 9
0
    def forward(self, x):
        img_size = x.size()[2:]
        source = []

        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(torch.cat((F.relu(x), F.relu(-x)), 1))

        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(torch.cat((F.relu(x), F.relu(-x)), 1))

        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        x = self.inception1(x)
        x = self.inception2(x)
        x = self.inception3(x)
        source.append(x)

        x = self.conv3_1(x)
        x = self.conv3_2(x)
        source.append(x)

        x = self.conv4_1(x)
        x = self.conv4_2(x)
        source.append(x)

        feature_maps = []
        for feat in source:
            feature_maps.append([feat.size(2), feat.size(3)])

        self.priors = Variable(PriorBox(img_size, feature_maps, cfg).forward())

        loc_preds, conf_preds = self.multilbox(source)

        if self.phase == 'test':
            output = self.test_det(loc_preds, self.softmax(conf_preds),
                                   self.priors)
        else:
            output = (loc_preds, conf_preds, self.priors)
        return output
Ejemplo n.º 10
0
def train():
    if args.visdom:
        import visdom
        viz = visdom.Visdom()

    print('Loading the dataset...')
    if args.dataset == 'COCO':
        if args.dataset_root == VOC_ROOT:
            if not os.path.exists(COCOroot):
                parser.error('Must specify dataset_root if specifying dataset')
            print("WARNING: Using default COCO dataset_root because " +
                  "--dataset_root was not specified.")
            args.dataset_root = COCOroot
        cfg = coco_refinedet[args.input_size]
        train_sets = [('train2017')]
        # train_sets = [('train2017', 'val2017')]
        dataset = COCODetection(COCOroot, train_sets,
                                SSDAugmentation(cfg['min_dim'], MEANS))
    elif args.dataset == 'VOC':
        '''if args.dataset_root == COCO_ROOT:
            parser.error('Must specify dataset if specifying dataset_root')'''
        cfg = voc_refinedet[args.input_size]
        dataset = VOCDetection(root=VOC_ROOT,
                               transform=SSDAugmentation(
                                   cfg['min_dim'], MEANS))
    print('Training RefineDet on:', dataset.name)
    print('Using the specified args:')
    print(args)

    refinedet_net = build_refinedet('train', int(args.input_size),
                                    cfg['num_classes'], backbone_dict)
    net = refinedet_net
    print(net)

    device = torch.device('cuda:0' if args.cuda else 'cpu')
    if args.ngpu > 1 and args.cuda:
        net = torch.nn.DataParallel(refinedet_net,
                                    device_ids=list(range(args.ngpu)))
    cudnn.benchmark = True
    net = net.to(device)

    if args.resume:
        print('Resuming training, loading {}...'.format(args.resume))
        state_dict = torch.load(args.resume)
        # create new OrderedDict that does not contain `module.`
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            head = k[:7]
            if head == 'module.':
                name = k[7:]  # remove `module.`
            else:
                name = k
            new_state_dict[name] = v
        refinedet_net.load_state_dict(new_state_dict)
    else:
        print('Initializing weights...')
        refinedet_net.init_weights(pretrained=pretrained)

    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    arm_criterion = RefineDetMultiBoxLoss(2, 0.5, True, 0, True, negpos_ratio,
                                          0.5, False, args.cuda)
    odm_criterion = RefineDetMultiBoxLoss(cfg['num_classes'],
                                          0.5,
                                          True,
                                          0,
                                          True,
                                          negpos_ratio,
                                          0.5,
                                          False,
                                          args.cuda,
                                          use_ARM=True)
    priorbox = PriorBox(cfg)
    with torch.no_grad():
        priors = priorbox.forward()
        priors = priors.to(device)

    net.train()
    # loss counters
    arm_loc_loss = 0
    arm_conf_loss = 0
    odm_loc_loss = 0
    odm_conf_loss = 0
    epoch = 0 + args.resume_epoch

    epoch_size = math.ceil(len(dataset) / args.batch_size)
    max_iter = args.max_epoch * epoch_size

    stepvalues = (args.max_epoch * 2 // 3 * epoch_size,
                  args.max_epoch * 8 // 9 * epoch_size,
                  args.max_epoch * epoch_size)
    if args.dataset == 'VOC':
        stepvalues = (args.max_epoch * 2 // 3 * epoch_size,
                      args.max_epoch * 5 // 6 * epoch_size,
                      args.max_epoch * epoch_size)
    step_index = 0

    if args.resume_epoch > 0:
        start_iter = args.resume_epoch * epoch_size
        for step in stepvalues:
            if step < start_iter:
                step_index += 1
    else:
        start_iter = 0

    if args.visdom:
        vis_title = 'RefineDet.PyTorch on ' + dataset.name
        vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
        iter_plot = create_vis_plot(viz, 'Iteration', 'Loss', vis_title,
                                    vis_legend)
        epoch_plot = create_vis_plot(viz, 'Epoch', 'Loss', vis_title,
                                     vis_legend)

    data_loader = data.DataLoader(dataset,
                                  args.batch_size,
                                  num_workers=args.num_workers,
                                  shuffle=True,
                                  collate_fn=detection_collate,
                                  pin_memory=True)
    for iteration in range(start_iter, max_iter):
        if iteration % epoch_size == 0:
            if args.visdom and iteration != 0:
                update_vis_plot(viz, epoch, arm_loc_loss, arm_conf_loss,
                                epoch_plot, None, 'append', epoch_size)
                # reset epoch loss counters
                arm_loc_loss = 0
                arm_conf_loss = 0
                odm_loc_loss = 0
                odm_conf_loss = 0
            # create batch iterator
            batch_iterator = iter(data_loader)
            if (epoch % 10 == 0 and epoch > 0) or (epoch % 5 == 0 and epoch >
                                                   (args.max_epoch * 2 // 3)):
                torch.save(
                    net.state_dict(),
                    args.save_folder + 'RefineDet' + args.input_size + '_' +
                    args.dataset + '_epoches_' + repr(epoch) + '.pth')
            epoch += 1

        t0 = time.time()
        if iteration in stepvalues:
            step_index += 1
        lr = adjust_learning_rate(optimizer, args.gamma, epoch, step_index,
                                  iteration, epoch_size)

        # load train data
        images, targets = next(batch_iterator)
        images = images.to(device)
        targets = [ann.to(device) for ann in targets]
        # for an in targets:
        #     for instance in an:
        #         for cor in instance[:-1]:
        #             if cor < 0 or cor > 1:
        #                 raise StopIteration

        # forward
        out = net(images)

        # backprop
        optimizer.zero_grad()

        arm_loss_l, arm_loss_c = arm_criterion(out, priors, targets)
        odm_loss_l, odm_loss_c = odm_criterion(out, priors, targets)
        arm_loss = arm_loss_l + arm_loss_c
        odm_loss = odm_loss_l + odm_loss_c
        loss = arm_loss + odm_loss

        loss.backward()
        optimizer.step()

        arm_loc_loss += arm_loss_l.item()
        arm_conf_loss += arm_loss_c.item()
        odm_loc_loss += odm_loss_l.item()
        odm_conf_loss += odm_loss_c.item()
        t1 = time.time()
        batch_time = t1 - t0
        eta = int(batch_time * (max_iter - iteration))
        print('Epoch:{}/{} || Epochiter: {}/{} || Iter: {}/{} || ARM_L Loss: {:.4f} ARM_C Loss: {:.4f} ODM_L Loss: {:.4f} ODM_C Loss: {:.4f} loss: {:.4f} || LR: {:.8f} || Batchtime: {:.4f} s || ETA: {}'.\
            format(epoch, args.max_epoch, (iteration % epoch_size) + 1, epoch_size, iteration + 1, max_iter, arm_loss_l.item(), arm_loss_c.item(), odm_loss_l.item(), odm_loss_c.item(), loss.item(), lr, batch_time, str(datetime.timedelta(seconds=eta))))

        if args.visdom:
            update_vis_plot(viz, iteration, arm_loss_l.item(),
                            arm_loss_c.item(), iter_plot, epoch_plot, 'append')

    torch.save(
        refinedet_net.state_dict(), args.save_folder +
        '/RefineDet{}_{}_final.pth'.format(args.input_size, args.dataset))
Ejemplo n.º 11
0
def main():
    global args
    global minmum_loss
    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()

    args.total_batch_size = args.world_size * args.batch_size

    ## DATA loading code
    if args.dataset == 'COCO':
        train_sets = [('2014', 'train'), ('2014', 'valminusminival')]
        cfg = (COCO_300, COCO_512)[args.size == '512']
    elif args.dataset == 'VOC':
        train_sets = [('2007', 'trainval'), ('2012', 'trainval')]
        cfg = (VOC_300, VOC_512)[args.size == '512']

    # other impoort parmeters
    img_dim = (300, 512)[args.size == '512']
    rgb_means = ((104, 117, 123), (103.94, 116.78,
                                   123.68))[args.version == 'RFB_mobile']
    p = (0.6, 0.2)[args.version == 'RFB_mobile']
    num_classes = (21, 81)[args.dataset == 'COCO']

    if args.dataset == 'COCO':
        dataset = COCODetection(root=cfg['coco_root'],
                                image_sets=train_sets,
                                preproc=preproc(img_dim, rgb_means, p))
    elif args.dataset == 'VOC':
        dataset = VOCDetection(root=cfg['voc_root'],
                               image_sets=train_sets,
                               preproc=preproc(img_dim, rgb_means, p),
                               target_transform=AnnotationTransform())

    print('Training SSD on:', dataset.name)
    print('Loading the dataset...')
    train_loader = data.DataLoader(dataset,
                                   args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=True,
                                   collate_fn=detection_collate,
                                   pin_memory=True)

    print("Build RFB network")
    if args.version == 'RFB_vgg':
        model = RFB_Net_vgg('train', img_dim, num_classes)
    elif args.version == 'RFB_E_vgg':
        model = RFB_Net_E_vgg('train', img_dim, num_classes)
    elif args.version == 'RFB_mobile':
        model = RFB_Net_mobile('train', img_dim, num_classes)
    else:
        print('Unkown version!')

    if args.pretrained:
        base_weights = torch.load(args.save_folder + args.basenet)
        print('Loading base network...')
        model.base.load_state_dict(base_weights)

    model = model.cuda()
    # optimizer and loss function
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
                             False)

    ## get the priorbox of ssd
    priorbox = PriorBox(cfg)
    with torch.no_grad():
        priors = priorbox.forward()
        priors = priors.cuda()

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            minmum_loss = checkpoint['minmum_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        print('Initializing weights...')
        # initialize newly added layers' weights with xavier method
        model.extras.apply(weights_init)
        model.loc.apply(weights_init)
        model.conf.apply(weights_init)
        model.Norm.apply(weights_init)
        if args.version == 'RFB_E_vgg':
            model.reduce.apply(weights_init)
            model.up_reduce.apply(weights_init)

    print('Using the specified args:')
    print(args)
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        loss = train(train_loader, model, priors, criterion, optimizer, epoch)
        # remember best prec@1 and save checkpoint
        if args.local_rank == 0:
            is_best = loss < minmum_loss
            minmum_loss = min(loss, minmum_loss)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_prec1': minmum_loss,
                    'optimizer': optimizer.state_dict(),
                }, is_best, epoch)
Ejemplo n.º 12
0
def SSD300(input_shape, num_classes=21):
    """SSD300 architecture.

    # Arguments
        input_shape: Shape of the input image,
            expected to be either (300, 300, 3) or (3, 300, 300)(not tested).
        num_classes: Number of classes including background.

    # References
        https://arxiv.org/abs/1512.02325
    """
    print('begin building networks')
    kernel_size = (3, 3)

    net = {}
    # Block 1
    input_tensor = Input(shape=input_shape)
    img_size = (input_shape[1], input_shape[0])
    net['input'] = input_tensor
    net['conv1_1'] = Conv2D(64,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv1_1')(net['input'])
    net['conv1_2'] = Conv2D(64,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv1_2')(net['conv1_1'])
    net['pool1'] = MaxPooling2D((2, 2),
                                strides=(2, 2),
                                padding='same',
                                name='pool1')(net['conv1_2'])
    # Block 2
    net['conv2_1'] = Conv2D(128,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv2_1')(net['pool1'])
    net['conv2_2'] = Conv2D(128,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv2_2')(net['conv2_1'])
    net['pool2'] = MaxPooling2D((2, 2),
                                strides=(2, 2),
                                padding='same',
                                name='pool2')(net['conv2_2'])
    # Block 3
    net['conv3_1'] = Conv2D(256,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv3_1')(net['pool2'])
    net['conv3_2'] = Conv2D(256,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv3_2')(net['conv3_1'])
    net['conv3_3'] = Conv2D(256,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv3_3')(net['conv3_2'])
    net['pool3'] = MaxPooling2D((2, 2),
                                strides=(2, 2),
                                padding='same',
                                name='pool3')(net['conv3_3'])
    # Block 4
    net['conv4_1'] = Conv2D(512,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv4_1')(net['pool3'])
    net['conv4_2'] = Conv2D(512,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv4_2')(net['conv4_1'])
    net['conv4_3'] = Conv2D(512,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv4_3')(net['conv4_2'])
    net['pool4'] = MaxPooling2D((2, 2),
                                strides=(2, 2),
                                padding='same',
                                name='pool4')(net['conv4_3'])
    # Block 5
    net['conv5_1'] = Conv2D(512,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv5_1')(net['pool4'])
    net['conv5_2'] = Conv2D(512,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv5_2')(net['conv5_1'])
    net['conv5_3'] = Conv2D(512,
                            kernel_size,
                            activation='relu',
                            padding='same',
                            name='conv5_3')(net['conv5_2'])
    net['pool5'] = MaxPooling2D((3, 3),
                                strides=(1, 1),
                                padding='same',
                                name='pool5')(net['conv5_3'])
    # FC6
    net['fc6'] = Conv2D(1024,
                        kernel_size,
                        dilation_rate=(6, 6),
                        activation='relu',
                        padding='same',
                        name='fc6')(net['pool5'])
    # x = Dropout(0.5, name='drop6')(x)
    # FC7
    net['fc7'] = Conv2D(1024, (1, 1),
                        activation='relu',
                        padding='same',
                        name='fc7')(net['fc6'])
    # x = Dropout(0.5, name='drop7')(x)
    # Block 6
    net['conv6_1'] = Conv2D(256, (1, 1),
                            activation='relu',
                            padding='same',
                            name='conv6_1')(net['fc7'])
    net['conv6_2'] = Conv2D(512,
                            kernel_size,
                            strides=(2, 2),
                            activation='relu',
                            padding='same',
                            name='conv6_2')(net['conv6_1'])
    # Block 7
    net['conv7_1'] = Conv2D(128, (1, 1),
                            activation='relu',
                            padding='same',
                            name='conv7_1')(net['conv6_2'])
    net['conv7_2'] = ZeroPadding2D()(net['conv7_1'])
    net['conv7_2'] = Conv2D(256,
                            kernel_size,
                            strides=(2, 2),
                            activation='relu',
                            padding='valid',
                            name='conv7_2')(net['conv7_2'])
    # Block 8
    net['conv8_1'] = Conv2D(128, (1, 1),
                            activation='relu',
                            padding='same',
                            name='conv8_1')(net['conv7_2'])
    net['conv8_2'] = Conv2D(256,
                            kernel_size,
                            strides=(2, 2),
                            activation='relu',
                            padding='same',
                            name='conv8_2')(net['conv8_1'])
    # Last Pool
    net['pool6'] = GlobalAveragePooling2D(name='pool6')(net['conv8_2'])
    print('base network built')

    # Prediction from conv4_3
    net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3'])
    num_priors = 3
    x = Conv2D(num_priors * 4,
               kernel_size,
               padding='same',
               name='conv4_3_norm_mbox_loc')(net['conv4_3_norm'])
    net['conv4_3_norm_mbox_loc'] = x
    flatten = Flatten(name='conv4_3_norm_mbox_loc_flat')
    net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc'])
    name = 'conv4_3_norm_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes,
               kernel_size,
               padding='same',
               name=name)(net['conv4_3_norm'])
    net['conv4_3_norm_mbox_conf'] = x
    flatten = Flatten(name='conv4_3_norm_mbox_conf_flat')
    net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf'])
    priorbox = PriorBox(img_size,
                        30.0,
                        aspect_ratios=[2],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv4_3_norm_mbox_priorbox')
    net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm'])
    print('conv4_3_norm_mbox_priorbox built')
    # Prediction from fc7
    num_priors = 6
    net['fc7_mbox_loc'] = Conv2D(num_priors * 4,
                                 kernel_size,
                                 padding='same',
                                 name='fc7_mbox_loc')(net['fc7'])
    flatten = Flatten(name='fc7_mbox_loc_flat')
    net['fc7_mbox_loc_flat'] = flatten(net['fc7_mbox_loc'])
    name = 'fc7_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    net['fc7_mbox_conf'] = Conv2D(num_priors * num_classes, (3, 3),
                                  padding='same',
                                  name=name)(net['fc7'])
    flatten = Flatten(name='fc7_mbox_conf_flat')
    net['fc7_mbox_conf_flat'] = flatten(net['fc7_mbox_conf'])
    priorbox = PriorBox(img_size,
                        60.0,
                        max_size=114.0,
                        aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='fc7_mbox_priorbox')
    net['fc7_mbox_priorbox'] = priorbox(net['fc7'])
    print('fc7_mbox_priorbox built')
    # Prediction from conv6_2
    num_priors = 6
    x = Conv2D(num_priors * 4,
               kernel_size,
               padding='same',
               name='conv6_2_mbox_loc')(net['conv6_2'])
    net['conv6_2_mbox_loc'] = x
    flatten = Flatten(name='conv6_2_mbox_loc_flat')
    net['conv6_2_mbox_loc_flat'] = flatten(net['conv6_2_mbox_loc'])
    name = 'conv6_2_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes,
               kernel_size,
               padding='same',
               name=name)(net['conv6_2'])
    net['conv6_2_mbox_conf'] = x
    flatten = Flatten(name='conv6_2_mbox_conf_flat')
    net['conv6_2_mbox_conf_flat'] = flatten(net['conv6_2_mbox_conf'])
    priorbox = PriorBox(img_size,
                        114.0,
                        max_size=168.0,
                        aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv6_2_mbox_priorbox')
    net['conv6_2_mbox_priorbox'] = priorbox(net['conv6_2'])
    print('conv6_2_mbox_priorbox built')
    # Prediction from conv7_2
    num_priors = 6
    x = Conv2D(num_priors * 4,
               kernel_size,
               padding='same',
               name='conv7_2_mbox_loc')(net['conv7_2'])
    net['conv7_2_mbox_loc'] = x
    flatten = Flatten(name='conv7_2_mbox_loc_flat')
    net['conv7_2_mbox_loc_flat'] = flatten(net['conv7_2_mbox_loc'])
    name = 'conv7_2_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes,
               kernel_size,
               padding='same',
               name=name)(net['conv7_2'])
    net['conv7_2_mbox_conf'] = x
    flatten = Flatten(name='conv7_2_mbox_conf_flat')
    net['conv7_2_mbox_conf_flat'] = flatten(net['conv7_2_mbox_conf'])
    priorbox = PriorBox(img_size,
                        168.0,
                        max_size=222.0,
                        aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv7_2_mbox_priorbox')
    net['conv7_2_mbox_priorbox'] = priorbox(net['conv7_2'])
    print('conv7_2_mbox_priorbox built')
    # Prediction from conv8_2
    num_priors = 6
    x = Conv2D(num_priors * 4,
               kernel_size,
               padding='same',
               name='conv8_2_mbox_loc')(net['conv8_2'])
    net['conv8_2_mbox_loc'] = x
    flatten = Flatten(name='conv8_2_mbox_loc_flat')
    net['conv8_2_mbox_loc_flat'] = flatten(net['conv8_2_mbox_loc'])
    name = 'conv8_2_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes,
               kernel_size,
               padding='same',
               name=name)(net['conv8_2'])
    net['conv8_2_mbox_conf'] = x
    flatten = Flatten(name='conv8_2_mbox_conf_flat')
    net['conv8_2_mbox_conf_flat'] = flatten(net['conv8_2_mbox_conf'])
    priorbox = PriorBox(img_size,
                        222.0,
                        max_size=276.0,
                        aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv8_2_mbox_priorbox')
    net['conv8_2_mbox_priorbox'] = priorbox(net['conv8_2'])
    print('conv8_2_mbox_priorbox built')
    # Prediction from pool6
    num_priors = 6
    x = Dense(num_priors * 4, name='pool6_mbox_loc_flat')(net['pool6'])
    net['pool6_mbox_loc_flat'] = x
    name = 'pool6_mbox_conf_flat'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Dense(num_priors * num_classes, name=name)(net['pool6'])
    net['pool6_mbox_conf_flat'] = x
    priorbox = PriorBox(img_size,
                        276.0,
                        max_size=330.0,
                        aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='pool6_mbox_priorbox')
    if K.image_dim_ordering() == 'tf':
        target_shape = (1, 1, 256)
    else:
        target_shape = (256, 1, 1)
    net['pool6_reshaped'] = Reshape(target_shape,
                                    name='pool6_reshaped')(net['pool6'])
    net['pool6_mbox_priorbox'] = priorbox(net['pool6_reshaped'])
    print('pool6_mbox_priorbox built')
    # Gather all predictions
    net['mbox_loc'] = concatenate([
        net['conv4_3_norm_mbox_loc_flat'], net['fc7_mbox_loc_flat'],
        net['conv6_2_mbox_loc_flat'], net['conv7_2_mbox_loc_flat'],
        net['conv8_2_mbox_loc_flat'], net['pool6_mbox_loc_flat']
    ],
                                  axis=1,
                                  name='mbox_loc')
    net['mbox_conf'] = concatenate([
        net['conv4_3_norm_mbox_conf_flat'], net['fc7_mbox_conf_flat'],
        net['conv6_2_mbox_conf_flat'], net['conv7_2_mbox_conf_flat'],
        net['conv8_2_mbox_conf_flat'], net['pool6_mbox_conf_flat']
    ],
                                   axis=1,
                                   name='mbox_conf')
    net['mbox_priorbox'] = concatenate([
        net['conv4_3_norm_mbox_priorbox'], net['fc7_mbox_priorbox'],
        net['conv6_2_mbox_priorbox'], net['conv7_2_mbox_priorbox'],
        net['conv8_2_mbox_priorbox'], net['pool6_mbox_priorbox']
    ],
                                       axis=1,
                                       name='mbox_priorbox')
    print('gathering all prediction layers built')
    if hasattr(net['mbox_loc'], '_keras_shape'):
        # divide 4 for [xmin, ymin, xmax, ymax]
        num_boxes = net['mbox_loc']._keras_shape[-1] // 4
    elif hasattr(net['mbox_loc'], 'int_shape'):
        num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4
    net['mbox_loc'] = Reshape((num_boxes, 4),
                              name='mbox_loc_final')(net['mbox_loc'])
    net['mbox_conf'] = Reshape((num_boxes, num_classes),
                               name='mbox_conf_logits')(net['mbox_conf'])
    net['mbox_conf'] = Activation('softmax',
                                  name='mbox_conf_final')(net['mbox_conf'])
    net['predictions'] = concatenate(
        [net['mbox_loc'], net['mbox_conf'], net['mbox_priorbox']],
        axis=2,
        name='predictions')
    print('prediction layers built')
    model = Model(net['input'], net['predictions'])
    return model
Ejemplo n.º 13
0
class TBPP(nn.Module):
    def __init__(self, phase, size, base, extras, head, num_classes):
        super(TBPP, self).__init__()
        self.phase = phase
        self.num_classes = num_classes
        self.cfg = {
            'num_classes':
            2,
            'lr_steps': (80000, 100000, 120000),
            'max_iter':
            120000,
            'feature_maps': [64, 32, 16, 8, 4, 2, 1],
            'min_dim':
            512,
            'steps': [8, 16, 32, 64, 128, 256, 512],
            'min_sizes': [20, 51, 133, 215, 296, 378, 460],
            'max_sizes': [51, 133, 215, 296, 378, 460, 542],
            'aspect_ratios': [[2, 3], [2, 3, 5], [2, 3, 5], [2, 3, 5],
                              [2, 3, 5], [2, 3], [2, 3]],  # TODO
            'variance': [0.1, 0.2],
            'clip':
            True,
            'name':
            'MINE'
        }
        self.priorbox = PriorBox(
            self.cfg)  # calculate the size of prior boxes, i.e. defaults boxes
        self.priors = Variable(self.priorbox.forward(), volatile=True)
        self.size = size

        # TBPP network
        self.vgg = nn.ModuleList(base)
        self.L2Norm = L2Norm(512, 20)
        self.extras = nn.ModuleList(extras)
        self.loc = nn.ModuleList(head[0])
        self.conf = nn.ModuleList(head[1])

        if phase == 'test':
            self.softmax = nn.Softmax(dim=-1)
            self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)

    def forward(self, x):
        sources = list()
        loc = list()
        conf = list()

        # apply vgg up to conv4_3 relu
        for k in range(23):
            x = self.vgg[k](x)
        s = self.L2Norm(x)
        sources.append(s)

        # apply vgg up to fc7
        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)
        sources.append(x)

        # apply extra layers and cache source layer outputs
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True)
            if k % 2 == 1:
                sources.append(x)

        # apply multibox head to source layers
        for (x, l, c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)

        if self.phase == "test":
            output = self.detect(
                loc.view(loc.size(0), -1, 4),  # loc predictions
                self.softmax(conf.view(conf.size(0), -1,
                                       self.num_classes)),  # conf predictions
                self.priors.type(type(
                    x.data)))  # prior boxes, i.e. default boxes
        else:
            output = (loc.view(loc.size(0), -1,
                               4), conf.view(conf.size(0), -1,
                                             self.num_classes), self.priors)

        return output

    def load_weights(self, base_file):
        other, ext = os.path.splitext(base_file)
        if ext == '.pkl' or '.pth':
            print('Loading weights into state dict ...')
            self.load_state_dict(
                torch.load(base_file,
                           map_location=lambda storage, loc: storage))
            print('Loaded!')
        else:
            print('Sorry, only .pth and .pkl files are supported.')
Ejemplo n.º 14
0
class SSD(nn.Module):
    """Single Shot Multibox Architecture
    The network is composed of a base VGG network followed by the
    added multibox conv layers.  Each multibox layer branches into
        1) conv2d for class conf scores
        2) conv2d for localization predictions
        3) associated priorbox layer to produce default bounding
           boxes specific to the layer's feature map size.
    See: https://arxiv.org/pdf/1512.02325.pdf for more details.

    Args:
        phase: (string) Can be "test" or "train"
        base: VGG16 layers for input, size of either 300 or 500
        extras: extra layers that feed to multibox loc and conf layers
        head: "multibox head" consists of loc and conf conv layers
    """
    def __init__(self,
                 config,
                 phase,
                 base,
                 extras,
                 head,
                 num_classes,
                 top_k=200):
        super(SSD, self).__init__()
        self.phase = phase
        self.num_classes = num_classes
        # TODO: implement __call__ in PriorBox
        self.priorbox = PriorBox(config)
        self.priors = Variable(self.priorbox.forward(), volatile=True)

        # SSD network
        self.vgg = nn.ModuleList(base)
        # Layer learns to scale the l2 normalized features from conv4_3
        self.L2Norm = L2Norm(512, 20)
        self.extras = nn.ModuleList(extras)

        self.loc = nn.ModuleList(head[0])
        self.conf = nn.ModuleList(head[1])

        if phase == 'test':
            self.softmax = nn.Softmax()
            self.detect = Detect(num_classes,
                                 0,
                                 top_k,
                                 0.01,
                                 0.45,
                                 variance=config['variance'])

    def forward(self, x):
        """Applies network layers and ops on input image(s) x.

        Args:
            x: input image or batch of images. Shape: [batch,3*batch,300,300].

        Return:
            Depending on phase:
            test:
                Variable(tensor) of output class label predictions,
                confidence score, and corresponding location predictions for
                each object detected. Shape: [batch,topk,7]

            train:
                list of concat outputs from:
                    1: confidence layers, Shape: [batch*num_priors,num_classes]
                    2: localization layers, Shape: [batch,num_priors*4]
                    3: priorbox layers, Shape: [2,num_priors*4]
        """
        sources = list()
        loc = list()
        conf = list()

        # apply vgg up to conv4_3 relu
        for k in range(23):
            x = self.vgg[k](x)

        s = self.L2Norm(x)
        sources.append(s)

        # apply vgg up to fc7
        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)
        sources.append(x)

        # apply extra layers and cache source layer outputs
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True)
            if k % 2 == 1:
                sources.append(x)

        # apply multibox head to source layers
        for (x, l, c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
        if self.phase == "test":
            output = self.detect(
                loc.view(loc.size(0), -1, 4),  # loc preds
                self.softmax(conf.view(-1, self.num_classes)),  # conf preds
                self.priors.type(type(x.data))  # default boxes
            )
        else:
            output = (loc.view(loc.size(0), -1,
                               4), conf.view(conf.size(0), -1,
                                             self.num_classes), self.priors)
        return output

    def load_weights(self, base_file):
        other, ext = os.path.splitext(base_file)
        if ext == '.pkl' or '.pth':
            print('Loading weights into state dict...')
            self.load_state_dict(
                torch.load(base_file,
                           map_location=lambda storage, loc: storage))
            print('Finished!')
        else:
            print('Sorry only .pth and .pkl files supported.')
def simple_SSD(input_shape, num_classes, min_size, num_priors, max_size,
               aspect_ratios, variances):
    input_tensor = Input(shape=input_shape)

    body = Convolution2D(16, 7, 7)(input_tensor)
    body = Activation('relu')(body)
    body = MaxPooling2D(2, 2, border_mode='valid')(body)

    body = Convolution2D(32, 5, 5)(body)
    body = Activation('relu')(body)
    branch_1 = MaxPooling2D(2, 2, border_mode='valid')(body)

    body = Convolution2D(64, 3, 3)(branch_1)
    body = Activation('relu')(body)
    branch_2 = MaxPooling2D(2, 2, border_mode='valid')(body)

    # first branch
    norm_1 = Normalize(20)(branch_1)
    localization_1 = Convolution2D(num_priors * 4, 3, 3,
                                   border_mode='same')(norm_1)
    localization_1 = Flatten(localization_1)
    classification_1 = Convolution2D(num_priors * num_classes,
                                     3,
                                     3,
                                     border_mode='same')(norm_1)
    classification_1 = Flatten()(classification_1)
    prior_boxes_1 = PriorBox(input_shape[0:2], min_size, max_size,
                             aspect_ratios)

    # second branch
    norm_2 = Normalize(20)(branch_2)
    localization_2 = Convolution2D(num_priors * 4, 3, 3,
                                   border_mode='same')(norm_2)
    localization_2 = Flatten(localization_2)
    classification_2 = Convolution2D(num_priors * num_classes,
                                     3,
                                     3,
                                     border_mode='same')(norm_2)
    classification_2 = Flatten()(classification_2)
    prior_boxes_2 = PriorBox(input_shape[0:2], min_size, max_size,
                             aspect_ratios)

    localization_head = Merge([localization_1, localization_2],
                              mode='concat',
                              concat_axis=1)

    classification_head = Merge([classification_1, classification_2],
                                mode='concat',
                                concat_axis=1)

    prior_boxes_head = Merge([prior_boxes_1, prior_boxes_2],
                             mode='concat',
                             concat_axis=1)

    if hasattr(localization_head, '_keras_shape'):
        num_boxes = localization_head._keras_shape[-1] // 4
    elif hasattr(localization_head, 'int_shape'):
        num_boxes = K.int_shape(localization_head)[-1] // 4

    localization_head = Reshape((num_boxes, 4))(localization_head)
    classification_head = Reshape(
        (num_boxes, num_classes))(classification_head)
    classification_head = Activation('softmax')(classification_head)
    predictions = Merge(localization_head,
                        classification_head,
                        prior_boxes_head,
                        mode='concat',
                        concat_axis=2)

    model = Model(input_tensor, predictions)

    return model
Ejemplo n.º 16
0
class SSD300(nn.Module):

    def __init__(self, num_classes, phase, pretrain=False, finetune=None):
        super(SSD300, self).__init__()
        self.num_classes = num_classes
        self.phase = phase
        
        self.base_net = self._base_net()
        self.extra_net = self._extra_net()
        self.loc_pred, self.cls_pred = self._predict_net()

        self.L2Norm = L2Norm(512, 20)
        self.priorbox = PriorBox(v2)
        self.priors = Variable(self.priorbox.forward(), volatile=True)

        if phase == 'test':
            self.softmax = nn.Softmax()
            self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)

        self._init_weight()
        if pretrain:
            self._load_weight()
        if finetune is not None:
            self._finetune(finetune)

    def _base_net(self):
        """Use vgg16 as basenet. 
        Refer https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py.

        Returns:
            basenet: (ModuleList)
        """
        def make_layers(cfg, batch_norm=False):
            layers = []
            in_channels = 3
            for v in cfg:
                if v == 'M':
                    layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
                elif v == 'C':
                    layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
                else:
                    conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
                    if batch_norm:
                        layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
                    else:
                        layers += [conv2d, nn.ReLU(inplace=True)]
                    in_channels = v
            pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
            conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
            conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
            layers += [pool5, conv6, nn.ReLU(inplace=True),
                              conv7, nn.ReLU(inplace=True)]
            return layers

        cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 
               512, 512, 512, 'M', 512, 512, 512]
        return nn.ModuleList(make_layers(cfg))

    def _extra_net(self):
        """Extra layers in SSD300, conv8,9,10,11
        Refer https://arxiv.org/pdf/1512.02325.pdf

        Returns:
            extra_net: (ModuleList)
        """
        def make_layers(cfg, batch_norm=False):
            layers = []
            in_channels = 1024
            flag = False

            for i, v in enumerate(cfg):
                if in_channels == 'S':
                    in_channels = v
                    continue
                _kerner_size = (1,3)[flag]
                if v == 'S':
                    conv = nn.Conv2d(in_channels, cfg[i+1],
                                     _kerner_size, stride=2, padding=1)
                else:
                    conv = nn.Conv2d(in_channels, v, _kerner_size)
                layers += [conv]
                in_channels = v
                flag = not flag
            return layers

        cfg = [256, 'S', 512, 
               128, 'S', 256, 
               128, 256, 
               128, 256]
        return nn.ModuleList(make_layers(cfg))

    def _predict_net(self):
        """Predict layer, cls and loc

        Returns:
            loc_layers:  [list], len=6
            conf_layers: [list], len=6

        """
        loc_layers = []
        conf_layers = []
        in_channels = [512, 1024, 512, 256, 256, 256]
        mboxes = [4, 6, 6, 6, 4, 4] # number of boxes per feature map location
        for (in_channels, mbox) in zip(in_channels, mboxes):
            loc_layers += [nn.Conv2d(in_channels, mbox*4, kernel_size=3, padding=1)]
            conf_layers += [nn.Conv2d(in_channels, mbox*self.num_classes, kernel_size=3, padding=1)]
        return nn.ModuleList(loc_layers), nn.ModuleList(conf_layers)

    def forward(self, x):
        """Apply network layers and ops on input image(s) x.

        Args:
            x (tensor): input image or batch of image.
                Shape: [batch, 3, 300, 300]

        Returns: 
            Depending on phase;
            train:
                list of concat outputs from:
                    1: confidence layers, Shape: [batch*num_priors, num_classes]
                    2: localization layers, Shape: [batch, num_priors*4]
                    3: priorbox layers, Shape: [2, num_priors*4]
            test:
                Variale(tensor) of input class label predictions
                ..
        """
        sources = [] # feature maps where to make predictions 
        conf = []
        loc = []

        # apply vgg, without BatchNorm
        pred_index = [22,] # conv4_3 relu
        for k, v in enumerate(self.base_net):
            x = v(x)
            if k in pred_index:
                sources.append(self.L2Norm(x))
        sources.append(x)

        # apply extra_net and cache source layer outputs
        pred_index = [1,3,5,7]
        for k, v in enumerate(self.extra_net):
            x = v(x) 
            if k in pred_index:
                sources.append(x) 
        
        # apply predict_net to source layers
        for (x, l, c) in zip(sources, self.loc_pred, self.cls_pred):
            loc.append(l(x).permute(0,2,3,1).contiguous()) # [B,C,H,W] -> [B,H,W,C]
            conf.append(c(x).permute(0,2,3,1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) # to concat pred from many layers
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)

        if self.phase == 'train':
            output = (
                    loc.view(loc.size(0), -1, 4),
                    conf.view(loc.size(0), -1, self.num_classes),
                    self.priors
            )
        else:
            output = self.detect(
                    loc.view(loc.size(0), -1, 4),
                    self.softmax(conf.view(-1, self.num_classes)),
                    self.priors.type(type(x.data))
                )
        return output

    def _init_weight(self):
        def weight_init(m):
            if isinstance(m, nn.Conv2d):
                init.xavier_uniform(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                if m.bias is not None:
                    m.bias.data.zero_()
        self.apply(weight_init)

    def _fetch_weight(self):
        """Fetch pretrain model using torchvision.
        Returns: 
            weight_file: (str) pretrained weight file path

        """
        print('Fetching pretrained model...')
        vgg16 = models.vgg16(pretrained=True)
        model_file = os.path.join(os.environ['HOME'], '.torch/models', 'vgg16-*.pth')
        return glob.glob(model_file)[0]

    def _load_weight(self, weight_file=None):
        """Load pretrained model.
        source: features.[0-28].[weight,bias], classifier.[0,3,6].[weight,bias]
        target: base_net.[0-28].[weight,bias], base_net.[31,33].[weight,bias], -> (load pretrained model) 
                extra_net.[0-7].[xx], loc_pred.[0-5].[xx], cls_pred.[0-5].[xx] -> (init)

        Kwargs:
            weight_file (str): *.pth file path

        Returns: None

        """

        if weight_file == None:
            weight_file = self._fetch_weight()

        _, ext = os.path.splitext(weight_file)

        def downsample(fc, layer):
            """
            downsample weight and bias in fc6,fc7 to conv6,conv7
            w: [512,7,7,4096] -> [512,3,3,1024]     fc6
               [4096, 4096] -> [1024, 1, 1, 1024]   fc7
            b: [4096] -> [1024]
            """
            fc = fc.view(4, 1024, -1)[0] # [4096, 512*7*7] -> [4, 1024, -1][0], 
            if fc.size(1) > 1: # weight
                if layer == 'fc6':
                    fc = fc.view(1024, 512, 7, 7)[:, :, 0::3, 0::3]
                elif layer == 'fc7':
                    fc = fc.view(4, 1024, 1024, 1, 1)[0]
            else:
                fc = fc[:,0]
            return fc

        if ext == '.pkl' or '.pth':
            source_dict = torch.load(weight_file)
            # features -> base_net, remove
            target_dict = {}
            for key in source_dict.keys():
                if 'features' in key: # conv1-5
                    target_dict['base_net'+key[8:]] = source_dict[key] 
                elif 'classifier.0' in key: # conv6
                    target_dict['base_net.31'+key[12:]] = downsample(source_dict[key], 'fc6')
                elif 'classifier.3' in key: # conv7
                    target_dict['base_net.33'+key[12:]] = downsample(source_dict[key], 'fc7')
            source_dict = target_dict 
            # add
            for (key, value) in self.state_dict().items():
                if key not in target_dict.keys():
                    target_dict[key] = value
            self.load_state_dict(target_dict)
            print('Loading imagenet weight successfully!')
        else:
            print('Sorry, only .pth and .pkl')
            
    def _finetune(self, weight_file):
        _, ext = os.path.splitext(weight_file)
        if ext == '.pkl' or '.pth':
            source_dict = torch.load(weight_file)
            
            # remove num_classes-awared layers
            target_dict = {}
            for key in source_dict.keys():
                if 'cls_pred' not in key: # conv1-5
                    target_dict[key] = source_dict[key]
            # add
            for (key, value) in self.state_dict().items():
                if key not in target_dict.keys():
                    target_dict[key] = value
            self.load_state_dict(target_dict)
            print('Loading finetune weight successfully!')
        else:
            print('Sorry, only .pth and .pkl')
Ejemplo n.º 17
0
class DSOD_64_16(nn.Module):
    def __init__(self, num_classes):
        super(DSOD_64_16, self).__init__()
        self.num_classes = num_classes
        self.extractor = DenseNet_64_16()
        self.loc_layers = nn.ModuleList()
        self.cls_layers = nn.ModuleList()
        self.cfg = cfg_320_64_16
        self.priorbox = PriorBox(self.cfg)
        self.priors = self.priorbox.forward()

        in_channels = channel_dict['6416']
        num_anchors = (4, 6, 6, 6, 4, 4)
        for inC, num_anchor in zip(in_channels, num_anchors):
            # self.loc_layers += [nn.Conv2d(inC, num_anchor*4, kernel_size=3, padding=1)]
            # self.cls_layers += [nn.Conv2d(inC, num_anchor* num_classes, kernel_size=3, padding=1)
            #                                   ]
            self.loc_layers += [
                nn.Sequential(
                    nn.Conv2d(inC,
                              num_anchor * 4,
                              kernel_size=3,
                              padding=1,
                              bias=False), nn.BatchNorm2d(num_anchor * 4))
            ]
            self.cls_layers += [
                nn.Sequential(
                    nn.Conv2d(inC,
                              num_anchor * num_classes,
                              kernel_size=3,
                              padding=1,
                              bias=False),
                    nn.BatchNorm2d(num_anchor * num_classes))
            ]
        self.normalize = nn.ModuleList(
            [L2Norm(chan, 20) for chan in in_channels])

        self.reset_parameters()

    def forward(self, x):
        loc_preds = []
        cls_preds = []
        xs = self.extractor(x)
        for i, x in enumerate(xs):
            x = self.normalize[i](x)
            loc_pred = self.loc_layers[i](x)
            loc_pred = loc_pred.permute(0, 2, 3, 1).contiguous()
            loc_preds.append(loc_pred.view(loc_pred.size(0), -1, 4))

            cls_pred = self.cls_layers[i](x)
            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous()
            cls_preds.append(
                cls_pred.view(cls_pred.size(0), -1, self.num_classes))

        # loc_preds = torch.cat(loc_preds, 1)
        # cls_preds = torch.cat(cls_preds, 1)

        loc = torch.cat([o.view(o.size(0), -1) for o in loc_preds], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in cls_preds], 1)

        output = (loc.view(loc.size(0), -1,
                           4), conf.view(conf.size(0), -1,
                                         self.num_classes), self.priors)
        return output

    def reset_parameters(self):
        for name, param in self.extractor.named_parameters():
            if hasattr(param, 'weight'):
                nn.init.xavier_uniform(param.weight.data,
                                       gain=nn.init.calculate_gain('relu'))

        for name, param in self.loc_layers.named_parameters():
            if hasattr(param, 'weight'):
                nn.init.normal(param.weight.data, std=0.01)

        for name, param in self.cls_layers.named_parameters():
            if hasattr(param, 'weight'):
                nn.init.normal(param.weight.data, std=0.01)
Ejemplo n.º 18
0
            cv2.rectangle(img, (p1[0] - 2 // 2, p1[1] - 2 - baseline),
                          (p1[0] + text_size[0], p1[1] + text_size[1]),
                          [255, 0, 0], -1)
            cv2.putText(img, conf, (p1[0], p1[1] + baseline),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1, 8)

    t2 = time.time()
    print('detect:{} timer:{}'.format(img_path, t2 - t1))

    cv2.imwrite(os.path.join(args.save_dir, os.path.basename(img_path)), img)


if __name__ == '__main__':
    # load PriorBox
    with torch.no_grad():
        priorbox = PriorBox(input_size=[640, 640], cfg=cfg)
        priors = priorbox.forward()
        priors = priors.cuda()

    net = build_net('test', cfg.NUM_CLASSES)
    net.load_state_dict(torch.load(args.model))
    net.eval()

    if use_cuda:
        net.cuda()
        cudnn.benckmark = True

    img_path = './img'
    img_list = [
        os.path.join(img_path, x) for x in os.listdir(img_path)
        if x.endswith('jpg')
Ejemplo n.º 19
0
def main():
    global args
    global minmum_loss
    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()

    args.total_batch_size = args.world_size * args.batch_size

    # build dsfd network
    print("Building net...")
    pyramidbox = build_net('train', cfg.NUM_CLASSES)
    model = pyramidbox

    if args.pretrained:
        vgg_weights = torch.load(args.save_folder + args.basenet)
        print('Load base network....')
        model.vgg.load_state_dict(vgg_weights)

    # for multi gpu
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model)

    model = model.cuda()
    # optimizer and loss function
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    criterion1 = MultiBoxLoss(cfg, True)
    criterion2 = MultiBoxLoss(cfg, True, use_head_loss=True)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            minmum_loss = checkpoint['minmum_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        print('Initializing weights...')
        pyramidbox.extras.apply(pyramidbox.weights_init)
        pyramidbox.lfpn_topdown.apply(pyramidbox.weights_init)
        pyramidbox.lfpn_later.apply(pyramidbox.weights_init)
        pyramidbox.cpm.apply(pyramidbox.weights_init)
        pyramidbox.loc_layers.apply(pyramidbox.weights_init)
        pyramidbox.conf_layers.apply(pyramidbox.weights_init)

    print('Loading wider dataset...')
    train_dataset = WIDERDetection(cfg.FACE.TRAIN_FILE, mode='train')

    val_dataset = WIDERDetection(cfg.FACE.VAL_FILE, mode='val')

    train_loader = data.DataLoader(train_dataset,
                                   args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=True,
                                   collate_fn=detection_collate,
                                   pin_memory=True)
    val_batchsize = args.batch_size // 2
    val_loader = data.DataLoader(val_dataset,
                                 val_batchsize,
                                 num_workers=args.num_workers,
                                 shuffle=False,
                                 collate_fn=detection_collate,
                                 pin_memory=True)

    print('Using the specified args:')
    print(args)

    # load PriorBox
    with torch.no_grad():
        priorbox = PriorBox(input_size=[640, 640], cfg=cfg)
        priors = priorbox.forward()
        priors = priors.cuda()

    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        end = time.time()
        train_loss = train(train_loader, model, priors, criterion1, criterion2,
                           optimizer, epoch)
        val_loss = val(val_loader, model, priors, criterion1, criterion2)
        if args.local_rank == 0:
            is_best = val_loss < minmum_loss
            minmum_loss = min(val_loss, minmum_loss)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_prec1': minmum_loss,
                    'optimizer': optimizer.state_dict(),
                }, is_best, epoch)
        epoch_time = time.time() - end
        print('Epoch %s time cost %f' % (epoch, epoch_time))
Ejemplo n.º 20
0
def main():
    global args
    global minmum_loss
    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()

    args.total_batch_size = args.world_size * args.batch_size

    # build dsfd network
    print("Building net...")
    model = RetinaFace(cfg=cfg)
    print("Printing net...")

    # for multi gpu
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model)

    model = model.cuda()
    # optimizer and loss function
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    criterion = MultiBoxLoss(cfg['num_classes'], 0.35, True, 0, True, 7, 0.35,
                             False)

    ## dataset
    print("loading dataset")
    train_dataset = WiderFaceDetection(
        args.training_dataset, preproc(cfg['image_size'], cfg['rgb_mean']))

    train_loader = data.DataLoader(train_dataset,
                                   args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=True,
                                   collate_fn=detection_collate,
                                   pin_memory=True)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            minmum_loss = checkpoint['minmum_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    print('Using the specified args:')
    print(args)
    # load PriorBox
    print("Load priorbox")
    with torch.no_grad():
        priorbox = PriorBox(cfg=cfg,
                            image_size=(cfg['image_size'], cfg['image_size']))
        priors = priorbox.forward()
        priors = priors.cuda()

    print("start traing")
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_loss = train(train_loader, model, priors, criterion, optimizer,
                           epoch)
        if args.local_rank == 0:
            is_best = train_loss < minmum_loss
            minmum_loss = min(train_loss, minmum_loss)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_prec1': minmum_loss,
                    'optimizer': optimizer.state_dict(),
                }, is_best, epoch)
Ejemplo n.º 21
0
def mini_SSD300(input_shape=(300,300,3), num_classes=21):
    """SSD300 architecture.

    # Arguments
        input_shape: Shape of the input image,
            expected to be either (300, 300, 3) or (3, 300, 300)(not tested).
        num_classes: Number of classes including background.

    # References
        https://arxiv.org/abs/1512.02325
    """
    net = {}
    # Block 1
    input_tensor = input_tensor = Input(shape=input_shape)
    img_size = (input_shape[1], input_shape[0])
    net['input'] = input_tensor
    net['conv1_1'] = Convolution2D(64, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv1_1')(net['input'])
    net['conv1_2'] = Convolution2D(64, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv1_2')(net['conv1_1'])
    net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
                                name='pool1')(net['conv1_2'])
    # Block 2
    net['conv2_1'] = Convolution2D(128, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv2_1')(net['pool1'])
    net['conv2_2'] = Convolution2D(128, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv2_2')(net['conv2_1'])
    net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
                                name='pool2')(net['conv2_2'])
    # Block 3
    net['conv3_1'] = Convolution2D(256, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv3_1')(net['pool2'])
    net['conv3_2'] = Convolution2D(256, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv3_2')(net['conv3_1'])
    net['conv3_3'] = Convolution2D(256, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv3_3')(net['conv3_2'])
    net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
                                name='pool3')(net['conv3_3'])
    # Block 4
    net['conv4_1'] = Convolution2D(512, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv4_1')(net['pool3'])
    net['conv4_2'] = Convolution2D(512, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv4_2')(net['conv4_1'])
    net['conv4_3'] = Convolution2D(512, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv4_3')(net['conv4_2'])
    net['pool4'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
                                name='pool4')(net['conv4_3'])
    # Block 5
    net['conv5_1'] = Convolution2D(512, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv5_1')(net['pool4'])
    net['conv5_2'] = Convolution2D(512, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv5_2')(net['conv5_1'])
    net['conv5_3'] = Convolution2D(512, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv5_3')(net['conv5_2'])
    net['pool5'] = MaxPooling2D((3, 3), strides=(1, 1), border_mode='same',
                                name='pool5')(net['conv5_3'])
    # FC6
    net['fc6'] = AtrousConvolution2D(1024, 3, 3, atrous_rate=(6, 6),
                                     activation='relu', border_mode='same',
                                     name='fc6')(net['pool5'])
    # x = Dropout(0.5, name='drop6')(x)
    # FC7
    net['fc7'] = Convolution2D(1024, 1, 1, activation='relu',
                               border_mode='same', name='fc7')(net['fc6'])
    # x = Dropout(0.5, name='drop7')(x)
    # Block 6
    # deleted

    net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3'])
    num_priors = 3
    x = Convolution2D(num_priors * 4, 3, 3, border_mode='same',
                      name='conv4_3_norm_mbox_loc')(net['conv4_3_norm'])
    net['conv4_3_norm_mbox_loc'] = x
    flatten = Flatten(name='conv4_3_norm_mbox_loc_flat')
    net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc'])
    name = 'conv4_3_norm_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same',
                      name=name)(net['conv4_3_norm'])
    net['conv4_3_norm_mbox_conf'] = x
    flatten = Flatten(name='conv4_3_norm_mbox_conf_flat')
    net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf'])
    priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv4_3_norm_mbox_priorbox')
    net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm'])
    # Prediction from fc7
    num_priors = 6
    net['fc7_mbox_loc'] = Convolution2D(num_priors * 4, 3, 3,
                                        border_mode='same',
                                        name='fc7_mbox_loc')(net['fc7'])
    flatten = Flatten(name='fc7_mbox_loc_flat')
    net['fc7_mbox_loc_flat'] = flatten(net['fc7_mbox_loc'])
    name = 'fc7_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    net['fc7_mbox_conf'] = Convolution2D(num_priors * num_classes, 3, 3,
                                         border_mode='same',
                                         name=name)(net['fc7'])
    flatten = Flatten(name='fc7_mbox_conf_flat')
    net['fc7_mbox_conf_flat'] = flatten(net['fc7_mbox_conf'])
    priorbox = PriorBox(img_size, 60.0, max_size=114.0, aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='fc7_mbox_priorbox')
    net['fc7_mbox_priorbox'] = priorbox(net['fc7'])
    # Gather all predictions
    net['mbox_loc'] = merge([net['conv4_3_norm_mbox_loc_flat'],
                             net['fc7_mbox_loc_flat']],
                            mode='concat', concat_axis=1, name='mbox_loc')
    net['mbox_conf'] = merge([net['conv4_3_norm_mbox_conf_flat'],
                              net['fc7_mbox_conf_flat']],
                             mode='concat', concat_axis=1, name='mbox_conf')
    if hasattr(net['mbox_loc'], '_keras_shape'):
        num_boxes = net['mbox_loc']._keras_shape[-1] // 4
    elif hasattr(net['mbox_loc'], 'int_shape'):
        num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4
    net['mbox_loc'] = Reshape((num_boxes, 4),
                              name='mbox_loc_final')(net['mbox_loc'])
    net['mbox_conf'] = Reshape((num_boxes, num_classes),
                               name='mbox_conf_logits')(net['mbox_conf'])
    net['mbox_conf'] = Activation('softmax',
                                  name='mbox_conf_final')(net['mbox_conf'])
    net['mbox_priorbox'] = merge([net['conv4_3_norm_mbox_priorbox'],
                                  net['fc7_mbox_priorbox']],
                                 mode='concat', concat_axis=1,
                                 name='mbox_priorbox')

    net['predictions'] = merge([net['mbox_loc'],
                               net['mbox_conf'],
                               net['mbox_priorbox']],
                               mode='concat', concat_axis=2,
                               name='predictions')
    model = Model(net['input'], net['predictions'])
    return model
def mini_SSD(num_classes=21):

    base_kernel_size = 4 + num_classes
    aspect_ratios = (1, 2, 1 / 2)
    num_aspect_ratios = len(aspect_ratios)

    base_model = VGG16(weights='imagenet')
    base_model.layers[0].name = 'input_1'
    input_tensor = base_model.input
    #input_tensor = base_model

    #input_tensor.name = 'image_array'

    for layer in base_model.layers:
        layer.trainable = False
    body = base_model.get_layer('block4_pool').output
    body = Convolution2D((base_kernel_size * num_aspect_ratios),
                         3,
                         3,
                         border_mode='same')(body)
    branch_1 = PriorBox(aspect_ratios)(body)

    body = Convolution2D(32, 3, 3, border_mode='same')(branch_1)
    body = Activation('relu')(body)
    body = MaxPooling2D((2, 2))(body)
    body = Dropout(.5)(body)
    body = Convolution2D((base_kernel_size * num_aspect_ratios),
                         3,
                         3,
                         border_mode='same')(body)
    branch_2 = PriorBox(aspect_ratios)(body)

    body = Convolution2D(64, 3, 3, border_mode='same')(branch_2)
    body = Activation('relu')(body)
    body = MaxPooling2D((3, 3))(body)
    body = Dropout(.5)(body)
    body = Convolution2D((base_kernel_size * num_aspect_ratios),
                         3,
                         3,
                         border_mode='same')(body)
    branch_3 = PriorBox(aspect_ratios)(body)

    branch_1 = Reshape((-1, 4 + num_classes))(branch_1)
    local_1 = Lambda(lambda x: x[:, :, :4])(branch_1)
    class_1 = Lambda(lambda x: K.softmax(x[:, :, 4:]))(branch_1)

    branch_2 = Reshape((-1, 4 + num_classes))(branch_2)
    local_2 = Lambda(lambda x: x[:, :, :4])(branch_2)
    class_2 = Lambda(lambda x: K.softmax(x[:, :, 4:]))(branch_2)

    branch_3 = Reshape((-1, 4 + num_classes))(branch_3)
    local_3 = Lambda(lambda x: x[:, :, :4])(branch_3)
    class_3 = Lambda(lambda x: K.softmax(x[:, :, 4:]))(branch_3)

    classification_tensor = merge([class_1, class_2, class_3],
                                  mode='concat',
                                  concat_axis=1,
                                  name='classes')

    localization_tensor = merge([local_1, local_2, local_3],
                                mode='concat',
                                concat_axis=1,
                                name='encoded_box')
    output_tensor = merge([localization_tensor, classification_tensor],
                          mode='concat',
                          concat_axis=-1,
                          name='predictions')
    model = Model(input_tensor, output_tensor)
    return model
def mini_SSD300(input_shape=(300, 300, 3), num_classes=21):
    net = {}
    # Block 1
    input_tensor = input_tensor = Input(shape=input_shape)
    img_size = (input_shape[1], input_shape[0])
    net['input'] = input_tensor
    net['conv1_1'] = Convolution2D(64, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv1_1')(net['input'])
    net['conv1_2'] = Convolution2D(64, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv1_2')(net['conv1_1'])
    net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
                                name='pool1')(net['conv1_2'])
    # Block 2
    net['conv2_1'] = Convolution2D(128, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv2_1')(net['pool1'])
    net['conv2_2'] = Convolution2D(128, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv2_2')(net['conv2_1'])
    net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
                                name='pool2')(net['conv2_2'])
    # Block 3
    net['conv3_1'] = Convolution2D(256, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv3_1')(net['pool2'])
    net['conv3_2'] = Convolution2D(256, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv3_2')(net['conv3_1'])
    net['conv3_3'] = Convolution2D(256, 3, 3,
                                   activation='relu',
                                   border_mode='same',
                                   name='conv3_3')(net['conv3_2'])
    net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
                                name='pool3')(net['conv3_3'])

    net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv3_3'])
    num_priors = 6
    x = Convolution2D(num_priors * 4, 3, 3, border_mode='same',
                      name='conv4_3_norm_mbox_loc')(net['conv4_3_norm'])
    net['conv4_3_norm_mbox_loc'] = x
    flatten = Flatten(name='conv4_3_norm_mbox_loc_flat')
    net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc'])
    name = 'conv4_3_norm_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same',
                      name=name)(net['conv4_3_norm'])
    net['conv4_3_norm_mbox_conf'] = x
    flatten = Flatten(name='conv4_3_norm_mbox_conf_flat')
    net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf'])
    priorbox = PriorBox(img_size, 30.0, max_size=60, aspect_ratios=[2, 3],
                        variances=[0.1, 0.1, 0.2, 0.2],
                        name='conv4_3_norm_mbox_priorbox')
    net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm'])

    # Prediction from fc7
    if hasattr(net['conv4_3_norm_mbox_loc_flat'], '_keras_shape'):
        num_boxes = net['conv4_3_norm_mbox_loc_flat']._keras_shape[-1] // 4
    elif hasattr(net['mbox_loc'], 'int_shape'):
        num_boxes = K.int_shape(net['conv4_3_norm_mbox_loc_flat'])[-1] // 4
    net['mbox_loc'] = Reshape((num_boxes, 4),
                              name='mbox_loc_final')(net['conv4_3_norm_mbox_loc_flat'])
    net['mbox_conf'] = Reshape((num_boxes, num_classes),
                               name='mbox_conf_logits')(net['conv4_3_norm_mbox_conf_flat'])
    net['mbox_conf'] = Activation('softmax',
                                  name='mbox_conf_final')(net['mbox_conf'])

    net['predictions'] = merge([net['mbox_loc'],
                               net['mbox_conf'],
                               net['conv4_3_norm_mbox_priorbox']],
                               mode='concat', concat_axis=2,
                               name='predictions')
    model = Model(net['input'], net['predictions'])
    return model