def forward(self, x, rois, roi_indices):
        """Forward the chain.

        We assume that there are :math:`N` batches.

        Args:
            x (Variable): 4D image variable.
            rois (Tensor): A bounding box array containing coordinates of
                proposal boxes.  This is a concatenation of bounding box
                arrays from multiple images in the batch.
                Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
                RoIs from the :math:`i` th image,
                :math:`R' = \\sum _{i=1} ^ N R_i`.
            roi_indices (Tensor): An array containing indices of images to
                which bounding boxes correspond to. Its shape is :math:`(R',)`.

        """
        # in case roi_indices is  ndarray
        roi_indices = at.totensor(roi_indices).float()
        rois = at.totensor(rois).float()
        indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
        # NOTE: important: yx->xy
        xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
        indices_and_rois = t.autograd.Variable(xy_indices_and_rois.contiguous())

        pool = self.roi(x, indices_and_rois)
        pool = pool.view(pool.size(0), -1)
        fc7 = self.classifier(pool)
        roi_cls_locs = self.cls_loc(fc7)
        roi_scores = self.score(fc7)
        return roi_cls_locs, roi_scores
Ejemplo n.º 2
0
def train(**kwargs):
    opt._parse(kwargs)

    if not VOC:
        dataset = CsvDataset('/home/artemlyan/data/avito_intro/images/',
                             'labeled_with_classes.csv')
        print('load data')
        dataloader = data_.DataLoader(dataset, \
                                      batch_size=1, \
                                      shuffle=True, \
                                      # pin_memory=True,

                                      num_workers=opt.num_workers)

        test_dataloader = data_.DataLoader(dataset,
                                           batch_size=1,
                                           num_workers=opt.test_num_workers,
                                           shuffle=False, \
                                           pin_memory=True
                                           )
    else:
        dataset = Dataset(opt)
        print('load data for VOC')
        dataloader = data_.DataLoader(dataset, \
                                      batch_size=1, \
                                      shuffle=True, \
                                      # pin_memory=True,

                                      num_workers=opt.num_workers)
        testset = TestDataset(opt)
        test_dataloader = data_.DataLoader(testset,
                                           batch_size=1,
                                           num_workers=opt.test_num_workers,
                                           shuffle=False, \
                                           pin_memory=True
                                           )
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)

    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr
    for epoch in range(opt.epoch):
        if not VOC:
            dataset.set_mode('train')
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            print(img.size(), bbox_.size(), label_.size(), scale.size())
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            img, bbox, label = Variable(img), Variable(bbox), Variable(label)
            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                print('pred', _bboxes, 'gt', bbox_[0])
                pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img(
                    'roi_cm',
                    at.totensor(trainer.roi_cm.conf, False).float())

        if not VOC:
            dataset.set_mode('val')

        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
        print("eval reuslt:", eval_result)

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay

        trainer.vis.plot('test_map', eval_result['map'])
        log_info = 'lr:{}, map:{},loss:{}'.format(
            str(lr_), str(eval_result['map']), str(trainer.get_meter_data()))
        trainer.vis.log(log_info)
        if epoch == 30:
            break
def train(**kwargs):
    opt._parse(kwargs)

    dataset = Dataset(opt)
    print('load data')
    dataloader = data_.DataLoader(dataset,
                                  batch_size=1,
                                  shuffle=True,
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=2,
                                       shuffle=False,
                                       # pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)

    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    for epoch in range(7):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale, ori_img) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            losses = trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = (img * 0.225 + 0.45).clamp(min=0, max=1) * 255
                gt_img = visdom_bbox(at.tonumpy(ori_img_)[0], 
                                    at.tonumpy(bbox_)[0], 
                                    label_[0].numpy())
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(ori_img,visualize=True)
                pred_img = visdom_bbox( at.tonumpy(ori_img[0]), 
                                        at.tonumpy(_bboxes[0]),
                                        at.tonumpy(_labels[0]).reshape(-1), 
                                        at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
        if epoch==4:
            trainer.faster_rcnn.scale_lr(opt.lr_decay)

    eval_result = eval(test_dataloader, faster_rcnn, test_num=1e100)
    print('eval_result')
    trainer.save(mAP=eval_result['map'])
Ejemplo n.º 4
0
def train(**kwargs):
    opt._parse(
        kwargs
    )  #将调用函数时候附加的参数用,config.py文件里面的opt._parse()进行解释,然后获取其数据存储的路径,之后放到Dataset里面!

    dataset = Dataset(opt)
    print('load data')
    dataloader = data_.DataLoader(dataset,
                                  batch_size=1,
                                  shuffle=True,
                                  num_workers=opt.num_workers)

    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(
        testset,
        batch_size=1,
        num_workers=opt.test_num_workers,
        shuffle=False,
        #pin_memory=True
    )  #pin_memory锁页内存,开启时使用显卡的内存,速度更快

    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    #判断opt.load_path是否存在,如果存在,直接从opt.load_path读取预训练模型,然后将训练数据的label进行可视化操作
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)
    trainer.vis.text(dataset.dataset.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr
    # 之后用一个for循环开始训练过程,而训练迭代的次数opt.epoch=14也在config.py文件中都预先定义好,属于超参数
    for epoch in range(opt.epoch):
        print('epoch {}/{}'.format(epoch, opt.epoch))
        trainer.reset_meters()  #首先在可视化界面重设所有数据
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            scale = array_tool.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                #可视化画出loss
                trainer.vis.plot_many(trainer.get_meter_data())
                #可视化画出groudtruth bboxes
                ori_img_ = inverse_normalize(array_tool.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, array_tool.tonumpy(bbox_[0]),
                                     array_tool.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                #可视化画出预测bboxes
                # 调用faster_rcnn的predict函数进行预测,预测的结果保留在以_下划线开头的对象里面
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(
                    ori_img_, array_tool.tonumpy(_bboxes[0]),
                    array_tool.tonumpy(_labels[0]).reshape(-1),
                    array_tool.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)
                # 调用 trainer.vis.text将rpn_cm也就是RPN网络的混淆矩阵在可视化工具中显示出来
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                #将roi_cm也就是roihead网络的混淆矩阵在可视化工具中显示出来
                trainer.vis.img(
                    'roi_cm',
                    array_tool.totensor(trainer.roi_cm.conf, False).float())
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{}, loss:{}'.format(
            str(lr_), str(eval_result['map']), str(trainer.get_meter_data()))
        trainer.vis.log(log_info)  #将学习率以及map等信息及时显示更新

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch == 9:  #if判断语句如果学习的epoch达到了9就将学习率*0.1变成原来的十分之一
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay

        if epoch == 13:
            break
Ejemplo n.º 5
0
def train(**kwargs):
    opt._parse(kwargs)

    dataset = Dataset(opt)
    print('load data')
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=False, \
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)
    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale, human_box, object_box, action) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            human_box, object_box, action = human_box.cuda(), object_box.cuda(), action.cuda()
            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(bbox_[0]),
                                     # at.tonumpy(action[0]),
                                     at.tonumpy(label_[0])
                                     )
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
                print(_labels[0])
                pred_img = visdom_bbox(ori_img_,
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
                                                  str(eval_result['map']),
                                                  str(trainer.get_meter_data()))
        trainer.vis.log(log_info)

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay

        if epoch == 13: 
            break
Ejemplo n.º 6
0
def train(**kwargs):
    opt._parse(kwargs) #获得config设置信息

    dataset = Dataset(opt) #传入opt,利用设置的数据集参数来创建训练数据集
    print('load data')
    dataloader = data_.DataLoader(dataset, \ #用创建的训练数据集创建训练DataLoader,代码仅支持batch_size=1
                                  batch_size=1, \
                                  shuffle=True, \
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt) #传入opt,利用设置的数据集参数来加载测试数据集
    test_dataloader = data_.DataLoader(testset, #用创建的测试数据集创建训练DataLoader,代码仅支持batch_size=1
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
                                       
    faster_rcnn = FasterRCNNVGG16() #创建以vgg为backbone的FasterRCNN网络
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda() #把创建好的FasterRCNN网络放入训练器
    if opt.load_path: #若有FasterRCNN网络的预训练加载,则加载load_path权重
        trainer.load(opt.load_path) #训练器加载权重
        print('load pretrained model from %s' % opt.load_path)
    trainer.vis.text(dataset.db.label_names, win='labels') 
    best_map = 0 #初始化best_map,训练时用于判断是否需要保存模型,类似打擂台后面用
    lr_ = opt.lr #得到预设的学习率
    for epoch in range(opt.epoch): #开始训练,训练次数为opt.epoch
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): 
            scale = at.scalar(scale) #进行类别处理得到scale(待定)
            #bbox是gt_box坐标(ymin, xmin, ymax, xmax)
            #label是类别的下标VOC_BBOX_LABEL_NAMES
            #img是图片,代码仅支持batch_size=1的训练
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() #使用gpu训练
            trainer.train_step(img, bbox, label, scale) #预处理完毕,进入模型

            if (ii + 1) % opt.plot_every == 0: #可视化内容,(跳过)
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_,
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
        
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) #训练一个epoch评估一次
        trainer.vis.plot('test_map', eval_result['map']) #可视化内容,(跳过)
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] #获得当前的学习率
        log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), #日志输出学习率,map,loss
                                                  str(eval_result['map']),
                                                  str(trainer.get_meter_data()))
        trainer.vis.log(log_info) #可视化内容,(跳过)

        if eval_result['map'] > best_map: #若这次评估的map大于之前最大的map则保存模型
            best_map = eval_result['map'] #保存模型的map信息
            best_path = trainer.save(best_map=best_map) #调用保存模型函数
        if epoch == 9: #若训练到第9个epoch则加载之前最好的模型并且减低学习率继续训练
            trainer.load(best_path) #加载模型
            trainer.faster_rcnn.scale_lr(opt.lr_decay) #降低学习率
            lr_ = lr_ * opt.lr_decay #获得当前学习率

        if epoch == 13: #13个epoch停止训练
            break
Ejemplo n.º 7
0
def train_val():
    print('load data')
    train_loader, val_loader = get_train_val_loader(
        opt.root_dir,
        batch_size=opt.batch_size,
        val_ratio=0.1,
        shuffle=opt.shuffle,
        num_workers=opt.num_workers,
        pin_memory=opt.pin_memory)
    faster_rcnn = FasterRCNNVGG16()
    # faster_rcnn = FasterRCNNResNet50()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()

    # if opt.load_path:
    #     trainer.load(opt.load_path)
    #     print('load pretrained model from %s' % opt.load_path)

    # trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        tqdm.monitor_interval = 0
        for ii, sample in tqdm(enumerate(train_loader)):
            if len(sample.keys()) == 5:
                img_id, img, bbox, scale, label = sample['img_id'], sample['image'], sample['bbox'], sample['scale'], \
                                                    sample['label']
                img, bbox, label = img.cuda().float(), bbox.cuda(), label.cuda(
                )
                img, bbox, label = Variable(img), Variable(bbox), Variable(
                    label)

            else:
                img_id, img, bbox, scale, label = sample['img_id'], sample['image'], np.zeros((1, 0, 4)), \
                                                  sample['scale'], np.zeros((1, 0, 1))
                img = img.cuda().float()
                img = Variable(img)

            if bbox.size == 0:
                continue

            scale = at.scalar(scale)
            trainer.train_step(img_id, img, bbox, label, scale)
            if (ii + 1) % opt.plot_every == 0:
                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot ground truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, img_id[0], at.tonumpy(bbox[0]),
                                     at.tonumpy(label[0]))

                trainer.vis.img('gt_img', gt_img)

                # plot predicted bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_, img_id[0],
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))

                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img(
                    'roi_cm',
                    at.totensor(trainer.roi_cm.conf, False).float())

        mAP = eval_mAP(trainer, val_loader)
        trainer.vis.plot('val_mAP', mAP)
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{},loss:{}'.format(
            str(lr_), str(mAP), str(trainer.get_meter_data()))
        trainer.vis.log(log_info)
        if mAP > best_map:
            best_map = mAP
            best_path = trainer.save(best_map=best_map)
        if epoch == opt.epoch - 1:
            best_path = trainer.save()

        if (epoch + 1) % 10 == 0:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay
    def forward(self, x, rois, roi_indices, context_rois, context_roi_indices):
        # def forward(self, x, rois, roi_indices):
        """Forward the chain.

        We assume that there are :math:`N` batches.

        Args:
            x (Variable): 4D image variable.
            rois (Tensor): A bounding box array containing coordinates of
                proposal boxes.  This is a concatenation of bounding box
                arrays from multiple images in the batch.
                Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
                RoIs from the :math:`i` th image,
                :math:`R' = \\sum _{i=1} ^ N R_i`.
            roi_indices (Tensor): An array containing indices of images to
                which bounding boxes correspond to. Its shape is :math:`(R',)`.

        """
        # in case roi_indices is  ndarray
        roi_indices = at.totensor(roi_indices).float()
        rois = at.totensor(rois).float()
        indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
        # NOTE: important: yx->xy
        xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
        indices_and_rois = xy_indices_and_rois.contiguous()

        pool = self.roi(x, indices_and_rois)
        pool = pool.view(pool.size(0), -1)
        fc7 = self.classifier(pool)
        roi_cls_locs = self.cls_loc(fc7)
        roi_scores = self.score(fc7)
        #         print(f"score: {roi_scores}")

        # context_roi_scores = 0
        # in case roi_indices is  ndarray
        context_roi_indices = at.totensor(context_roi_indices).float()
        context_rois = at.totensor(context_rois).float()
        context_indices_and_rois = t.cat(
            [context_roi_indices[:, None], context_rois], dim=1)
        # NOTE: important: yx->xy
        context_xy_indices_and_rois = context_indices_and_rois[:,
                                                               [0, 2, 1, 4, 3]]
        context_indices_and_rois = context_xy_indices_and_rois.contiguous()

        #         print(f"DEBUG: {context_indices_and_rois.shape}")
        context_pool = self.context_roi(x, context_indices_and_rois)
        context_pool = context_pool.view(context_pool.size(0), -1)
        context_fc7 = self.context_classifier(context_pool)
        # context_roi_cls_locs = self.context_cls_loc(context_fc7)
        #         context_roi_scores = self.context_score(context_fc7)

        # gating
        ex_feat = self.gating_module(fc7, context_fc7)
        ex_feat = ex_feat.view(ex_feat.size(0), -1)
        ex_scores = self.cls_score(ex_feat)
        #         print(f"cls_score: {ex_scores}")
        roi_scores = ex_scores

        # context relevance score
        #         context_relevance_roi_indices = at.totensor(context_roi_indices).float()
        #         context_relevance_rois = at.totensor(context_rois).float()
        #         context_relevance_indices_and_rois = t.cat(
        #             [context_relevance_roi_indices[:, None], context_relevance_rois], dim=1)
        #         # NOTE: important: yx->xy
        #         context_relevance_xy_indices_and_rois = context_relevance_indices_and_rois[:, [
        #             0, 2, 1, 4, 3]]
        #         context_relevance_indices_and_rois = context_relevance_xy_indices_and_rois.contiguous()

        #         context_relevance_pool = self.context_relevance_roi(x, context_relevance_indices_and_rois)
        #         context_relevance_pool = context_relevance_pool.view(context_relevance_pool.size(0), -1)
        context_relevance_pool = self.avgpool(x)
        context_relevance_pool = t.flatten(context_relevance_pool, 1)
        context_relevance_fc7 = self.context_relevance_classifier(
            context_relevance_pool)
        context_relevance_roi_scores = self.context_relevance_score(
            context_relevance_fc7)

        return roi_cls_locs, roi_scores, context_relevance_roi_scores
def train(**kwargs):
    opt._parse(kwargs)

    dataset = Dataset(opt)
    print('load data')
    dataloader = data_.DataLoader(dataset,
                                  batch_size=1,
                                  shuffle=True, \
                                  # pin_memory=True,

                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False,
                                       pin_memory=True)
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)
    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    best_ap = np.array([0.] * opt.label_number)
    lr_ = opt.lr
    vis = trainer.vis
    starttime = datetime.datetime.now()
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                # roi confusion matrix
                roi_cm = at.totensor(trainer.roi_cm.conf, False).float()
                trainer.vis.img('roi_cm', roi_cm)

        eval_result = eval(test_dataloader,
                           faster_rcnn,
                           vis=vis,
                           test_num=opt.test_num)
        best_ap = dict(zip(opt.VOC_BBOX_LABEL_NAMES, eval_result['ap']))
        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{},loss:{}'.format(
            str(lr_), str(eval_result['map']), str(trainer.get_meter_data()))
        trainer.vis.log(log_info)

        if eval_result['map'] > best_map:
            print('roi_cm=\n', trainer.roi_cm.value())
            plot_confusion_matrix(trainer.roi_cm.value(),
                                  classes=('animal', 'plant', 'rock',
                                           'background'),
                                  normalize=False,
                                  title='Normalized Confusion Matrix')
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map, best_ap=best_ap)
        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay

        # if epoch == 13:
        #     break
    endtime = datetime.datetime.now()
    train_consum = (endtime - starttime).seconds
    print("train_consum=", train_consum)
Ejemplo n.º 10
0
    def forward(self, imgs, bboxes, labels, scale):
        '''
        :param imgs:  (~torch.autograd.Variable)  一个批次的图片
        :param bboxes: (~torch.autograd.Variable)  (N, R, 4)
        :param labels:  (~torch.autograd..Variable)  (N, R)  [0 - L-1] L为类别数
        :param scale:   (float)  原图经过preprocessing处理后的缩放比
        :return:  namedtuple of 5 losses
        '''

        n = bboxes.shape[0]  #batch_size 数量
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        c2_out = self.faster_rcnn.C2(imgs)
        c3_out = self.faster_rcnn.C3(c2_out)
        c4_out = self.faster_rcnn.C4(c3_out)

        p2, p3, p4, p5 = self.faster_rcnn.fpn(c2_out, c3_out, c4_out)
        feature_maps = [p2, p3, p4, p5]
        rcnn_maps = [p2, p3, p4]

        # rpn_locs的维度(hh*ww*9,4),rpn_scores维度为(hh*ww*9,2), rois的维度为(2000,4),
        # roi_indices用不到,anchor的维度为(hh*ww*9,4),H和W是经过数据预处理后的。
        # 计算(H/16)x(W/16)x9(大概20000)个anchor属于前景的概率,取前12000个并经过NMS得到2000个近似目标框G^的坐标。
        # roi的维度为(2000,4)
        rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(
            feature_maps, img_size, scale)

        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]  #(hh*ww*9,2)
        rpn_loc = rpn_locs[0]   #(hh*ww*9,4)
        roi = rois   #(2000,4)

        # 调用proposal_target_creator函数生成sample roi(128,4)、gt_roi_loc(128,4)、
        # gt_roi_label(128,1),RoIHead网络利用这sample_roi+featue为输入,
        # 输出是分类(21类)和回归(进一步微调bbox)的预测值,
        # 那么分类回归的groud truth就是ProposalTargetCreator输出的gt_roi_label和gt_roi_loc。
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi,
            array_tool.tonumpy(bbox),
            array_tool.tonumpy(label),
            self.loc_normalize_mean,
            self.loc_normalize_std)

        sample_roi_index = torch.zeros(len(sample_roi))

        roi_cls_loc, roi_score = self.faster_rcnn.head(
            rcnn_maps,
            sample_roi,
            sample_roi_index)


        #------------------RPN loss------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            array_tool.tonumpy(bbox),
            anchor,
            img_size)
        gt_rpn_label = array_tool.totensor(gt_rpn_label).long()
        gt_rpn_loc = array_tool.totensor(gt_rpn_loc)
        #rpn的回归l1smooth损失
        rpn_loc_loss = _fast_rcnn_loc_loss(
            rpn_loc,
            gt_rpn_loc,
            gt_rpn_label.data,
            self.rpn_sigma)
        #rpn的分类交叉熵损失
        rpn_cls_loss = functional.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _gt_rpn_score = rpn_score[gt_rpn_label > -1]
        _rpn_score = array_tool.tonumpy(rpn_score)[array_tool.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(array_tool.totensor(_rpn_score, False), _gt_rpn_label.data.long())

        #------------------------ROI loss------------------------#
        n_sample = roi_cls_loc.shape[0]   #n_sample为128 , roi_cls_loc为VGG16RoIHead的输出(128*84)
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) # roi_cls_loc=(128,21,4)
        roi_loc = roi_cls_loc[torch.arange(0, n_sample).long().cuda(), \
                                array_tool.totensor(gt_roi_label).long()]  # (128,4),按照label编号从21类中挑出当前标签的loc,从(128,21,4)降为(128,4)
        gt_roi_label = array_tool.totensor(gt_roi_label).long()
        gt_roi_loc = array_tool.totensor(gt_roi_loc)

        #roi的回归l1smooth损失
        roi_loc_loss = _fast_rcnn_loc_loss(
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,
            self.roi_sigma)
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())  #roi的交叉熵损失
        self.roi_cm.add(array_tool.totensor(roi_score, False), gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]  #总loss,增加losses列表长度到5

        return LossTuple(*losses)
Ejemplo n.º 11
0
    def forward(self, imgs, bboxes, labels, scale):
        # 获取batch个数
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        # (n,c,hh,ww)
        img_size = (H, W)

        # vgg16 conv5_3之前的部分提取图片的特征
        features = self.faster_rcnn.extractor(imgs)

        # rpn_locs的维度(hh*ww*9,4),rpn_scores维度为(hh*ww*9,2),
        #  rois的维度为(2000,4),roi_indices用不到,anchor的维度为
        # (hh*ww*9,4),H和W是经过数据预处理后的。计算(H/16)x(W/16)x9
        # (大概20000)个anchor属于前景的概率,取前12000个并经过NMS得到2000个
        # 近似目标框G^的坐标。roi的维度为(2000,4)

        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        # bbox维度(N, R, 4)
        bbox = bboxes[0]
        # labels维度为(N,R)
        label = labels[0]
        #hh*ww*9
        rpn_score = rpn_scores[0]
        # hh*ww*9
        rpn_loc = rpn_locs[0]
        # (2000,4)
        roi = rois

        # Sample RoIs and forward
        # 调用proposal_target_creator函数生成sample roi(128,4)、
        # gt_roi_loc(128,4)、gt_roi_label(128,1),RoIHead网络
        # 利用这sample_roi+featue为输入,输出是分类(21类)和回归
        # (进一步微调bbox)的预测值,那么分类回归的groud truth就
        # 是ProposalTargetCreator输出的gt_roi_label和gt_roi_loc。

        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        # roi回归输出的是128*84和128*21,然而真实位置参数是128*4和真实标签128*1
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        # ------------------ RPN losses -------------------#
        # 输入20000个anchor和bbox,调用anchor_target_creator函数得到
        # 2000个anchor与bbox的偏移量与label
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        gt_rpn_label = at.totensor(gt_rpn_label).long()
        gt_rpn_loc = at.totensor(gt_rpn_loc)
        # 下面分析_fast_rcnn_loc_loss函数。rpn_loc为rpn网络回归出来的偏移量
        # (20000个),gt_rpn_loc为anchor_target_creator函数得到2000个anchor
        # 与bbox的偏移量,rpn_sigma=1.
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        # rpn_score为rpn网络得到的(20000个)与anchor_target_creator
        # 得到的2000个label求交叉熵损失
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]  #不计算背景类
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False),
                        _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        # roi_cls_loc为VGG16RoIHead的输出(128*84), n_sample=128
        n_sample = roi_cls_loc.shape[0]
        # roi_cls_loc=(128,21,4)
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        # proposal_target_creator()生成的128个proposal与bbox求得的偏移量
        # dx,dy,dw,dh
        gt_roi_label = at.totensor(gt_roi_label).long()
        # 128个标签
        gt_roi_loc = at.totensor(gt_roi_loc)
        # 采用smooth_l1_loss
        roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc,
                                           gt_roi_label.data, self.roi_sigma)
        # 求交叉熵损失
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False),
                        gt_roi_label.data.long())
        # 四个loss加起来
        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
Ejemplo n.º 12
0
        for i, (img, bbox_, label_, scale) in enumerate(dataloader):
            t1 = time.time()

            optimizer.zero_grad()

            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            rpn_model.cuda()
            _, _, rois, _, _ = rpn_model.forward(img, scale)
            rpn_model.cpu()
            bbox = bbox[0]
            label = label[0]

            sample_roi, gt_roi_loc, gt_roi_label = model.PTC(
                rois, at.tonumpy(bbox), at.tonumpy(label))

            gt_roi_label = at.totensor(gt_roi_label).long()
            gt_roi_loc = at.totensor(gt_roi_loc)

            for roi, roi_label, roi_loc in zip(sample_roi, gt_roi_label,
                                               gt_roi_loc):
                roi_cls_reg_locs, roi_clf_score = model.forward(img, roi)
                cls_reg_loss, cls_loss, reg_loss = model.loss(
                    roi_clf_score, roi_cls_reg_locs, roi_loc, roi_label)
                cls_reg_loss.backward()
                for group in optimizer.param_groups:
                    for p in group['params']:
                        state = optimizer.state[p]
                        if ('step' in state and state['step'] >= 1024):
                            state['step'] = 1000
                optimizer.step()
                c_loss.append(cls_loss)
Ejemplo n.º 13
0
def train():
    #     opt._parse(kwargs)

    best_map = float('-inf')

    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        state_dict = torch.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)
        best_map = state_dict['other_info']['best_map']

    trainer.vis.text(dataset.db.label_names, win='labels')
    lr_ = opt.lr
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img(
                    'roi_cm',
                    at.totensor(trainer.roi_cm.conf, False).float())
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{},loss:{}'.format(
            str(lr_), str(eval_result['map']), str(trainer.get_meter_data()))
        trainer.vis.log(log_info)
        print(log_info)
        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch % 5 == 4:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay
Ejemplo n.º 14
0
    def predict(self, imgs, sizes=None, visualize=False):
        '''
        对每张图片进行预测,
        Args:
            输入图片必须是CHW格式的RGB,是np.ndarry
        Return:
            返回的是一个tuple,包含:框的坐标,标签,得分
            (bboxes,labels,scores)
        '''
        self.eval()
        if visualize:  #可视化
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]  #get width&height
                #TODO:为什么可视化需要随机处理
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
            prepared_imgs = imgs
        bboxes = list()
        labels = []
        scores = []
        for img, size in zip(prepared_imgs, sizes):
            img = at.totensor(img[None]).float()
            scale = img.shape[3] / size[1]
            #TODO:调用forward函数,为什么可以这么调用
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            #TODO:.data是什么作用
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            mean = t.Tensor(self.loc_normalize_mean).cuda().repeat(
                self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda().repeat(
                self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            #TODO: 这个会有变形的作用吗
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)

            cls_bbox = loc2bbox(
                at.tonumpy(roi).reshape((-1, 4)),
                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor((cls_bbox))
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            '''clamp表示将tensor限制在其范围,让框不超过图片'''
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)
        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores
Ejemplo n.º 15
0
def train(opt, faster_rcnn, dataloader,  val_dataloader,
          test_dataloader, trainer, lr_, best_map, start_epoch):
    trainer.train()
    for epoch in range(start_epoch, start_epoch+opt.epoch):
        trainer.reset_meters()
        pbar = tqdm(enumerate(dataloader), total=len(dataloader))
        for ii, (img, bbox_, label_, scale) in pbar:
            # Currently configured to predict (y_min, x_min, y_max, x_max)
#             bbox_tmp = bbox_.clone()
#             bbox_ = transform_bbox(bbox_)
            scale = at.scalar(scale)

            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            losses = trainer.train_step(img, bbox, label, scale)
            if ii % 100 == 0:
                rpnloc = losses[0].cpu().data.numpy()
                rpncls = losses[1].cpu().data.numpy()
                roiloc = losses[2].cpu().data.numpy()
                roicls = losses[3].cpu().data.numpy()
                tot = losses[4].cpu().data.numpy()
                pbar.set_description(f"Epoch: {epoch} | Batch: {ii} | RPNLoc Loss: {rpnloc:.4f} | RPNclc Loss: {rpncls:.4f} | ROIloc Loss: {roiloc:.4f} | ROIclc Loss: {roicls:.4f} | Total Loss: {tot:.4f}")
            
            if (ii+1) % 1000 == 0:
                eval_result = eval(val_dataloader, faster_rcnn, test_num=1000)
                trainer.vis.plot('val_map', eval_result['map'])
                lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
                val_log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
                                                   str(eval_result['map']),
                                                        str(trainer.get_meter_data()))
                trainer.vis.log(val_log_info)
                print("Evaluation Results on Val Set ")
                print(val_log_info)
                print("\n\n")


            if (ii + 1) % 100 == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                print(trainer.get_meter_data())
                try:
                    ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                    gt_img = visdom_bbox(ori_img_,
                                        at.tonumpy(bbox_[0]),
                                        at.tonumpy(label_[0]))
                    trainer.vis.img('gt_img', gt_img)
                    plt.show()

                    # plot predicti bboxes
                    _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
                    pred_img = visdom_bbox(ori_img_,
                                        at.tonumpy(_bboxes[0]),
                                        at.tonumpy(_labels[0]).reshape(-1),
                                        at.tonumpy(_scores[0]))
                    plt.show()
                    trainer.vis.img('pred_img', pred_img)

                    # rpn confusion matrix(meter)
                    trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                     win='rpn_cm')
                    # roi confusion matrix
                    trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf,
                                                          False).float())
                except:
                    print("Cannot display images")
            if (ii + 1) % 100 == 0:
                eval_result = eval(val_dataloader, faster_rcnn, test_num=25)
                trainer.vis.plot('val_map', eval_result['map'])
                log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), str(
                    eval_result['map']), str(trainer.get_meter_data()))
                trainer.vis.log(log_info)


        # Save after every epoch
        epoch_path = trainer.save(epoch, best_map=0)
                
        eval_result = eval(test_dataloader, faster_rcnn, test_num=1000)
        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        test_log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
                                                   str(eval_result['map']),
                                                        str(trainer.get_meter_data()))

        trainer.vis.log(test_log_info)
        print("Evaluation Results on Test Set ")
        print(test_log_info)
        print("\n\n")

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = epoch_path

        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay

        if epoch == 13: 
            break
Ejemplo n.º 16
0
    def forward(self, imgs, bboxes, labels, scale):
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        # feature extraction
        features = self.faster_rcnn.extractor(imgs)

        # RPN network
        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # Sample RoIs and forward
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)

        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))

        # Faster rcnn head
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(),
                              at.totensor(gt_roi_label).long()]
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)

        roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(),
                                           gt_roi_loc.float(),
                                           gt_roi_label.data, self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
Ejemplo n.º 17
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0]  # number of input images one time
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape  # should be (1,3,H,W)
        img_size = (H, W)

        # need more feature maps here when you are trying to use features of different scale
        features = self.faster_rcnn.extractor(imgs)

        rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor = self.faster_rcnn.rpn(
            features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        # different parameters here :
        #     num_boxes : number of ground truth bounding boxes in a image.
        #     num_anchors : number of anchors in images(or to say in a feature map).
        #     num_rois : number of ROIs that are generated by RPN, which will be used in Fast RCNN.
        bbox = bboxes[0]  # shape (num_boxes, 4)
        label = labels[0]  # shape (num_boxes,)
        rpn_score = rpn_scores[0]  # shape (num_anchors,)
        rpn_loc = rpn_locs[0]  # shape (num_anchors, 4)
        roi = rois  # shape (num_rois, 4)
        search_region = search_regions  # shape (num_rois, 4)

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois,
        # consider them as constant input
        sample_roi, sample_search_region, (
            Tx, Ty), gt_roi_label = self.proposal_target_creator(
                roi, search_region, at.tonumpy(bbox), at.tonumpy(label))

        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        (px, py), roi_score = self.faster_rcnn.head(features, sample_roi,
                                                    sample_search_region,
                                                    sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False),
                        _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = px.shape[0]
        # (px, py) and (Tx, Ty) are to be used to caculate loss :roi_loc_loss

        Tx = at.tovariable(Tx).float()
        Ty = at.tovariable(Ty).float()

        print("px is ", px)
        # print("max of px is ", t.max(px))
        # print("min of px is ", t.min(px))
        # print(t.max(Tx))
        # print(t.max(Ty))
        # print(Tx.shape, Ty.shape, px.shape, py.shape)

        roi_loc_loss = _LocNet_loss(Tx, Ty, px, py, gt_roi_label.data,
                                    self.roi_sigma)

        gt_roi_label = at.tovariable(gt_roi_label).long()
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False),
                        gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]

        print("losses", losses)

        losses = losses + [sum(losses)]

        return LossTuple(*losses)  # return a namedtuple
        if opt.use_adam:
            self.optimizer = torch.optim.Adam(params)
        else:
            self.optimizer = torch.optim.SGD(params, momentum=0.9)
        return self.optimizer

    def scale_lr(self, decay=0.1):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] *= decay
        return self.optimizer


if __name__ == '__main__':
    img = np.ones((3, 5, 5), dtype=np.float32)
    b = array_tool.totensor(img[None]).float()
    loc_normalize_mean = (0., 0., 0., 0.)
    roi_cls_loc = np.ones((1, 84), dtype=np.float32)
    mean = torch.Tensor(loc_normalize_mean).cuda()
    mean = mean.repeat(21)[None]
    loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
    std = torch.Tensor(loc_normalize_std).cuda()
    std = std.repeat(21)[None]
    roi_cls_loc = array_tool.totensor(roi_cls_loc)
    roi_cls_loc = roi_cls_loc.data
    roi_cls_loc = (roi_cls_loc * std + mean)
    print(roi_cls_loc.size())
    roi_cls_loc = roi_cls_loc.view(-1, 21, 4)
    print(roi_cls_loc.size())
    roi = np.zeros((1, 4), dtype=np.float32)
    roi = array_tool.totensor(roi)
Ejemplo n.º 19
0
def train(**kwargs):
    # opt._parse(kwargs)

    print('load data')
    dataloader = get_train_loader(opt.root_dir,
                                  batch_size=opt.batch_size,
                                  shuffle=opt.shuffle,
                                  num_workers=opt.num_workers,
                                  pin_memory=opt.pin_memory)
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()

    # if opt.load_path:
    #     trainer.load(opt.load_path)
    #     print('load pretrained model from %s' % opt.load_path)

    # trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, sample in tqdm(enumerate(dataloader)):
            if len(sample.keys()) == 5:
                img_id, img, bbox_, scale, label_ = sample['img_id'], sample['image'], sample['bbox'], sample['scale'], \
                                                    sample['label']
                img, bbox, label = img.cuda().float(), bbox_.cuda(
                ), label_.cuda()
                img, bbox, label = Variable(img), Variable(bbox), Variable(
                    label)

            else:
                img_id, img, bbox, scale, label = sample['img_id'], sample['image'], np.zeros((1, 0, 4)), \
                                                  sample['scale'], np.zeros((1, 0, 1))
                img = img.cuda().float()
                img = Variable(img)

            # if label.size == 0:
            #     continue

            scale = at.scalar(scale)
            trainer.train_step(img_id, img, bbox, label, scale)
            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot ground truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicted bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img(
                    'roi_cm',
                    at.totensor(trainer.roi_cm.conf, False).float())

        if epoch % 10 == 0:
            best_path = trainer.save(best_map=best_map)
Ejemplo n.º 20
0
    def predict(self, imgs,sizes=None,visualize=False): #预测函数
        """Detect objects from images.

        This method predicts objects for each image.

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """
        self.eval() #网络设置为eval模式(禁用BatchNorm和Dropout)
        if visualize: #可视化内容,(跳过)
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
             prepared_imgs = imgs 
        bboxes = list() #最终的输出框
        labels = list() #最终的输出label
        scores = list() #最终的输出分数
        for img, size in zip(prepared_imgs, sizes): 
            img = at.totensor(img[None]).float() #增加batch维
            scale = img.shape[3] / size[1] #获得scale(待定)
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) #前向
            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale #把rois变回原图尺寸(待定)

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]
            #Q:看网上说ProposalCreator坐标归一化了所以这里要返回原图,但是我没看到。疑问
            #A:我觉得"ProposalCreator坐标归一化了"这个有错误,这里要反归一化是因为训练的时候使用的loc归一化了(ProposalTargetCreator),所以预测结果loc是归一化后的,并不是ProposalCreator时候归一化了
            roi_cls_loc = (roi_cls_loc * std + mean) #坐标反归一化
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) #一个框对应n_class个loc,所以要expand_as到同维度后面可以二次修正框
            
            #二次修正框得到最后框
            cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
                                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) #限制超出尺寸的框
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) #限制超出尺寸的框
            #softmax得到每个框的类别概率
            prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)
            #输入框以及对应的类别概率,抑制输出
            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            
            #输出坐标,类别,该类别概率
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate') #可视化内容,(跳过)
        self.train() #返回train模式
        return bboxes, labels, scores
Ejemplo n.º 21
0
    def predict(self, imgs,sizes=None,visualize=False):
        """Detect objects from images.
        从图像中检测物体

        This method predicts objects for each image.
          此方法预测每个图像的对象。
        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """
        #将模块设置为评估模式。这只对诸如Dropout或BatchNorm等模块有任何影响。module中的方法
        self.eval()
        #可视化
        if visualize:
            #设置为可视化  设置 self.nms_thresh = 0.3   self.score_thresh = 0.7
            #评估模式 和 可视化模式 使用不同的nms最大化抑制 和阈值
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                # print('nei img shape is ', img.shape)
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
             prepared_imgs = imgs 
        bboxes = list()
        labels = list()
        scores = list()
        #size[600,800]
        # print('sizes is ', sizes)
        for img, size in zip(prepared_imgs, sizes):
            #img由[3,600,800]转为[1,3,600,800]  转为变量,扩充一维 并设置为 预测模式
            img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
            #scale 为1
            scale = img.shape[3] / size[1]
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
                                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores
Ejemplo n.º 22
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.
        Faster网络的前向传播、计算losses*************************
        Here are notations used.

        * :math:`N` is the batch size. `N`是批量大小
        * :math:`R` is the number of bounding boxes per image. `R`是每个图像的边界框的数量

        Currently, only :math:`N=1` is supported.
        当前模型,只有N=1可用

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
                                            batch=1的图片变量
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
                                            真实人工标注的bboxes变量
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`.
                 The background is excluded from the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes.
                 背景被排除在定义之外,这意味着值的范围。`L`是前景类的数量
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.
                预处理期间应用于原始图像的缩放量

        Returns:
            namedtuple of 5 losses
            五个损失
        """

        n = bboxes.shape[0]
        #判断,只支持batch为1
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')
        #img_size=原图像的高、宽
        _, _, H, W = imgs.shape
        img_size = (H, W)
        #通过提取器(预训练好的VGG16)网络提取特征
        features = self.faster_rcnn.extractor(imgs)
        #通过rpn网络(区域提案网络)得到
        #rpn这是一个区域提案网络。它提取图像特征,预测输出rois
        #rpn_locs[1,17316,4]   rpn_scores[1,17316,2]   rois[2000,4]   roi_indices[2000,]全为0  anchor [17316,4]
        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        # 由于批量大小为1,因此将变量转换为单数形式(即压缩第一维)
        #bbox变为[1,4]
        bbox = bboxes[0]
        label = labels[0]
        #则rpn_score变为[17316,4]  rpn_loc 变为[17316,2]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        #大约2000个rois
        roi = rois

        # Sample RoIs and forward   简单的ROIs和前向传播
        # it's fine to break the computation graph of rois, consider them as constant input
        #打破rois的计算图,将它作为一个固定不变的输入
        #proposal_target_creator  输入为rois(2000个候选框,和人工标注的bbox)用于生成训练目标,只训练用到
        #2000个rois选出128个
        #sample_roi[128,4]     gt_roi_loc[128,4]     gt_roi_label[128,] 值为0或1 表示正负样本
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now
        #它全部为零,因为现在它只支持batch = 1
        sample_roi_index = t.zeros(len(sample_roi))
        #roi head网络进行预测类别和目标框
        #RoIHead: 负责对rois分类和微调。对RPN找出的rois,判断它是否包含目标,并修正框的位置和座标
        #使用RoIs提议的的feature maps,对RoI中的对象进行分类并提高目标框定位
        #roi_cls_loc  roi的分类、回归
        #传入  特征提取的features   和  128个ROI
        #roi_cls_loc [128,84]回归定位    roi_score[128,21]分类(20类加背景)
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        # ------------------ RPN losses -------------------#
        #真实标注的bbox,预测出来的anchor锚点
        # 将真实的bbox分配给锚点,返回 经过rpn后对应的定位和标签
        #gt_rpn_loc[17316,4]     gt_rpn_label  [17316,]
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        #转为变量V  转为long型
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        #rpn的回归定位损失   rpn_loc_loss[1]
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        #ignore_index的默认值是 - 100...
        #F:pytorch的function
        #分类使用交叉熵
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)

        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        #添加进rpn 混淆矩阵
        self.rpn_cm.add(at.totensor(_rpn_score, False),
                        _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        #roi分类和回归   压缩第一维
        #n_sample 128
        n_sample = roi_cls_loc.shape[0]
        #改变形状为[ 32,4]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        #得到roi的回归
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        # gt_roi_label:真实roi的标签
        #gt_roi_loc:真实roi的回归
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)
        #roi的回归损失  计算回归定位的损失
        roi_loc_loss = _fast_rcnn_loc_loss(
            #contiguous从不连续调整为连续
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,
            self.roi_sigma)
        #roi分类损失(交叉熵)
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
        #添加进roi 混淆矩阵
        self.roi_cm.add(at.totensor(roi_score, False),
                        gt_roi_label.data.long())
        #计算总损失
        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]
        #返回Tuple,四个损失+总损失
        return LossTuple(*losses)
Ejemplo n.º 23
0
    def predict(self, imgs, sizes=None, visualize=False):
        """Detect objects from images

        This method predicts objects for each image

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.
        """

        self.eval()
        if visualize:
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]  # [H, W], img.shape: [C, H, W]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
            prepared_imgs = imgs
        bboxes = list()
        labels = list()
        scores = list()

        for img, size in zip(prepared_imgs, sizes):
            img = at.totensor(img[None]).float()
            # img[None] add a new axis shape: [C, H, W] -> [1, C, H, W]

            scale = img.shape[3] / size[1]  # new_W / ori_W

            roi_cls_loc, roi_scores, rois, _ = self(img,
                                                    scale)  # 这里自己调用__call__
            # assuming that batch size is 1
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda().repeat(
                self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda().repeat(
                self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)

            cls_bbox = loc2bbox(
                at.tonumpy(roi).reshape((-1, 4)),
                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)

            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2].clamp(min=0, max=size[0]))
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2].clamp(min=0, max=size[1]))

            prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)
            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            score.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores
Ejemplo n.º 24
0
def train(**kwargs):
    opt._parse(kwargs)

    print('dataset = Dataset(opt)')
    transform = transforms.Compose([
        # you can add other transformations in this list
        transforms.ToTensor()
    ])
    dataset = Dataset(opt, transform=transform)
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=True, \
                                  # pin_memory=True,

                                  num_workers=opt.num_workers,
                                  )
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    print('faster_rcnn = FasterRCNNVGG16()')
    faster_rcnn = FasterRCNNVGG16()
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s at trian.py line 70' %
              opt.load_path)

    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            print("tqdm(enumerate(dataloader)):")
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            img, bbox, label = Variable(img), Variable(bbox), Variable(label)

            print("train.py trainer.train_step(img, bbox, label, scale)")
            print(img.shape)
            print(bbox.shape)
            print(label.shape)

            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                print("trian.py line94")
                print(trainer.get_meter_data())
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img(
                    'roi_cm',
                    at.totensor(trainer.roi_cm.conf, False).float())
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay

        trainer.vis.plot('test_map', eval_result['map'])
        log_info = 'lr:{}, map:{},loss:{}'.format(
            str(lr_), str(eval_result['map']), str(trainer.get_meter_data()))
        trainer.vis.log(log_info)
        if epoch == 13:
            break
def train(**kwargs):
    opt._parse(kwargs)
    dataset = Dataset(opt)
    # 300w_dataset = FaceLandmarksDataset()
    print('load data')
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=True, \
                                  pin_memory=True,\
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    attacker = attacks.DCGAN(train_adv=False)
    if opt.load_attacker:
        attacker.load(opt.load_attacker)
        print('load attacker model from %s' % opt.load_attacker)
    trainer = VictimFasterRCNNTrainer(faster_rcnn, attacker,
                                      attack_mode=True).cuda()
    # trainer = VictimFasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)

    trainer.vis.text(dataset.db.label_names, win='labels')
    # eval_result = eval(test_dataloader, faster_rcnn, test_num=2000)
    best_map = 0
    for epoch in range(opt.epoch):
        trainer.reset_meters(adv=True)
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            ipdb.set_trace()
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            img, bbox, label = Variable(img), Variable(bbox), Variable(label)
            trainer.train_step(img, bbox, label, scale)

            if (ii) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())
                trainer.vis.plot_many(trainer.get_meter_data(adv=True))

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicted bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)
                if trainer.attacker is not None:
                    adv_img = trainer.attacker.perturb(img)
                    adv_img_ = inverse_normalize(at.tonumpy(adv_img[0]))
                    _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                        [adv_img_], visualize=True)
                    adv_pred_img = visdom_bbox(
                        adv_img_, at.tonumpy(_bboxes[0]),
                        at.tonumpy(_labels[0]).reshape(-1),
                        at.tonumpy(_scores[0]))
                    trainer.vis.img('adv_img', adv_pred_img)
                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img(
                    'roi_cm',
                    at.totensor(trainer.roi_cm.conf, False).float())

                if (ii) % 500 == 0:
                    best_path = trainer.save(epochs=epoch, save_rcnn=True)

        if epoch % 2 == 0:
            best_path = trainer.save(epochs=epoch)
Ejemplo n.º 26
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0]  #bbox的个数
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape  # 图像的高宽
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)  #features
        # rpn_locs 的维度( hh*ww*9 , 4 ), rpn_scores 维度为( hh*ww*9 , 2 ) ,
        # rois 的维度为( 2000,4 ), roi_indices 用不到, anchor 的维度为
        # ( hh*ww*9 , 4 ), H 和 W 是经过数据预处理后的。计算( H/16 ) x(W/16)x9
        # ( 大概 20000) 个 anchor 属于前景的概率 , 取前 12000 个并经过 NMS 得到 2000 个
        # 近似目标框 G^ 的坐标。 roi 的维度为 (2000,4)
        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]  #[N,R,4]
        label = labels[0]  #[N,R]
        rpn_score = rpn_scores[0]  #hh*ww*9
        rpn_loc = rpn_locs[0]  #hh*ww*9
        roi = rois  #[2000,4]

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois,
        # consider them as constant input

        # 调用proposal_target_creator函数生成sample roi(128, 4)、
        # gt_roi_loc(128, 4)、 gt_roi_label(128, 1),
        # RoIHead网络利用这
        # sample_roi + featue为输入, 输出是分类(21
        # 类)和回归(进一步微调 bbox)的预测值, 那么分类回归的
        # groudtruth
        # ProposalTargetCreator输出的gt_roi_label/gt_roi_loc

        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        # ------------------ RPN losses -------------------#
        # 输入 20000 个 anchor 和 bbox ,调用 anchor_target_creator 函数得到
        # 2000 个 anchor 与 bbox 的偏移量与 label
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        gt_rpn_label = at.totensor(gt_rpn_label).long()
        gt_rpn_loc = at.totensor(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        # roi_cls_loc 为 VGG16RoIHead 的输出( 128,84) , n_sample=128
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        #[128,21,4]
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        gt_roi_label = at.totensor(gt_roi_label).long()
        gt_roi_loc = at.totensor(gt_roi_loc)

        roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc,
                                           gt_roi_label.data, self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
Ejemplo n.º 27
0
    def forward(self, imgs, bboxes, labels, scale):
        """前向传播过程计算损失
        参数:
            imgs: [N, C, H, W]
            bboxes: [N, R, 4]
            labels: [N, R]
            scale: 单个值就可以
        返回:5个损失"""
        num_batch = bboxes.shape[0]
        if num_batch != 1:
            raise ValueError("仅支持batch_size=1")

        # 得到图片的尺寸H, W
        _, _, H, W = imgs.shape
        img_size = (H, W)
        # 得到特征图
        features = self.faster_rcnn.extractor(imgs)
        # 进入rpn网络, 输出预测的锚点框预测偏移量和得分
        rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(
            features, img_size, scale)
        # 由于batch size为1,所以取其中的元素为:
        bbox = bboxes[0]
        label = labels[0]
        rpn_loc = rpn_locs[0]
        rpn_score = rpn_scores[0]
        roi = rois

        # 产生锚点框的真实偏移量和标签
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            bbox=tonumpy(data=bbox), anchor=anchor, img_size=img_size)

        # 产生候选框的真实偏移量和标签
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi=roi,
            bbox=tonumpy(bbox),
            label=tonumpy(label),
            loc_normalize_mean=self.loc_normalize_mean,
            loc_normalize_std=self.loc_normalize_std)
        # 由于batch_size=1,所以sample_roi_indice都为0
        sample_roi_index = torch.zeros(len(sample_roi))
        # 产生由候选框产生的预测框的偏移量和得分
        roi_cls_loc, roi_score = self.faster_rcnn.head(
            x=features, rois=sample_roi, roi_indices=sample_roi_index)

        # ------------------------rpn loss----------------------------------#
        gt_rpn_label = totensor(data=gt_rpn_label).long()
        gt_rpn_loc = totensor(data=gt_rpn_loc)
        rpn_loc_loss = _faster_rcnn_loc_loss(pred_loc=rpn_loc,
                                             gt_loc=gt_rpn_loc,
                                             gt_label=gt_rpn_label.data,
                                             sigma=self.rpn_sigma)
        rpn_cls_loss = F.cross_entropy(input=rpn_score,
                                       target=gt_rpn_label.cuda(),
                                       ignore_index=-1)
        # 除了标签为-1之外的真实标签
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = tonumpy(data=rpn_score)[tonumpy(data=gt_rpn_label) > -1]
        self.rpn_cm.add(predicted=totensor(data=_rpn_score, cuda=False),
                        target=_gt_rpn_label.data.long())

        # ---------------------roi loss---------------------------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        # 取出gt_roi_label对应的预测框的预测偏移量
        roi_loc = roi_cls_loc[torch.arange(0, n_sample),
                              totensor(data=gt_roi_label).long()]
        gt_roi_loc = totensor(data=gt_roi_loc)
        gt_roi_label = totensor(data=gt_roi_label).long()
        roi_loc_loss = _faster_rcnn_loc_loss(pred_loc=roi_loc.contiguous(),
                                             gt_loc=gt_roi_loc,
                                             gt_label=gt_roi_label.data,
                                             sigma=self.roi_sigma)
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
        self.roi_cm.add(predicted=totensor(roi_score, False),
                        target=gt_roi_label.data.long())
        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]
        return LossTuple(*losses)
Ejemplo n.º 28
0
def train(**kwargs):
    opt._parse(kwargs)

    dataset = Dataset(opt)
    print('load data')
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=True, \
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)

    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            img, bbox, label = Variable(img), Variable(bbox), Variable(label)
            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_,
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)

        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
                                                  str(eval_result['map']),
                                                  str(trainer.get_meter_data()))
        trainer.vis.log(log_info)
        if epoch == 13: 
            break
Ejemplo n.º 29
0
    def predict(self, imgs, sizes=None, visualize=False):
        """Detect objects from images.

        This method predicts objects for each image.

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """
        self.eval()
        if visualize:
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
            prepared_imgs = imgs
        bboxes = list()
        labels = list()
        scores = list()
        for img, size in zip(prepared_imgs, sizes):
            img = at.totensor(img[None]).float()
            scale = img.shape[3] / size[1]
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            # We are assuming that batch size is 1.
            roi_scores = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = torch.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = torch.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
                                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
Ejemplo n.º 30
0
def train(**kwargs):
    opt._parse(kwargs)

    dataset = Dataset(opt)
    print('load data')
    dataloader = data_.DataLoader(dataset,
                                  batch_size=1,
                                  shuffle=True,
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False,
                                       pin_memory=True
                                       )
    testset_all = TestDataset_all(opt, 'test2')
    test_all_dataloader = data_.DataLoader(testset_all,
                                           batch_size=1,
                                           num_workers=opt.test_num_workers,
                                           shuffle=False,
                                           pin_memory=True
                                           )

    tsf = Transform(opt.min_size, opt.max_size)
    faster_rcnn = FasterRCNNVGG16()
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    print('model construct completed')

    # 加载训练过的模型,在config配置路径就可以了
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)

    #提取蒸馏知识所需要的软标签
    if opt.is_distillation == True:
        opt.predict_socre = 0.3
        for ii, (imgs, sizes, gt_bboxes_, gt_labels_, scale, id_) in tqdm(enumerate(dataloader)):
            if len(gt_bboxes_) == 0:
                continue
            sizes = [sizes[0][0].item(), sizes[1][0].item()]
            pred_bboxes_, pred_labels_, pred_scores_, features_ = trainer.faster_rcnn.predict(imgs, [
                sizes])

            img_file = os.path.join(
                opt.voc_data_dir, 'JPEGImages', id_[0] + '.jpg')
            ori_img = read_image(img_file, color=True)
            img, pred_bboxes_, pred_labels_, scale_ = tsf(
                (ori_img, pred_bboxes_[0], pred_labels_[0]))

            #去除软标签和真值标签重叠过多的部分,去除错误的软标签
            pred_bboxes_, pred_labels_, pred_scores_ = py_cpu_nms(
                gt_bboxes_[0], gt_labels_[0], pred_bboxes_, pred_labels_, pred_scores_[0])

            #存储软标签,这样存储不会使得GPU占用过多
            np.save('label/' + str(id_[0]) + '.npy', pred_labels_)
            np.save('bbox/' + str(id_[0]) + '.npy', pred_bboxes_)
            np.save('feature/' + str(id_[0]) + '.npy', features_)
            np.save('score/' + str(id_[0]) + '.npy', pred_scores_)

        opt.predict_socre = 0.05
    t.cuda.empty_cache()

    # visdom 显示所有类别标签名
    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr

    for epoch in range(opt.epoch):
        print('epoch=%d' % epoch)

        # 重置混淆矩阵
        trainer.reset_meters()
        # tqdm可以在长循环中添加一个进度提示信息,用户只需要封装任意的迭代器 tqdm(iterator),
        # 是一个快速、扩展性强
        for ii, (img, sizes, bbox_, label_, scale, id_) in tqdm(enumerate(dataloader)):
            if len(bbox_) == 0:
                continue
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            # 训练的就这一步 下面的都是打印的信息
            # 转化成pytorch能够计算的格式,转tensor格式
            if opt.is_distillation == True:
                #读取软标签
                teacher_pred_labels = np.load(
                    'label/' + str(id_[0]) + '.npy')
                teacher_pred_bboxes = np.load(
                    'bbox/' + str(id_[0]) + '.npy')
                teacher_pred_features_ = np.load(
                    'feature/' + str(id_[0]) + '.npy')
                teacher_pred_scores = np.load(
                    'score/' + str(id_[0]) + '.npy')
                #格式转换
                teacher_pred_bboxes = teacher_pred_bboxes.astype(np.float32)
                teacher_pred_labels = teacher_pred_labels.astype(np.int32)
                teacher_pred_scores = teacher_pred_scores.astype(np.float32)
                #转成pytorch格式
                teacher_pred_bboxes_ = at.totensor(teacher_pred_bboxes)
                teacher_pred_labels_ = at.totensor(teacher_pred_labels)
                teacher_pred_scores_ = at.totensor(teacher_pred_scores)
                teacher_pred_features_ = at.totensor(teacher_pred_features_)
                #使用GPU
                teacher_pred_bboxes_ = teacher_pred_bboxes_.cuda()
                teacher_pred_labels_ = teacher_pred_labels_.cuda()
                teacher_pred_scores_ = teacher_pred_scores_.cuda()
                teacher_pred_features_ = teacher_pred_features_.cuda()

                # 如果dataset.py 中的Transform 设置了图像翻转,就要使用这个判读软标签是否一起翻转
                if(teacher_pred_bboxes_[0][1] != bbox[0][0][1]):
                    _, o_C, o_H, o_W = img.shape
                    teacher_pred_bboxes_ = flip_bbox(
                        teacher_pred_bboxes_, (o_H, o_W), x_flip=True)

                losses = trainer.train_step(img, bbox, label, scale, epoch,
                                            teacher_pred_bboxes_, teacher_pred_labels_, teacher_pred_features_, teacher_pred_scores)
            else:
                trainer.train_step(img, bbox, label, scale, epoch)

            # visdom显示的信息
            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(teacher_pred_bboxes_),
                                     at.tonumpy(teacher_pred_labels_),
                                     at.tonumpy(teacher_pred_scores_))
                trainer.vis.img('gt_img_all', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores, _ = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_,
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # 混淆矩阵
                # rpn confusion matrix(meter)
                trainer.vis.text(
                    str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
                # roi confusion matrix
                trainer.vis.text(
                    str(trainer.roi_cm.value().tolist()), win='roi_cm')
                # trainer.vis.img('roi_cm', at.totensor(
                # trainer.roi_cm.value(), False).float())

        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{},ap:{}, map:{},loss:{}'.format(str(lr_),
                                                        str(eval_result['ap']),
                                                        str(eval_result['map']),
                                                        str(trainer.get_meter_data()))
        trainer.vis.log(log_info)

        # 保存最好结果并记住路径
        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)

        if epoch == 20:
            trainer.save(best_map='20')
            result = eval(test_all_dataloader,
                          trainer.faster_rcnn, test_num=5000)
            print('20result={}'.format(str(result)))
            # trainer.load(best_path)
            # result=eval(test_all_dataloader,trainer.faster_rcnn,test_num=5000)
            # print('bestmapresult={}'.format(str(result)))
            break

        # 每10轮加载前面最好权重,并且减少学习率
        if epoch % 20 == 15:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay
Ejemplo n.º 31
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)

        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois,
        # consider them as constant input
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        gt_rpn_label = at.totensor(gt_rpn_label).long()
        gt_rpn_loc = at.totensor(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False),
                        _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        gt_roi_label = at.totensor(gt_roi_label).long()
        gt_roi_loc = at.totensor(gt_roi_loc)

        roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc,
                                           gt_roi_label.data, self.roi_sigma)
        #         weight = t.Tensor([10,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]).cuda()
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False),
                        gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
Ejemplo n.º 32
0
def train(**kwargs):  # *变量名, 表示任何多个无名参数, 它是一个tuple;**变量名, 表示关键字参数, 它是一个dict
    opt._parse(kwargs)  # 识别参数,传递过来的是一个字典,用parse来解析

    dataset = Dataset(opt)  # 作者自定义的Dataset类
    print('读取数据中...')

    # Dataloader 定义了一次获取批次数据的方法
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=True, \
                                  # pin_memory=True,

                                  num_workers=opt.num_workers) # PyTorch自带的DataLoader类,生成一个多线程迭代器来迭代dataset, 以供读取一个batch的数据
    testset = TestDataset(opt, split='trainval')

    # 测试集loader
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()  # 网络定义
    print('模型构建完毕!')

    trainer = FasterRCNNTrainer(
        faster_rcnn).cuda()  # 定义一个训练器,返回loss, .cuda()表示把返回的Tensor存入GPU

    if opt.load_path:  # 如果要加载预训练模型
        trainer.load(opt.load_path)
        print('已加载预训练参数 %s' % opt.load_path)
    else:
        print("未引入预训练参数, 随机初始化网络参数")

    trainer.vis.text(dataset.db.label_names, win='labels')  # 显示labels标题
    best_map = 0  # 定义一个best_map

    for epoch in range(opt.epoch):  # 对于每一个epoch

        trainer.reset_meters()  # 重置测各种测量仪

        # 对每一个数据
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)  # 转化为标量
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda(
            )  # 存入GPU
            img, bbox, label = Variable(img), Variable(bbox), Variable(
                label)  # 转换成变量以供自动微分器使用
            # TODO
            trainer.train_step(img, bbox, label, scale)  # 训练一步

            if (ii + 1) % opt.plot_every == 0:  # 如果到达"每多少次显示"
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict(
                    [ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()),
                                 win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img(
                    'roi_cm',
                    at.totensor(trainer.roi_cm.conf, False).float())

        # 使用测试数据集来评价模型(此步里面包含预测信息)
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(
                best_map=best_map)  # 好到一定程度就存储模型, 存储在checkpoint文件夹内

        if epoch == 9:  # 到第9轮的时候读取模型, 并调整学习率
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)

        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{},loss:{}'.format(
            str(lr_), str(eval_result['map']), str(trainer.get_meter_data()))
        trainer.vis.log(log_info)

        # if epoch == 13:  # 到第14轮的时候停止训练
        #     break

    trainer.save(best_map=best_map)
Ejemplo n.º 33
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)

        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois, 
        # consider them as constant input
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi,
            at.tonumpy(bbox),
            at.tonumpy(label),
            self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(
            features,
            sample_roi,
            sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox),
            anchor,
            img_size)
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(
            rpn_loc,
            gt_rpn_loc,
            gt_rpn_label.data,
            self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)

        roi_loc_loss = _fast_rcnn_loc_loss(
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,
            self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
    def predict(self, imgs,sizes=None,visualize=False):
        """Detect objects from images.

        This method predicts objects for each image.

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """
        self.eval()
        if visualize:
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
             prepared_imgs = imgs 
        bboxes = list()
        labels = list()
        scores = list()
        for img, size in zip(prepared_imgs, sizes):
            img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
            scale = img.shape[3] / size[1]
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
                                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores