def __init__(self, weights_path):
        self.model = SSD('test')
        self.model.cuda().eval()

        state = torch.load(weights_path, map_location=lambda storage, loc: storage)
        state = {key: value.float() for key, value in state.items()}
        self.model.load_state_dict(state)

        self.transform = GeneralizedRCNNTransform(DETECTOR_MIN_SIZE, DETECTOR_MAX_SIZE, DETECTOR_MEAN, DETECTOR_STD)
        self.transform.eval()
Exemple #2
0
 def _init_test_generalized_rcnn_transform(self):
     min_size = 100
     max_size = 200
     image_mean = [0.485, 0.456, 0.406]
     image_std = [0.229, 0.224, 0.225]
     transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)
     return transform
Exemple #3
0
 def test_not_float_normalize(self):
     transform = GeneralizedRCNNTransform(300, 500, torch.zeros(3),
                                          torch.ones(3))
     image = [torch.randint(0, 255, (3, 200, 300), dtype=torch.uint8)]
     targets = [{"boxes": torch.rand(3, 4)}]
     with pytest.raises(TypeError):
         out = transform(image, targets)  # noqa: F841
Exemple #4
0
 def __init__(self_module):
     super(TransformModule, self_module).__init__()
     min_size = 800
     max_size = 1333
     image_mean = [0.485, 0.456, 0.406]
     image_std = [0.229, 0.224, 0.225]
     self_module.transform = GeneralizedRCNNTransform(
         min_size, max_size, image_mean, image_std)
 def test_transform_copy_targets(self):
     transform = GeneralizedRCNNTransform(300, 500, torch.zeros(3), torch.ones(3))
     image = [torch.rand(3, 200, 300), torch.rand(3, 200, 200)]
     targets = [{'boxes': torch.rand(3, 4)}, {'boxes': torch.rand(2, 4)}]
     targets_copy = copy.deepcopy(targets)
     out = transform(image, targets)  # noqa: F841
     self.assertTrue(torch.equal(targets[0]['boxes'], targets_copy[0]['boxes']))
     self.assertTrue(torch.equal(targets[1]['boxes'], targets_copy[1]['boxes']))
Exemple #6
0
 def test_transform_copy_targets(self):
     transform = GeneralizedRCNNTransform(300, 500, torch.zeros(3), torch.ones(3))
     image = [torch.rand(3, 200, 300), torch.rand(3, 200, 200)]
     targets = [{"boxes": torch.rand(3, 4)}, {"boxes": torch.rand(2, 4)}]
     targets_copy = copy.deepcopy(targets)
     out = transform(image, targets)  # noqa: F841
     assert_equal(targets[0]["boxes"], targets_copy[0]["boxes"])
     assert_equal(targets[1]["boxes"], targets_copy[1]["boxes"])
 def __init__(self, root, img_transform, extra_info=True):
     self.root = root
     self.img_transform = img_transform
     self.extra_info = extra_info
     self.anno = pd.read_csv(
         os.path.join(self.root, 'annotation.csv'))
     
     self.target_transform = GeneralizedRCNNTransform(
         400, 400, [0., 0., 0.], [1., 1., 1.])
Exemple #8
0
class Detector(object):
    def __init__(self, weights_path):
        self.model = SSD('test')
        self.model.cuda().eval()

        state = torch.load(weights_path,
                           map_location=lambda storage, loc: storage)
        state = {key: value.float() for key, value in state.items()}
        self.model.load_state_dict(state)

        self.transform = GeneralizedRCNNTransform(DETECTOR_MIN_SIZE,
                                                  DETECTOR_MAX_SIZE,
                                                  DETECTOR_MEAN, DETECTOR_STD)
        self.transform.eval()

    def detect(self, images):
        images = torch.stack(
            [torch.from_numpy(image).cuda() for image in images])
        images = images.transpose(1, 3).transpose(2, 3).float()
        original_image_sizes = [img.shape[-2:] for img in images]
        images, _ = self.transform(images, None)
        with torch.no_grad():
            detections_batch = self.model(images.tensors).cpu().numpy()
        result = []
        for detections, image_size in zip(detections_batch,
                                          images.image_sizes):
            scores = detections[1, :, 0]
            keep_idxs = scores > DETECTOR_THRESHOLD
            detections = detections[1, keep_idxs, :]
            detections = detections[:, [1, 2, 3, 4, 0]]
            detections[:, 0] *= image_size[1]
            detections[:, 1] *= image_size[0]
            detections[:, 2] *= image_size[1]
            detections[:, 3] *= image_size[0]
            result.append({
                'scores': torch.from_numpy(detections[:, 4]),
                'boxes': torch.from_numpy(detections[:, :4])
            })

        result = self.transform.postprocess(result, images.image_sizes,
                                            original_image_sizes)
        return result
    def __init__(self, backbone, num_classes,
        min_size=800, max_size=1333,
        image_mean=None, image_std=None,
        anchor_generator=None, head=None,
        proposal_matcher=None,
        score_thresh=0.05,
        nms_thresh=0.5,
        detections_per_img=300,
        fg_iou_thresh=0.5, bg_iou_thresh=0.4,
        topk_candidates=1000):

        super(RetinaNet, self).__init__()

        if not hasattr(backbone, "out_channels"):
            raise ValueError("backbone should contain an attribute out_channels specifying the number of output channels "
                "assumed be the samefor all the levels")

        self.backbone = backbone

        assert isinstance(anchor_generator, (AnchorGenerator, type(None)))

        if anchor_generator is None:
            anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512])
            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
            anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)

        self.anchor_generator = anchor_generator

        if head is None:
            head = RetinaNetHead(backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes)
        self.head = head

        if proposal_matcher is None:
            proposal_matcher = det_utils.Matcher(
                fg_iou_thresh,
                bg_iou_thresh,
                allow_low_quality_matches = True,
            )
        self.proposal_matcher = proposal_matcher

        self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)

        self.score_thresh = score_thresh
        self.nms_thresh = nms_thresh
        self.detections_per_img = detections_per_img
        self.topk_candidates = topk_candidates

        self.has_warned = False
Exemple #10
0
def get_features_for_projection(model, imagePath, device):
    image_mean = [0.485, 0.456, 0.406]
    image_std = [0.229, 0.224, 0.225]
    # these transform parameters are from source code of Mask R CNN
    transform = GeneralizedRCNNTransform(min_size=800, max_size=1333, image_mean=image_mean, image_std=image_std)
    image = Image.open(imagePath)
    image_tensor = TF.to_tensor(image)
    # let it be in list (can be multiple)
    # TODO make it multiple
    images = [image_tensor]
    images, _ = transform(images)
    features = model.backbone(images.tensors.to(device))
    features_to_be_projected = features['pool']
    return features_to_be_projected
Exemple #11
0
def get_features_for_projection_multi(model, imagePaths, device):
    image_mean = [0.485, 0.456, 0.406]
    image_std = [0.229, 0.224, 0.225]
    # these transform parameters are from source code of Mask R CNN
    transform = GeneralizedRCNNTransform(min_size=800, max_size=1333, image_mean=image_mean, image_std=image_std)
    images = [Image.open(imagePath) for imagePath in imagePaths]
    image_tensors = [TF.to_tensor(image) for image in images]
    # let it be in list (can be multiple)
    results = []
    with torch.no_grad():
        for tensor in image_tensors:
            images, _ = transform([tensor])
            features = model.backbone(images.tensors.to(device))
            features_to_be_projected = features['pool']
            results.append(features_to_be_projected[0])
    return results
Exemple #12
0
    def __init__(self,
                 backbone,
                 rpn,
                 roi_heads,
                 mask_net,
                 transform,
                 input_img_num=6,
                 depth_estimator_path='_depth_net.pth'):
        super(GeneralizedRCNN, self).__init__()
        self.transform = transform
        self.backbone = UNet(4, 1)
        self.backbone_ = UNet(4, 64)
        self.input_img_num = input_img_num
        self.rpn = rpn
        self.roi_heads = roi_heads
        self.mask_net = UnetMask(6, 1)
        self.backbone_out_channels = 64

        self.depth_estimator_path = depth_estimator_path
        self.depth_estimator = VggDepthEstimator()
        self.depth_estimator.load_state_dict(
            torch.load(self.depth_estimator_path))
        self.depth_resize = nn.Upsample(size=(400, 400),
                                        mode='bilinear',
                                        align_corners=True)

        self.img_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((400, 400)),
            transforms.ToTensor(),
            transforms.Normalize(MEAN, STD)
        ])
        self.target_transform = GeneralizedRCNNTransform(
            400, 400, [0., 0., 0.], [1., 1., 1.])

        self.mask_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((800, 800)),
            transforms.ToTensor()
        ])

        self.depth_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((128, 416)),
            transforms.ToTensor(),
            transforms.Normalize(MEAN, STD)
        ])
    def __init__(self, backbone, rpn, roi_heads, transform, input_img_num=6):
        super(DetectionGeneralizedRCNN, self).__init__()
        self.transform = transform
        self.backbone = backbone
        self.input_img_num = input_img_num
        self.rpn = rpn
        self.roi_heads = roi_heads
        self.backbone_out_channels = backbone.out_channels

        self.img_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((400, 400)),
            transforms.ToTensor(),
            transforms.Normalize(MEAN, STD)
        ])
        self.target_transform = GeneralizedRCNNTransform(
            400, 400, [0., 0., 0.], [1., 1., 1.])
Exemple #14
0
def store_features_for_projection_multi(imagePaths, model_maskrcnn, outputs,
                                        sceneid):
    image_mean = [0.485, 0.456, 0.406]
    image_std = [0.229, 0.224, 0.225]
    # these transform parameters are from source code of Mask R CNN
    transform = GeneralizedRCNNTransform(min_size=800,
                                         max_size=1333,
                                         image_mean=image_mean,
                                         image_std=image_std)
    images = [Image.open(imagePath) for imagePath in imagePaths]
    image_tensors = [TF.to_tensor(image) for image in images]
    # let it be in list (can be multiple)
    images, _ = transform(image_tensors)
    with torch.no_grad():
        body = model_maskrcnn.body
        body = body
        output = body(images.tensors)
    torch.save(output, outputs + sceneid + ".fea")
Exemple #15
0
    def __init__(
            self,
            backbone,
            num_classes=2,
            num_pids=5532,
            num_cq_size=5000,
            # transform parameters
            min_size=900,
            max_size=1500,
            image_mean=None,
            image_std=None,
            # Anchor settings:
            anchor_scales=None,
            anchor_ratios=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=12000,
            rpn_pre_nms_top_n_test=6000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=300,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            rcnn_bbox_bn=True,
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.4,
            box_detections_per_img=300,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.1,
            box_batch_size_per_image=128,
            box_positive_fraction=0.5,
            bbox_reg_weights=None,
            # ReID parameters
            feat_head=None,
            reid_head=None,
            reid_loss=None):
        if rpn_anchor_generator is None:
            anchor_sizes = ((32, 64, 128, 256, 512), )
            aspect_ratios = ((0.5, 1.0, 2.0), )
            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
        rpn_head = RPNHead(backbone.out_channels,
                           rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)
        rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head,
                                    rpn_fg_iou_thresh, rpn_bg_iou_thresh,
                                    rpn_batch_size_per_image,
                                    rpn_positive_fraction, rpn_pre_nms_top_n,
                                    rpn_post_nms_top_n, rpn_nms_thresh)

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(featmap_names=['feat2rpn'],
                                              output_size=[14, 14],
                                              sampling_ratio=2)
        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 2048
            box_head = GAP_BOX_HEAD(resolution, feat_head, representation_size)
        if box_predictor is None:
            representation_size = 2048
            box_predictor = FastRCNNPredictor(representation_size,
                                              num_classes,
                                              RCNN_bbox_bn=False)
        if reid_head is None:
            reid_head = REID_HEAD(box_head.out_dims, 256)
        if reid_loss is None:
            reid_loss = OIMLoss(256, num_pids, num_cq_size, 0.5, 30)
        roi_heads = OIM_ROI_HEAD(
            reid_head,
            reid_loss,
            # box
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img)
        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)
        super(FasterRCNN_OIM, self).__init__(backbone, rpn, roi_heads,
                                             transform)
def evaluate_yolo_2017(model, data_loader, device):
    n_threads = torch.get_num_threads()
    # FIXME remove this and make paste_masks_in_image run on the GPU
    torch.set_num_threads(1)
    cpu_device = torch.device("cpu")
    model.eval()
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Test:'

    coco = get_coco_api_from_dataset(data_loader.dataset)
    iou_types = _get_iou_types(model)
    coco_evaluator = CocoEvaluator(coco, iou_types)
    transform = GeneralizedRCNNTransform(416, 416, [0, 0, 0], [1, 1, 1])
    transform.eval()
    for image, targets in metric_logger.log_every(data_loader, 100, header):
        image = list(img.to(device) for img in image)

        original_image_sizes = [img.shape[-2:] for img in image]

        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        torch.cuda.synchronize()
        model_time = time.time()
        transformed_img = transform(image)
        transformed_shape = transformed_img[0].tensors.shape[-2:]
        inf_out, _ = model(transformed_img[0].tensors)
        # Run NMS
        output = non_max_suppression(inf_out, conf_thres=0.001, iou_thres=0.6)

        # Statistics per image
        predictions = []
        for si, pred in enumerate(output):
            prediction = {'boxes': [], 'labels': [], 'scores': []}
            if pred is None:
                continue
            # Append to text file
            # with open('test.txt', 'a') as file:
            #    [file.write('%11.5g' * 7 % tuple(x) + '\n') for x in pred]

            # Clip boxes to image bounds
            clip_coords(pred, transformed_shape)
            # Append to pycocotools JSON dictionary
            # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ...
            image_id = int(targets[si]['image_id'])
            box = pred[:, :4].clone()  # xyxy
            # scale_coords(transformed_shape, box, shapes[si][0], shapes[si][1])  # to original shape
            # box = xyxy2xywh(box)  # xywh
            # box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
            for di, d in enumerate(pred):
                box_T = [floatn(x, 3) for x in box[di]]
                label = coco91class[int(d[5])]
                score = floatn(d[4], 5)
                prediction['boxes'].append(box_T)
                prediction['labels'].append(label)
                prediction['scores'].append(score)
            prediction['boxes'] = torch.tensor(prediction['boxes'])
            prediction['labels'] = torch.tensor(prediction['labels'])
            prediction['scores'] = torch.tensor(prediction['scores'])
            predictions.append(prediction)

        outputs = transform.postprocess(predictions,
                                        transformed_img[0].image_sizes,
                                        original_image_sizes)

        outputs = [{k: v.to(cpu_device)
                    for k, v in t.items()} for t in predictions]
        model_time = time.time() - model_time

        res = {
            target["image_id"].item(): output
            for target, output in zip(targets, outputs)
        }
        evaluator_time = time.time()
        coco_evaluator.update(res)
        evaluator_time = time.time() - evaluator_time
        metric_logger.update(model_time=model_time,
                             evaluator_time=evaluator_time)

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    coco_evaluator.accumulate()
    coco_evaluator.summarize()
    torch.set_num_threads(n_threads)
    return coco_evaluator
Exemple #17
0
    def __init__(
            self,
            backbone,
            num_classes=None,
            # transform parameters
            min_size=800,
            max_size=800,
            image_mean=None,
            image_std=None,
            # RPN parameters
            anchor_generator=None,
            # Box parameters
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.15,
            box_bg_iou_thresh=0.15,
            box_batch_size_per_image=50,
            box_positive_fraction=0.5,
            bbox_reg_weights=None):

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError(
                    "num_classes should be None when box_predictor is specified"
                )
        else:
            if box_predictor is None:
                raise ValueError(
                    "num_classes should not be None when box_predictor "
                    "is not specified")

        out_channels = backbone.out_channels

        if anchor_generator is None:
            # anchor size per every feature map level
            anchor_sizes = ((16, ), (32, ), (64, ), (128, ), (210, ), (320, ))
            aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes)
            anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)

        num_anchors_per_location = anchor_generator.num_anchors_per_location(
        )[0]
        ssd_predictor = SSDPredictor(out_channels, num_classes,
                                     num_anchors_per_location)

        ssd_head = SSDHead(
            # Box
            ssd_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights)
        bbox_reg_weights = ssd_head.bbox_reg_weight
        detection_filter = DetectionNmsPostprocessor(box_score_thresh,
                                                     box_nms_thresh,
                                                     bbox_reg_weights,
                                                     box_detections_per_img)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super(SSD, self).__init__(transform, backbone, anchor_generator,
                                  ssd_head, detection_filter, num_classes)
Exemple #18
0
    def __init__(self, cfg):
        super(SeqNet, self).__init__()

        backbone, box_head = build_resnet(name="resnet50", pretrained=True)

        anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ),
                                           aspect_ratios=((0.5, 1.0, 2.0), ))
        head = RPNHead(
            in_channels=backbone.out_channels,
            num_anchors=anchor_generator.num_anchors_per_location()[0],
        )
        pre_nms_top_n = dict(training=cfg.MODEL.RPN.PRE_NMS_TOPN_TRAIN,
                             testing=cfg.MODEL.RPN.PRE_NMS_TOPN_TEST)
        post_nms_top_n = dict(training=cfg.MODEL.RPN.POST_NMS_TOPN_TRAIN,
                              testing=cfg.MODEL.RPN.POST_NMS_TOPN_TEST)
        rpn = RegionProposalNetwork(
            anchor_generator=anchor_generator,
            head=head,
            fg_iou_thresh=cfg.MODEL.RPN.POS_THRESH_TRAIN,
            bg_iou_thresh=cfg.MODEL.RPN.NEG_THRESH_TRAIN,
            batch_size_per_image=cfg.MODEL.RPN.BATCH_SIZE_TRAIN,
            positive_fraction=cfg.MODEL.RPN.POS_FRAC_TRAIN,
            pre_nms_top_n=pre_nms_top_n,
            post_nms_top_n=post_nms_top_n,
            nms_thresh=cfg.MODEL.RPN.NMS_THRESH,
        )

        faster_rcnn_predictor = FastRCNNPredictor(2048, 2)
        reid_head = deepcopy(box_head)
        box_roi_pool = MultiScaleRoIAlign(featmap_names=["feat_res4"],
                                          output_size=14,
                                          sampling_ratio=2)
        box_predictor = BBoxRegressor(2048,
                                      num_classes=2,
                                      bn_neck=cfg.MODEL.ROI_HEAD.BN_NECK)
        roi_heads = SeqRoIHeads(
            # OIM
            num_pids=cfg.MODEL.LOSS.LUT_SIZE,
            num_cq_size=cfg.MODEL.LOSS.CQ_SIZE,
            oim_momentum=cfg.MODEL.LOSS.OIM_MOMENTUM,
            oim_scalar=cfg.MODEL.LOSS.OIM_SCALAR,
            # SeqNet
            faster_rcnn_predictor=faster_rcnn_predictor,
            reid_head=reid_head,
            # parent class
            box_roi_pool=box_roi_pool,
            box_head=box_head,
            box_predictor=box_predictor,
            fg_iou_thresh=cfg.MODEL.ROI_HEAD.POS_THRESH_TRAIN,
            bg_iou_thresh=cfg.MODEL.ROI_HEAD.NEG_THRESH_TRAIN,
            batch_size_per_image=cfg.MODEL.ROI_HEAD.BATCH_SIZE_TRAIN,
            positive_fraction=cfg.MODEL.ROI_HEAD.POS_FRAC_TRAIN,
            bbox_reg_weights=None,
            score_thresh=cfg.MODEL.ROI_HEAD.SCORE_THRESH_TEST,
            nms_thresh=cfg.MODEL.ROI_HEAD.NMS_THRESH_TEST,
            detections_per_img=cfg.MODEL.ROI_HEAD.DETECTIONS_PER_IMAGE_TEST,
        )

        transform = GeneralizedRCNNTransform(
            min_size=cfg.INPUT.MIN_SIZE,
            max_size=cfg.INPUT.MAX_SIZE,
            image_mean=[0.485, 0.456, 0.406],
            image_std=[0.229, 0.224, 0.225],
        )

        self.backbone = backbone
        self.rpn = rpn
        self.roi_heads = roi_heads
        self.transform = transform

        # loss weights
        self.lw_rpn_reg = cfg.SOLVER.LW_RPN_REG
        self.lw_rpn_cls = cfg.SOLVER.LW_RPN_CLS
        self.lw_proposal_reg = cfg.SOLVER.LW_PROPOSAL_REG
        self.lw_proposal_cls = cfg.SOLVER.LW_PROPOSAL_CLS
        self.lw_box_reg = cfg.SOLVER.LW_BOX_REG
        self.lw_box_cls = cfg.SOLVER.LW_BOX_CLS
        self.lw_box_reid = cfg.SOLVER.LW_BOX_REID
Exemple #19
0
    def __init__(
            self,
            num_classes=2,
            # transform parameters
            backbone_name='resnet50',
            min_size=256,
            max_size=512,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            rpn_score_thresh=0.0,
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=512,
            box_positive_fraction=0.25,
            bbox_reg_weights=None,
            # Ellipse regressor
            ellipse_roi_pool=None,
            ellipse_head=None,
            ellipse_predictor=None,
            ellipse_loss_metric="gaussian-angle"):

        backbone = resnet_fpn_backbone(backbone_name,
                                       pretrained=True,
                                       trainable_layers=5)

        # Input image is grayscale -> in_channels = 1 instead of 3 (COCO)
        backbone.body.conv1 = Conv2d(1,
                                     64,
                                     kernel_size=(7, 7),
                                     stride=(2, 2),
                                     padding=(3, 3),
                                     bias=False)

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError(
                    "num_classes should be None when box_predictor is specified"
                )
        else:
            if box_predictor is None:
                raise ValueError(
                    "num_classes should not be None when box_predictor "
                    "is not specified")

        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
            anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, ))
            aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes)
            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)

        rpn = RegionProposalNetwork(rpn_anchor_generator,
                                    rpn_head,
                                    rpn_fg_iou_thresh,
                                    rpn_bg_iou_thresh,
                                    rpn_batch_size_per_image,
                                    rpn_positive_fraction,
                                    rpn_pre_nms_top_n,
                                    rpn_post_nms_top_n,
                                    rpn_nms_thresh,
                                    score_thresh=rpn_score_thresh)

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(
                featmap_names=['0', '1', '2', '3'],
                output_size=7,
                sampling_ratio=2)

        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 1024
            box_head = TwoMLPHead(out_channels * resolution**2,
                                  representation_size)

        if box_predictor is None:
            representation_size = 1024
            box_predictor = FastRCNNPredictor(representation_size, num_classes)

        if ellipse_roi_pool is None:
            ellipse_roi_pool = MultiScaleRoIAlign(
                featmap_names=['0', '1', '2', '3'],
                output_size=7,
                sampling_ratio=2)

        if ellipse_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 1024
            ellipse_head = TwoMLPHead(out_channels * resolution**2,
                                      representation_size)

        if ellipse_predictor is None:
            representation_size = 1024
            ellipse_predictor = EllipseRegressor(representation_size,
                                                 num_classes)

        roi_heads = EllipseRoIHeads(
            # Box
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img,
            # Ellipse
            ellipse_roi_pool=ellipse_roi_pool,
            ellipse_head=ellipse_head,
            ellipse_predictor=ellipse_predictor,
            ellipse_loss_metric=ellipse_loss_metric)

        if image_mean is None:
            image_mean = [0.156]
        if image_std is None:
            image_std = [0.272]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super().__init__(backbone, rpn, roi_heads, transform)
Exemple #20
0
class RetinaNet(nn.Module):
    """
    Implements RetinaNet.

    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes.

    The behavior of the model changes depending if it is in training or evaluation mode.

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
        - labels (Int64Tensor[N]): the class label for each ground-truth box

    The model returns a Dict[Tensor] during training, containing the classification and regression
    losses.

    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows:
        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
        - labels (Int64Tensor[N]): the predicted labels for each image
        - scores (Tensor[N]): the scores for each prediction

    Args:
        backbone (nn.Module): the network used to compute the features for the model.
            It should contain an out_channels attribute, which indicates the number of output
            channels that each feature map has (and it should be the same for all feature maps).
            The backbone should return a single Tensor or an OrderedDict[Tensor].
        num_classes (int): number of output classes of the model (including the background).
        min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
        max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
        image_mean (Tuple[float, float, float]): mean values used for input normalization.
            They are generally the mean values of the dataset on which the backbone has been trained
            on
        image_std (Tuple[float, float, float]): std values used for input normalization.
            They are generally the std values of the dataset on which the backbone has been trained on
        anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
            maps.
        head (nn.Module): Module run on top of the feature pyramid.
            Defaults to a module containing a classification and regression module.
        score_thresh (float): Score threshold used for postprocessing the detections.
        nms_thresh (float): NMS threshold used for postprocessing the detections.
        detections_per_img (int): Number of best detections to keep after NMS.
        fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
            considered as positive during training.
        bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
            considered as negative during training.
        topk_candidates (int): Number of best detections to keep before NMS.

    Example:

        >>> import torch
        >>> import torchvision
        >>> from torchvision.models.detection import RetinaNet
        >>> from torchvision.models.detection.anchor_utils import AnchorGenerator
        >>> # load a pre-trained model for classification and return
        >>> # only the features
        >>> backbone = torchvision.models.mobilenet_v2(pretrained=True).features
        >>> # RetinaNet needs to know the number of
        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
        >>> # so we need to add it here
        >>> backbone.out_channels = 1280
        >>>
        >>> # let's make the network generate 5 x 3 anchors per spatial
        >>> # location, with 5 different sizes and 3 different aspect
        >>> # ratios. We have a Tuple[Tuple[int]] because each feature
        >>> # map could potentially have different sizes and
        >>> # aspect ratios
        >>> anchor_generator = AnchorGenerator(
        >>>     sizes=((32, 64, 128, 256, 512),),
        >>>     aspect_ratios=((0.5, 1.0, 2.0),)
        >>> )
        >>>
        >>> # put the pieces together inside a RetinaNet model
        >>> model = RetinaNet(backbone,
        >>>                   num_classes=2,
        >>>                   anchor_generator=anchor_generator)
        >>> model.eval()
        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
        >>> predictions = model(x)
    """
    __annotations__ = {
        'box_coder': det_utils.BoxCoder,
        'proposal_matcher': det_utils.Matcher,
    }

    def __init__(
            self,
            backbone,
            num_classes,
            # transform parameters
            min_size=800,
            max_size=1333,
            image_mean=None,
            image_std=None,
            # Anchor parameters
            anchor_generator=None,
            head=None,
            proposal_matcher=None,
            score_thresh=0.05,
            nms_thresh=0.5,
            detections_per_img=300,
            fg_iou_thresh=0.5,
            bg_iou_thresh=0.4,
            topk_candidates=1000):
        super().__init__()

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")
        self.backbone = backbone

        assert isinstance(anchor_generator, (AnchorGenerator, type(None)))

        if anchor_generator is None:
            anchor_sizes = tuple(
                (x, int(x * 2**(1.0 / 3)), int(x * 2**(2.0 / 3)))
                for x in [32, 64, 128, 256, 512])
            aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes)
            anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
        self.anchor_generator = anchor_generator

        if head is None:
            head = RetinaNetHead(
                backbone.out_channels,
                anchor_generator.num_anchors_per_location()[0], num_classes)
        self.head = head

        if proposal_matcher is None:
            proposal_matcher = det_utils.Matcher(
                fg_iou_thresh,
                bg_iou_thresh,
                allow_low_quality_matches=True,
            )
        self.proposal_matcher = proposal_matcher

        self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        self.transform = GeneralizedRCNNTransform(min_size, max_size,
                                                  image_mean, image_std)

        self.score_thresh = score_thresh
        self.nms_thresh = nms_thresh
        self.detections_per_img = detections_per_img
        self.topk_candidates = topk_candidates

        # used only on torchscript mode
        self._has_warned = False

    @torch.jit.unused
    def eager_outputs(self, losses, detections):
        # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
        if self.training:
            return losses

        return detections

    def compute_loss(self, targets, head_outputs, anchors):
        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor]) -> Dict[str, Tensor]
        matched_idxs = []
        for anchors_per_image, targets_per_image in zip(anchors, targets):
            if targets_per_image['boxes'].numel() == 0:
                matched_idxs.append(
                    torch.full((anchors_per_image.size(0), ),
                               -1,
                               dtype=torch.int64,
                               device=anchors_per_image.device))
                continue

            match_quality_matrix = box_ops.box_iou(targets_per_image['boxes'],
                                                   anchors_per_image)
            matched_idxs.append(self.proposal_matcher(match_quality_matrix))

        return self.head.compute_loss(targets, head_outputs, anchors,
                                      matched_idxs)

    def postprocess_detections(self, head_outputs, anchors, image_shapes):
        # type: (Dict[str, List[Tensor]], List[List[Tensor]], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
        class_logits = head_outputs['cls_logits']
        box_regression = head_outputs['bbox_regression']

        num_images = len(image_shapes)

        detections: List[Dict[str, Tensor]] = []

        for index in range(num_images):
            box_regression_per_image = [br[index] for br in box_regression]
            logits_per_image = [cl[index] for cl in class_logits]
            anchors_per_image, image_shape = anchors[index], image_shapes[
                index]

            image_boxes = []
            image_scores = []
            image_labels = []

            for box_regression_per_level, logits_per_level, anchors_per_level in \
                    zip(box_regression_per_image, logits_per_image, anchors_per_image):
                num_classes = logits_per_level.shape[-1]

                # remove low scoring boxes
                scores_per_level = torch.sigmoid(logits_per_level).flatten()
                keep_idxs = scores_per_level > self.score_thresh
                scores_per_level = scores_per_level[keep_idxs]
                topk_idxs = torch.where(keep_idxs)[0]

                # keep only topk scoring predictions
                num_topk = min(self.topk_candidates, topk_idxs.size(0))
                scores_per_level, idxs = scores_per_level.topk(num_topk)
                topk_idxs = topk_idxs[idxs]

                anchor_idxs = torch.div(topk_idxs,
                                        num_classes,
                                        rounding_mode='floor')
                labels_per_level = topk_idxs % num_classes

                boxes_per_level = self.box_coder.decode_single(
                    box_regression_per_level[anchor_idxs],
                    anchors_per_level[anchor_idxs])
                boxes_per_level = box_ops.clip_boxes_to_image(
                    boxes_per_level, image_shape)

                image_boxes.append(boxes_per_level)
                image_scores.append(scores_per_level)
                image_labels.append(labels_per_level)

            image_boxes = torch.cat(image_boxes, dim=0)
            image_scores = torch.cat(image_scores, dim=0)
            image_labels = torch.cat(image_labels, dim=0)

            # non-maximum suppression
            keep = box_ops.batched_nms(image_boxes, image_scores, image_labels,
                                       self.nms_thresh)
            keep = keep[:self.detections_per_img]

            detections.append({
                'boxes': image_boxes[keep],
                'scores': image_scores[keep],
                'labels': image_labels[keep],
            })

        return detections

    def forward(self, images, targets=None):
        # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
        """
        Args:
            images (list[Tensor]): images to be processed
            targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)

        Returns:
            result (list[BoxList] or dict[Tensor]): the output from the model.
                During training, it returns a dict[Tensor] which contains the losses.
                During testing, it returns list[BoxList] contains additional fields
                like `scores`, `labels` and `mask` (for Mask R-CNN models).

        """
        if self.training and targets is None:
            raise ValueError("In training mode, targets should be passed")

        if self.training:
            assert targets is not None
            for target in targets:
                boxes = target["boxes"]
                if isinstance(boxes, torch.Tensor):
                    if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
                        raise ValueError("Expected target boxes to be a tensor"
                                         "of shape [N, 4], got {:}.".format(
                                             boxes.shape))
                else:
                    raise ValueError("Expected target boxes to be of type "
                                     "Tensor, got {:}.".format(type(boxes)))

        # get the original image sizes
        original_image_sizes: List[Tuple[int, int]] = []
        for img in images:
            val = img.shape[-2:]
            assert len(val) == 2
            original_image_sizes.append((val[0], val[1]))

        # transform the input
        images, targets = self.transform(images, targets)

        # Check for degenerate boxes
        # TODO: Move this to a function
        if targets is not None:
            for target_idx, target in enumerate(targets):
                boxes = target["boxes"]
                degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
                if degenerate_boxes.any():
                    # print the first degenerate box
                    bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
                    degen_bb: List[float] = boxes[bb_idx].tolist()
                    raise ValueError(
                        "All bounding boxes should have positive height and width."
                        " Found invalid box {} for target at index {}.".format(
                            degen_bb, target_idx))

        # get the features from the backbone
        features = self.backbone(images.tensors)
        if isinstance(features, torch.Tensor):
            features = OrderedDict([('0', features)])

        # TODO: Do we want a list or a dict?
        features = list(features.values())
        for idx in range(len(features)):
            features[idx] = features[idx].to(torch.float32)

        # compute the retinanet heads outputs using the features
        head_outputs = self.head(features)

        for key in head_outputs:
            head_outputs[key] = head_outputs[key].to(torch.float32)

        # create the set of anchors
        anchors = self.anchor_generator(images, features)

        losses = {}
        detections: List[Dict[str, Tensor]] = []
        if self.training:
            assert targets is not None

            # compute the losses
            losses = self.compute_loss(targets, head_outputs, anchors)
        else:
            # recover level sizes
            num_anchors_per_level = [x.size(2) * x.size(3) for x in features]
            HW = 0
            for v in num_anchors_per_level:
                HW += v
            HWA = head_outputs['cls_logits'].size(1)
            A = HWA // HW
            num_anchors_per_level = [hw * A for hw in num_anchors_per_level]

            # split outputs per level
            split_head_outputs: Dict[str, List[Tensor]] = {}
            for k in head_outputs:
                split_head_outputs[k] = list(head_outputs[k].split(
                    num_anchors_per_level, dim=1))
            split_anchors = [
                list(a.split(num_anchors_per_level)) for a in anchors
            ]

            # compute the detections
            detections = self.postprocess_detections(split_head_outputs,
                                                     split_anchors,
                                                     images.image_sizes)
            detections = self.transform.postprocess(detections,
                                                    images.image_sizes,
                                                    original_image_sizes)

        if torch.jit.is_scripting():
            if not self._has_warned:
                warnings.warn(
                    "RetinaNet always returns a (Losses, Detections) tuple in scripting"
                )
                self._has_warned = True
            return losses, detections
        return self.eager_outputs(losses, detections)
from torchvision.datasets.coco import CocoDetection
from torchvision.models.detection.transform import GeneralizedRCNNTransform
from torchvision.transforms import transforms
from pdetection import transform
from torch.utils.data.dataloader import DataLoader
import torch

from pdetection import utils

path_data = '/home/peng/Documents/srch/Object Detection/dataset/coco/val2017'
path_anno = '/home/peng/Documents/srch/Object Detection/dataset/coco/annotations_trainval2017/annotations/instances_val2017.json'

coco_dset = CocoDetection(root=path_data, annFile=path_anno)
trans = transform.ODTransformer(800, 1333, [0.485, 0.456, 0.406],
                                [0.229, 0.224, 0.225])
trans_compare = GeneralizedRCNNTransform(800, 1333, [0.485, 0.456, 0.406],
                                         [0.229, 0.224, 0.225])
# trans = transform.ODTransformer(800, 1333, [0.4, 0.5, 0.6], [0.5, 1, 2])

cocoloader = DataLoader(coco_dset, batch_size=3, collate_fn=utils.collate_fn)
for data in cocoloader:
    images, targets = data
    inputimgs = []
    totensor = transforms.ToTensor()
    for img in images:
        inputimgs.append(totensor(img))
    src_img_sizes = [img.shape[-2:] for img in inputimgs]
    # print([img.shape[-2:] for img in inputimgs])
    targets1 = utils.totargets(targets)
    targets2 = utils.totargets(targets)
    print(targets2[2]['boxes'])
    inputimgs_bk = [torch.zeros_like(img).copy_(img) for img in inputimgs]
Exemple #22
0
    def __init__(
            self,
            backbone,
            num_classes=None,
            num_pids=5532,
            num_cq_size=5000,
            # transform parameters
            min_size=900,
            max_size=1500,
            image_mean=None,
            image_std=None,
            # Anchor settings:
            anchor_scales=None,
            anchor_ratios=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=12000,
            rpn_pre_nms_top_n_test=6000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=300,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            box_roi_pool=None,
            feat_head=None,
            box_predictor=None,
            box_score_thresh=0.0,
            box_nms_thresh=0.4,
            box_detections_per_img=300,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.1,
            box_batch_size_per_image=128,
            box_positive_fraction=0.5,
            bbox_reg_weights=None,
            # ReID parameters
            embedding_head=None,
            reid_loss=None):

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                'backbone should contain an attribute out_channels '
                'specifying the number of output channels (assumed to be the '
                'same for all the levels)')

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError(
                    'num_classes should be None when box_predictor is specified'
                )
        else:
            if box_predictor is None:
                raise ValueError(
                    'num_classes should not be None when box_predictor'
                    'is not specified')

        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
            if anchor_scales is None:
                anchor_scales = ((32, 64, 128, 256, 512), )
            if anchor_ratios is None:
                anchor_ratios = ((0.5, 1.0, 2.0), )
            rpn_anchor_generator = AnchorGenerator(anchor_scales,
                                                   anchor_ratios)

        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)

        rpn = self._set_rpn(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh,
                            rpn_bg_iou_thresh, rpn_batch_size_per_image,
                            rpn_positive_fraction, rpn_pre_nms_top_n,
                            rpn_post_nms_top_n, rpn_nms_thresh)

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(featmap_names=['feat_res4'],
                                              output_size=14,
                                              sampling_ratio=2)

        if feat_head is None:
            raise ValueError('feat_head should be specified manually.')
            # resolution = box_roi_pool.output_size[0]
            # representation_size = 2048
            # # ConvHead should be part of the backbone
            # # feat_head = TwoMLPHead(
            # #     out_channels * resolution ** 2,
            # #     representation_size)

        if box_predictor is None:
            box_predictor = CoordRegressor(2048, num_classes)

        if embedding_head is None:
            embedding_head = ReIDEmbeddingProj(
                featmap_names=['feat_res4', 'feat_res5'],
                in_channels=[1024, 2048],
                dim=256)

        if reid_loss is None:
            reid_loss = HOIMLoss(256, num_pids, num_cq_size, 0.5, 30.0)

        roi_heads = self._set_roi_heads(
            embedding_head, reid_loss, box_roi_pool, feat_head, box_predictor,
            box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image,
            box_positive_fraction, bbox_reg_weights, box_score_thresh,
            box_nms_thresh, box_detections_per_img)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super(FasterRCNN_HOIM, self).__init__(backbone, rpn, roi_heads,
                                              transform)
Exemple #23
0
    def __init__(
            self,
            backbone,
            n_channel_backbone=5,
            num_classes=None,
            # transform parameters
            min_size=800,
            max_size=1333,
            #min_size=720, max_size=1280,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.5,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            anchor_sizes=[32, 64, 128, 256, 512],
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.4,
            box_detections_per_img=30,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=512,
            box_positive_fraction=0.25,
            bbox_reg_weights=None,
            weight_loss=False,
            use_soft_nms=False,
            use_context=False,
            use_track_branch=False):

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError(
                    "num_classes should be None when box_predictor is specified"
                )
        else:
            if box_predictor is None:
                raise ValueError(
                    "num_classes should not be None when box_predictor "
                    "is not specified")

        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
            ratios = ((0.5, 1.0, 2.0), )
            aspect_ratios = ratios * len(anchor_sizes)
            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)

        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)

        rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head,
                                    rpn_fg_iou_thresh, rpn_bg_iou_thresh,
                                    rpn_batch_size_per_image,
                                    rpn_positive_fraction, rpn_pre_nms_top_n,
                                    rpn_post_nms_top_n, rpn_nms_thresh,
                                    weight_loss)

        if box_roi_pool is None:

            box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3],
                                              output_size=7,
                                              sampling_ratio=2)

            if n_channel_backbone == 6:

                box_roi_pool = MultiScaleRoIAlign(
                    featmap_names=[0, 1, 2, 3, 4],
                    output_size=7,
                    sampling_ratio=2)

        representation_size1 = 1024
        representation_size2 = 1024
        track_embedding_size = 1024

        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            if use_context:
                box_head = TwoMLPHead(2 * out_channels * resolution**2,
                                      representation_size1,
                                      representation_size2)
            else:
                box_head = TwoMLPHead(out_channels * resolution**2,
                                      representation_size1,
                                      representation_size2)

        if use_track_branch:
            if use_context:
                track_embedding = TwoMLPHead(2 * out_channels * resolution**2,
                                             representation_size1,
                                             track_embedding_size)
            else:
                track_embedding = TwoMLPHead(out_channels * resolution**2,
                                             representation_size1,
                                             track_embedding_size)
        else:
            track_embedding = None

        if box_predictor is None:
            box_predictor = FastRCNNPredictor(representation_size1,
                                              num_classes)

        if num_classes > 2:
            use_soft_nms = False

        roi_heads = RoIHeads(
            # Box
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img,
            weight_loss=weight_loss,
            use_soft_nms=use_soft_nms,
            use_context=use_context)

        if use_track_branch:
            track_heads = TrackHeads(box_roi_pool,
                                     box_head,
                                     box_predictor,
                                     box_fg_iou_thresh,
                                     box_bg_iou_thresh,
                                     box_batch_size_per_image,
                                     box_positive_fraction,
                                     bbox_reg_weights,
                                     weight_loss=False,
                                     use_context=False,
                                     track_embedding=track_embedding)
        else:
            track_heads = None

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, track_heads,
                                         transform, n_channel_backbone)
Exemple #24
0
    def __init__(
        self,
        backbone: nn.Module,
        num_classes: int,
        anchor_grids: List[List[int]],
        # transform parameters
        min_size: int = 320,
        max_size: int = 416,
        image_mean: Optional[List[float]] = None,
        image_std: Optional[List[float]] = None,
        # Anchor parameters
        anchor_generator: Optional[nn.Module] = None,
        head: Optional[nn.Module] = None,
        # Training parameter
        compute_loss: Optional[nn.Module] = None,
        fg_iou_thresh: float = 0.5,
        bg_iou_thresh: float = 0.4,
        # Post Process parameter
        postprocess_detections: Optional[nn.Module] = None,
        score_thresh: float = 0.05,
        nms_thresh: float = 0.5,
        detections_per_img: int = 300,
    ):
        super().__init__()
        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")
        self.backbone = backbone

        if anchor_generator is None:
            strides: List[int] = [8, 16, 32]
            anchor_generator = AnchorGenerator(strides, anchor_grids)
        self.anchor_generator = anchor_generator

        if compute_loss is None:
            compute_loss = SetCriterion(
                weights=(1.0, 1.0, 1.0, 1.0),
                fg_iou_thresh=fg_iou_thresh,
                bg_iou_thresh=bg_iou_thresh,
            )
        self.compute_loss = compute_loss

        if head is None:
            head = YoloHead(
                backbone.out_channels,
                anchor_generator.num_anchors,
                num_classes,
            )
        self.head = head

        if image_mean is None:
            image_mean = [0., 0., 0.]
        if image_std is None:
            image_std = [1., 1., 1.]

        self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)

        if postprocess_detections is None:
            postprocess_detections = PostProcess(score_thresh, nms_thresh, detections_per_img)
        self.postprocess_detections = postprocess_detections

        # used only on torchscript mode
        self._has_warned = False
Exemple #25
0
                    default=8,
                    help='Index to the dataset for an example')
parser.add_argument('--outdir',
                    type=str,
                    default='examples_detection',
                    help='Folder for output images')

if __name__ == '__main__':
    args = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    root = args.root
    annfile = args.annfile

    # Load a maskRCNN finetuned on our birds
    network_transform = GeneralizedRCNNTransform(800, 1333, (0, 0, 0),
                                                 (1, 1, 1))
    backbone = resnet_fpn_backbone(backbone_name='resnet101', pretrained=False)
    model = MaskRCNN(backbone, num_classes=2)
    model.transform = network_transform
    model.eval()
    model.load_state_dict(torch.load('models/detector.pth'))
    model.to(device)

    # Load a data split
    normalize = T.Normalize(mean=[102.9801, 115.9465, 122.7717],
                            std=[1., 1., 1.])
    coco = COCO(annfile)

    # Load an image example
    available_Ids = coco.getImgIds()
    imgfile = coco.loadImgs(available_Ids[args.index])[0]['file_name']
Exemple #26
0
    def __init__(
            self,
            backbone,
            num_classes=None,
            # transform parameters
            min_size=800,
            max_size=1333,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=512,
            box_positive_fraction=0.25,
            bbox_reg_weights=None):

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError(
                    "num_classes should be None when box_predictor is specified"
                )
        else:
            if box_predictor is None:
                raise ValueError(
                    "num_classes should not be None when box_predictor "
                    "is not specified")

        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
            anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, ))
            aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes)
            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)

        rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head,
                                    rpn_fg_iou_thresh, rpn_bg_iou_thresh,
                                    rpn_batch_size_per_image,
                                    rpn_positive_fraction, rpn_pre_nms_top_n,
                                    rpn_post_nms_top_n, rpn_nms_thresh)

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(
                featmap_names=['0', '1', '2', '3'],
                output_size=7,
                sampling_ratio=2)

        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 1024
            box_head = TwoMLPHead(out_channels * resolution**2,
                                  representation_size)

        if box_predictor is None:
            representation_size = 1024
            box_predictor = FastRCNNPredictor(representation_size, num_classes)

        roi_heads = RoIHeads(
            # Box
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
Exemple #27
0
import torch
import torchvision
from torch.jit.annotations import Tuple, List, Dict, Optional
import numpy as np
import cv2
import dataset

images, targets = dataset.load_data()
to_tensor = torchvision.transforms.ToTensor()
images = [to_tensor(image) for image in images]
targets = [{
    'boxes': item['boxes'],
    'labels': item['labels']
} for item in targets]

min_size = [800, 820, 900]
max_size = 1333
from torchvision.models.detection.transform import GeneralizedRCNNTransform

## transform
image_mean = [0.485, 0.456, 0.406]
image_std = [0.229, 0.224, 0.225]
transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)
transform.train()
print(transform.training)
images, targets = transform(images, targets)
print(images.tensors.shape)
print(images.image_sizes)
print(len(targets))
print(targets[0])
Exemple #28
0
    def __init__(
            self,
            backbone,
            num_classes=None,
            # transform parameters
            min_size=800,
            max_size=1333,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.3,
            box_detections_per_img=128,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=64,
            box_positive_fraction=0.25,
            bbox_reg_weights=None):

        print("Using modified Faster RCNN....")
        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError(
                    "num_classes should be None when box_predictor is specified"
                )
        else:
            if box_predictor is None:
                raise ValueError(
                    "num_classes should not be None when box_predictor "
                    "is not specified")

        out_channels = backbone.out_channels

        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 1024
            box_head = TwoMLPHead(out_channels * resolution**2,
                                  representation_size)

        if box_predictor is None:
            representation_size = 1024
            box_predictor = FastRCNNPredictor(representation_size, num_classes)

        rpn = None

        roi_heads = RoIHeads(
            # Box
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]

        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super().__init__(backbone, rpn, roi_heads, transform)
Exemple #29
0
import pickle
import random
import argparse
import numpy as np
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim

from models import get_model
from datasets import get_loader

from helper import compute_ts_road_map, compute_ats_bounding_boxes
from torchvision.models.detection.transform import GeneralizedRCNNTransform
target_transform = GeneralizedRCNNTransform(800, 800, [0., 0., 0.],
                                            [1., 1., 1.])


def get_mask_ts(mask, target):
    mask = nn.Sigmoid()(mask) > 0.5

    temp_tensor = torch.zeros(1, 3, 400, 400)
    temp_target = [{'masks': mask, 'boxes': torch.tensor([[1., 1., 1., 1.]])}]
    _, temp_target = target_transform(temp_tensor, temp_target)
    predicted_road_map = temp_target[0]['masks'][0, :1]

    ts_road_map = compute_ts_road_map(predicted_road_map, target[0]['masks'])
    return ts_road_map


def get_detection_ts(detection, target):
Exemple #30
0
class YOLO(nn.Module):
    def __init__(
        self,
        backbone: nn.Module,
        num_classes: int,
        anchor_grids: List[List[int]],
        # transform parameters
        min_size: int = 320,
        max_size: int = 416,
        image_mean: Optional[List[float]] = None,
        image_std: Optional[List[float]] = None,
        # Anchor parameters
        anchor_generator: Optional[nn.Module] = None,
        head: Optional[nn.Module] = None,
        # Training parameter
        compute_loss: Optional[nn.Module] = None,
        fg_iou_thresh: float = 0.5,
        bg_iou_thresh: float = 0.4,
        # Post Process parameter
        postprocess_detections: Optional[nn.Module] = None,
        score_thresh: float = 0.05,
        nms_thresh: float = 0.5,
        detections_per_img: int = 300,
    ):
        super().__init__()
        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")
        self.backbone = backbone

        if anchor_generator is None:
            strides: List[int] = [8, 16, 32]
            anchor_generator = AnchorGenerator(strides, anchor_grids)
        self.anchor_generator = anchor_generator

        if compute_loss is None:
            compute_loss = SetCriterion(
                weights=(1.0, 1.0, 1.0, 1.0),
                fg_iou_thresh=fg_iou_thresh,
                bg_iou_thresh=bg_iou_thresh,
            )
        self.compute_loss = compute_loss

        if head is None:
            head = YoloHead(
                backbone.out_channels,
                anchor_generator.num_anchors,
                num_classes,
            )
        self.head = head

        if image_mean is None:
            image_mean = [0., 0., 0.]
        if image_std is None:
            image_std = [1., 1., 1.]

        self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)

        if postprocess_detections is None:
            postprocess_detections = PostProcess(score_thresh, nms_thresh, detections_per_img)
        self.postprocess_detections = postprocess_detections

        # used only on torchscript mode
        self._has_warned = False

    @torch.jit.unused
    def eager_outputs(
        self,
        losses: Dict[str, Tensor],
        detections: List[Dict[str, Tensor]],
    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
        if self.training:
            return losses

        return detections

    def forward(
        self,
        images: List[Tensor],
        targets: Optional[List[Dict[str, Tensor]]] = None,
    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
        """
        Arguments:
            images (list[Tensor]): images to be processed
            targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)

        Returns:
            result (list[BoxList] or dict[Tensor]): the output from the model.
                During Training, it returns a dict[Tensor] which contains the losses
                TODO, currently this repo doesn't support training.
                During Testing, it returns list[BoxList] contains additional fields
                like `scores` and `labels`.
        """
        # get the original image sizes
        original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], [])
        for img in images:
            val = img.shape[-2:]
            assert len(val) == 2
            original_image_sizes.append((val[0], val[1]))

        # transform the input
        images, targets = self.transform(images, targets)

        # get the features from the backbone
        features = self.backbone(images.tensors)

        # compute the yolo heads outputs using the features
        head_outputs = self.head(features)

        # create the set of anchors
        anchors_tuple = self.anchor_generator(features)
        losses = {}
        detections = torch.jit.annotate(List[Dict[str, Tensor]], [])

        if self.training:
            assert targets is not None

            # compute the losses
            losses = self.compute_loss(targets, head_outputs, anchors_tuple[0])
        else:
            # compute the detections
            detections = self.postprocess_detections(head_outputs, anchors_tuple, images.image_sizes)
            detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)

        if torch.jit.is_scripting():
            if not self._has_warned:
                warnings.warn("YOLO always returns a (Losses, Detections) tuple in scripting")
                self._has_warned = True
            return losses, detections
        else:
            return self.eager_outputs(losses, detections)