def test_compat_runner_args():
    cfg = ConfigDict(dict(total_epochs=12))
    with pytest.warns(None) as record:
        cfg = compat_runner_args(cfg)
    assert len(record) == 1
    assert 'runner' in record.list[0].message.args[0]
    assert 'runner' in cfg
    assert cfg.runner.type == 'EpochBasedRunner'
    assert cfg.runner.max_epochs == cfg.total_epochs
Beispiel #2
0
    def __init__(self,
                 backbone,
                 refiner=None,
                 train_cfg=None,
                 test_cfg=None,
                 norm_cfg=None,
                 pretrained=None):
        super().__init__()

        self.train_cfg = train_cfg if train_cfg is not None else ConfigDict()
        self.test_cfg = test_cfg if test_cfg is not None else ConfigDict()
        self.norm_cfg = norm_cfg if norm_cfg is not None else ConfigDict()
        self.backbone = build_backbone(backbone)
        # build refiner if it's not None.
        if refiner is None:
            self.train_cfg['train_refiner'] = False
            self.test_cfg['refine'] = False
        else:
            self.refiner = build_component(refiner)

        # if argument train_cfg is not None, validate if the config is proper.
        if train_cfg is not None:
            assert hasattr(self.train_cfg, 'train_refiner')
            assert hasattr(self.test_cfg, 'refine')
            if self.test_cfg.refine and not self.train_cfg.train_refiner:
                print_log(
                    'You are not training the refiner, but it is used for '
                    'model forwarding.', 'root', logging.WARNING)

            if not self.train_cfg.train_backbone:
                self.freeze_backbone()

        # validate if test config is proper
        if not hasattr(self.test_cfg, 'metrics'):
            raise KeyError('Missing key "metrics" in test_cfg')

        if mmcv.is_list_of(self.test_cfg.metrics, str):
            for metric in self.test_cfg.metrics:
                if metric not in self.allowed_metrics:
                    raise KeyError(f'metric {metric} is not supported')
        elif self.test_cfg.metrics is not None:
            raise TypeError('metrics must be None or a list of str')

        self.init_weights(pretrained)
Beispiel #3
0
def test_inference_detector():
    from mmcv import ConfigDict

    from mmdet.apis import inference_detector
    from mmdet.models import build_detector

    # small RetinaNet
    num_class = 3
    model_dict = dict(
        type='RetinaNet',
        backbone=dict(
            type='ResNet',
            depth=18,
            num_stages=4,
            out_indices=(3, ),
            norm_cfg=dict(type='BN', requires_grad=False),
            norm_eval=True,
            style='pytorch'),
        neck=None,
        bbox_head=dict(
            type='RetinaHead',
            num_classes=num_class,
            in_channels=512,
            stacked_convs=1,
            feat_channels=256,
            anchor_generator=dict(
                type='AnchorGenerator',
                octave_base_scale=4,
                scales_per_octave=3,
                ratios=[0.5],
                strides=[32]),
            bbox_coder=dict(
                type='DeltaXYWHBBoxCoder',
                target_means=[.0, .0, .0, .0],
                target_stds=[1.0, 1.0, 1.0, 1.0]),
        ),
        test_cfg=dict(
            nms_pre=1000,
            min_bbox_size=0,
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100))

    rng = np.random.RandomState(0)
    img1 = rng.rand(100, 100, 3)
    img2 = rng.rand(100, 100, 3)

    model = build_detector(ConfigDict(model_dict))
    config = _get_config_module('retinanet/retinanet_r50_fpn_1x_coco.py')
    model.cfg = config
    # test single image
    result = inference_detector(model, img1)
    assert len(result) == num_class
    # test multiple image
    result = inference_detector(model, [img1, img2])
    assert len(result) == 2 and len(result[0]) == num_class
Beispiel #4
0
def test_centernet_head_get_bboxes():
    """Tests center head generating and decoding the heatmap."""
    s = 256
    img_metas = [{
        'img_shape': (s, s, 3),
        'scale_factor': np.array([1., 1., 1., 1.]),
        'pad_shape': (s, s, 3),
        'batch_input_shape': (s, s),
        'border': (0, 0, 0, 0),
        'flip': False
    }]
    test_cfg = ConfigDict(
        dict(topk=100, local_maximum_kernel=3, max_per_img=100))
    gt_bboxes = [
        torch.Tensor([[10, 20, 200, 240], [40, 50, 100, 200],
                      [10, 20, 100, 240]])
    ]
    gt_labels = [torch.LongTensor([1, 1, 2])]

    self = CenterNetHead(num_classes=4,
                         in_channel=1,
                         feat_channel=4,
                         test_cfg=test_cfg)
    self.feat_shape = (1, 1, s // 4, s // 4)
    targets, _ = self.get_targets(gt_bboxes, gt_labels, self.feat_shape,
                                  img_metas[0]['pad_shape'])
    center_target = targets['center_heatmap_target']
    wh_target = targets['wh_target']
    offset_target = targets['offset_target']
    # make sure assign target right
    for i in range(len(gt_bboxes[0])):
        bbox, label = gt_bboxes[0][i] / 4, gt_labels[0][i]
        ctx, cty = sum(bbox[0::2]) / 2, sum(bbox[1::2]) / 2
        int_ctx, int_cty = int(sum(bbox[0::2]) / 2), int(sum(bbox[1::2]) / 2)
        w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
        x_off = ctx - int(ctx)
        y_off = cty - int(cty)
        assert center_target[0, label, int_cty, int_ctx] == 1
        assert wh_target[0, 0, int_cty, int_ctx] == w
        assert wh_target[0, 1, int_cty, int_ctx] == h
        assert offset_target[0, 0, int_cty, int_ctx] == x_off
        assert offset_target[0, 1, int_cty, int_ctx] == y_off
    # make sure get_bboxes is right
    detections = self.get_bboxes([center_target], [wh_target], [offset_target],
                                 img_metas,
                                 rescale=True,
                                 with_nms=False)
    out_bboxes = detections[0][0][:3]
    out_clses = detections[0][1][:3]
    for bbox, cls in zip(out_bboxes, out_clses):
        flag = False
        for gt_bbox, gt_cls in zip(gt_bboxes[0], gt_labels[0]):
            if (bbox[:4] == gt_bbox[:4]).all():
                flag = True
        assert flag, 'get_bboxes is wrong'
def test_compat_imgs_per_gpu():
    cfg = ConfigDict(
        dict(
            data=dict(
                imgs_per_gpu=1,
                samples_per_gpu=2,
                val=dict(),
                test=dict(),
                train=dict())))
    cfg = compat_imgs_per_gpu(cfg)
    assert cfg.data.samples_per_gpu == cfg.data.imgs_per_gpu
Beispiel #6
0
def test_transformer_encoder_pixel_decoder():
    base_channels = 64
    pixel_decoder_cfg = ConfigDict(
        dict(type='TransformerEncoderPixelDecoder',
             in_channels=[base_channels * 2**i for i in range(4)],
             feat_channels=base_channels,
             out_channels=base_channels,
             norm_cfg=dict(type='GN', num_groups=32),
             act_cfg=dict(type='ReLU'),
             encoder=dict(
                 type='DetrTransformerEncoder',
                 num_layers=6,
                 transformerlayers=dict(
                     type='BaseTransformerLayer',
                     attn_cfgs=dict(type='MultiheadAttention',
                                    embed_dims=base_channels,
                                    num_heads=8,
                                    attn_drop=0.1,
                                    proj_drop=0.1,
                                    dropout_layer=None,
                                    batch_first=False),
                     ffn_cfgs=dict(embed_dims=base_channels,
                                   feedforward_channels=base_channels * 8,
                                   num_fcs=2,
                                   act_cfg=dict(type='ReLU', inplace=True),
                                   ffn_drop=0.1,
                                   dropout_layer=None,
                                   add_identity=True),
                     operation_order=('self_attn', 'norm', 'ffn', 'norm'),
                     norm_cfg=dict(type='LN'),
                     init_cfg=None,
                     batch_first=False),
                 init_cfg=None),
             positional_encoding=dict(type='SinePositionalEncoding',
                                      num_feats=base_channels // 2,
                                      normalize=True)))
    self = build_plugin_layer(pixel_decoder_cfg)[1]
    img_metas = [{
        'batch_input_shape': (128, 160),
        'img_shape': (120, 160, 3),
    }, {
        'batch_input_shape': (128, 160),
        'img_shape': (125, 160, 3),
    }]
    feats = [
        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
        for i in range(4)
    ]
    mask_feature, memory = self(feats, img_metas)

    assert memory.shape[-2:] == feats[-1].shape[-2:]
    assert mask_feature.shape == feats[0].shape
Beispiel #7
0
def compat_runner_args(cfg):
    if 'runner' not in cfg:
        cfg.runner = ConfigDict({
            'type': 'EpochBasedRunner',
            'max_epochs': cfg.total_epochs
        })
        warnings.warn(
            'config is now expected to have a `runner` section, '
            'please set `runner` in your config.', UserWarning)
    else:
        if 'total_epochs' in cfg:
            assert cfg.total_epochs == cfg.runner.max_epochs
    return cfg
Beispiel #8
0
def test_msdeformattn_pixel_decoder():
    base_channels = 64
    pixel_decoder_cfg = ConfigDict(
        dict(type='MSDeformAttnPixelDecoder',
             in_channels=[base_channels * 2**i for i in range(4)],
             strides=[4, 8, 16, 32],
             feat_channels=base_channels,
             out_channels=base_channels,
             num_outs=3,
             norm_cfg=dict(type='GN', num_groups=32),
             act_cfg=dict(type='ReLU'),
             encoder=dict(
                 type='DetrTransformerEncoder',
                 num_layers=6,
                 transformerlayers=dict(
                     type='BaseTransformerLayer',
                     attn_cfgs=dict(type='MultiScaleDeformableAttention',
                                    embed_dims=base_channels,
                                    num_heads=8,
                                    num_levels=3,
                                    num_points=4,
                                    im2col_step=64,
                                    dropout=0.0,
                                    batch_first=False,
                                    norm_cfg=None,
                                    init_cfg=None),
                     ffn_cfgs=dict(type='FFN',
                                   embed_dims=base_channels,
                                   feedforward_channels=base_channels * 4,
                                   num_fcs=2,
                                   ffn_drop=0.0,
                                   act_cfg=dict(type='ReLU', inplace=True)),
                     operation_order=('self_attn', 'norm', 'ffn', 'norm')),
                 init_cfg=None),
             positional_encoding=dict(type='SinePositionalEncoding',
                                      num_feats=base_channels // 2,
                                      normalize=True),
             init_cfg=None), )
    self = build_plugin_layer(pixel_decoder_cfg)[1]
    feats = [
        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
        for i in range(4)
    ]
    mask_feature, multi_scale_features = self(feats)

    assert mask_feature.shape == feats[0].shape
    assert len(multi_scale_features) == 3
    multi_scale_features = multi_scale_features[::-1]
    for i in range(3):
        assert multi_scale_features[i].shape[-2:] == feats[i + 1].shape[-2:]
def test_image_classifier_return_tuple():
    model_cfg = ConfigDict(type='ImageClassifier',
                           backbone=dict(type='ResNet_CIFAR',
                                         depth=50,
                                         num_stages=4,
                                         out_indices=(3, ),
                                         style='pytorch',
                                         return_tuple=False),
                           head=dict(type='LinearClsHead',
                                     num_classes=10,
                                     in_channels=2048,
                                     loss=dict(type='CrossEntropyLoss')))

    imgs = torch.randn(16, 3, 32, 32)

    model_cfg_ = deepcopy(model_cfg)
    with pytest.warns(DeprecationWarning):
        model = CLASSIFIERS.build(model_cfg_)

    # test backbone return tensor
    feat = model.extract_feat(imgs)
    assert isinstance(feat, torch.Tensor)

    # test backbone return tuple
    model_cfg_ = deepcopy(model_cfg)
    model_cfg_.backbone.return_tuple = True
    model = CLASSIFIERS.build(model_cfg_)

    feat = model.extract_feat(imgs)
    assert isinstance(feat, tuple)

    # test warning if backbone return tensor
    class ToyBackbone(BaseModule):
        def __init__(self):
            super().__init__()
            self.conv = torch.nn.Conv2d(3, 16, 3)

        def forward(self, x):
            return self.conv(x)

    model_cfg_ = deepcopy(model_cfg)
    model_cfg_.backbone.return_tuple = True
    model = CLASSIFIERS.build(model_cfg_)
    model.backbone = ToyBackbone()

    with pytest.warns(DeprecationWarning):
        model.extract_feat(imgs)
def test_maskformer_fusion_head():
    img_metas = [
        {
            'batch_input_shape': (128, 160),
            'img_shape': (126, 160, 3),
            'ori_shape': (63, 80, 3),
            'pad_shape': (128, 160, 3)
        },
    ]
    num_things_classes = 80
    num_stuff_classes = 53
    num_classes = num_things_classes + num_stuff_classes
    config = ConfigDict(type='MaskFormerFusionHead',
                        num_things_classes=num_things_classes,
                        num_stuff_classes=num_stuff_classes,
                        loss_panoptic=None,
                        test_cfg=dict(panoptic_on=True,
                                      semantic_on=False,
                                      instance_on=True,
                                      max_per_image=100,
                                      object_mask_thr=0.8,
                                      iou_thr=0.8,
                                      filter_low_score=False),
                        init_cfg=None)

    self = MaskFormerFusionHead(**config)

    # test forward_train
    assert self.forward_train() == dict()

    mask_cls_results = torch.rand((1, 100, num_classes + 1))
    mask_pred_results = torch.rand((1, 100, 128, 160))

    # test panoptic_postprocess and instance_postprocess
    results = self.simple_test(mask_cls_results, mask_pred_results, img_metas)
    assert 'ins_results' in results[0] and 'pan_results' in results[0]

    # test semantic_postprocess
    config.test_cfg.semantic_on = True
    with pytest.raises(AssertionError):
        self.simple_test(mask_cls_results, mask_pred_results, img_metas)

    with pytest.raises(NotImplementedError):
        self.semantic_postprocess(mask_cls_results, mask_pred_results)
Beispiel #11
0
def test_pixel_decoder():
    base_channels = 64
    pixel_decoder_cfg = ConfigDict(
        dict(type='PixelDecoder',
             in_channels=[base_channels * 2**i for i in range(4)],
             feat_channels=base_channels,
             out_channels=base_channels,
             norm_cfg=dict(type='GN', num_groups=32),
             act_cfg=dict(type='ReLU')))
    self = build_plugin_layer(pixel_decoder_cfg)[1]
    img_metas = [{}, {}]
    feats = [
        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
        for i in range(4)
    ]
    mask_feature, memory = self(feats, img_metas)

    assert (memory == feats[-1]).all()
    assert mask_feature.shape == feats[0].shape
Beispiel #12
0
def test_maskformer_head_loss():
    """Tests head loss when truth is empty and non-empty."""
    base_channels = 64
    # batch_input_shape = (128, 160)
    img_metas = [{
        'batch_input_shape': (128, 160),
        'pad_shape': (128, 160, 3),
        'img_shape': (126, 160, 3),
        'ori_shape': (63, 80, 3)
    }, {
        'batch_input_shape': (128, 160),
        'pad_shape': (128, 160, 3),
        'img_shape': (120, 160, 3),
        'ori_shape': (60, 80, 3)
    }]
    feats = [
        torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
        for i in range(4)
    ]
    num_things_classes = 80
    num_stuff_classes = 53
    num_classes = num_things_classes + num_stuff_classes
    config = ConfigDict(
        dict(
            type='MaskFormerHead',
            in_channels=[base_channels * 2**i for i in range(4)],
            feat_channels=base_channels,
            out_channels=base_channels,
            num_things_classes=num_things_classes,
            num_stuff_classes=num_stuff_classes,
            num_queries=100,
            pixel_decoder=dict(
                type='TransformerEncoderPixelDecoder',
                norm_cfg=dict(type='GN', num_groups=32),
                act_cfg=dict(type='ReLU'),
                encoder=dict(
                    type='DetrTransformerEncoder',
                    num_layers=6,
                    transformerlayers=dict(
                        type='BaseTransformerLayer',
                        attn_cfgs=dict(type='MultiheadAttention',
                                       embed_dims=base_channels,
                                       num_heads=8,
                                       attn_drop=0.1,
                                       proj_drop=0.1,
                                       dropout_layer=None,
                                       batch_first=False),
                        ffn_cfgs=dict(embed_dims=base_channels,
                                      feedforward_channels=base_channels * 8,
                                      num_fcs=2,
                                      act_cfg=dict(type='ReLU', inplace=True),
                                      ffn_drop=0.1,
                                      dropout_layer=None,
                                      add_identity=True),
                        operation_order=('self_attn', 'norm', 'ffn', 'norm'),
                        norm_cfg=dict(type='LN'),
                        init_cfg=None,
                        batch_first=False),
                    init_cfg=None),
                positional_encoding=dict(type='SinePositionalEncoding',
                                         num_feats=base_channels // 2,
                                         normalize=True)),
            enforce_decoder_input_project=False,
            positional_encoding=dict(type='SinePositionalEncoding',
                                     num_feats=base_channels // 2,
                                     normalize=True),
            transformer_decoder=dict(
                type='DetrTransformerDecoder',
                return_intermediate=True,
                num_layers=6,
                transformerlayers=dict(
                    type='DetrTransformerDecoderLayer',
                    attn_cfgs=dict(type='MultiheadAttention',
                                   embed_dims=base_channels,
                                   num_heads=8,
                                   attn_drop=0.1,
                                   proj_drop=0.1,
                                   dropout_layer=None,
                                   batch_first=False),
                    ffn_cfgs=dict(embed_dims=base_channels,
                                  feedforward_channels=base_channels * 8,
                                  num_fcs=2,
                                  act_cfg=dict(type='ReLU', inplace=True),
                                  ffn_drop=0.1,
                                  dropout_layer=None,
                                  add_identity=True),
                    # the following parameter was not used,
                    # just make current api happy
                    feedforward_channels=base_channels * 8,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                     'ffn', 'norm')),
                init_cfg=None),
            loss_cls=dict(type='CrossEntropyLoss',
                          use_sigmoid=False,
                          loss_weight=1.0,
                          reduction='mean',
                          class_weight=[1.0] * num_classes + [0.1]),
            loss_mask=dict(type='FocalLoss',
                           use_sigmoid=True,
                           gamma=2.0,
                           alpha=0.25,
                           reduction='mean',
                           loss_weight=20.0),
            loss_dice=dict(type='DiceLoss',
                           use_sigmoid=True,
                           activate=True,
                           reduction='mean',
                           naive_dice=True,
                           eps=1.0,
                           loss_weight=1.0),
            train_cfg=dict(assigner=dict(type='MaskHungarianAssigner',
                                         cls_cost=dict(
                                             type='ClassificationCost',
                                             weight=1.0),
                                         mask_cost=dict(type='FocalLossCost',
                                                        weight=20.0,
                                                        binary_input=True),
                                         dice_cost=dict(type='DiceCost',
                                                        weight=1.0,
                                                        pred_act=True,
                                                        eps=1.0)),
                           sampler=dict(type='MaskPseudoSampler')),
            test_cfg=dict(object_mask_thr=0.8, iou_thr=0.8)))
    self = MaskFormerHead(**config)
    self.init_weights()
    all_cls_scores, all_mask_preds = self.forward(feats, img_metas)
    # Test that empty ground truth encourages the network to predict background
    gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])]
    gt_masks_list = [
        torch.zeros((0, 128, 160)).long(),
        torch.zeros((0, 128, 160)).long()
    ]

    empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
                                gt_masks_list, img_metas)
    # When there is no truth, the cls loss should be nonzero but there should
    # be no mask loss.
    for key, loss in empty_gt_losses.items():
        if 'cls' in key:
            assert loss.item() > 0, 'cls loss should be non-zero'
        elif 'mask' in key:
            assert loss.item(
            ) == 0, 'there should be no mask loss when there are no true mask'
        elif 'dice' in key:
            assert loss.item(
            ) == 0, 'there should be no dice loss when there are no true mask'

    # when truth is non-empty then both cls, mask, dice loss should be nonzero
    # random inputs
    gt_labels_list = [
        torch.tensor([10, 100]).long(),
        torch.tensor([100, 10]).long()
    ]
    mask1 = torch.zeros((2, 128, 160)).long()
    mask1[0, :50] = 1
    mask1[1, 50:] = 1
    mask2 = torch.zeros((2, 128, 160)).long()
    mask2[0, :, :50] = 1
    mask2[1, :, 50:] = 1
    gt_masks_list = [mask1, mask2]
    two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
                              gt_masks_list, img_metas)
    for loss in two_gt_losses.values():
        assert loss.item() > 0, 'all loss should be non-zero'

    # test forward_train
    gt_bboxes = None
    gt_labels = [
        torch.tensor([10]).long(),
        torch.tensor([10]).long(),
    ]
    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
    thing_mask1[0, :50] = 1
    thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32)
    thing_mask2[0, :, 50:] = 1
    gt_masks = [
        BitmapMasks(thing_mask1, 128, 160),
        BitmapMasks(thing_mask2, 128, 160),
    ]
    stuff_mask1 = torch.zeros((1, 128, 160)).long()
    stuff_mask1[0, :50] = 10
    stuff_mask1[0, 50:] = 100
    stuff_mask2 = torch.zeros((1, 128, 160)).long()
    stuff_mask2[0, :, 50:] = 10
    stuff_mask2[0, :, :50] = 100
    gt_semantic_seg = [stuff_mask1, stuff_mask2]

    self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks,
                       gt_semantic_seg)

    # test inference mode
    self.simple_test(feats, img_metas)
def test_encoder_decoder():

    # test 1 decode head, w.o. aux head

    cfg = ConfigDict(type='EncoderDecoder',
                     backbone=dict(type='ExampleBackbone'),
                     decode_head=dict(type='ExampleDecodeHead'),
                     train_cfg=None,
                     test_cfg=dict(mode='whole'))
    segmentor = build_segmentor(cfg)
    _segmentor_forward_train_test(segmentor)

    # test slide mode
    cfg.test_cfg = ConfigDict(mode='slide', crop_size=(3, 3), stride=(2, 2))
    segmentor = build_segmentor(cfg)
    _segmentor_forward_train_test(segmentor)

    # test 1 decode head, 1 aux head
    cfg = ConfigDict(type='EncoderDecoder',
                     backbone=dict(type='ExampleBackbone'),
                     decode_head=dict(type='ExampleDecodeHead'),
                     auxiliary_head=dict(type='ExampleDecodeHead'))
    cfg.test_cfg = ConfigDict(mode='whole')
    segmentor = build_segmentor(cfg)
    _segmentor_forward_train_test(segmentor)

    # test 1 decode head, 2 aux head
    cfg = ConfigDict(type='EncoderDecoder',
                     backbone=dict(type='ExampleBackbone'),
                     decode_head=dict(type='ExampleDecodeHead'),
                     auxiliary_head=[
                         dict(type='ExampleDecodeHead'),
                         dict(type='ExampleDecodeHead')
                     ])
    cfg.test_cfg = ConfigDict(mode='whole')
    segmentor = build_segmentor(cfg)
    _segmentor_forward_train_test(segmentor)
Beispiel #14
0
    def __init__(self,
                 attn_cfgs=None,
                 ffn_cfgs=dict(
                     type='FFN',
                     embed_dims=256,
                     feedforward_channels=1024,
                     num_fcs=2,
                     ffn_drop=0.,
                     act_cfg=dict(type='ReLU', inplace=True),
                 ),
                 operation_order=None,
                 norm_cfg=dict(type='LN'),
                 init_cfg=None,
                 batch_first=False,
                 **kwargs):

        deprecated_args = dict(feedforward_channels='feedforward_channels',
                               ffn_dropout='ffn_drop',
                               ffn_num_fcs='num_fcs')
        for ori_name, new_name in deprecated_args.items():
            if ori_name in kwargs:
                warnings.warn(
                    f'The arguments `{ori_name}` in BaseTransformerLayer '
                    f'has been deprecated, now you should set `{new_name}` '
                    f'and other FFN related arguments '
                    f'to a dict named `ffn_cfgs`. ')
                ffn_cfgs[new_name] = kwargs[ori_name]

        super(BaseTransformerLayer, self).__init__(init_cfg)

        self.batch_first = batch_first

        assert set(operation_order) & set(
            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
            set(operation_order), f'The operation_order of' \
            f' {self.__class__.__name__} should ' \
            f'contains all four operation type ' \
            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"

        num_attn = operation_order.count('self_attn') + operation_order.count(
            'cross_attn')
        if isinstance(attn_cfgs, dict):
            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
        else:
            assert num_attn == len(attn_cfgs), f'The length ' \
                f'of attn_cfg {num_attn} is ' \
                f'not consistent with the number of attention' \
                f'in operation_order {operation_order}.'

        self.num_attn = num_attn
        self.operation_order = operation_order
        self.norm_cfg = norm_cfg
        self.pre_norm = operation_order[0] == 'norm'
        self.attentions = ModuleList()

        index = 0
        for operation_name in operation_order:
            if operation_name in ['self_attn', 'cross_attn']:
                if 'batch_first' in attn_cfgs[index]:
                    assert self.batch_first == attn_cfgs[index]['batch_first']
                else:
                    attn_cfgs[index]['batch_first'] = self.batch_first
                attention = build_attention(attn_cfgs[index])
                # Some custom attentions used as `self_attn`
                # or `cross_attn` can have different behavior.
                attention.operation_name = operation_name
                self.attentions.append(attention)
                index += 1

        self.embed_dims = self.attentions[0].embed_dims

        self.ffns = ModuleList()
        num_ffns = operation_order.count('ffn')
        if isinstance(ffn_cfgs, dict):
            ffn_cfgs = ConfigDict(ffn_cfgs)
        if isinstance(ffn_cfgs, dict):
            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
        assert len(ffn_cfgs) == num_ffns
        for ffn_index in range(num_ffns):
            if 'embed_dims' not in ffn_cfgs[ffn_index]:
                ffn_cfgs['embed_dims'] = self.embed_dims
            else:
                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
            self.ffns.append(
                build_feedforward_network(ffn_cfgs[ffn_index],
                                          dict(type='FFN')))

        self.norms = ModuleList()
        num_norms = operation_order.count('norm')
        for _ in range(num_norms):
            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
Beispiel #15
0
            'assigner': {
                'type': 'MaxIoUAssigner',
                'pos_iou_thr': 0.7,
                'neg_iou_thr': 0.3,
                'min_pos_iou': 0.3,
                'match_low_quality': True,
                'ignore_iof_thr': -1
            },
            'sampler': {
                'type': 'RandomSampler',
                'num': 256,
                'pos_fraction': 0.5,
                'neg_pos_ub': -1,
                'add_gt_as_proposals': False
            },
            'allowed_border': -1,
            'pos_weight': -1,
            'debug': False
        },
        'test_cfg': {
            'nms_across_levels': False,
            'nms_pre': 1000,
            'nms_post': 1000,
            'max_num': 1000,
            'nms_thr': 0.7,
            'min_bbox_size': 0
        }
    }
    head_cfgs = ConfigDict(head_cfgs)
    rpn_head = RPNHead(**head_cfgs)
    def setup_class(cls):
        cls.data_prefix = osp.join(osp.dirname(osp.dirname(__file__)), 'data')
        cls.frame_ann_file = osp.join(cls.data_prefix, 'frame_test_list.txt')
        cls.frame_ann_file_with_offset = osp.join(
            cls.data_prefix, 'frame_test_list_with_offset.txt')
        cls.frame_ann_file_multi_label = osp.join(
            cls.data_prefix, 'frame_test_list_multi_label.txt')
        cls.video_ann_file = osp.join(cls.data_prefix, 'video_test_list.txt')
        cls.action_ann_file = osp.join(cls.data_prefix,
                                       'action_test_anno.json')
        cls.proposal_ann_file = osp.join(cls.data_prefix,
                                         'proposal_test_list.txt')
        cls.proposal_norm_ann_file = osp.join(cls.data_prefix,
                                              'proposal_normalized_list.txt')

        cls.frame_pipeline = [
            dict(type='SampleFrames',
                 clip_len=32,
                 frame_interval=2,
                 num_clips=1),
            dict(type='RawFrameDecode', io_backend='disk')
        ]
        cls.video_pipeline = [
            dict(type='OpenCVInit'),
            dict(type='SampleFrames',
                 clip_len=32,
                 frame_interval=2,
                 num_clips=1),
            dict(type='OpenCVDecode')
        ]
        cls.action_pipeline = []
        cls.proposal_pipeline = [
            dict(type='SampleProposalFrames',
                 clip_len=1,
                 body_segments=5,
                 aug_segments=(2, 2),
                 aug_ratio=0.5),
            dict(type='FrameSelector', io_backend='disk')
        ]
        cls.proposal_test_pipeline = [
            dict(type='SampleProposalFrames',
                 clip_len=1,
                 body_segments=5,
                 aug_segments=(2, 2),
                 aug_ratio=0.5,
                 mode='test'),
            dict(type='FrameSelector', io_backend='disk')
        ]

        cls.proposal_train_cfg = ConfigDict(
            dict(ssn=dict(assigner=dict(positive_iou_threshold=0.7,
                                        background_iou_threshold=0.01,
                                        incomplete_iou_threshold=0.5,
                                        background_coverage_threshold=0.02,
                                        incomplete_overlap_threshold=0.01),
                          sampler=dict(num_per_video=8,
                                       positive_ratio=1,
                                       background_ratio=1,
                                       incomplete_ratio=6,
                                       add_gt_as_proposals=True),
                          loss_weight=dict(comp_loss_weight=0.1,
                                           reg_loss_weight=0.1),
                          debug=False)))
        cls.proposal_test_cfg = ConfigDict(
            dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16),
                          evaluater=dict(top_k=2000,
                                         nms=0.2,
                                         softmax_before_filter=True,
                                         cls_top_k=2))))
        cls.proposal_test_cfg_topall = ConfigDict(
            dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16),
                          evaluater=dict(top_k=-1,
                                         nms=0.2,
                                         softmax_before_filter=True,
                                         cls_top_k=2))))
Beispiel #17
0
def merge_aug_proposals(aug_proposals, img_metas, cfg):
    """Merge augmented proposals (multiscale, flip, etc.)

    Args:
        aug_proposals (list[Tensor]): proposals from different testing
            schemes, shape (n, 5). Note that they are not rescaled to the
            original image size.

        img_metas (list[dict]): list of image info dict where each dict has:
            'img_shape', 'scale_factor', 'flip', and may also contain
            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
            For details on the values of these keys see
            `mmdet/datasets/pipelines/formatting.py:Collect`.

        cfg (dict): rpn test config.

    Returns:
        Tensor: shape (n, 4), proposals corresponding to original image scale.
    """

    cfg = copy.deepcopy(cfg)

    # deprecate arguments warning
    if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
        warnings.warn(
            'In rpn_proposal or test_cfg, '
            'nms_thr has been moved to a dict named nms as '
            'iou_threshold, max_num has been renamed as max_per_img, '
            'name of original arguments and the way to specify '
            'iou_threshold of NMS will be deprecated.')
    if 'nms' not in cfg:
        cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
    if 'max_num' in cfg:
        if 'max_per_img' in cfg:
            assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \
                f'max_per_img at the same time, but get {cfg.max_num} ' \
                f'and {cfg.max_per_img} respectively' \
                f'Please delete max_num which will be deprecated.'
        else:
            cfg.max_per_img = cfg.max_num
    if 'nms_thr' in cfg:
        assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
            f'iou_threshold in nms and ' \
            f'nms_thr at the same time, but get ' \
            f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
            f' respectively. Please delete the nms_thr ' \
            f'which will be deprecated.'

    recovered_proposals = []
    for proposals, img_info in zip(aug_proposals, img_metas):
        img_shape = img_info['img_shape']
        scale_factor = img_info['scale_factor']
        flip = img_info['flip']
        flip_direction = img_info['flip_direction']
        _proposals = proposals.clone()
        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
                                              scale_factor, flip,
                                              flip_direction)
        recovered_proposals.append(_proposals)
    aug_proposals = torch.cat(recovered_proposals, dim=0)
    merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
                              aug_proposals[:, -1].contiguous(),
                              cfg.nms.iou_threshold)
    scores = merged_proposals[:, 4]
    _, order = scores.sort(0, descending=True)
    num = min(cfg.max_per_img, merged_proposals.shape[0])
    order = order[:num]
    merged_proposals = merged_proposals[order, :]
    return merged_proposals
Beispiel #18
0
    def setup_class(cls):
        # prefix path
        cls.data_prefix = osp.normpath(
            osp.join(osp.dirname(__file__), '../../data'))
        cls.ann_file_prefix = osp.join(cls.data_prefix, 'annotations')

        # annotations path
        cls.action_ann_file = osp.join(cls.ann_file_prefix,
                                       'action_test_anno.json')
        cls.audio_feature_ann_file = osp.join(cls.ann_file_prefix,
                                              'audio_feature_test_list.txt')
        cls.audio_ann_file = osp.join(cls.ann_file_prefix,
                                      'audio_test_list.txt')
        cls.frame_ann_file_multi_label = osp.join(
            cls.ann_file_prefix, 'rawframe_test_list_multi_label.txt')
        cls.frame_ann_file_with_offset = osp.join(
            cls.ann_file_prefix, 'rawframe_test_list_with_offset.txt')
        cls.frame_ann_file = osp.join(cls.ann_file_prefix,
                                      'rawframe_test_list.txt')
        cls.hvu_frame_ann_file = osp.join(cls.ann_file_prefix,
                                          'hvu_frame_test_anno.json')
        cls.hvu_video_ann_file = osp.join(cls.ann_file_prefix,
                                          'hvu_video_test_anno.json')
        cls.hvu_video_eval_ann_file = osp.join(
            cls.ann_file_prefix, 'hvu_video_eval_test_anno.json')
        cls.proposal_ann_file = osp.join(cls.ann_file_prefix,
                                         'proposal_test_list.txt')
        cls.proposal_norm_ann_file = osp.join(cls.ann_file_prefix,
                                              'proposal_normalized_list.txt')
        cls.rawvideo_test_anno_json = osp.join(cls.ann_file_prefix,
                                               'rawvideo_test_anno.json')
        cls.rawvideo_test_anno_txt = osp.join(cls.ann_file_prefix,
                                              'rawvideo_test_anno.txt')
        cls.video_ann_file = osp.join(cls.ann_file_prefix,
                                      'video_test_list.txt')

        # pipeline configuration
        cls.action_pipeline = []
        cls.audio_feature_pipeline = [
            dict(type='LoadAudioFeature'),
            dict(type='SampleFrames',
                 clip_len=32,
                 frame_interval=2,
                 num_clips=1),
            dict(type='AudioFeatureSelector')
        ]
        cls.audio_pipeline = [
            dict(type='AudioDecodeInit'),
            dict(type='SampleFrames',
                 clip_len=32,
                 frame_interval=2,
                 num_clips=1),
            dict(type='AudioDecode')
        ]
        cls.frame_pipeline = [
            dict(type='SampleFrames',
                 clip_len=32,
                 frame_interval=2,
                 num_clips=1),
            dict(type='RawFrameDecode', io_backend='disk')
        ]
        cls.proposal_pipeline = [
            dict(type='SampleProposalFrames',
                 clip_len=1,
                 body_segments=5,
                 aug_segments=(2, 2),
                 aug_ratio=0.5),
            dict(type='RawFrameDecode', io_backend='disk')
        ]
        cls.proposal_test_pipeline = [
            dict(type='SampleProposalFrames',
                 clip_len=1,
                 body_segments=5,
                 aug_segments=(2, 2),
                 aug_ratio=0.5,
                 mode='test'),
            dict(type='RawFrameDecode', io_backend='disk')
        ]
        cls.proposal_train_cfg = ConfigDict(
            dict(ssn=dict(assigner=dict(positive_iou_threshold=0.7,
                                        background_iou_threshold=0.01,
                                        incomplete_iou_threshold=0.5,
                                        background_coverage_threshold=0.02,
                                        incomplete_overlap_threshold=0.01),
                          sampler=dict(num_per_video=8,
                                       positive_ratio=1,
                                       background_ratio=1,
                                       incomplete_ratio=6,
                                       add_gt_as_proposals=True),
                          loss_weight=dict(comp_loss_weight=0.1,
                                           reg_loss_weight=0.1),
                          debug=False)))
        cls.proposal_test_cfg = ConfigDict(
            dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16),
                          evaluater=dict(top_k=2000,
                                         nms=0.2,
                                         softmax_before_filter=True,
                                         cls_top_k=2))))
        cls.proposal_test_cfg_topall = ConfigDict(
            dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16),
                          evaluater=dict(top_k=-1,
                                         nms=0.2,
                                         softmax_before_filter=True,
                                         cls_top_k=2))))
        cls.rawvideo_pipeline = []
        cls.video_pipeline = [
            dict(type='OpenCVInit'),
            dict(type='SampleFrames',
                 clip_len=32,
                 frame_interval=2,
                 num_clips=1),
            dict(type='OpenCVDecode')
        ]

        cls.hvu_categories = [
            'action', 'attribute', 'concept', 'event', 'object', 'scene'
        ]
        cls.hvu_category_nums = [739, 117, 291, 69, 1679, 248]
        cls.hvu_categories_for_eval = ['action', 'scene', 'object']
        cls.hvu_category_nums_for_eval = [3, 3, 3]

        cls.filename_tmpl = 'img_{:05d}.jpg'
def _init_model(num_stuff_classes):
    base_channels = 64
    num_things_classes = 80
    num_classes = num_things_classes + num_stuff_classes
    config = ConfigDict(
        dict(
            type='Mask2FormerHead',
            in_channels=[base_channels * 2**i for i in range(4)],
            feat_channels=base_channels,
            out_channels=base_channels,
            num_things_classes=num_things_classes,
            num_stuff_classes=num_stuff_classes,
            num_queries=100,
            num_transformer_feat_level=3,
            pixel_decoder=dict(
                type='MSDeformAttnPixelDecoder',
                num_outs=3,
                norm_cfg=dict(type='GN', num_groups=32),
                act_cfg=dict(type='ReLU'),
                encoder=dict(
                    type='DetrTransformerEncoder',
                    num_layers=6,
                    transformerlayers=dict(
                        type='BaseTransformerLayer',
                        attn_cfgs=dict(type='MultiScaleDeformableAttention',
                                       embed_dims=base_channels,
                                       num_heads=8,
                                       num_levels=3,
                                       num_points=4,
                                       im2col_step=64,
                                       dropout=0.0,
                                       batch_first=False,
                                       norm_cfg=None,
                                       init_cfg=None),
                        ffn_cfgs=dict(type='FFN',
                                      embed_dims=base_channels,
                                      feedforward_channels=base_channels * 4,
                                      num_fcs=2,
                                      ffn_drop=0.0,
                                      act_cfg=dict(type='ReLU', inplace=True)),
                        feedforward_channels=base_channels * 4,
                        ffn_dropout=0.0,
                        operation_order=('self_attn', 'norm', 'ffn', 'norm')),
                    init_cfg=None),
                positional_encoding=dict(type='SinePositionalEncoding',
                                         num_feats=base_channels // 2,
                                         normalize=True),
                init_cfg=None),
            enforce_decoder_input_project=False,
            positional_encoding=dict(type='SinePositionalEncoding',
                                     num_feats=base_channels // 2,
                                     normalize=True),
            transformer_decoder=dict(
                type='DetrTransformerDecoder',
                return_intermediate=True,
                num_layers=9,
                transformerlayers=dict(
                    type='DetrTransformerDecoderLayer',
                    attn_cfgs=dict(type='MultiheadAttention',
                                   embed_dims=base_channels,
                                   num_heads=8,
                                   attn_drop=0.0,
                                   proj_drop=0.0,
                                   dropout_layer=None,
                                   batch_first=False),
                    ffn_cfgs=dict(embed_dims=base_channels,
                                  feedforward_channels=base_channels * 8,
                                  num_fcs=2,
                                  act_cfg=dict(type='ReLU', inplace=True),
                                  ffn_drop=0.0,
                                  dropout_layer=None,
                                  add_identity=True),
                    # the following parameter was not used,
                    # just make current api happy
                    feedforward_channels=base_channels * 8,
                    operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
                                     'ffn', 'norm')),
                init_cfg=None),
            loss_cls=dict(type='CrossEntropyLoss',
                          use_sigmoid=False,
                          loss_weight=2.0,
                          reduction='mean',
                          class_weight=[1.0] * num_classes + [0.1]),
            loss_mask=dict(type='CrossEntropyLoss',
                           use_sigmoid=True,
                           reduction='mean',
                           loss_weight=5.0),
            loss_dice=dict(type='DiceLoss',
                           use_sigmoid=True,
                           activate=True,
                           reduction='mean',
                           naive_dice=True,
                           eps=1.0,
                           loss_weight=5.0),
            train_cfg=dict(
                num_points=256,
                oversample_ratio=3.0,
                importance_sample_ratio=0.75,
                assigner=dict(type='MaskHungarianAssigner',
                              cls_cost=dict(type='ClassificationCost',
                                            weight=2.0),
                              mask_cost=dict(type='CrossEntropyLossCost',
                                             weight=5.0,
                                             use_sigmoid=True),
                              dice_cost=dict(type='DiceCost',
                                             weight=5.0,
                                             pred_act=True,
                                             eps=1.0)),
                sampler=dict(type='MaskPseudoSampler')),
            test_cfg=dict(panoptic_on=True,
                          semantic_on=False,
                          instance_on=True,
                          max_dets_per_image=100,
                          object_mask_thr=0.8,
                          iou_thr=0.8)))
    self = Mask2FormerHead(**config)
    self.init_weights()

    return self
Beispiel #20
0
    def _get_bboxes_single(self,
                           cls_scores,
                           bbox_preds,
                           mlvl_anchors,
                           mlvl_masks,
                           img_shape,
                           scale_factor,
                           cfg,
                           rescale=False):
        cfg = self.test_cfg if cfg is None else cfg

        cfg = copy.deepcopy(cfg)

        # deprecate arguments warning
        if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
            warnings.warn(
                'In rpn_proposal or test_cfg, '
                'nms_thr has been moved to a dict named nms as '
                'iou_threshold, max_num has been renamed as max_per_img, '
                'name of original arguments and the way to specify '
                'iou_threshold of NMS will be deprecated.')
        if 'nms' not in cfg:
            cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
        if 'max_num' in cfg:
            if 'max_per_img' in cfg:
                assert cfg.max_num == cfg.max_per_img, f'You ' \
                    f'set max_num and max_per_img at the same time, ' \
                    f'but get {cfg.max_num} ' \
                    f'and {cfg.max_per_img} respectively' \
                    'Please delete max_num which will be deprecated.'
            else:
                cfg.max_per_img = cfg.max_num
        if 'nms_thr' in cfg:
            assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
                f'iou_threshold in nms and ' \
                f'nms_thr at the same time, but get ' \
                f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
                f' respectively. Please delete the ' \
                f'nms_thr which will be deprecated.'

        assert cfg.nms.get('type', 'nms') == 'nms', 'GARPNHead only support ' \
            'naive nms.'

        mlvl_proposals = []
        for idx in range(len(cls_scores)):
            rpn_cls_score = cls_scores[idx]
            rpn_bbox_pred = bbox_preds[idx]
            anchors = mlvl_anchors[idx]
            mask = mlvl_masks[idx]
            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
            # if no location is kept, end.
            if mask.sum() == 0:
                continue
            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
            if self.use_sigmoid_cls:
                rpn_cls_score = rpn_cls_score.reshape(-1)
                scores = rpn_cls_score.sigmoid()
            else:
                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
                # remind that we set FG labels to [0, num_class-1]
                # since mmdet v2.0
                # BG cat_id: num_class
                scores = rpn_cls_score.softmax(dim=1)[:, :-1]
            # filter scores, bbox_pred w.r.t. mask.
            # anchors are filtered in get_anchors() beforehand.
            scores = scores[mask]
            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
                                                                   4)[mask, :]
            if scores.dim() == 0:
                rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
                anchors = anchors.unsqueeze(0)
                scores = scores.unsqueeze(0)
            # filter anchors, bbox_pred, scores w.r.t. scores
            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
                _, topk_inds = scores.topk(cfg.nms_pre)
                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
                anchors = anchors[topk_inds, :]
                scores = scores[topk_inds]
            # get proposals w.r.t. anchors and rpn_bbox_pred
            proposals = self.bbox_coder.decode(
                anchors, rpn_bbox_pred, max_shape=img_shape)
            # filter out too small bboxes
            if cfg.min_bbox_size >= 0:
                w = proposals[:, 2] - proposals[:, 0]
                h = proposals[:, 3] - proposals[:, 1]
                valid_inds = torch.nonzero(
                    (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size),
                    as_tuple=False).squeeze()
                proposals = proposals[valid_inds, :]
                scores = scores[valid_inds]
            # NMS in current level
            proposals, _ = nms(proposals, scores, cfg.nms.iou_threshold)
            proposals = proposals[:cfg.nms_post, :]
            mlvl_proposals.append(proposals)
        proposals = torch.cat(mlvl_proposals, 0)
        if cfg.get('nms_across_levels', False):
            # NMS across multi levels
            proposals, _ = nms(proposals[:, :4], proposals[:, -1],
                               cfg.nms.iou_threshold)
            proposals = proposals[:cfg.max_per_img, :]
        else:
            scores = proposals[:, 4]
            num = min(cfg.max_per_img, proposals.shape[0])
            _, topk_inds = scores.topk(num)
            proposals = proposals[topk_inds, :]
        return proposals
Beispiel #21
0
    def __init__(self,
                 num_frames,
                 img_size,
                 patch_size,
                 pretrained=None,
                 embed_dims=768,
                 num_heads=12,
                 num_transformer_layers=12,
                 in_channels=3,
                 dropout_ratio=0.,
                 transformer_layers=None,
                 attention_type='divided_space_time',
                 norm_cfg=dict(type='LN', eps=1e-6),
                 **kwargs):
        super().__init__(**kwargs)
        assert attention_type in self.supported_attention_types, (
            f'Unsupported Attention Type {attention_type}!')
        assert transformer_layers is None or isinstance(
            transformer_layers, (dict, list))

        self.num_frames = num_frames
        self.pretrained = pretrained
        self.embed_dims = embed_dims
        self.num_transformer_layers = num_transformer_layers
        self.attention_type = attention_type

        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_channels=in_channels,
            embed_dims=embed_dims)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
        self.pos_embed = nn.Parameter(
            torch.zeros(1, num_patches + 1, embed_dims))
        self.drop_after_pos = nn.Dropout(p=dropout_ratio)
        if self.attention_type != 'space_only':
            self.time_embed = nn.Parameter(
                torch.zeros(1, num_frames, embed_dims))
            self.drop_after_time = nn.Dropout(p=dropout_ratio)

        self.norm = build_norm_layer(norm_cfg, embed_dims)[1]

        if transformer_layers is None:
            # stochastic depth decay rule
            dpr = np.linspace(0, 0.1, num_transformer_layers)

            if self.attention_type == 'divided_space_time':
                _transformerlayers_cfg = [
                    dict(
                        type='BaseTransformerLayer',
                        attn_cfgs=[
                            dict(
                                type='DividedTemporalAttentionWithNorm',
                                embed_dims=embed_dims,
                                num_heads=num_heads,
                                num_frames=num_frames,
                                dropout_layer=dict(
                                    type='DropPath', drop_prob=dpr[i]),
                                norm_cfg=dict(type='LN', eps=1e-6)),
                            dict(
                                type='DividedSpatialAttentionWithNorm',
                                embed_dims=embed_dims,
                                num_heads=num_heads,
                                num_frames=num_frames,
                                dropout_layer=dict(
                                    type='DropPath', drop_prob=dpr[i]),
                                norm_cfg=dict(type='LN', eps=1e-6))
                        ],
                        ffn_cfgs=dict(
                            type='FFNWithNorm',
                            embed_dims=embed_dims,
                            feedforward_channels=embed_dims * 4,
                            num_fcs=2,
                            act_cfg=dict(type='GELU'),
                            dropout_layer=dict(
                                type='DropPath', drop_prob=dpr[i]),
                            norm_cfg=dict(type='LN', eps=1e-6)),
                        operation_order=('self_attn', 'self_attn', 'ffn'))
                    for i in range(num_transformer_layers)
                ]
            else:
                # Sapce Only & Joint Space Time
                _transformerlayers_cfg = [
                    dict(
                        type='BaseTransformerLayer',
                        attn_cfgs=[
                            dict(
                                type='MultiheadAttention',
                                embed_dims=embed_dims,
                                num_heads=num_heads,
                                batch_first=True,
                                dropout_layer=dict(
                                    type='DropPath', drop_prob=dpr[i]))
                        ],
                        ffn_cfgs=dict(
                            type='FFN',
                            embed_dims=embed_dims,
                            feedforward_channels=embed_dims * 4,
                            num_fcs=2,
                            act_cfg=dict(type='GELU'),
                            dropout_layer=dict(
                                type='DropPath', drop_prob=dpr[i])),
                        operation_order=('norm', 'self_attn', 'norm', 'ffn'),
                        norm_cfg=dict(type='LN', eps=1e-6),
                        batch_first=True)
                    for i in range(num_transformer_layers)
                ]

            transformer_layers = ConfigDict(
                dict(
                    type='TransformerLayerSequence',
                    transformerlayers=_transformerlayers_cfg,
                    num_layers=num_transformer_layers))

        self.transformer_layers = build_transformer_layer_sequence(
            transformer_layers)
Beispiel #22
0
    def _get_bboxes_single(self,
                           cls_scores,
                           bbox_preds,
                           mlvl_anchors,
                           img_shape,
                           scale_factor,
                           cfg,
                           rescale=False):
        """Transform outputs for a single batch item into bbox predictions.

        Args:
            cls_scores (list[Tensor]): Box scores for each scale level
                Has shape (num_anchors * num_classes, H, W).
            bbox_preds (list[Tensor]): Box energies / deltas for each scale
                level with shape (num_anchors * 4, H, W).
            mlvl_anchors (list[Tensor]): Box reference for each scale level
                with shape (num_total_anchors, 4).
            img_shape (tuple[int]): Shape of the input image,
                (height, width, 3).
            scale_factor (ndarray): Scale factor of the image arange as
                (w_scale, h_scale, w_scale, h_scale).
            cfg (mmcv.Config): Test / postprocessing configuration,
                if None, test_cfg would be used.
            rescale (bool): If True, return boxes in original image space.

        Returns:
            Tensor: Labeled boxes have the shape of (n,5), where the
                first 4 columns are bounding box positions
                (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
                between 0 and 1.
        """
        cfg = self.test_cfg if cfg is None else cfg
        cfg = copy.deepcopy(cfg)
        # bboxes from different level should be independent during NMS,
        # level_ids are used as labels for batched NMS to separate them
        level_ids = []
        mlvl_scores = []
        mlvl_bbox_preds = []
        mlvl_valid_anchors = []
        for idx in range(len(cls_scores)):
            rpn_cls_score = cls_scores[idx]
            rpn_bbox_pred = bbox_preds[idx]
            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
            if self.use_sigmoid_cls:
                rpn_cls_score = rpn_cls_score.reshape(-1)
                scores = rpn_cls_score.sigmoid()
            else:
                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
                # We set FG labels to [0, num_class-1] and BG label to
                # num_class in RPN head since mmdet v2.5, which is unified to
                # be consistent with other head since mmdet v2.0. In mmdet v2.0
                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
                scores = rpn_cls_score.softmax(dim=1)[:, 0]
            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
            anchors = mlvl_anchors[idx]
            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
                # sort is faster than topk
                # _, topk_inds = scores.topk(cfg.nms_pre)
                if torch.onnx.is_in_onnx_export():
                    # sort op will be converted to TopK in onnx
                    # and k<=3480 in TensorRT
                    _, topk_inds = scores.topk(cfg.nms_pre)
                    scores = scores[topk_inds]
                else:
                    ranked_scores, rank_inds = scores.sort(descending=True)
                    topk_inds = rank_inds[:cfg.nms_pre]
                    scores = ranked_scores[:cfg.nms_pre]
                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
                anchors = anchors[topk_inds, :]
            mlvl_scores.append(scores)
            mlvl_bbox_preds.append(rpn_bbox_pred)
            mlvl_valid_anchors.append(anchors)
            level_ids.append(
                scores.new_full((scores.size(0), ), idx, dtype=torch.long))

        scores = torch.cat(mlvl_scores)
        anchors = torch.cat(mlvl_valid_anchors)
        rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
        proposals = self.bbox_coder.decode(anchors,
                                           rpn_bbox_pred,
                                           max_shape=img_shape)
        ids = torch.cat(level_ids)

        # Skip nonzero op while exporting to ONNX
        if cfg.min_bbox_size > 0 and (not torch.onnx.is_in_onnx_export()):
            w = proposals[:, 2] - proposals[:, 0]
            h = proposals[:, 3] - proposals[:, 1]
            valid_inds = torch.nonzero((w >= cfg.min_bbox_size)
                                       & (h >= cfg.min_bbox_size),
                                       as_tuple=False).squeeze()
            if valid_inds.sum().item() != len(proposals):
                proposals = proposals[valid_inds, :]
                scores = scores[valid_inds]
                ids = ids[valid_inds]

        # deprecate arguments warning
        if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
            warnings.warn(
                'In rpn_proposal or test_cfg, '
                'nms_thr has been moved to a dict named nms as '
                'iou_threshold, max_num has been renamed as max_per_img, '
                'name of original arguments and the way to specify '
                'iou_threshold of NMS will be deprecated.')
        if 'nms' not in cfg:
            cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
        if 'max_num' in cfg:
            if 'max_per_img' in cfg:
                assert cfg.max_num == cfg.max_per_img, f'You ' \
                    f'set max_num and ' \
                    f'max_per_img at the same time, but get {cfg.max_num} ' \
                    f'and {cfg.max_per_img} respectively' \
                    'Please delete max_num which will be deprecated.'
            else:
                cfg.max_per_img = cfg.max_num
        if 'nms_thr' in cfg:
            assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set' \
                f' iou_threshold in nms and ' \
                f'nms_thr at the same time, but get' \
                f' {cfg.nms.iou_threshold} and {cfg.nms_thr}' \
                f' respectively. Please delete the nms_thr ' \
                f'which will be deprecated.'

        dets, keep = batched_nms(proposals, scores, ids, cfg.nms)
        return dets[:cfg.max_per_img]
Beispiel #23
0
    def _get_bboxes(self,
                    cls_scores,
                    bbox_preds,
                    mlvl_anchors,
                    img_shapes,
                    scale_factors,
                    cfg,
                    rescale=False):
        """Transform outputs for a single batch item into bbox predictions.

        Args:
            cls_scores (list[Tensor]): Box scores for each scale level
                Has shape (N, num_anchors * num_classes, H, W).
            bbox_preds (list[Tensor]): Box energies / deltas for each scale
                level with shape (N, num_anchors * 4, H, W).
            mlvl_anchors (list[Tensor]): Box reference for each scale level
                with shape (num_total_anchors, 4).
            img_shapes (list[tuple[int]]): Shape of the input image,
                (height, width, 3).
            scale_factors (list[ndarray]): Scale factor of the image arange as
                (w_scale, h_scale, w_scale, h_scale).
            cfg (mmcv.Config): Test / postprocessing configuration,
                if None, test_cfg would be used.
            rescale (bool): If True, return boxes in original image space.

        Returns:
            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
                The first item is an (n, 5) tensor, where the first 4 columns
                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
                5-th column is a score between 0 and 1. The second item is a
                (n,) tensor where each item is the predicted class labelof the
                corresponding box.
        """
        cfg = self.test_cfg if cfg is None else cfg
        cfg = copy.deepcopy(cfg)
        # bboxes from different level should be independent during NMS,
        # level_ids are used as labels for batched NMS to separate them
        level_ids = []
        mlvl_scores = []
        mlvl_bbox_preds = []
        mlvl_valid_anchors = []
        batch_size = cls_scores[0].shape[0]
        nms_pre_tensor = torch.tensor(cfg.nms_pre,
                                      device=cls_scores[0].device,
                                      dtype=torch.long)
        for idx in range(len(cls_scores)):
            rpn_cls_score = cls_scores[idx]
            rpn_bbox_pred = bbox_preds[idx]
            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
            rpn_cls_score = rpn_cls_score.permute(0, 2, 3, 1)
            if self.use_sigmoid_cls:
                rpn_cls_score = rpn_cls_score.reshape(batch_size, -1)
                scores = rpn_cls_score.sigmoid()
            else:
                rpn_cls_score = rpn_cls_score.reshape(batch_size, -1, 2)
                # We set FG labels to [0, num_class-1] and BG label to
                # num_class in RPN head since mmdet v2.5, which is unified to
                # be consistent with other head since mmdet v2.0. In mmdet v2.0
                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
                scores = rpn_cls_score.softmax(-1)[..., 0]
            rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).reshape(
                batch_size, -1, 4)
            anchors = mlvl_anchors[idx]
            anchors = anchors.expand_as(rpn_bbox_pred)
            # Get top-k prediction
            from mmdet.core.export import get_k_for_topk
            nms_pre = get_k_for_topk(nms_pre_tensor, rpn_bbox_pred.shape[1])
            if nms_pre > 0:
                _, topk_inds = scores.topk(nms_pre)
                batch_inds = torch.arange(batch_size).view(
                    -1, 1).expand_as(topk_inds)
                # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501
                if torch.onnx.is_in_onnx_export():
                    # Mind k<=3480 in TensorRT for TopK
                    transformed_inds = scores.shape[1] * batch_inds + topk_inds
                    scores = scores.reshape(-1, 1)[transformed_inds].reshape(
                        batch_size, -1)
                    rpn_bbox_pred = rpn_bbox_pred.reshape(
                        -1, 4)[transformed_inds, :].reshape(batch_size, -1, 4)
                    anchors = anchors.reshape(-1,
                                              4)[transformed_inds, :].reshape(
                                                  batch_size, -1, 4)
                else:
                    # sort is faster than topk
                    ranked_scores, rank_inds = scores.sort(descending=True)
                    topk_inds = rank_inds[:, :cfg.nms_pre]
                    scores = ranked_scores[:, :cfg.nms_pre]
                    batch_inds = torch.arange(batch_size).view(
                        -1, 1).expand_as(topk_inds)
                    rpn_bbox_pred = rpn_bbox_pred[batch_inds, topk_inds, :]
                    anchors = anchors[batch_inds, topk_inds, :]

            mlvl_scores.append(scores)
            mlvl_bbox_preds.append(rpn_bbox_pred)
            mlvl_valid_anchors.append(anchors)
            level_ids.append(
                scores.new_full((
                    batch_size,
                    scores.size(1),
                ),
                                idx,
                                dtype=torch.long))

        batch_mlvl_scores = torch.cat(mlvl_scores, dim=1)
        batch_mlvl_anchors = torch.cat(mlvl_valid_anchors, dim=1)
        batch_mlvl_rpn_bbox_pred = torch.cat(mlvl_bbox_preds, dim=1)
        batch_mlvl_proposals = self.bbox_coder.decode(batch_mlvl_anchors,
                                                      batch_mlvl_rpn_bbox_pred,
                                                      max_shape=img_shapes)
        batch_mlvl_ids = torch.cat(level_ids, dim=1)

        # deprecate arguments warning
        if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
            warnings.warn(
                'In rpn_proposal or test_cfg, '
                'nms_thr has been moved to a dict named nms as '
                'iou_threshold, max_num has been renamed as max_per_img, '
                'name of original arguments and the way to specify '
                'iou_threshold of NMS will be deprecated.')
        if 'nms' not in cfg:
            cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
        if 'max_num' in cfg:
            if 'max_per_img' in cfg:
                assert cfg.max_num == cfg.max_per_img, f'You ' \
                    f'set max_num and ' \
                    f'max_per_img at the same time, but get {cfg.max_num} ' \
                    f'and {cfg.max_per_img} respectively' \
                    'Please delete max_num which will be deprecated.'
            else:
                cfg.max_per_img = cfg.max_num
        if 'nms_thr' in cfg:
            assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set' \
                f' iou_threshold in nms and ' \
                f'nms_thr at the same time, but get' \
                f' {cfg.nms.iou_threshold} and {cfg.nms_thr}' \
                f' respectively. Please delete the nms_thr ' \
                f'which will be deprecated.'

        # Replace batched_nms with ONNX::NonMaxSuppression in deployment
        if torch.onnx.is_in_onnx_export():
            from mmdet.core.export import add_dummy_nms_for_onnx
            batch_mlvl_scores = batch_mlvl_scores.unsqueeze(2)
            score_threshold = cfg.nms.get('score_thr', 0.0)
            nms_pre = cfg.get('deploy_nms_pre', cfg.max_per_img)
            dets, _ = add_dummy_nms_for_onnx(batch_mlvl_proposals,
                                             batch_mlvl_scores,
                                             cfg.max_per_img,
                                             cfg.nms.iou_threshold,
                                             score_threshold, nms_pre,
                                             cfg.max_per_img)
            return dets

        result_list = []
        for (mlvl_proposals, mlvl_scores,
             mlvl_ids) in zip(batch_mlvl_proposals, batch_mlvl_scores,
                              batch_mlvl_ids):
            # Skip nonzero op while exporting to ONNX
            if cfg.min_bbox_size > 0 and (not torch.onnx.is_in_onnx_export()):
                w = mlvl_proposals[:, 2] - mlvl_proposals[:, 0]
                h = mlvl_proposals[:, 3] - mlvl_proposals[:, 1]
                valid_ind = torch.nonzero((w >= cfg.min_bbox_size)
                                          & (h >= cfg.min_bbox_size),
                                          as_tuple=False).squeeze()
                if valid_ind.sum().item() != len(mlvl_proposals):
                    mlvl_proposals = mlvl_proposals[valid_ind, :]
                    mlvl_scores = mlvl_scores[valid_ind]
                    mlvl_ids = mlvl_ids[valid_ind]

            dets, keep = batched_nms(mlvl_proposals, mlvl_scores, mlvl_ids,
                                     cfg.nms)
            result_list.append(dets[:cfg.max_per_img])
        return result_list
Beispiel #24
0
def compat_loader_args(cfg):
    """Deprecated sample_per_gpu in cfg.data."""

    cfg = copy.deepcopy(cfg)
    if 'train_dataloader' not in cfg.data:
        cfg.data['train_dataloader'] = ConfigDict()
    if 'val_dataloader' not in cfg.data:
        cfg.data['val_dataloader'] = ConfigDict()
    if 'test_dataloader' not in cfg.data:
        cfg.data['test_dataloader'] = ConfigDict()

    # special process for train_dataloader
    if 'samples_per_gpu' in cfg.data:

        samples_per_gpu = cfg.data.pop('samples_per_gpu')
        assert 'samples_per_gpu' not in \
               cfg.data.train_dataloader, ('`samples_per_gpu` are set '
                                           'in `data` field and ` '
                                           'data.train_dataloader` '
                                           'at the same time. '
                                           'Please only set it in '
                                           '`data.train_dataloader`. ')
        cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu

    if 'persistent_workers' in cfg.data:

        persistent_workers = cfg.data.pop('persistent_workers')
        assert 'persistent_workers' not in \
               cfg.data.train_dataloader, ('`persistent_workers` are set '
                                           'in `data` field and ` '
                                           'data.train_dataloader` '
                                           'at the same time. '
                                           'Please only set it in '
                                           '`data.train_dataloader`. ')
        cfg.data.train_dataloader['persistent_workers'] = persistent_workers

    if 'workers_per_gpu' in cfg.data:

        workers_per_gpu = cfg.data.pop('workers_per_gpu')
        cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu
        cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu
        cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu

    # special process for val_dataloader
    if 'samples_per_gpu' in cfg.data.val:
        # keep default value of `sample_per_gpu` is 1
        assert 'samples_per_gpu' not in \
               cfg.data.val_dataloader, ('`samples_per_gpu` are set '
                                         'in `data.val` field and ` '
                                         'data.val_dataloader` at '
                                         'the same time. '
                                         'Please only set it in '
                                         '`data.val_dataloader`. ')
        cfg.data.val_dataloader['samples_per_gpu'] = \
            cfg.data.val.pop('samples_per_gpu')
    # special process for val_dataloader

    # in case the test dataset is concatenated
    if isinstance(cfg.data.test, dict):
        if 'samples_per_gpu' in cfg.data.test:
            assert 'samples_per_gpu' not in \
                   cfg.data.test_dataloader, ('`samples_per_gpu` are set '
                                              'in `data.test` field and ` '
                                              'data.test_dataloader` '
                                              'at the same time. '
                                              'Please only set it in '
                                              '`data.test_dataloader`. ')

            cfg.data.test_dataloader['samples_per_gpu'] = \
                cfg.data.test.pop('samples_per_gpu')

    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            if 'samples_per_gpu' in ds_cfg:
                assert 'samples_per_gpu' not in \
                       cfg.data.test_dataloader, ('`samples_per_gpu` are set '
                                                  'in `data.test` field and ` '
                                                  'data.test_dataloader` at'
                                                  ' the same time. '
                                                  'Please only set it in '
                                                  '`data.test_dataloader`. ')
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu

    return cfg
Beispiel #25
0
            return losses
        else:
            return ds

        return ds


if __name__ == "__main__":
    import time
    from mmcv import ConfigDict

    cfg = dict(pretrained_model='/home/caojia/densenet161.pth',
               model=dict(depth_num_layers=161,
                          input_shape=[608, 960],
                          max_depth=80,
                          fxy=[631.0]),
               data=dict(imgs_per_gpu=2))
    cfg = ConfigDict(cfg)

    net = LPGNet(cfg).cuda().eval()
    x = torch.randn((2, 3, 608, 960)).cuda()
    focal = [712.] * x.size()[
        0]  # camera focal_fxy, the length should be equal to input batch_size
    inputs = dict(leftImage=x, left_gt=x[:, 0, :, :])
    torch.cuda.synchronize()
    s_t = time.time()

    y = net(inputs)
    torch.cuda.synchronize()
    print('inference time is ', time.time() - s_t)
Beispiel #26
0
def test_ssn_loss():
    ssn_loss = SSNLoss()

    # test activity_loss
    activity_score = torch.rand((8, 21))
    labels = torch.LongTensor([8] * 8).squeeze()
    activity_indexer = torch.tensor([0, 7])
    output_activity_loss = ssn_loss.activity_loss(activity_score, labels,
                                                  activity_indexer)
    assert torch.equal(
        output_activity_loss,
        F.cross_entropy(activity_score[activity_indexer, :],
                        labels[activity_indexer]))

    # test completeness_loss
    completeness_score = torch.rand((8, 20), requires_grad=True)
    labels = torch.LongTensor([8] * 8).squeeze()
    completeness_indexer = torch.tensor([0, 1, 2, 3, 4, 5, 6])
    positive_per_video = 1
    incomplete_per_video = 6
    output_completeness_loss = ssn_loss.completeness_loss(
        completeness_score, labels, completeness_indexer, positive_per_video,
        incomplete_per_video)

    pred = completeness_score[completeness_indexer, :]
    gt = labels[completeness_indexer]
    pred_dim = pred.size(1)
    pred = pred.view(-1, positive_per_video + incomplete_per_video, pred_dim)
    gt = gt.view(-1, positive_per_video + incomplete_per_video)
    # yapf:disable
    positive_pred = pred[:, :positive_per_video, :].contiguous().view(-1, pred_dim)  # noqa:E501
    incomplete_pred = pred[:, positive_per_video:, :].contiguous().view(-1, pred_dim)  # noqa:E501
    # yapf:enable
    ohem_ratio = 0.17
    positive_loss = OHEMHingeLoss.apply(
        positive_pred, gt[:, :positive_per_video].contiguous().view(-1), 1,
        1.0, positive_per_video)
    incomplete_loss = OHEMHingeLoss.apply(
        incomplete_pred, gt[:, positive_per_video:].contiguous().view(-1), -1,
        ohem_ratio, incomplete_per_video)
    num_positives = positive_pred.size(0)
    num_incompletes = int(incomplete_pred.size(0) * ohem_ratio)
    assert_loss = ((positive_loss + incomplete_loss) /
                   float(num_positives + num_incompletes))
    assert torch.equal(output_completeness_loss, assert_loss)

    # test reg_loss
    bbox_pred = torch.rand((8, 20, 2))
    labels = torch.LongTensor([8] * 8).squeeze()
    bbox_targets = torch.rand((8, 2))
    regression_indexer = torch.tensor([0])
    output_reg_loss = ssn_loss.classwise_regression_loss(
        bbox_pred, labels, bbox_targets, regression_indexer)

    pred = bbox_pred[regression_indexer, :, :]
    gt = labels[regression_indexer]
    reg_target = bbox_targets[regression_indexer, :]
    class_idx = gt.data - 1
    classwise_pred = pred[:, class_idx, :]
    classwise_reg_pred = torch.cat((torch.diag(classwise_pred[:, :, 0]).view(
        -1, 1), torch.diag(classwise_pred[:, :, 1]).view(-1, 1)),
                                   dim=1)
    assert torch.equal(
        output_reg_loss,
        F.smooth_l1_loss(classwise_reg_pred.view(-1), reg_target.view(-1)) * 2)

    # test ssn_loss
    proposal_type = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 2]])
    train_cfg = ConfigDict(
        dict(ssn=dict(sampler=dict(num_per_video=8,
                                   positive_ratio=1,
                                   background_ratio=1,
                                   incomplete_ratio=6,
                                   add_gt_as_proposals=True),
                      loss_weight=dict(comp_loss_weight=0.1,
                                       reg_loss_weight=0.1))))
    output_loss = ssn_loss(activity_score, completeness_score, bbox_pred,
                           proposal_type, labels, bbox_targets, train_cfg)
    assert torch.equal(output_loss['loss_activity'], output_activity_loss)
    assert torch.equal(output_loss['loss_completeness'],
                       output_completeness_loss * 0.1)
    assert torch.equal(output_loss['loss_reg'], output_reg_loss * 0.1)
Beispiel #27
0
            "abs_rel": abs_rel / len(img_files),
            "sq_rel": sq_rel / len(img_files),
            "rmse": rmse / len(img_files),
            "rmse_log": rmse_log / len(img_files),
            "delta1": delta1 / len(img_files),
            "delta2": delta2 / len(img_files),
            "delta3": delta3 / len(img_files)
        }


if __name__ == "__main__":
    from mmcv import ConfigDict
    import matplotlib.pyplot as plt

    infer_cfg = dict(model_path='./tmp/epoch_16.pth',
                     pretrained_model='/home/caojia/densenet161.pth',
                     data=dict(output_size=(352, 1216), imgs_per_gpu=1),
                     model=dict(type='LPGNet',
                                depth_num_layers=161,
                                input_shape=[352, 1216],
                                max_depth=80,
                                fxy=[721.0]))

    infer_cfg = ConfigDict(infer_cfg)

    evaluator = Evaluator(infer_cfg)
    img_folder = '/home/caojia/kitti_eigen_test/image_02/'
    gt_folder = '/home/caojia/kitti_eigen_test/groundtruth/'

    print(evaluator.eval(img_folder, gt_folder, False))
def test_compat_loader_args():
    cfg = ConfigDict(dict(data=dict(val=dict(), test=dict(), train=dict())))
    cfg = compat_loader_args(cfg)
    # auto fill loader args
    assert 'val_dataloader' in cfg.data
    assert 'train_dataloader' in cfg.data
    assert 'test_dataloader' in cfg.data
    cfg = ConfigDict(
        dict(
            data=dict(
                samples_per_gpu=1,
                persistent_workers=True,
                workers_per_gpu=1,
                val=dict(samples_per_gpu=3),
                test=dict(samples_per_gpu=2),
                train=dict())))

    cfg = compat_loader_args(cfg)

    assert cfg.data.train_dataloader.workers_per_gpu == 1
    assert cfg.data.train_dataloader.samples_per_gpu == 1
    assert cfg.data.train_dataloader.persistent_workers
    assert cfg.data.val_dataloader.workers_per_gpu == 1
    assert cfg.data.val_dataloader.samples_per_gpu == 3
    assert cfg.data.test_dataloader.workers_per_gpu == 1
    assert cfg.data.test_dataloader.samples_per_gpu == 2

    # test test is a list
    cfg = ConfigDict(
        dict(
            data=dict(
                samples_per_gpu=1,
                persistent_workers=True,
                workers_per_gpu=1,
                val=dict(samples_per_gpu=3),
                test=[dict(samples_per_gpu=2),
                      dict(samples_per_gpu=3)],
                train=dict())))

    cfg = compat_loader_args(cfg)
    assert cfg.data.test_dataloader.samples_per_gpu == 3

    # assert can not set args at the same time
    cfg = ConfigDict(
        dict(
            data=dict(
                samples_per_gpu=1,
                persistent_workers=True,
                workers_per_gpu=1,
                val=dict(samples_per_gpu=3),
                test=dict(samples_per_gpu=2),
                train=dict(),
                train_dataloader=dict(samples_per_gpu=2))))
    # samples_per_gpu can not be set in `train_dataloader`
    # and data field at the same time
    with pytest.raises(AssertionError):
        compat_loader_args(cfg)
    cfg = ConfigDict(
        dict(
            data=dict(
                samples_per_gpu=1,
                persistent_workers=True,
                workers_per_gpu=1,
                val=dict(samples_per_gpu=3),
                test=dict(samples_per_gpu=2),
                train=dict(),
                val_dataloader=dict(samples_per_gpu=2))))
    # samples_per_gpu can not be set in `val_dataloader`
    # and data field at the same time
    with pytest.raises(AssertionError):
        compat_loader_args(cfg)
    cfg = ConfigDict(
        dict(
            data=dict(
                samples_per_gpu=1,
                persistent_workers=True,
                workers_per_gpu=1,
                val=dict(samples_per_gpu=3),
                test=dict(samples_per_gpu=2),
                test_dataloader=dict(samples_per_gpu=2))))
    # samples_per_gpu can not be set in `test_dataloader`
    # and data field at the same time
    with pytest.raises(AssertionError):
        compat_loader_args(cfg)
# ========================================= #

# ============== BUILD MODEL ================ #

class_map = icedata.coco.class_map()

model_name = "mobilenetv3_large_100_aa"
base_config_path = mmdet_configs_path / "retinanet"
config_path = base_config_path / "retinanet_r50_fpn_1x_coco.py"
cfg = Config.fromfile(config_path)

## mmdet >= 2.12 requires `ConfigDict`, not just `dict`
cfg.model.backbone = ConfigDict(
    dict(
        type=f"TIMM_{model_name}",
        pretrained=True,
        out_indices=(1, 2, 3, 4),
    ))
cfg.model.neck.in_channels = [24, 40, 112, 960]
cfg.model.bbox_head.num_classes = len(class_map) - 1

model = build_detector(cfg.model)
# print(model)

# ============================================ #

# ============== PL LIGHTNING ADAPTER ================ #


class MobileNetV3Adapter(models.mmdet.retinanet.lightning.ModelAdapter):
    def __init__(
Beispiel #30
0
def test_detr_head_loss():
    """Tests transformer head loss when truth is empty and non-empty."""
    s = 256
    img_metas = [{
        'img_shape': (s, s, 3),
        'scale_factor': 1,
        'pad_shape': (s, s, 3),
        'batch_input_shape': (s, s)
    }]
    config = ConfigDict(
        dict(type='DETRHead',
             num_classes=80,
             in_channels=200,
             transformer=dict(
                 type='Transformer',
                 encoder=dict(type='DetrTransformerEncoder',
                              num_layers=6,
                              transformerlayers=dict(
                                  type='BaseTransformerLayer',
                                  attn_cfgs=[
                                      dict(type='MultiheadAttention',
                                           embed_dims=256,
                                           num_heads=8,
                                           dropout=0.1)
                                  ],
                                  feedforward_channels=2048,
                                  ffn_dropout=0.1,
                                  operation_order=('self_attn', 'norm', 'ffn',
                                                   'norm'))),
                 decoder=dict(
                     type='DetrTransformerDecoder',
                     return_intermediate=True,
                     num_layers=6,
                     transformerlayers=dict(
                         type='DetrTransformerDecoderLayer',
                         attn_cfgs=dict(type='MultiheadAttention',
                                        embed_dims=256,
                                        num_heads=8,
                                        dropout=0.1),
                         feedforward_channels=2048,
                         ffn_dropout=0.1,
                         operation_order=('self_attn', 'norm', 'cross_attn',
                                          'norm', 'ffn', 'norm')),
                 )),
             positional_encoding=dict(type='SinePositionalEncoding',
                                      num_feats=128,
                                      normalize=True),
             loss_cls=dict(type='CrossEntropyLoss',
                           bg_cls_weight=0.1,
                           use_sigmoid=False,
                           loss_weight=1.0,
                           class_weight=1.0),
             loss_bbox=dict(type='L1Loss', loss_weight=5.0),
             loss_iou=dict(type='GIoULoss', loss_weight=2.0)))

    self = DETRHead(**config)
    self.init_weights()
    feat = [torch.rand(1, 200, 10, 10)]
    cls_scores, bbox_preds = self.forward(feat, img_metas)
    # Test that empty ground truth encourages the network to predict background
    gt_bboxes = [torch.empty((0, 4))]
    gt_labels = [torch.LongTensor([])]
    gt_bboxes_ignore = None
    empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
                                img_metas, gt_bboxes_ignore)
    # When there is no truth, the cls loss should be nonzero but there should
    # be no box loss.
    for key, loss in empty_gt_losses.items():
        if 'cls' in key:
            assert loss.item() > 0, 'cls loss should be non-zero'
        elif 'bbox' in key:
            assert loss.item(
            ) == 0, 'there should be no box loss when there are no true boxes'
        elif 'iou' in key:
            assert loss.item(
            ) == 0, 'there should be no iou loss when there are no true boxes'

    # When truth is non-empty then both cls and box loss should be nonzero for
    # random inputs
    gt_bboxes = [
        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
    ]
    gt_labels = [torch.LongTensor([2])]
    one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels,
                              img_metas, gt_bboxes_ignore)
    for loss in one_gt_losses.values():
        assert loss.item(
        ) > 0, 'cls loss, or box loss, or iou loss should be non-zero'

    # test forward_train
    self.forward_train(feat, img_metas, gt_bboxes, gt_labels)

    # test inference mode
    self.get_bboxes(cls_scores, bbox_preds, img_metas, rescale=True)