Exemple #1
0
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)

        src, mask = features[-1].decompose()
        assert mask is not None
        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]

        outputs_class = self.class_embed(hs)
        outputs_coord = self.bbox_embed(hs).sigmoid()
        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
        return out
Exemple #2
0
    def loss_masks(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the masks: the focal loss and the dice loss.
           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
        """
        assert "pred_masks" in outputs

        src_idx = self._get_src_permutation_idx(indices)
        tgt_idx = self._get_tgt_permutation_idx(indices)
        src_masks = outputs["pred_masks"]
        src_masks = src_masks[src_idx]
        masks = [t["masks"] for t in targets]
        # TODO use valid to mask invalid areas due to padding in loss
        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
        target_masks = target_masks.to(src_masks)
        target_masks = target_masks[tgt_idx]

        # upsample predictions to the target size
        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
                                mode="bilinear", align_corners=False)
        src_masks = src_masks[:, 0].flatten(1)

        target_masks = target_masks.flatten(1)
        target_masks = target_masks.view(src_masks.shape)
        losses = {
            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
        }
        return losses
Exemple #3
0
    def forward(self, samples: NestedTensor):
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.detr.backbone(samples)

        bs = features[-1].tensors.shape[0]

        src, mask = features[-1].decompose()
        src_proj = self.detr.input_proj(src)
        hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1])

        outputs_class = self.detr.class_embed(hs)
        outputs_coord = self.detr.bbox_embed(hs).sigmoid()
        out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
        if self.detr.aux_loss:
            out["aux_outputs"] = [
                {"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
            ]

        # FIXME h_boxes takes the last one computed, keep this in mind
        bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)

        seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors])
        outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])

        out["pred_masks"] = outputs_seg_masks
        return out
 def test_model_script(self):
     model = detr_resnet50(pretrained=False).eval()
     scripted_model = torch.jit.script(model)
     x = nested_tensor_from_tensor_list([torch.rand(3, 200, 200), torch.rand(3, 200, 250)])
     out = model(x)
     out_script = scripted_model(x)
     self.assertTrue(out["pred_logits"].equal(out_script["pred_logits"]))
     self.assertTrue(out["pred_boxes"].equal(out_script["pred_boxes"]))
Exemple #5
0
def sample_support_categories(args, targets, support_images, support_class_ids, support_targets):
    """
    This function is used during training. It does the followings:
    1. Samples the support categories (total num: args.total_num_support; maximum positive num: args.max_pos_support)
       (Insufficient positive support categories will be replaced with negative support categories.)
    2. Filters ground truths of the query images.
       We only keep ground truths whose labels are sampled as support categories.
    3. Samples and pre-processes support_images, support_class_ids, and support_targets.
    """
    support_images = list(itertools.chain(*support_images))
    support_class_ids = torch.cat(support_class_ids, dim=0).tolist()
    support_targets = list(itertools.chain(*support_targets))

    positive_labels = torch.cat([target['labels'] for target in targets], dim=0).unique()
    num_positive_labels = positive_labels.shape[0]
    positive_labels_list = positive_labels.tolist()
    negative_labels_list = list(set(support_class_ids) - set(positive_labels_list))
    num_negative_labels = len(negative_labels_list)

    positive_label_indexes = [i for i in list(range(len(support_images))) if support_class_ids[i] in positive_labels_list]
    negative_label_indexes = [i for i in list(range(len(support_images))) if support_class_ids[i] in negative_labels_list]

    meta_support_images, meta_support_class_ids, meta_support_targets = list(), list(), list()
    for _ in range(args.episode_num):
        NUM_POS = random.randint(max(0, args.episode_size - num_negative_labels),
                                 min(num_positive_labels, args.episode_size))
        NUM_NEG = args.episode_size - NUM_POS

        # Sample positive support classes: make sure in every episode, there is no repeated category
        while True:
            pos_support_indexes = random.sample(positive_label_indexes, NUM_POS)
            if NUM_POS == len(set([support_class_ids[i] for i in pos_support_indexes])):
                break

        # Sample negative support classes: try our best to ensure in every episode there is no repeated category
        num_trial = 0
        while num_trial < 50:
            neg_support_indexes = random.sample(negative_label_indexes, NUM_NEG)
            if NUM_NEG == len(set([support_class_ids[i] for i in neg_support_indexes])):
                break
            else:
                num_trial += 1

        support_indexes = pos_support_indexes + neg_support_indexes
        random.shuffle(support_indexes)

        selected_support_images = [support_images[i] for i in support_indexes]
        selected_support_class_ids = [support_class_ids[i] for i in support_indexes]
        selected_support_targets = [support_targets[i] for i in support_indexes]

        meta_support_images += selected_support_images
        meta_support_class_ids += selected_support_class_ids
        meta_support_targets += selected_support_targets

    meta_support_images = utils.nested_tensor_from_tensor_list(meta_support_images)
    meta_support_class_ids = torch.tensor(meta_support_class_ids)

    return targets, meta_support_images, meta_support_class_ids, meta_support_targets
Exemple #6
0
    def test_transformer_forward(self):
        backbone = Backbone('resnet50', True, True, False)
        x = nested_tensor_from_tensor_list(
            [torch.rand(3, 200, 200),
             torch.rand(3, 200, 250)])

        out = backbone(x)
        for key, value in out.items():
            print('{} {}'.format(key, value.tensors.shape))
Exemple #7
0
    def forward(self, data: dict):
        if self.training:
            self.criterion.initialize_for_single_clip(data['gt_instances'])
        frames = data['imgs']  # list of Tensor.
        outputs = {
            'pred_logits': [],
            'pred_boxes': [],
        }

        track_instances = self._generate_empty_tracks()
        keys = list(track_instances._fields.keys())
        for frame_index, frame in enumerate(frames):
            frame.requires_grad = False
            is_last = frame_index == len(frames) - 1
            if self.use_checkpoint and frame_index < len(frames) - 2:
                def fn(frame, *args):
                    frame = nested_tensor_from_tensor_list([frame])
                    tmp = Instances((1, 1), **dict(zip(keys, args)))
                    frame_res = self._forward_single_image(frame, tmp)
                    return (
                        frame_res['pred_logits'],
                        frame_res['pred_boxes'],
                        frame_res['ref_pts'],
                        frame_res['hs'],
                        *[aux['pred_logits'] for aux in frame_res['aux_outputs']],
                        *[aux['pred_boxes'] for aux in frame_res['aux_outputs']]
                    )

                args = [frame] + [track_instances.get(k) for k in keys]
                params = tuple((p for p in self.parameters() if p.requires_grad))
                tmp = checkpoint.CheckpointFunction.apply(fn, len(args), *args, *params)
                frame_res = {
                    'pred_logits': tmp[0],
                    'pred_boxes': tmp[1],
                    'ref_pts': tmp[2],
                    'hs': tmp[3],
                    'aux_outputs': [{
                        'pred_logits': tmp[4+i],
                        'pred_boxes': tmp[4+5+i],
                    } for i in range(5)],
                }
            else:
                frame = nested_tensor_from_tensor_list([frame])
                frame_res = self._forward_single_image(frame, track_instances)
            frame_res = self._post_process_single_image(frame_res, track_instances, is_last)

            track_instances = frame_res['track_instances']
            outputs['pred_logits'].append(frame_res['pred_logits'])
            outputs['pred_boxes'].append(frame_res['pred_boxes'])

        if not self.training:
            outputs['track_instances'] = track_instances
        else:
            outputs['losses_dict'] = self.criterion.losses_dict
        return outputs
Exemple #8
0
def get_image(args):
    test_image_raw = Image.open(args.demo_image).convert('RGB')
    transform = T.Compose([
        # T.RandomResize([400], max_size=1333),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    test_image = transform(test_image_raw, target=None)
    test_image = [test_image[0]]
    nested_test_image = nested_tensor_from_tensor_list(test_image)

    return nested_test_image
Exemple #9
0
 def fn(frame, *args):
     frame = nested_tensor_from_tensor_list([frame])
     tmp = Instances((1, 1), **dict(zip(keys, args)))
     frame_res = self._forward_single_image(frame, tmp)
     return (
         frame_res['pred_logits'],
         frame_res['pred_boxes'],
         frame_res['ref_pts'],
         frame_res['hs'],
         *[aux['pred_logits'] for aux in frame_res['aux_outputs']],
         *[aux['pred_boxes'] for aux in frame_res['aux_outputs']]
     )
Exemple #10
0
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)
        features, poses = self.backbone(samples)

        src, mask = features[-1].decompose()

        tensors = []
        masks = []
        for feature in features:
            tensor, mask = feature.decompose()
            tensors.append(tensor)
            masks.append(mask)

        assert mask is not None

        hs, ref_point, _ = self.transformer(tensors, masks, self.query_embed.weight, poses)

        # DL, L, B, d_model -> DL, B, L, d_model
        hs = hs.transpose(1, 2).contiguous()
        # L, B, 2 -> B, L, 2
        ref_point = ref_point.transpose(0, 1).contiguous()

        outputs_class = self.class_embed(hs)

        inversed_ref_point = - torch.log(1 / (ref_point + 1e-10) - 1 + 1e-10)
        outputs_coord = self.bbox_embed(hs)
        outputs_coord[..., 0] = outputs_coord[..., 0] + inversed_ref_point[..., 0]
        outputs_coord[..., 1] = outputs_coord[..., 1] + inversed_ref_point[..., 1]
        outputs_coord = torch.sigmoid(outputs_coord)

        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
        return out
Exemple #11
0
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        # 首先将样本转换为NestedTensor类型
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)
        # 输入到CNN提取特征
        features, pos = self.backbone(samples)

        # 取出最后一层特征及对应的mask
        src, mask = features[-1].decompose()
        assert mask is not None
        # 将query_embedding的权重作为参数输入到Transformer的前向过程,作为position encoding。而与这个position encoding相结
        # 合形成嵌入的是什么东东呢?当然是我们需要预测的目标咯!那么我都还不知道这些目标是什么在哪里,如何将它实体化?于是作者就
        # 直接将它初始化为全0,shape和query embedding 的权重一致。(可在Transformer前向传播过程中看到)
        # Transformer的输出是元组,分别为Decoder和Encoder的输出,因此在这里取第一个代表的是Decoder的输出
        hs = self.transformer(self.input_proj(src), mask,
                              self.query_embed.weight, pos[-1])[0]

        # 第二部分就是对输出的维度进行转化,与分类和回归任务所要求的相对应。
        # 生成分类与回归的预测结果
        outputs_class = self.class_embed(hs)
        outputs_coord = self.bbox_embed(hs).sigmoid()
        # 由于hs包含了Transformer中Decoder每层的输出,因此索引为-1代表去掉最后一层的输出
        # 那么这里问题就来了,在测试的时候,预测结果就有num_queries(默认100)个,而图片中实际的物体数量通常并没有那么多,这时候
        # 该如何处理呢?如传统套路一致,我们可以对预测分数设置一个阀值,只有预测的置信度大于阀值的query objects,我们将其输出显
        # 示(画出bbox),可以看官方show给我们的notebook。
        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        # 若指定要计算Decoder每层预测输出对应的loss,则记录对应的输出结果
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)
        return out
Exemple #12
0
    def forward(self, inputs):
        # P5 (32*32) is the target.
        img_batch = inputs.tensors

        resnest_out = self.resnest(img_batch)

        features = torch.cat([
            self.p0_downsample(resnest_out['0']),
            self.p1_downsample(resnest_out['1']),
            self.p2_downsample(resnest_out['2']), resnest_out['3']
        ], 1)

        features = nested_tensor_from_tensor_list(features)
        pos = self.position_encoding(features).to(features.tensors.dtype)

        return [features], [pos]
Exemple #13
0
def benchmark():
    args, _ = get_benckmark_arg_parser().parse_known_args()
    main_args = get_main_args_parser().parse_args(_)
    assert args.warm_iters < args.num_iters and args.num_iters > 0 and args.warm_iters >= 0
    assert args.batch_size > 0
    assert args.resume is None or os.path.exists(args.resume)
    dataset = build_dataset('val', main_args)
    model, _, _ = build_model(main_args)
    model.cuda()
    model.eval()
    if args.resume is not None:
        ckpt = torch.load(args.resume, map_location=lambda storage, loc: storage)
        model.load_state_dict(ckpt['model'])
    inputs = nested_tensor_from_tensor_list([dataset.__getitem__(0)[0].cuda() for _ in range(args.batch_size)])
    t = measure_average_inference_time(model, inputs, args.num_iters, args.warm_iters)
    return 1.0 / t * args.batch_size
Exemple #14
0
 def test_model_detection_different_inputs(self):
     model = detr_resnet50(pretrained=False).eval()
     # support NestedTensor
     x = nested_tensor_from_tensor_list(
         [torch.rand(3, 200, 200),
          torch.rand(3, 200, 250)])
     out = model(x)
     self.assertIn('pred_logits', out)
     # and 4d Tensor
     x = torch.rand(1, 3, 200, 200)
     out = model(x)
     self.assertIn('pred_logits', out)
     # and List[Tensor[C, H, W]]
     x = torch.rand(3, 200, 200)
     out = model([x])
     self.assertIn('pred_logits', out)
Exemple #15
0
    def forward(self, samples, postprocessors=None, targets=None, criterion=None):
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)

        features, pos = self.backbone(samples)

        num = self.args.layer1_num
        src, mask = features[num].decompose()
        assert mask is not None
        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[num])[0]

        outputs_class = self.class_embed(hs)
        outputs_coord = self.lines_embed(hs).sigmoid()
        out = {'pred_logits': outputs_class[-1], 'pred_lines': outputs_coord[-1]}
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
        return out
Exemple #16
0
    def inference_single_image(self, img, ori_img_size, track_instances=None):
        if not isinstance(img, NestedTensor):
            img = nested_tensor_from_tensor_list(img)
        if track_instances is None:
            track_instances = self._generate_empty_tracks()
        res = self._forward_single_image(img, track_instances=track_instances)

        track_instances = res['track_instances']
        track_instances = self.post_process(track_instances, ori_img_size)
        ret = {'track_instances': track_instances}
        if 'ref_pts' in res:
            ref_pts = res['ref_pts']
            img_h, img_w = ori_img_size
            scale_fct = torch.Tensor([img_w, img_h]).to(ref_pts)
            ref_pts = ref_pts * scale_fct[None]
            ret['ref_pts'] = ref_pts
        return ret
Exemple #17
0
    def forward(self, samples: NestedTensor):
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.vistr.backbone(samples)
        bs = features[-1].tensors.shape[0]
        src, mask = features[-1].decompose()
        assert mask is not None
        src_proj = self.vistr.input_proj(src)
        n,c,s_h,s_w = src_proj.shape
        bs_f = bs//self.vistr.num_frames
        src_proj = src_proj.reshape(bs_f, self.vistr.num_frames,c, s_h, s_w).permute(0,2,1,3,4).flatten(-2)
        mask = mask.reshape(bs_f, self.vistr.num_frames, s_h*s_w)
        pos = pos[-1].permute(0,2,1,3,4).flatten(-2)
        hs, memory = self.vistr.transformer(src_proj, mask, self.vistr.query_embed.weight, pos)
        outputs_class = self.vistr.class_embed(hs)
        outputs_coord = self.vistr.bbox_embed(hs).sigmoid()
        out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
        if self.vistr.aux_loss:
            out['aux_outputs'] = self.vistr._set_aux_loss(outputs_class, outputs_coord)
        for i in range(3):
            _,c_f,h,w = features[i].tensors.shape
            features[i].tensors = features[i].tensors.reshape(bs_f, self.vistr.num_frames, c_f, h,w)
        n_f = self.vistr.num_queries//self.vistr.num_frames
        outputs_seg_masks = []
        
        # image level processing using box attention
        for i in range(self.vistr.num_frames):
            hs_f = hs[-1][:,i*n_f:(i+1)*n_f,:]
            memory_f = memory[:,:,i,:].reshape(bs_f, c, s_h,s_w)
            mask_f = mask[:,i,:].reshape(bs_f, s_h,s_w)
            bbox_mask_f = self.bbox_attention(hs_f, memory_f, mask=mask_f)
            seg_masks_f = self.mask_head(memory_f, bbox_mask_f, [features[2].tensors[:,i], features[1].tensors[:,i], features[0].tensors[:,i]])
            outputs_seg_masks_f = seg_masks_f.view(bs_f, n_f, 24, seg_masks_f.shape[-2], seg_masks_f.shape[-1])
            outputs_seg_masks.append(outputs_seg_masks_f)
        frame_masks = torch.cat(outputs_seg_masks,dim=0)
        outputs_seg_masks = []

        # instance level processing using 3D convolution
        for i in range(frame_masks.size(1)):
            mask_ins = frame_masks[:,i].unsqueeze(0)
            mask_ins = mask_ins.permute(0,2,1,3,4)
            outputs_seg_masks.append(self.insmask_head(mask_ins))
        outputs_seg_masks = torch.cat(outputs_seg_masks,1).squeeze(0).permute(1,0,2,3)
        outputs_seg_masks = outputs_seg_masks.reshape(1,360,outputs_seg_masks.size(-2),outputs_seg_masks.size(-1))
        out["pred_masks"] = outputs_seg_masks
        return out
Exemple #18
0
    def forward(self, tensor_list: torch.Tensor):
        self.batch_size = tensor_list.shape[0]
        self.sequnce_length = tensor_list.shape[1]
        tensor_list = tensor_list.reshape(
            (self.batch_size * self.sequnce_length, tensor_list.shape[2],
             tensor_list.shape[3], tensor_list.shape[4]))
        tensor_list = nested_tensor_from_tensor_list(tensor_list)

        xs = self.body(tensor_list.tensors)
        out: Dict[str, NestedTensor] = {}
        for name, x in xs.items():
            m = tensor_list.mask
            assert m is not None
            mask = F.interpolate(m[None].float(),
                                 size=x.shape[-2:]).to(torch.bool)[0]
            out[name] = NestedTensor(x, mask)
        return out
    def forward(self, x):
        """
        x: one image of dimensions [batch size, channel count, width, height]
        """
        l_x = [x[i] for i in range(x.shape[0])]
        sample = nested_tensor_from_tensor_list(l_x)
        output = self.model(sample)
        image_sizes = torch.zeros([len(l_x), 2]).cpu()
        i = 0
        for x in l_x:
            image_sizes[i][0] = x.shape[1]
            image_sizes[i][1] = x.shape[2]
            i += 1

        # converting detr to torchvision detection format
        processed_output = self.pp(output['pred_logits'], output['pred_boxes'],
                                   image_sizes)
        return processed_output
    def forward(self, samples: NestedTensor):
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)

        src, mask = features[-1].decompose()
        assert mask is not None
        hs = self.transformer(self.input_proj(src), mask,
                              self.query_embed.weight, pos[-1])[0]

        outputs_obj_class = self.obj_class_embed(hs)
        outputs_verb_class = self.verb_class_embed(hs)
        outputs_sub_coord = self.sub_bbox_embed(hs).sigmoid()
        outputs_obj_coord = self.obj_bbox_embed(hs).sigmoid()
        out = {
            'pred_obj_logits': outputs_obj_class[-1],
            'pred_verb_logits': outputs_verb_class[-1],
            'pred_sub_boxes': outputs_sub_coord[-1],
            'pred_obj_boxes': outputs_obj_coord[-1]
        }
        return out
Exemple #21
0
    def forward(self,
                samples,
                postprocessors=None,
                targets=None,
                criterion=None):
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)

        # backbone
        features, pos = self.letr.backbone(samples)

        # layer 1
        l1_num = self.args.layer1_num
        src1, mask1 = features[l1_num].decompose()
        assert mask1 is not None

        # layer 1 transformer
        hs1, _ = self.letr.transformer(self.letr.input_proj(src1), mask1,
                                       self.letr.query_embed.weight,
                                       pos[l1_num])

        # layer 2
        l2_num = self.args.layer2_num
        src2, mask2 = features[l2_num].decompose()
        src2 = self.input_proj(src2)

        # layer 2 transformer
        hs2, memory, _ = self.transformer(src2, mask2, hs1[-1], pos[l2_num])

        outputs_class = self.class_embed(hs2)
        outputs_coord = self.lines_embed(hs2).sigmoid()
        out = {}
        out["pred_logits"] = outputs_class[-1]
        out["pred_lines"] = outputs_coord[-1]

        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        return out, None
    def run_model(self,
                  model,
                  inputs_list,
                  tolerate_small_mismatch=False,
                  do_constant_folding=True,
                  dynamic_axes=None,
                  output_names=None,
                  input_names=None):
        model.eval()

        onnx_io = io.BytesIO()
        onnx_path = "detr.onnx"

        # export to onnx with the first input
        torch.onnx.export(model,
                          inputs_list[0],
                          onnx_io,
                          input_names=input_names,
                          output_names=output_names,
                          export_params=True,
                          training=False)
        torch.onnx.export(model,
                          inputs_list[0],
                          onnx_path,
                          input_names=input_names,
                          output_names=output_names,
                          export_params=True,
                          training=False)
        # validate the exported model with onnx runtime
        for test_inputs in inputs_list:
            with torch.no_grad():
                if isinstance(test_inputs, torch.Tensor) or isinstance(
                        test_inputs, list):
                    test_inputs = (
                        nested_tensor_from_tensor_list(test_inputs), )
                test_ouputs = model(*test_inputs)
                if isinstance(test_ouputs, torch.Tensor):
                    test_ouputs = (test_ouputs, )
            self.ort_validate(onnx_io, test_inputs, test_ouputs,
                              tolerate_small_mismatch)
Exemple #23
0
 def loss_masks(self, outputs, targets, indices, num_boxes):
     """Compute the losses related to the masks: the focal loss and the dice loss.
        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
     """
     assert 'pred_masks' in outputs
     src_idx = self._get_src_permutation_idx(indices)
     tgt_idx = self._get_tgt_permutation_idx(indices)
     src_masks = outputs['pred_masks']
     src_masks = src_masks[src_idx]
     masks = [t['masks'] for t in targets]
     target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
     target_masks = target_masks.to(src_masks)
     target_masks = target_masks[tgt_idx]
     src_masks = interpolate(src_masks[:, None], size=target_masks.shape
         [-2:], mode='bilinear', align_corners=False)
     src_masks = src_masks[:, 0].flatten(1)
     target_masks = target_masks.flatten(1)
     target_masks = target_masks.view(src_masks.shape)
     losses = {'loss_mask': sigmoid_focal_loss(src_masks, target_masks,
         num_boxes), 'loss_dice': dice_loss(src_masks, target_masks,
         num_boxes)}
     return losses
Exemple #24
0
    def forward(self, samples: NestedTensor):
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.detr.backbone(samples)

        bs = features[-1].tensors.shape[0]

        src, mask = features[-1].decompose()
        assert mask is not None
        src_proj = self.detr.input_proj(src)
        # to align with UP-DETR, we add self.detr.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) here.
        hs, memory = self.detr.transformer(
            src_proj, mask,
            self.detr.query_embed.weight.unsqueeze(1).repeat(1, bs, 1),
            pos[-1])

        outputs_class = self.detr.class_embed(hs)
        outputs_coord = self.detr.bbox_embed(hs).sigmoid()
        out = {
            "pred_logits": outputs_class[-1],
            "pred_boxes": outputs_coord[-1]
        }
        if self.detr.aux_loss:
            out['aux_outputs'] = self.detr._set_aux_loss(
                outputs_class, outputs_coord)

        # FIXME h_boxes takes the last one computed, keep this in mind
        bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)

        seg_masks = self.mask_head(
            src_proj, bbox_mask,
            [features[2].tensors, features[1].tensors, features[0].tensors])
        outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries,
                                           seg_masks.shape[-2],
                                           seg_masks.shape[-1])

        out["pred_masks"] = outputs_seg_masks
        return out
Exemple #25
0
    def forward(self, data: dict):
        if self.training:
            self.criterion.initialize_for_single_clip(data['gt_instances'])
        frames = data['imgs']  # list of Tensor.
        outputs = {
            'pred_logits': [],
            'pred_boxes': [],
        }

        track_instances = self._generate_empty_tracks()
        for frame in frames:
            if not isinstance(frame, NestedTensor):
                frame = nested_tensor_from_tensor_list([frame])
            frame_res = self._forward_single_image(frame, track_instances)
            track_instances = frame_res['track_instances']
            outputs['pred_logits'].append(frame_res['pred_logits'])
            outputs['pred_boxes'].append(frame_res['pred_boxes'])

        if not self.training:
            outputs['track_instances'] = track_instances
        else:
            outputs['losses_dict'] = self.criterion.losses_dict
        return outputs
Exemple #26
0
 def forward(self, inputs: List[Tensor]):
     sample = nested_tensor_from_tensor_list(inputs)
     return self.model(sample)
Exemple #27
0
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)

        src, mask = features[-1].decompose()
        src_4x, mask_4x = features[0].decompose()
        src_8x, mask_8x = features[1].decompose()
        src_16x, mask_16x = features[2].decompose()
        src_list = [src_4x, src_8x, src_16x]
        mask_list = [mask_4x, mask_8x, mask_16x]
        pos_embed_list = [pos[0], pos[1], pos[2]]

        if int(os.environ.get("encoder_high_resolution", 0)):
            if int(os.environ.get("encoder_resolution", 8)) == 4:
                _, _, h_4x, w_4x = src_list[0].size()
                feat1 = src_list[0]
                feat2 = F.interpolate(src_list[1],
                                      size=(h_4x, w_4x),
                                      mode="bilinear",
                                      align_corners=True)
                feat3 = F.interpolate(src_list[2],
                                      size=(h_4x, w_4x),
                                      mode="bilinear",
                                      align_corners=True)
                feat4 = F.interpolate(src,
                                      size=(h_4x, w_4x),
                                      mode="bilinear",
                                      align_corners=True)
                feats = torch.cat([feat1, feat2, feat3, feat4], 1)
                hs = self.transformer(self.input_proj(feats), mask_4x,
                                      self.query_embed.weight, pos[0],
                                      src_list, mask_list, pos_embed_list)[0]
            elif int(os.environ.get("encoder_resolution", 8)) == 8:
                _, _, h_8x, w_8x = src_list[1].size()
                feat1 = F.interpolate(src_list[0],
                                      size=(h_8x, w_8x),
                                      mode="bilinear",
                                      align_corners=True)
                feat2 = src_list[1]
                feat3 = F.interpolate(src_list[2],
                                      size=(h_8x, w_8x),
                                      mode="bilinear",
                                      align_corners=True)
                feat4 = F.interpolate(src,
                                      size=(h_8x, w_8x),
                                      mode="bilinear",
                                      align_corners=True)
                feats = torch.cat([feat1, feat2, feat3, feat4], 1)
                hs = self.transformer(self.input_proj(feats), mask_8x,
                                      self.query_embed.weight, pos[1],
                                      src_list, mask_list, pos_embed_list)[0]
            elif int(os.environ.get("encoder_resolution", 8)) == 16:
                _, _, h_16x, w_16x = src_list[2].size()
                feat1 = F.interpolate(src_list[0],
                                      size=(h_16x, w_16x),
                                      mode="bilinear",
                                      align_corners=True)
                feat2 = F.interpolate(src_list[1],
                                      size=(h_16x, w_16x),
                                      mode="bilinear",
                                      align_corners=True)
                feat3 = src_list[2]
                feat4 = F.interpolate(src,
                                      size=(h_16x, w_16x),
                                      mode="bilinear",
                                      align_corners=True)
                feats = torch.cat([feat1, feat2, feat3, feat4], 1)
                hs = self.transformer(self.input_proj(feats), mask_16x,
                                      self.query_embed.weight, pos[2],
                                      src_list, mask_list, pos_embed_list)[0]
            else:
                _, _, h_32x, w_32x = src.size()
                feat1 = F.interpolate(src_list[0],
                                      size=(h_32x, w_32x),
                                      mode="bilinear",
                                      align_corners=True)
                feat2 = F.interpolate(src_list[1],
                                      size=(h_32x, w_32x),
                                      mode="bilinear",
                                      align_corners=True)
                feat3 = F.interpolate(src_list[2],
                                      size=(h_32x, w_32x),
                                      mode="bilinear",
                                      align_corners=True)
                feat4 = src
                feats = torch.cat([feat1, feat2, feat3, feat4], 1)
                hs = self.transformer(self.input_proj(feats), mask,
                                      self.query_embed.weight, pos[-1],
                                      src_list, mask_list, pos_embed_list)[0]
        else:
            hs = self.transformer(self.input_proj(src), mask,
                                  self.query_embed.weight, pos[-1], src_list,
                                  mask_list, pos_embed_list)[0]

        outputs_class = self.class_embed(hs)
        outputs_coord = self.bbox_embed(hs).sigmoid()
        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)
        return out
    def forward(self, samples: NestedTensor, pre_embed=None):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        assert not self.training, 'here is inference mode'
        assert samples.tensors.shape[0] == 1, 'track only supports batch 1'
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)

        if pre_embed is not None:
            pre_feat = pre_embed['feat']
        else:
            pre_feat = features

        srcs = []
        masks = []

        for l, (feat, feat2) in enumerate(zip(features, pre_feat)):
            src, mask = feat.decompose()
            src2, _ = feat2.decompose()
            srcs.append(
                self.combine(
                    torch.cat(
                        [self.input_proj[l](src), self.input_proj[l](src2)],
                        dim=1)))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.combine(
                        torch.cat([
                            self.input_proj[l](features[-1].tensors),
                            self.input_proj[l](pre_feat[-1].tensors)
                        ],
                                  dim=1))
                else:
                    src = self.input_proj[l](srcs[-1])

                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        # detection mode
        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, memory = self.transformer(
            srcs, masks, pos, query_embeds)
        cur_hs = hs
        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        cur_class = outputs_class[-1]
        cur_box = outputs_coord[-1]
        cur_reference = cur_box
        cur_tgt = cur_hs[-1]

        if pre_embed is not None:
            # track mode
            pre_reference, pre_tgt = pre_embed['reference'], pre_embed['tgt']

            hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, _ = self.transformer(
                srcs, masks, pos, query_embeds, pre_reference, pre_tgt, memory)
            outputs_classes = []
            outputs_coords = []
            for lvl in range(hs.shape[0]):
                if lvl == 0:
                    reference = init_reference
                else:
                    reference = inter_references[lvl - 1]
                reference = inverse_sigmoid(reference)
                outputs_class = self.class_embed[lvl](hs[lvl])
                tmp = self.bbox_embed[lvl](hs[lvl])
                if reference.shape[-1] == 4:
                    tmp += reference
                else:
                    assert reference.shape[-1] == 2
                    tmp[..., :2] += reference
                outputs_coord = tmp.sigmoid()
                outputs_classes.append(outputs_class)
                outputs_coords.append(outputs_coord)
            outputs_class = torch.stack(outputs_classes)
            outputs_coord = torch.stack(outputs_coords)

            pre_class, pre_box = outputs_class[-1], outputs_coord[-1]

        else:
            pre_class, pre_box = cur_class, cur_box

        out = {
            'pred_logits': cur_class,
            'pred_boxes': cur_box,
            'tracking_logits': pre_class,
            'tracking_boxes': pre_box
        }

        pre_embed = {
            'reference': cur_reference,
            'tgt': cur_tgt,
            'feat': features
        }

        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out, pre_embed
Exemple #29
0
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)
        # print('sample: ', samples.tensors.shape)
        # print(len(features))
        # for ft in features:
        #     print(ft.tensors.shape)
        # exit()
        fpn_input = [features[i] for i in [0, 1, 3]]
        # for ft in fpn_input:
        #     print(ft.tensors.shape)
        # exit()
        # fpn_features = self.fpn(features)
        fpn_features = self.fpn(fpn_input)
        # print("fpn:")
        # for ft in fpn_features:
        #     print('[{}]: {}'.format(ft, fpn_features[ft].shape))
        # exit()

        src, mask = features[-1].decompose()
        # print("flatten features:")
        # print(src.shape)
        # exit()
        assert mask is not None
        hs = self.transformer(self.input_proj(src), mask,
                              self.query_embed.weight, pos[-1])[0]

        outputs_class = self.class_embed(hs)
        outputs_coord = self.bbox_embed(hs).sigmoid()

        # roi_input = [
        #     {'p5': features[0].tensors},
        #     outputs_coord[-1]
        #     ]
        roi_input = [{ft: fpn_features[ft]
                      for ft in fpn_features}, outputs_coord[-1]]
        # for ft in roi_input[0]:
        #     print('[{}]: {}'.format(ft, fpn_features[ft].shape))
        # exit()
        # roi_output = self.roi_head(*roi_input)
        roi_output = nn.Softmax(dim=2)(self.roi_head(*roi_input))

        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        # print("detr: ", torch.max(out['pred_logits'][0], dim=1))
        # print("roi: ", torch.max(roi_output, dim=2))
        # out['pred_logits'] = nn.Softmax(dim=2)(out['pred_logits'] * roi_output)
        out['pred_logits'] = nn.Softmax(dim=2)(out['pred_logits'] +
                                               self.roi_weight * roi_output)

        return out
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)

        features, pos = self.backbone(samples)

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None
        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        valid_ratio = None
        if self.accurate_ratio:
            valid_ratio = self._get_valid_ratio(samples.mask)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs, masks, pos, query_embeds, valid_ratio=valid_ratio)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)
        if not self.two_stage:
            ref_pts = torch.cat(
                [init_reference[None], inter_references[:, :, :, :2]])
            out = {
                'pred_logits': outputs_class[-1],
                'pred_boxes': outputs_coord[-1],
                'ref_pts': ref_pts,
                'logits_all': outputs_class,
                'boxes_all': outputs_coord
            }
        else:
            out = {
                'pred_logits': outputs_class[-1],
                'pred_boxes': outputs_coord[-1]
            }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out