コード例 #1
0
    def support_encoding_net(self, x, return_interm_layers=False):
        out: Dict[str, NestedTensor] = {}
        m = x.mask
        # x = self.meta_conv(x.tensors)
        x = self.backbone.conv1(x.tensors)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)
        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        if return_interm_layers:
            mask = F.interpolate(m[None].float(),
                                 size=x.shape[-2:]).to(torch.bool)[0]
            out['0'] = NestedTensor(x, mask)

        x = self.backbone.layer3(x)
        if return_interm_layers:
            mask = F.interpolate(m[None].float(),
                                 size=x.shape[-2:]).to(torch.bool)[0]
            out['1'] = NestedTensor(x, mask)

        x = self.backbone.layer4(x)
        if return_interm_layers:
            mask = F.interpolate(m[None].float(),
                                 size=x.shape[-2:]).to(torch.bool)[0]
            out['2'] = NestedTensor(x, mask)

        if return_interm_layers:
            return out
        else:
            mask = F.interpolate(m[None].float(),
                                 size=x.shape[-2:]).to(torch.bool)[0]
            out['0'] = NestedTensor(x, mask)
            return out
コード例 #2
0
ファイル: backbone.py プロジェクト: qgh1223/Deformable-DETR
    def forward(self, tensor_list: NestedTensor):
        xs = self.body(tensor_list.tensors)
        out: Dict[str, NestedTensor] = {}
        m = tensor_list.mask
        assert m is not None

        for name, x in xs.items():

            # mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]

            if name == '0':
                scale_map = self.c3_conv(x)
            elif name == '1':
                scale_map = self.c4_conv(x)
            else:
                scale_map = self.c5_conv(x)
            mask = F.interpolate(m[None].float(),
                                 size=scale_map.shape[-2:]).to(torch.bool)[0]
            out[name] = NestedTensor(scale_map, mask)

        c6 = self.c6_conv(xs['2'])
        mask = F.interpolate(m[None].float(),
                             size=c6.shape[-2:]).to(torch.bool)[0]
        out['3'] = NestedTensor(c6, mask)

        return out
コード例 #3
0
 def forward(self, tensor_list):
     xs = self.body(tensor_list.tensors)
     out = OrderedDict()
     for name, x in xs.items():
         mask = F.interpolate(tensor_list.mask[None].float(), size=x.shape[-2:]).bool()[0]
         out[name] = NestedTensor(x, mask)
     return out
コード例 #4
0
ファイル: detr.py プロジェクト: xieenze/detr
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if not isinstance(samples, NestedTensor):
            samples = NestedTensor.from_tensor_list(samples)
        features, pos = self.backbone(samples)

        src, mask = features[-1].decompose()
        hs = self.transformer(self.input_proj(src), mask,
                              self.query_embed.weight, pos[-1])[0]

        outputs_class = self.class_embed(hs)
        outputs_coord = self.bbox_embed(hs).sigmoid()
        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        if self.aux_loss:
            out['aux_outputs'] = [{
                'pred_logits': a,
                'pred_boxes': b
            } for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
        return out
コード例 #5
0
    def forward(self, tensor_list: NestedTensor):
        xs = self.body(tensor_list.tensors)
        out: Dict[str, NestedTensor] = {}
        for name, x in xs.items():

            if 'layer' + name not in self.return_layers:
                continue

            #print(name, ", ", x.shape)
            m = tensor_list.mask
            assert m is not None
            mask = F.interpolate(m[None].float(),
                                 size=x.shape[-2:]).to(torch.bool)[0]

            # TODO: workaround to avoid NaN of attention calculation because of a full "True" mask
            invalid_indices = (torch.logical_not(mask).sum(
                dim=[1, 2]) == 0).nonzero().squeeze(-1)
            if (len(invalid_indices)):
                #print("workaround to avoid NaN for {}".format(invalid_indices))
                mask[invalid_indices] = torch.zeros(x.shape[-2:],
                                                    dtype=torch.bool,
                                                    device=mask.device)

            out[name] = NestedTensor(x, mask)
        return out, xs
コード例 #6
0
ファイル: position_embedding.py プロジェクト: yoshilab/detr
def main():
    embedding = PositionEmbeddingSine(num_pos_feats=128)
    images = torch.rand((1, 3, 64, 64)).type(torch.FloatTensor)
    masks = torch.rand((1, 64, 64)).type(torch.LongTensor)
    inputs = NestedTensor(images, masks)
    pos = embedding(inputs)
    print(pos.size())
コード例 #7
0
ファイル: detr.py プロジェクト: xieenze/detr
    def loss_masks(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the masks: the focal loss and the dice loss.
           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
        """
        assert "pred_masks" in outputs

        src_idx = self._get_src_permutation_idx(indices)
        tgt_idx = self._get_tgt_permutation_idx(indices)

        src_masks = outputs["pred_masks"]

        # TODO use valid to mask invalid areas due to padding in loss
        target_masks, valid = NestedTensor.from_tensor_list(
            [t["masks"] for t in targets]).decompose()
        target_masks = target_masks.to(src_masks)

        src_masks = src_masks[src_idx]
        # upsample predictions to the target size
        src_masks = interpolate(src_masks[:, None],
                                size=target_masks.shape[-2:],
                                mode="bilinear",
                                align_corners=False)
        src_masks = src_masks[:, 0].flatten(1)

        target_masks = target_masks[tgt_idx].flatten(1)

        losses = {
            "loss_mask": sigmoid_focal_loss(src_masks, target_masks,
                                            num_boxes),
            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
        }
        return losses
コード例 #8
0
ファイル: backbone.py プロジェクト: askintution/detr
 def forward(self, tensor_list: NestedTensor):
     xs = self.body(tensor_list.tensors)
     out: Dict[str, NestedTensor] = {}
     for name, x in xs.items():
         m = tensor_list.mask
         assert m is not None
         mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
         out[name] = NestedTensor(x, mask)
     return out
コード例 #9
0
    def _forward_single_image(self, samples, track_instances: Instances):
        features, pos = self.backbone(samples)
        src, mask = features[-1].decompose()
        assert mask is not None

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(srcs, masks, pos, track_instances.query_pos, ref_pts=track_instances.ref_pts)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        ref_pts_all = torch.cat([init_reference[None], inter_references[:, :, :, :2]], dim=0)
        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'ref_pts': ref_pts_all[5]}
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
        out['hs'] = hs[-1]
        return out
コード例 #10
0
ファイル: detr.py プロジェクト: huaxiangwangman/detr_paddle
 def forward(self, images):
     features = self.backbone(images.tensor)
     masks = self.mask_out_padding([
         features_per_level.shape
         for features_per_level in features.values()
     ], images.image_sizes, images.tensor.device)
     assert len(features) == len(masks)
     for i, k in enumerate(features.keys()):
         features[k] = NestedTensor(features[k], masks[i])
     return features
コード例 #11
0
 def forward(self, tensor_list):
     xs = self.body(tensor_list.tensors)
     if not self.interm:
         xs = [xs[self.main_layer]]
     out = OrderedDict()
     for i, x in enumerate(xs):
         mask = F.interpolate(tensor_list.mask[None].float(),
                              size=x.shape[-2:]).bool()[0]
         out[f"layer{i}"] = NestedTensor(x, mask)
     return out
コード例 #12
0
ファイル: backbone.py プロジェクト: umd-fire-coml/detr
 def forward(self, tensor_list: NestedTensor):
     xs = self.body(tensor_list.tensors)  # get the tensor (image features) from intermediate layers
     out: Dict[str, NestedTensor] = {} 
     for name, x in xs.items():  # for each intermediate layer tensor
         m = tensor_list.mask 
         assert m is not None
         # scale the mask to the size of the intermediate layer tensor
         mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]  
         out[name] = NestedTensor(x, mask)
     return out
コード例 #13
0
ファイル: backbone.py プロジェクト: wuyuebupt/detr
 def forward(self, tensor_list):
     xs = self.body(tensor_list.tensors)
     out = OrderedDict()
     # print ("backbone part")
     for name, x in xs.items():
         mask = F.interpolate(tensor_list.mask[None].float(),
                              size=x.shape[-2:]).bool()[0]
         out[name] = NestedTensor(x, mask)
         # print ("backbone feature shape: ", x.size())
         # print ("mask:", mask.size())
     return out
コード例 #14
0
    def forward(self, samples: NestedTensor, pre_embed=None):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if isinstance(samples, (list, torch.Tensor)):
            samples = nested_tensor_from_tensor_list(samples)

        # detection only.
        if self.track_on and pre_embed is None:
            samples.tensor = samples.tensor[:, 3:, :, :]  # det on pre frame.

        # backbone features.
        features, pos = self.backbone(samples)
        src, mask = features[self.index_feedforward].decompose()
        assert mask is not None

        # embedding features.
        hs = self.transformer(self.input_proj(src),
                              mask,
                              self.query_embed.weight,
                              pos[self.index_feedforward],
                              tgt=pre_embed)[0]

        # individual branch.
        outputs_class = self.class_embed(hs)
        outputs_coord = self.bbox_embed(hs).sigmoid()
        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        outputs_track = None
        if self.track_on:
            outputs_track = self.track_embed(hs)
            out['pred_tracks'] = outputs_track[-1]
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord,
                                                    outputs_track)

        # pre embed.
        pre_embed = hs[-1].detach()

        return out, pre_embed
コード例 #15
0
 def forward(self, tensor_list: NestedTensor):
     xs = self.backbone.forward_features(
         tensor_list.tensors
     )  # NOTE tensor_list.tensors is merged tensors (padded).
     out: Dict[str, NestedTensor] = {}
     for name, x in xs.items():
         m = tensor_list.mask
         assert m is not None
         mask = F.interpolate(m[None].float(),
                              size=x.shape[-2:]).to(torch.bool)[0]
         out[name] = NestedTensor(x, mask)
     return out  # Returns a dict of NestedTensors, containing the features and corresponding (interpolated) masks.
コード例 #16
0
ファイル: backbone.py プロジェクト: GUOShuxuan/detr
 def forward(self, tensor_list: NestedTensor):
     xs = self.body(tensor_list.tensors)  #torch.Size([2, 256, 38, 60])
     # IPython.embed()
     xs = {'0': xs}
     out: Dict[str, NestedTensor] = {}
     for name, x in xs.items():
         m = tensor_list.mask
         assert m is not None
         mask = F.interpolate(m[None].float(),
                              size=x.shape[-2:]).to(torch.bool)[0]
         out[name] = NestedTensor(x, mask)  # torch.Size([2, 256, 38, 60])
     # IPython.embed()
     return out
コード例 #17
0
 def forward(self, tensor_list):
     """supports both NestedTensor and torch.Tensor
     """
     if isinstance(tensor_list, NestedTensor):
         xs = self.body(tensor_list.tensors)
         out: Dict[str, NestedTensor] = {}
         for name, x in xs.items():
             m = tensor_list.mask
             assert m is not None
             mask = F.interpolate(m[None].float(),
                                  size=x.shape[-2:]).to(torch.bool)[0]
             out[name] = NestedTensor(x, mask)
     else:
         out = self.body(tensor_list)
     return out
コード例 #18
0
ファイル: backbone.py プロジェクト: GUOShuxuan/detr
 def forward(self, tensor_list: NestedTensor):
     xs = self.body(tensor_list.tensors)
     # input:  torch.Size([2, 3, 604, 960])
     #xs['0'].size(): torch.Size([2, 2048, 19, 30]) 'orderdict'
     # IPython.embed()
     out: Dict[str, NestedTensor] = {}
     for name, x in xs.items():
         m = tensor_list.mask
         assert m is not None
         mask = F.interpolate(m[None].float(),
                              size=x.shape[-2:]).to(torch.bool)[0]
         out[name] = NestedTensor(
             x, mask
         )  #x.size():torch.Size([2, 2048, 19, 30]) mask.size():[2, 19, 30])
     # IPython.embed()
     return out
コード例 #19
0
    def forward(self, tensor_list: torch.Tensor):
        self.batch_size = tensor_list.shape[0]
        self.sequnce_length = tensor_list.shape[1]
        tensor_list = tensor_list.reshape(
            (self.batch_size * self.sequnce_length, tensor_list.shape[2],
             tensor_list.shape[3], tensor_list.shape[4]))
        tensor_list = nested_tensor_from_tensor_list(tensor_list)

        xs = self.body(tensor_list.tensors)
        out: Dict[str, NestedTensor] = {}
        for name, x in xs.items():
            m = tensor_list.mask
            assert m is not None
            mask = F.interpolate(m[None].float(),
                                 size=x.shape[-2:]).to(torch.bool)[0]
            out[name] = NestedTensor(x, mask)
        return out
コード例 #20
0
 def forward(self, tensor_list: NestedTensor):
     xs = self.body(tensor_list.tensors)
     out: Dict[str, NestedTensor] = {}
     for name, x in xs.items():
         m = tensor_list.mask
         assert m is not None
         m = L.unsqueeze(m,
                         1)  # [batch_size, h, w] -> [batch_size, 1, h, w]
         m = m.astype("float32")
         mask = L.image_resize(m,
                               out_shape=x.shape[-2:],
                               resample="NEAREST")
         mask = mask.astype("bool")
         mask = L.squeeze(
             mask, [1])  # [batch_size, 1, h, w] -> [batch_size, h, w]
         out[name] = NestedTensor(x, mask)
     return out
コード例 #21
0
    def forward(self, tensor_list: NestedTensor):
        x = tensor_list.tensors
        x = self.body.patch_embed(x)
        x = self.body.pos_drop(x)
        for module in self.body.blocks:
            x = module(x)
        x = self.body.norm(x)
        x = torch.reshape(x, (-1, 2048, 18, 12))
        x = self.body.pre_logits(x)

        m = tensor_list.mask
        mask = F.interpolate(m[None].float(),
                             size=x.shape[-2:]).to(torch.bool)[0]

        out: Dict[str, NestedTensor] = {}
        out['0'] = NestedTensor(x, mask)

        return out
コード例 #22
0
    def forward(self, samples: NestedTensor):
        if not isinstance(samples, NestedTensor):
            samples = NestedTensor.from_tensor_list(samples)
        features, pos = self.detr.backbone(samples)

        bs = features[-1].tensors.shape[0]

        src, mask = features[-1].decompose()
        src_proj = self.detr.input_proj(src)
        hs, memory = self.detr.transformer(src_proj, mask,
                                           self.detr.query_embed.weight,
                                           pos[-1])

        outputs_class = self.detr.class_embed(hs)
        outputs_coord = self.detr.bbox_embed(hs).sigmoid()
        out = {
            "pred_logits": outputs_class[-1],
            "pred_boxes": outputs_coord[-1]
        }
        if self.detr.aux_loss:
            out["aux_outputs"] = [{
                "pred_logits": a,
                "pred_boxes": b
            } for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]

        # FIXME h_boxes takes the last one computed, keep this in mind
        bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)

        seg_masks = self.mask_head(
            src_proj, bbox_mask,
            [features[2].tensors, features[1].tensors, features[0].tensors])
        outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries,
                                           seg_masks.shape[-2],
                                           seg_masks.shape[-1])

        out["pred_masks"] = outputs_seg_masks
        return out
コード例 #23
0
    def forward(self, images):
        cur_images = images.tensor
        cur_dim = cur_images.shape[1]
        if cur_dim == 3:
            pre_images = cur_images.clone()
        elif cur_dim == 6:
            pre_images = cur_images[:, 3:, :, :]
            cur_images = cur_images[:, :3, :, :]
        else:
            raise NotImplementedError

        features = self.backbone(cur_images, pre_images)
        masks = self.mask_out_padding(
            [
                features_per_level.shape
                for features_per_level in features.values()
            ],
            images.image_sizes,
            images.tensor.device,
        )
        assert len(features) == len(masks)
        for i, k in enumerate(features.keys()):
            features[k] = NestedTensor(features[k], masks[i])
        return features
コード例 #24
0
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)

        features, pos = self.backbone(samples)

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None
        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        valid_ratio = None
        if self.accurate_ratio:
            valid_ratio = self._get_valid_ratio(samples.mask)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs, masks, pos, query_embeds, valid_ratio=valid_ratio)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)
        if not self.two_stage:
            ref_pts = torch.cat(
                [init_reference[None], inter_references[:, :, :, :2]])
            out = {
                'pred_logits': outputs_class[-1],
                'pred_boxes': outputs_coord[-1],
                'ref_pts': ref_pts,
                'logits_all': outputs_class,
                'boxes_all': outputs_coord
            }
        else:
            out = {
                'pred_logits': outputs_class[-1],
                'pred_boxes': outputs_coord[-1]
            }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out
コード例 #25
0
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensors: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
            The backbone has two components, 0 represents the forward layer, on the other hand, 1 represents positionalencodding.
        """
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)

        features, pos = self.backbone(samples)
        print('sample shape:', samples.tensors.shape)
        print('feature:', features[0].tensors.shape)
        print('features length:', len(features))
        print('pos length:', len(pos))

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None

        print(src.shape)
        print(self.backbone[1])

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        print('pos len:', len(pos))
        print('pos shape:', pos[0].shape)
        """ From the beginning until here, we have the input samples of 2 images as input (2 because of the batch size). the two images are passed to 
            backbone which has two component, one is neural network, the other is positional encoding.
        
        """
        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs, masks, pos, query_embeds)
        """Don't underestimate this sentence. It returns the results of transformer!!!!!!!!!!
        
        """
        print('hs shape:', hs.shape)
        print('init_reference shape', init_reference.shape)
        print('')
        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out
コード例 #26
0
    def forward_once(self, samples: NestedTensor, train_samples: NestedTensor):
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)

        if not isinstance(train_samples, NestedTensor):
            train_samples = nested_tensor_from_tensor_list(train_samples)
        pre_feat, _ = self.backbone(train_samples)
        
        srcs = []
        masks = []
        
        for l, (feat, feat2) in enumerate(zip(features, pre_feat)):
            src, mask = feat.decompose()
            src2, _ = feat2.decompose()
            srcs.append(self.combine(torch.cat([self.input_proj[l](src), self.input_proj[l](src2)], dim=1)))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.combine(torch.cat([self.input_proj[l](features[-1].tensors), self.input_proj[l](pre_feat[-1].tensors)], dim=1))
                else:
                    src = self.input_proj[l](srcs[-1])

                m = samples.mask
                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)
            
        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, _ = self.transformer(srcs, masks, pos, query_embeds)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)
               
        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
        pre_embed = {'reference': outputs_coord[-1], 'tgt': hs[-1], 'feat': features}
        
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)        
        
        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {'pred_logits': enc_outputs_class, 'pred_boxes': enc_outputs_coord}
        return out, pre_embed
コード例 #27
0
ファイル: motr.py プロジェクト: reinforcementdriving/MOTR
    def _forward_single_image(self, samples, track_instances: Instances):
        features, pos = self.backbone(samples)
        src, mask = features[-1].decompose()
        assert mask is not None

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs,
            masks,
            pos,
            track_instances.query_pos,
            ref_pts=track_instances.ref_pts)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        ref_pts_all = torch.cat(
            [init_reference[None], inter_references[:, :, :, :2]], dim=0)
        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1],
            'ref_pts': ref_pts_all[5]
        }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        with torch.no_grad():
            if self.training:
                track_scores = outputs_class[-1,
                                             0, :].sigmoid().max(dim=-1).values
            else:
                track_scores = outputs_class[-1, 0, :, 0].sigmoid()

        track_instances.scores = track_scores
        track_instances.pred_logits = outputs_class[-1, 0]
        track_instances.pred_boxes = outputs_coord[-1, 0]
        track_instances.output_embedding = hs[-1, 0]
        if self.training:
            # the track id will be assigned by the mather.
            out['track_instances'] = track_instances
            track_instances = self.criterion.match_for_single_frame(out)
        else:
            # each track will be assigned an unique global id by the track base.
            self.track_base.update(track_instances)
        if self.memory_bank is not None:
            track_instances = self.memory_bank(track_instances)
            # track_instances.track_scores = track_instances.track_scores[..., 0]
            # track_instances.scores = track_instances.track_scores.sigmoid()
            if self.training:
                self.criterion.calc_loss_for_track_scores(track_instances)
        tmp = {}
        tmp['init_track_instances'] = self._generate_empty_tracks()
        tmp['track_instances'] = track_instances
        out_track_instances = self.track_embed(tmp)
        out['track_instances'] = out_track_instances
        return out
コード例 #28
0
ファイル: detr.py プロジェクト: whq-hqw/detr_change
    criterion.to(device)
    postprocessors = {'bbox': PostProcess()}
    if args.masks:
        postprocessors['segm'] = PostProcessSegm()
        if args.dataset_file == "coco_panoptic":
            is_thing_map = {i: i <= 90 for i in range(201)}
            postprocessors["panoptic"] = PostProcessPanoptic(is_thing_map,
                                                             threshold=0.85)

    return model, criterion, postprocessors


if __name__ == '__main__':
    import argparse
    from util.misc import NestedTensor
    from main import get_args_parser

    parser = argparse.ArgumentParser('DETR training and evaluation script',
                                     parents=[get_args_parser()])
    args = parser.parse_args()

    tensor = torch.randn(4, 3, 384, 384)
    mask = (tensor > 0)[:, 0, :, :]
    nt = NestedTensor(tensor, mask)

    model, criterion, _ = build(args)
    y = model(nt)
    for k, v in y.items():
        if isinstance(v, torch.Tensor):
            print("%s: %s" % (k, str(v.shape)))
コード例 #29
0
    def forward(self, samples: NestedTensor, pre_embed=None):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        assert not self.training, 'here is inference mode'
        assert samples.tensors.shape[0] == 1, 'track only supports batch 1'
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)

        if pre_embed is not None:
            pre_feat = pre_embed['feat']
        else:
            pre_feat = features

        srcs = []
        masks = []

        for l, (feat, feat2) in enumerate(zip(features, pre_feat)):
            src, mask = feat.decompose()
            src2, _ = feat2.decompose()
            srcs.append(
                self.combine(
                    torch.cat(
                        [self.input_proj[l](src), self.input_proj[l](src2)],
                        dim=1)))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.combine(
                        torch.cat([
                            self.input_proj[l](features[-1].tensors),
                            self.input_proj[l](pre_feat[-1].tensors)
                        ],
                                  dim=1))
                else:
                    src = self.input_proj[l](srcs[-1])

                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        # detection mode
        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, memory = self.transformer(
            srcs, masks, pos, query_embeds)
        cur_hs = hs
        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        cur_class = outputs_class[-1]
        cur_box = outputs_coord[-1]
        cur_reference = cur_box
        cur_tgt = cur_hs[-1]

        if pre_embed is not None:
            # track mode
            pre_reference, pre_tgt = pre_embed['reference'], pre_embed['tgt']

            hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, _ = self.transformer(
                srcs, masks, pos, query_embeds, pre_reference, pre_tgt, memory)
            outputs_classes = []
            outputs_coords = []
            for lvl in range(hs.shape[0]):
                if lvl == 0:
                    reference = init_reference
                else:
                    reference = inter_references[lvl - 1]
                reference = inverse_sigmoid(reference)
                outputs_class = self.class_embed[lvl](hs[lvl])
                tmp = self.bbox_embed[lvl](hs[lvl])
                if reference.shape[-1] == 4:
                    tmp += reference
                else:
                    assert reference.shape[-1] == 2
                    tmp[..., :2] += reference
                outputs_coord = tmp.sigmoid()
                outputs_classes.append(outputs_class)
                outputs_coords.append(outputs_coord)
            outputs_class = torch.stack(outputs_classes)
            outputs_coord = torch.stack(outputs_coords)

            pre_class, pre_box = outputs_class[-1], outputs_coord[-1]

        else:
            pre_class, pre_box = cur_class, cur_box

        out = {
            'pred_logits': cur_class,
            'pred_boxes': cur_box,
            'tracking_logits': pre_class,
            'tracking_boxes': pre_box
        }

        pre_embed = {
            'reference': cur_reference,
            'tgt': cur_tgt,
            'feat': features
        }

        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out, pre_embed
コード例 #30
0
    def forward(self, samples: NestedTensor):
        print('run forward')
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """

        #print(samples)
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)

        features, pos = self.backbone(samples)

        import cv2

        print('feature size', features[2].tensors.size())

        # for i in range(len(features)) :
        #   print(type(features[i]))
        #   print(features[i].tensors[0].size())
        #   name = '/content/content/content/Deformable-DETR/explained /features/feature', str(i)
        #   save_image(features[i].tensors[0], name)
        #   # cv2.imwrite(name, features[i])

        channel1 = features[2].tensors[0, :, :, :]
        # for i, feat in enumerate(channel1):
        #   #print(feat.size())
        #   name = '/content/Explain-Deformable-DETR/explained /features/feature_3_'+ str(i) + '.png'
        #   if i==30:
        #     break
        #   print(name)
        #   save_image(feat, name)

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            #print ('sdfsdfsdfsdfsdfsdf', self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None
        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        print(len(srcs))
        print(srcs[0].size())
        #layer1 = srcs[3][0, :, :, :]
        # for i, feat in enumerate(layer1):
        #   #print(feat.size())
        #   name = '/content/Explain-Deformable-DETR/explained /features/feature_3_'+ str(i) + '.png'
        #   if i==10:
        #     break
        #   print(name)
        #   save_image(feat, name)

        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        print(query_embeds)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs, masks, pos, query_embeds)

        print('self.class_embed', self.class_embed)
        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            print('two stage enc_outputs_coord and enc_outputs_class',
                  enc_outputs_coord.shape, enc_outputs_coord_unact[0][0][:])
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out