Beispiel #1
0
    def forward(self, tgt, reference_points, src, src_spatial_shapes, src_level_start_index, src_valid_ratios, query_pos, src_padding_mask):
        output = tgt

        intermediate = []
        intermediate_reference_points = []
        for lid, layer in enumerate(self.layers):
            if reference_points.shape[-1] == 4:
                reference_points_input = reference_points[:, :, None] * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None]
            else:
                assert reference_points.shape[-1] == 2
                reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
            output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)

            # hack implementation for iterative bounding box refinement
            if self.bbox_embed is not None:
                tmp = self.bbox_embed[lid](output)
                if reference_points.shape[-1] == 4:
                    new_reference_points = tmp + inverse_sigmoid(reference_points)
                    new_reference_points = new_reference_points.sigmoid()
                else:
                    assert reference_points.shape[-1] == 2
                    new_reference_points = tmp
                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
                    new_reference_points = new_reference_points.sigmoid()
                reference_points = new_reference_points.detach()

            if self.return_intermediate:
                intermediate.append(output)
                intermediate_reference_points.append(reference_points)

        if self.return_intermediate:
            return torch.stack(intermediate), torch.stack(intermediate_reference_points)

        return output, reference_points
Beispiel #2
0
    def _update_track_embedding(self, track_instances: Instances) -> Instances:
        if len(track_instances) == 0:
            return track_instances
        dim = track_instances.query_pos.shape[1]
        out_embed = track_instances.output_embedding
        query_pos = track_instances.query_pos[:, :dim // 2]
        query_feat = track_instances.query_pos[:, dim // 2:]
        q = k = query_pos + out_embed

        tgt = out_embed
        tgt2 = self.self_attn(q[:, None], k[:, None], value=tgt[:, None])[0][:,
                                                                             0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        if self.update_query_pos:
            query_pos2 = self.linear_pos2(
                self.dropout_pos1(self.activation(self.linear_pos1(tgt))))
            query_pos = query_pos + self.dropout_pos2(query_pos2)
            query_pos = self.norm_pos(query_pos)
            track_instances.query_pos[:, :dim // 2] = query_pos

        query_feat2 = self.linear_feat2(
            self.dropout_feat1(self.activation(self.linear_feat1(tgt))))
        query_feat = query_feat + self.dropout_feat2(query_feat2)
        query_feat = self.norm_feat(query_feat)
        track_instances.query_pos[:, dim // 2:] = query_feat

        track_instances.ref_pts = inverse_sigmoid(
            track_instances.pred_boxes[:, :2].detach().clone())
        return track_instances
Beispiel #3
0
    def forward_supp_branch(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask, tsp, support_boxes):
        # self attention
        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        support_img_h, support_img_w = spatial_shapes[0, 0], spatial_shapes[0, 1]
        supp_roi = torchvision.ops.roi_align(
            src.transpose(1, 2).reshape(src.shape[0], -1, support_img_h, support_img_w),
            support_boxes,
            output_size=(7, 7),
            spatial_scale=1 / 32.0,
            aligned=True).mean(3).mean(2)
        category_code = supp_roi.sigmoid()

        if self.QSAttn:
            # siamese attention
            src, tsp = self.siamese_attn(src,
                                         inverse_sigmoid(category_code).unsqueeze(0).expand(src.shape[0], -1, -1),
                                         category_code.unsqueeze(0).expand(src.shape[0], -1, -1),
                                         tsp)

            # ffn
            src = self.forward_ffn(src + tsp)
        else:
            src = self.forward_ffn(src)

        return src, category_code
Beispiel #4
0
    def _forward_single_image(self, samples, track_instances: Instances):
        features, pos = self.backbone(samples)
        src, mask = features[-1].decompose()
        assert mask is not None

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(srcs, masks, pos, track_instances.query_pos, ref_pts=track_instances.ref_pts)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        ref_pts_all = torch.cat([init_reference[None], inter_references[:, :, :, :2]], dim=0)
        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'ref_pts': ref_pts_all[5]}
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
        out['hs'] = hs[-1]
        return out
Beispiel #5
0
    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask, category_codes, tsp):
        # self attention
        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        if self.QSAttn:
            # siamese attention
            src, tsp = self.siamese_attn(src, inverse_sigmoid(category_codes), category_codes, tsp)
            # ffn
            src = self.forward_ffn(src + tsp)
        else:
            # ffn
            src = self.forward_ffn(src)

        return src
    def forward(self, samples: NestedTensor, pre_embed=None):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        assert not self.training, 'here is inference mode'
        assert samples.tensors.shape[0] == 1, 'track only supports batch 1'
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)

        if pre_embed is not None:
            pre_feat = pre_embed['feat']
        else:
            pre_feat = features

        srcs = []
        masks = []

        for l, (feat, feat2) in enumerate(zip(features, pre_feat)):
            src, mask = feat.decompose()
            src2, _ = feat2.decompose()
            srcs.append(
                self.combine(
                    torch.cat(
                        [self.input_proj[l](src), self.input_proj[l](src2)],
                        dim=1)))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.combine(
                        torch.cat([
                            self.input_proj[l](features[-1].tensors),
                            self.input_proj[l](pre_feat[-1].tensors)
                        ],
                                  dim=1))
                else:
                    src = self.input_proj[l](srcs[-1])

                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        # detection mode
        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, memory = self.transformer(
            srcs, masks, pos, query_embeds)
        cur_hs = hs
        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        cur_class = outputs_class[-1]
        cur_box = outputs_coord[-1]
        cur_reference = cur_box
        cur_tgt = cur_hs[-1]

        if pre_embed is not None:
            # track mode
            pre_reference, pre_tgt = pre_embed['reference'], pre_embed['tgt']

            hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, _ = self.transformer(
                srcs, masks, pos, query_embeds, pre_reference, pre_tgt, memory)
            outputs_classes = []
            outputs_coords = []
            for lvl in range(hs.shape[0]):
                if lvl == 0:
                    reference = init_reference
                else:
                    reference = inter_references[lvl - 1]
                reference = inverse_sigmoid(reference)
                outputs_class = self.class_embed[lvl](hs[lvl])
                tmp = self.bbox_embed[lvl](hs[lvl])
                if reference.shape[-1] == 4:
                    tmp += reference
                else:
                    assert reference.shape[-1] == 2
                    tmp[..., :2] += reference
                outputs_coord = tmp.sigmoid()
                outputs_classes.append(outputs_class)
                outputs_coords.append(outputs_coord)
            outputs_class = torch.stack(outputs_classes)
            outputs_coord = torch.stack(outputs_coords)

            pre_class, pre_box = outputs_class[-1], outputs_coord[-1]

        else:
            pre_class, pre_box = cur_class, cur_box

        out = {
            'pred_logits': cur_class,
            'pred_boxes': cur_box,
            'tracking_logits': pre_class,
            'tracking_boxes': pre_box
        }

        pre_embed = {
            'reference': cur_reference,
            'tgt': cur_tgt,
            'feat': features
        }

        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out, pre_embed
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)

        features, pos = self.backbone(samples)

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None
        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        valid_ratio = None
        if self.accurate_ratio:
            valid_ratio = self._get_valid_ratio(samples.mask)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs, masks, pos, query_embeds, valid_ratio=valid_ratio)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)
        if not self.two_stage:
            ref_pts = torch.cat(
                [init_reference[None], inter_references[:, :, :, :2]])
            out = {
                'pred_logits': outputs_class[-1],
                'pred_boxes': outputs_coord[-1],
                'ref_pts': ref_pts,
                'logits_all': outputs_class,
                'boxes_all': outputs_coord
            }
        else:
            out = {
                'pred_logits': outputs_class[-1],
                'pred_boxes': outputs_coord[-1]
            }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out
Beispiel #8
0
    def forward_once(self, samples: NestedTensor, train_samples: NestedTensor):
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)
        features, pos = self.backbone(samples)

        if not isinstance(train_samples, NestedTensor):
            train_samples = nested_tensor_from_tensor_list(train_samples)
        pre_feat, _ = self.backbone(train_samples)
        
        srcs = []
        masks = []
        
        for l, (feat, feat2) in enumerate(zip(features, pre_feat)):
            src, mask = feat.decompose()
            src2, _ = feat2.decompose()
            srcs.append(self.combine(torch.cat([self.input_proj[l](src), self.input_proj[l](src2)], dim=1)))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.combine(torch.cat([self.input_proj[l](features[-1].tensors), self.input_proj[l](pre_feat[-1].tensors)], dim=1))
                else:
                    src = self.input_proj[l](srcs[-1])

                m = samples.mask
                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)
            
        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact, _ = self.transformer(srcs, masks, pos, query_embeds)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)
               
        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
        pre_embed = {'reference': outputs_coord[-1], 'tgt': hs[-1], 'feat': features}
        
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)        
        
        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {'pred_logits': enc_outputs_class, 'pred_boxes': enc_outputs_coord}
        return out, pre_embed
Beispiel #9
0
def get_box_from_object_queries(enc_outputs_class, enc_outputs_coord_unact)

    hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(srcs, masks, pos, query_embeds)
    outputs_classes = []
    outputs_coords = []
    
    outputs_class = self.class_embed[lvl](hs[lvl])


    for lvl in range(hs.shape[0]):
        if lvl == 0:
            reference = init_reference
        else:
            reference = inter_references[lvl - 1]
        reference = inverse_sigmoid(reference)
        outputs_class = self.class_embed[lvl](hs[lvl])
        tmp = self.bbox_embed[lvl](hs[lvl])
        if reference.shape[-1] == 4:
            tmp += reference
        else:
            assert reference.shape[-1] == 2
            tmp[..., :2] += reference
        outputs_coord = tmp.sigmoid()
        outputs_classes.append(outputs_class)
        outputs_coords.append(outputs_coord)
    outputs_class = torch.stack(outputs_classes)
    outputs_coord = torch.stack(outputs_coords)

    out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
    if self.aux_loss:
Beispiel #10
0
    def _forward_single_image(self, samples, track_instances: Instances):
        features, pos = self.backbone(samples)
        src, mask = features[-1].decompose()
        assert mask is not None

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs,
            masks,
            pos,
            track_instances.query_pos,
            ref_pts=track_instances.ref_pts)

        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        ref_pts_all = torch.cat(
            [init_reference[None], inter_references[:, :, :, :2]], dim=0)
        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1],
            'ref_pts': ref_pts_all[5]
        }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        with torch.no_grad():
            if self.training:
                track_scores = outputs_class[-1,
                                             0, :].sigmoid().max(dim=-1).values
            else:
                track_scores = outputs_class[-1, 0, :, 0].sigmoid()

        track_instances.scores = track_scores
        track_instances.pred_logits = outputs_class[-1, 0]
        track_instances.pred_boxes = outputs_coord[-1, 0]
        track_instances.output_embedding = hs[-1, 0]
        if self.training:
            # the track id will be assigned by the mather.
            out['track_instances'] = track_instances
            track_instances = self.criterion.match_for_single_frame(out)
        else:
            # each track will be assigned an unique global id by the track base.
            self.track_base.update(track_instances)
        if self.memory_bank is not None:
            track_instances = self.memory_bank(track_instances)
            # track_instances.track_scores = track_instances.track_scores[..., 0]
            # track_instances.scores = track_instances.track_scores.sigmoid()
            if self.training:
                self.criterion.calc_loss_for_track_scores(track_instances)
        tmp = {}
        tmp['init_track_instances'] = self._generate_empty_tracks()
        tmp['track_instances'] = track_instances
        out_track_instances = self.track_embed(tmp)
        out['track_instances'] = out_track_instances
        return out
Beispiel #11
0
    def forward(self, samples: NestedTensor):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensors: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
            The backbone has two components, 0 represents the forward layer, on the other hand, 1 represents positionalencodding.
        """
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)

        features, pos = self.backbone(samples)
        print('sample shape:', samples.tensors.shape)
        print('feature:', features[0].tensors.shape)
        print('features length:', len(features))
        print('pos length:', len(pos))

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None

        print(src.shape)
        print(self.backbone[1])

        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        print('pos len:', len(pos))
        print('pos shape:', pos[0].shape)
        """ From the beginning until here, we have the input samples of 2 images as input (2 because of the batch size). the two images are passed to 
            backbone which has two component, one is neural network, the other is positional encoding.
        
        """
        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs, masks, pos, query_embeds)
        """Don't underestimate this sentence. It returns the results of transformer!!!!!!!!!!
        
        """
        print('hs shape:', hs.shape)
        print('init_reference shape', init_reference.shape)
        print('')
        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out
Beispiel #12
0
    def forward(self, samples: NestedTensor):
        print('run forward')
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-object) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                               (center_x, center_y, height, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized bounding box.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """

        #print(samples)
        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)

        features, pos = self.backbone(samples)

        import cv2

        print('feature size', features[2].tensors.size())

        # for i in range(len(features)) :
        #   print(type(features[i]))
        #   print(features[i].tensors[0].size())
        #   name = '/content/content/content/Deformable-DETR/explained /features/feature', str(i)
        #   save_image(features[i].tensors[0], name)
        #   # cv2.imwrite(name, features[i])

        channel1 = features[2].tensors[0, :, :, :]
        # for i, feat in enumerate(channel1):
        #   #print(feat.size())
        #   name = '/content/Explain-Deformable-DETR/explained /features/feature_3_'+ str(i) + '.png'
        #   if i==30:
        #     break
        #   print(name)
        #   save_image(feat, name)

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            #print ('sdfsdfsdfsdfsdfsdf', self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None
        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        print(len(srcs))
        print(srcs[0].size())
        #layer1 = srcs[3][0, :, :, :]
        # for i, feat in enumerate(layer1):
        #   #print(feat.size())
        #   name = '/content/Explain-Deformable-DETR/explained /features/feature_3_'+ str(i) + '.png'
        #   if i==10:
        #     break
        #   print(name)
        #   save_image(feat, name)

        query_embeds = None
        if not self.two_stage:
            query_embeds = self.query_embed.weight
        print(query_embeds)

        hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer(
            srcs, masks, pos, query_embeds)

        print('self.class_embed', self.class_embed)
        outputs_classes = []
        outputs_coords = []
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.bbox_embed[lvl](hs[lvl])
            if reference.shape[-1] == 4:
                tmp += reference
            else:
                assert reference.shape[-1] == 2
                tmp[..., :2] += reference
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        out = {
            'pred_logits': outputs_class[-1],
            'pred_boxes': outputs_coord[-1]
        }
        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(outputs_class,
                                                    outputs_coord)

        if self.two_stage:
            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
            print('two stage enc_outputs_coord and enc_outputs_class',
                  enc_outputs_coord.shape, enc_outputs_coord_unact[0][0][:])
            out['enc_outputs'] = {
                'pred_logits': enc_outputs_class,
                'pred_boxes': enc_outputs_coord
            }
        return out
Beispiel #13
0
    def forward(self,
                samples,
                targets=None,
                supp_samples=None,
                supp_class_ids=None,
                supp_targets=None,
                category_codes=None):

        if not isinstance(samples, NestedTensor):
            samples = nested_tensor_from_tensor_list(samples)

        batchsize = samples.tensors.shape[0]
        device = samples.tensors.device

        # During training, category_codes are generated from sampled (supp_samples, supp_class_ids, supp_targets)
        if self.training:
            assert supp_samples is not None
            assert supp_class_ids is not None
            assert supp_targets is not None
            # During training stage: we don't have to cover all categories, so there is only 1 episode
            num_support = supp_class_ids.shape[0]
            support_batchsize = self.args.episode_size
            assert num_support == (self.args.episode_size *
                                   self.args.episode_num)
            num_episode = self.args.episode_num
            category_codes = self.compute_category_codes(
                supp_samples, supp_targets)
        # During inference, category_codes should be provided and ready to use for all activated categories
        else:
            assert category_codes is not None
            assert supp_class_ids is not None
            # During inference stage: there are multiple episodes to cover all categories, including both base and novel
            num_support = supp_class_ids.shape[0]
            support_batchsize = self.args.episode_size
            num_episode = math.ceil(num_support / support_batchsize)

        features, pos = self.backbone(samples)

        srcs = []
        masks = []
        for l, feat in enumerate(features):
            src, mask = feat.decompose()
            srcs.append(self.input_proj[l](src))
            masks.append(mask)
            assert mask is not None
        if self.num_feature_levels > len(srcs):
            _len_srcs = len(srcs)
            for l in range(_len_srcs, self.num_feature_levels):
                if l == _len_srcs:
                    src = self.input_proj[l](features[-1].tensors)
                else:
                    src = self.input_proj[l](srcs[-1])
                m = samples.mask
                mask = F.interpolate(m[None].float(),
                                     size=src.shape[-2:]).to(torch.bool)[0]
                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                srcs.append(src)
                masks.append(mask)
                pos.append(pos_l)

        query_embeds = self.query_embed.to(device)

        # To store predictions for each episode
        meta_outputs_classes = []
        meta_outputs_coords = []
        meta_support_class_ids = []

        for i in range(num_episode):

            if self.num_feature_levels == 1:
                if (support_batchsize * (i + 1)) <= num_support:
                    cc = [
                        c[(support_batchsize *
                           i):(support_batchsize *
                               (i + 1)), :].unsqueeze(0).expand(
                                   batchsize, -1, -1) for c in category_codes
                    ]
                    episode_class_ids = supp_class_ids[(support_batchsize *
                                                        i):(support_batchsize *
                                                            (i + 1))]
                else:
                    cc = [
                        c[-support_batchsize:, :].unsqueeze(0).expand(
                            batchsize, -1, -1) for c in category_codes
                    ]
                    episode_class_ids = supp_class_ids[-support_batchsize:]
            elif self.num_feature_levels == 4:
                raise NotImplementedError
            else:
                raise NotImplementedError

            _, init_reference, _, encoder_outputs = \
                self.transformer(srcs, masks, pos, query_embeds, cc,
                                 self.task_positional_encoding(torch.zeros(self.args.episode_size, self.hidden_dim, device=device)).unsqueeze(0).expand(batchsize, -1, -1))

            (memory, spatial_shapes, level_start_index, valid_ratios,
             query_embed, mask_flatten, tgt) = encoder_outputs

            # Category-agnostic transformer decoder
            hs, inter_references = self.meta_decoder(
                tgt,
                init_reference,
                memory,
                spatial_shapes,
                level_start_index,
                valid_ratios,
                query_embed,
                mask_flatten,
            )

            # Final FFN to predict confidence scores and boxes coordinates
            outputs_classes = []
            outputs_coords = []
            for lvl in range(hs.shape[0]):
                if lvl == 0:
                    reference = init_reference.reshape(batchsize,
                                                       self.num_queries, 2)
                else:
                    reference = inter_references[lvl - 1]
                reference = inverse_sigmoid(reference)
                outputs_class = self.class_embed[lvl](hs[lvl])
                tmp = self.bbox_embed[lvl](hs[lvl])
                if reference.shape[-1] == 4:
                    tmp += reference
                else:
                    assert reference.shape[-1] == 2
                    tmp[..., :2] += reference
                outputs_coord = tmp.sigmoid()
                outputs_classes.append(
                    outputs_class.view(batchsize, self.num_queries,
                                       self.args.episode_size))
                outputs_coords.append(
                    outputs_coord.view(batchsize, self.num_queries, 4))

            meta_outputs_classes.append(torch.stack(outputs_classes))
            meta_outputs_coords.append(torch.stack(outputs_coords))
            meta_support_class_ids.append(episode_class_ids)

        # Calculate targets for the constructed meta-tasks
        # meta_targets are computed based on original targets and the sampled support images.
        meta_targets = []
        for b in range(batchsize):
            for episode_class_ids in meta_support_class_ids:
                meta_target = dict()
                target_indexes = [
                    i for i, x in enumerate(targets[b]['labels'].tolist())
                    if x in episode_class_ids
                ]
                meta_target['boxes'] = targets[b]['boxes'][target_indexes]
                meta_target['labels'] = targets[b]['labels'][target_indexes]
                meta_target['area'] = targets[b]['area'][target_indexes]
                meta_target['iscrowd'] = targets[b]['iscrowd'][target_indexes]
                meta_target['image_id'] = targets[b]['image_id']
                meta_target['size'] = targets[b]['size']
                meta_target['orig_size'] = targets[b]['orig_size']
                meta_targets.append(meta_target)

        # Create tensors for final outputs
        # default logits are -inf (default confidence scores are 0.00 after sigmoid)
        final_meta_outputs_classes = torch.ones(hs.shape[0],
                                                batchsize,
                                                num_episode,
                                                self.num_queries,
                                                self.num_classes,
                                                device=device) * (-999999.99)
        final_meta_outputs_coords = torch.zeros(hs.shape[0],
                                                batchsize,
                                                num_episode,
                                                self.num_queries,
                                                4,
                                                device=device)
        # Fill in predicted logits into corresponding positions
        class_ids_already_filled_in = []
        for episode_index, (pred_classes, pred_coords, class_ids) in enumerate(
                zip(meta_outputs_classes, meta_outputs_coords,
                    meta_support_class_ids)):
            for class_index, class_id in enumerate(class_ids):
                # During inference, we need to ignore the classes that already have predictions
                # During training, the same category might appear over different episodes, so no need to filter
                if self.training or (class_id.item()
                                     not in class_ids_already_filled_in):
                    class_ids_already_filled_in.append(class_id.item())
                    final_meta_outputs_classes[:, :, episode_index, :,
                                               class_id] = pred_classes[:, :, :,
                                                                        class_index]
                    final_meta_outputs_coords[:, :,
                                              episode_index, :, :] = pred_coords[:, :, :, :]
        # Pretend we have a batchsize of (batchsize x num_support), and produce final predictions
        final_meta_outputs_classes = final_meta_outputs_classes.view(
            hs.shape[0], batchsize * num_episode, self.num_queries,
            self.num_classes)
        final_meta_outputs_coords = final_meta_outputs_coords.view(
            hs.shape[0], batchsize * num_episode, self.num_queries, 4)

        out = dict()

        out['pred_logits'] = final_meta_outputs_classes[-1]
        out['pred_boxes'] = final_meta_outputs_coords[-1]
        out['activated_class_ids'] = torch.stack(
            meta_support_class_ids).unsqueeze(0).expand(
                batchsize, -1, -1).reshape(batchsize * num_episode, -1)
        out['meta_targets'] = meta_targets  # Add meta_targets into outputs for optimization

        out['batchsize'] = batchsize
        out['num_episode'] = num_episode
        out['num_queries'] = self.num_queries
        out['num_classes'] = self.num_classes

        if self.args.category_codes_cls_loss:
            if self.num_feature_levels == 1:
                # out['category_codes_cls_logits'] = self.category_codes_cls(category_codes)
                # out['category_codes_cls_targets'] = supp_class_ids
                # TODO: category_codes_cls_loss @ every encoder layer! THIS IS ONLY TRIAL!
                #out['category_codes_cls_logits'] = self.category_codes_cls(torch.cat(category_codes, dim=0))
                #out['category_codes_cls_targets'] = supp_class_ids.repeat(self.args.dec_layers)

                out['category_codes_cls_logits'] = self.category_codes_cls(
                    category_codes[0])
                out['category_codes_cls_targets'] = supp_class_ids
            elif self.num_feature_levels == 4:
                raise NotImplementedError
            else:
                raise NotImplementedError

        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(final_meta_outputs_classes,
                                                    final_meta_outputs_coords)
            for aux_output in out['aux_outputs']:
                aux_output['activated_class_ids'] = torch.stack(
                    meta_support_class_ids).unsqueeze(0).expand(
                        batchsize, -1, -1).reshape(batchsize * num_episode, -1)
        return out