Exemple #1
0
    def perturb(self, gt_obj, gt_rels, verbose=False):

        gt_obj_lst = [gt_obj[s:e] for _, s, e in enumerate_by_image(gt_obj[:, 0])]
        gt_rels_lst = [gt_rels[s:e] for _, s, e in enumerate_by_image(gt_rels[:, 0])]

        nodes = self.sample_nodes_(gt_obj_lst, gt_rels_lst)

        gt_obj_new = []
        for im, objs in enumerate(gt_obj_lst):  # for each image
            for obj_ind, obj_rels in zip(*nodes[im]):  # for each sampled node that will be perturbed

                if verbose:
                    before = objs[obj_ind, 1]
                    print('\nbefore: %s' % self.obj_classes[before])
                    for (_, o1, o2, R) in obj_rels:
                        print(self.triplet2str('{}_{}_{}'.format(objs[o1, 1], R, objs[o2, 1])))

                objs[obj_ind, 1] = self.perturb_object_(objs, obj_rels, obj_ind, verbose=verbose)

                if verbose:
                    print('\nafter: %s' % self.obj_classes[objs[obj_ind, 1]])
                    for (_, o1, o2, R) in obj_rels:
                        print(self.triplet2str('{}_{}_{}'.format(objs[o1, 1], R, objs[o2, 1])))

            gt_obj_new.append(objs)

        gt_obj_new = torch.cat(gt_obj_new)

        return gt_obj_new
Exemple #2
0
    def forward(self, union_pools, rois, union_inds, im_sizes):

        boxes = rois[:, 1:].clone()

        # scale boxes to the range [0,1]
        scale = boxes.new(boxes.shape).fill_(0)
        for i, s, e in enumerate_by_image(rois[:, 0].long().data):
            h, w = im_sizes[i][:2]
            scale[s:e, 0] = w
            scale[s:e, 1] = h
            scale[s:e, 2] = w
            scale[s:e, 3] = h
        boxes = boxes / scale

        try:
            rects = draw_union_boxes_my(boxes, union_inds,
                                        self.pooling_size * 4 - 1) - 0.5
        except Exception as e:
            # there was a problem with bboxes being larger than images at test time, had to clip them
            print(rois, boxes, im_sizes, scale)
            raise

        if self.concat:
            return torch.cat((union_pools, self.conv(rects)), 1)
        return union_pools + self.conv(rects)
def _sort_by_score(im_inds, scores):
    """
    We'll sort everything scorewise from Hi->low, BUT we need to keep images together
    and sort LSTM from l
    :param im_inds: Which im we're on
    :param scores: Goodness ranging between [0, 1]. Higher numbers come FIRST
    :return: Permutation to put everything in the right order for the LSTM
             Inverse permutation
             Lengths for the TxB packed sequence.
    """
    num_im = im_inds[-1] + 1
    rois_per_image = scores.new(num_im)
    lengths = []
    for i, s, e in enumerate_by_image(im_inds):
        rois_per_image[i] = 2 * (s - e) * num_im + i
        lengths.append(e - s)
    lengths = sorted(lengths, reverse=True)
    inds, ls_transposed = transpose_packed_sequence_inds(lengths)  # move it to TxB form
    inds = torch.LongTensor(inds).cuda(im_inds.get_device())

    # ~~~~~~~~~~~~~~~~
    # HACKY CODE ALERT!!!
    # we're sorting by confidence which is in the range (0,1), but more importantly by longest
    # img....
    # ~~~~~~~~~~~~~~~~
    roi_order = scores - 2 * rois_per_image[im_inds]
    _, perm = torch.sort(roi_order, 0, descending=True)
    perm = perm[inds]
    _, inv_perm = torch.sort(perm)

    return perm, inv_perm, ls_transposed
def _PadRelFeats(rel_im_inds,
                 rel_feats_all,
                 num_relation,
                 seq_per_img,
                 pred_classes,
                 obj_classes,
                 rels,
                 freq_matrix=None):
    """

    :param rel_im_inds: torch.LongTensor, [num_rels, ]
    :param rel_feats_all: Variable, [num_rels, 4096]
    :param num_relation:
    :param seq_per_img:
    :return: rel_feats, [batch_size*seq_per_img, num_relation, 4096]
    """
    rel_feats = []
    categories_info_all = []
    for i, s, e in enumerate_by_image(rel_im_inds):
        rel_feats_i = rel_feats_all[s:e, :]
        pred_classes_i = pred_classes[s:e][:, None]
        rels_i = rels[s:e, :]
        subj_categories = obj_classes[rels_i[:, 1]][:, None]
        obj_categories = obj_classes[rels_i[:, 2]][:, None]
        categories_info = torch.cat(
            (subj_categories, obj_categories, pred_classes_i), 1)

        # compute freqency baseline: reranking based on triplet frequency
        if freq_matrix is not None:
            categories_info_np = categories_info.data.cpu().numpy()
            freqs = []
            for cat in categories_info_np:
                freqs.append(freq_matrix[cat[0], cat[1], cat[2]])
            sort_index = torch.from_numpy(np.argsort(
                np.array(freqs) * -1)).cuda(rel_feats_all.get_device())
            rel_feats_i = rel_feats_i[sort_index, :]
            rels_i = rels_i[sort_index, :]
            categories_info = categories_info[sort_index, :]

        this_num_rel = e - s
        if num_relation <= this_num_rel:
            rel_feats_i = rel_feats_i[:num_relation, :]
            categories_info = categories_info[:num_relation]
        else:  # oversample
            sample_inds = torch.from_numpy(
                np.random.choice(np.arange(this_num_rel, dtype=np.int32),
                                 num_relation,
                                 replace=True)).long().cuda(
                                     rel_feats_all.get_device())
            rel_feats_i = rel_feats_i[sample_inds]
            categories_info = categories_info[sample_inds]

        rel_feats += [rel_feats_i[None, :, :]] * seq_per_img
        categories_info_all += [categories_info[None, :, :]] * seq_per_img
    rel_feats = torch.cat(rel_feats, 0)
    categories_info_all = torch.cat(categories_info_all, 0)
    return rel_feats, categories_info_all
Exemple #5
0
    def nms_boxes(self, obj_dists, rois, box_deltas, im_sizes):
        """
        Performs NMS on the boxes
        :param obj_dists: [#rois, #classes], ex:[4000+, 151]
        :param rois: [#rois, 5], ex:[4000+, 5]
        :param box_deltas: [#rois, #classes, 4]
        :param im_sizes: sizes of images [6,3]
        :return
            nms_inds [#nms], ex: #nms=384
            nms_scores [#nms]
            nms_labels [#nms]
            nms_boxes_assign [#nms, 4]
            nms_boxes  [#nms, #classes, 4]. classid=0 is the box prior.
        """
        # Now Converts "deltas" (predicted by the network) along with prior boxes into (x1, y1, x2, y2) representation.
        # box deltas is (num_rois, num_classes, 4) but rois is only #(num_rois, 4)
        # boxes = bbox_preds([#rois * 151, 4]) = [#rois, 151, 4]
        boxes = bbox_preds(
            rois[:, None, 1:].expand_as(box_deltas).contiguous().view(-1, 4),
            box_deltas.view(-1, 4)).view(*box_deltas.size())

        inds = rois[:, 0].long().contiguous()
        dets = []
        # Clip the boxes and get the best N dets per image.
        for i, s, e in enumerate_by_image(inds.data):
            h, w = im_sizes[i, :2]
            boxes[s:e, :, 0].data.clamp_(min=0, max=w - 1)
            boxes[s:e, :, 1].data.clamp_(min=0, max=h - 1)
            boxes[s:e, :, 2].data.clamp_(min=0, max=w - 1)
            boxes[s:e, :, 3].data.clamp_(min=0, max=h - 1)
            d_filtered = filter_det(
                F.softmax(obj_dists[s:e], 1),
                boxes[s:e],
                start_ind=s,
                nms_filter_duplicates=self.nms_filter_duplicates,
                max_per_img=self.max_per_img,
                thresh=self.thresh,
            )
            if d_filtered is not None:
                dets.append(d_filtered)

        # dets is a list: len is 6 (images); each image has (inds, scores, labels), each len is 64
        if len(dets) == 0:
            print("nothing was detected", flush=True)
            return None
        nms_inds, nms_scores, nms_labels = [
            torch.cat(x, 0) for x in zip(*dets)
        ]  # [384]
        twod_inds = nms_inds * boxes.size(1) + nms_labels.data
        nms_boxes_assign = boxes.view(-1, 4)[twod_inds]
        # nms_boxes: [384,151,4], the first dim of 151 is not "0" background class, it's rois
        # rois[:, 1:][nms_inds][:, None].shape: [384, 1, 4]; boxes[nms_inds][:, 1:]: [384,150,4]
        nms_boxes = torch.cat(
            (rois[:, 1:][nms_inds][:, None], boxes[nms_inds][:, 1:]), 1)
        return nms_inds, nms_scores, nms_labels, nms_boxes_assign, nms_boxes, inds[
            nms_inds]
Exemple #6
0
    def nms_boxes(self, obj_dists, rois, box_deltas, im_sizes):
        """
        Performs NMS on the boxes
        :param obj_dists: [#rois, #classes]
        :param rois: [#rois, 5]
        :param box_deltas: [#rois, #classes, 4]
        :param im_sizes: sizes of images
        :return
            nms_inds [#nms]
            nms_scores [#nms]
            nms_labels [#nms]
            nms_boxes_assign [#nms, 4]
            nms_boxes  [#nms, #classes, 4]. classid=0 is the box prior.
        """
        # Now produce the boxes
        # box deltas is (num_rois, num_classes, 4) but rois is only #(num_rois, 4)
        boxes = bbox_preds(
            rois[:, None, 1:].expand_as(box_deltas).contiguous().view(-1, 4),
            box_deltas.view(-1, 4)).view(*box_deltas.size())

        # Clip the boxes and get the best N dets per image.
        inds = rois[:, 0].long().contiguous()
        dets = []

        for i, s, e in enumerate_by_image(inds.data):
            h, w = im_sizes[i, :2]
            boxes[s:e, :, 0].data.clamp_(min=0, max=w - 1)
            boxes[s:e, :, 1].data.clamp_(min=0, max=h - 1)
            boxes[s:e, :, 2].data.clamp_(min=0, max=w - 1)
            boxes[s:e, :, 3].data.clamp_(min=0, max=h - 1)

            d_filtered = filter_det(
                F.softmax(obj_dists[s:e], 1),
                boxes[s:e],
                start_ind=s,
                nms_filter_duplicates=self.nms_filter_duplicates,
                max_per_img=self.max_per_img,
                thresh=self.thresh,
            )

            if d_filtered is not None:
                dets.append(d_filtered)
        if len(dets) == 0:
            print("nothing was detected", flush=True)
            return None
        nms_inds, nms_scores, nms_labels = [
            torch.cat(x, 0) for x in zip(*dets)
        ]
        twod_inds = nms_inds * boxes.size(1) + nms_labels.data
        nms_boxes_assign = boxes.view(-1, 4)[twod_inds]

        nms_boxes = torch.cat(
            (rois[:, 1:][nms_inds][:, None], boxes[nms_inds][:, 1:]), 1)
        return nms_inds, nms_scores, nms_labels, nms_boxes_assign, nms_boxes, inds[
            nms_inds]
Exemple #7
0
    def get_scaled_boxes(self, boxes, im_inds, im_sizes):
        if self.backbone == 'vgg16_old':
            boxes_scaled = boxes / IM_SCALE
        else:
            boxes_scaled = boxes.clone()
            for im_ind, s, e in enumerate_by_image(im_inds.long().data):
                boxes_scaled[s:e, [0, 2]] = boxes_scaled[
                    s:e, [0, 2]] / im_sizes[im_ind][1]  # width
                boxes_scaled[s:e, [1, 3]] = boxes_scaled[
                    s:e, [1, 3]] / im_sizes[im_ind][0]  # height

        assert boxes_scaled.max() <= 1 + 1e-3, (boxes_scaled.max(),
                                                boxes.max(), im_sizes)

        return boxes_scaled
Exemple #8
0
def dummy_nodes(gt_objs, gt_boxes, gt_rels):
    # Add dummy nodes to scene graphs to improve message propagation
    gt_objs_new, gt_boxes_new, gt_rels_new = [], [], []

    gt_rels_lst = [
        gt_rels[s:e] for im, s, e in enumerate_by_image(gt_rels[:, 0])
    ]
    dummy_box = torch.Tensor([0, 0, 1, 1]).view(1, 4).to(gt_boxes)
    offset = 0
    for im, s, e in enumerate_by_image(gt_objs[:, 0]):
        gt_objs_im = gt_objs[s:e]
        n_obj = len(gt_objs_im)

        rels = torch.zeros(
            (n_obj * 2,
             4)).to(gt_rels)  # adding two way edges from/to the dummy node
        rels[:, 0] = im
        for i in range(n_obj):
            # make edges two way as in the visual genome data loader
            for j, in_out in zip([i, i + n_obj], [(1, 2), (2, 1)]):
                rels[j, in_out[0]] = n_obj
                rels[j, in_out[1]] = i

        rels = torch.cat((gt_rels_lst[im].clone(), rels), 0)
        rels[:, 1:3] += offset
        gt_rels_new.append(rels)
        gt_objs_new.append(
            torch.cat(
                (gt_objs_im, torch.Tensor([im, 0]).view(1, 2).to(gt_objs_im)),
                0))
        gt_boxes_new.append(torch.cat((gt_boxes[s:e], dummy_box), 0))
        offset += (n_obj + 1)  # +1 because 1 dummy node is added
        # assert len(torch.cat(gt_objs_new)) == offset, (torch.cat(gt_objs_new).shape, offset)

    return torch.cat(gt_objs_new), torch.cat(gt_boxes_new), torch.cat(
        gt_rels_new)
Exemple #9
0
def pack_vectors(im_inds, vec_inps):
    num_im = int(im_inds[-1] + 1)
    max_num_roi = 0
    im2roi = []
    ls_rois = []
    for i, s, e in enumerate_by_image(im_inds):
        im2roi.append((s, e))
        max_num_roi = max(max_num_roi, e - s)
        ls_rois.append(e - s)
    packed_tensor = Variable(
        torch.FloatTensor(num_im, max_num_roi,
                          vec_inps.shape[1]).fill_(0).cuda(
                              vec_inps.get_device()))
    for i, seg in enumerate(im2roi):
        packed_tensor[i, :ls_rois[i]] = vec_inps[seg[0]:seg[1], :]
    return packed_tensor, np.array(ls_rois)
Exemple #10
0
    def forward(self, im_inds, obj_fmaps, obj_logits, rel_inds, vr, obj_labels=None, boxes_per_cls=None):
        """
        Reason relationship classes using knowledge of object and relationship coccurrence.
        """

        # print(rel_inds.shape)
        # (num_rel, 3)
        if self.mode == 'predcls':
            obj_logits = Variable(onehot_logits(obj_labels.data, self.num_obj_cls))
        obj_probs = F.softmax(obj_logits, 1)

        obj_fmaps = self.obj_proj(obj_fmaps)
        vr = self.rel_proj(vr)
        
        rel_logits = []
        obj_logits_refined = []
        for (_, obj_s, obj_e), (_, rel_s, rel_e) in zip(enumerate_by_image(im_inds.data), enumerate_by_image(rel_inds[:,0])):        
            rl, ol = self.ggnn(rel_inds[rel_s:rel_e, 1:] - obj_s, obj_probs[obj_s:obj_e], obj_fmaps[obj_s:obj_e], vr[rel_s:rel_e])
            rel_logits.append(rl)
            obj_logits_refined.append(ol)

        rel_logits = torch.cat(rel_logits, 0)
        
        if self.ggnn.refine_obj_cls:
            obj_logits_refined = torch.cat(obj_logits_refined, 0)
            obj_logits = obj_logits_refined

        obj_probs = F.softmax(obj_logits, 1)
        if self.mode == 'sgdet' and not self.training:
            # NMS here for baseline            
            nms_mask = obj_probs.data.clone()
            nms_mask.zero_()
            for c_i in range(1, obj_probs.size(1)):
                scores_ci = obj_probs.data[:, c_i]
                boxes_ci = boxes_per_cls.data[:, c_i]

                keep = apply_nms(scores_ci, boxes_ci,
                                    pre_nms_topn=scores_ci.size(0), post_nms_topn=scores_ci.size(0),
                                    nms_thresh=0.3)
                nms_mask[:, c_i][keep] = 1

            obj_preds = Variable(nms_mask * obj_probs.data, volatile=True)[:,1:].max(1)[1] + 1
        else:
            obj_preds = obj_labels if obj_labels is not None else obj_probs[:,1:].max(1)[1] + 1
            
        return obj_logits, obj_preds, rel_logits
Exemple #11
0
    def forward(self, im_inds, obj_fmaps, obj_labels):
        """
        Reason object classes using knowledge of object cooccurrence
        """

        if self.mode == 'predcls':
            # in task 'predcls', there is no need to run GGNN_obj
            obj_dists = Variable(to_onehot(obj_labels.data, self.num_obj_cls))
            return obj_dists
        else:
            input_ggnn = self.obj_proj(obj_fmaps)

            lengths = []
            for i, s, e in enumerate_by_image(im_inds.data):
                lengths.append(e - s)
            obj_cum_add = np.cumsum([0] + lengths)
            obj_dists = torch.cat([self.ggnn_obj(input_ggnn[obj_cum_add[i] : obj_cum_add[i+1]]) for i in range(len(lengths))], 0)
            return obj_dists
Exemple #12
0
    def forward(self, union_pools, rois, union_inds, im_sizes):

        if self.edge_model == 'motifs':
            pair_rois = torch.cat(
                (rois[:, 1:][union_inds[:, 0]], rois[:, 1:][union_inds[:, 1]]),
                1).data.cpu().numpy()
            rects = torch.from_numpy(
                self.draw_union_boxes(pair_rois, self.pooling_size * 4 - 1) -
                0.5).to(union_pools)
        elif self.edge_model == 'raw_boxes':
            boxes = rois[:, 1:].clone()
            # scale boxes to the range [0,1]
            scale = boxes.new(boxes.shape).fill_(0).float()
            for i, s, e in enumerate_by_image(rois[:, 0].long().data):
                h, w = im_sizes[i][:2]
                scale[s:e, 0] = w
                scale[s:e, 1] = h
                scale[s:e, 2] = w
                scale[s:e, 3] = h
            boxes = boxes / scale

            try:
                rects = self.draw_union_boxes(boxes, union_inds,
                                              self.pooling_size * 4 - 1) - 0.5
            except Exception as e:
                # there was a problem with bboxes being larger than images at test time, had to clip them
                print(rois, boxes, im_sizes, scale)
                raise

            # to debug:
            # print('rects my', rects.shape, rects.min(), rects.max())
            # np.save('rects.npy', rects.data.cpu().numpy())
            # pair_rois = torch.cat((rois[:, 1:][union_inds[:, 0]], rois[:, 1:][union_inds[:, 1]]), 1).data.cpu().numpy()
            # rects2 = torch.from_numpy(draw_union_boxes(pair_rois, self.pooling_size * 4 - 1) - 0.5).to(union_pools)
            # print('rects2', rects2.shape, rects2.min(), rects2.max())
            # np.save('rects2.npy', rects2.data.cpu().numpy())
            # print(union_inds)
            # raise ValueError('saved')
        else:
            raise NotImplementedError(self.edge_model)

        if self.concat:
            return torch.cat((union_pools, self.conv(rects)), 1)
        return union_pools + self.conv(rects)
Exemple #13
0
def filter_dets_for_gcn_caption(im_inds, region_feats, obj_scores, obj_classes, rel_inds, pred_scores,
                                rel_rank_scores=None,
                                seq_labels=None, mask_labels=None, coco_ids=None):
    num_box = obj_classes.size()
    num_rel = rel_inds.size(0)
    assert rel_inds.size(1) == 3
    assert pred_scores.size(0) == num_rel

    obj_scores0 = obj_scores.data[rel_inds[:, 1]]
    obj_scores1 = obj_scores.data[rel_inds[:, 2]]

    pred_scores_max, pred_classes_argmax = pred_scores.data[:, 1:].max(1)
    pred_classes_argmax = pred_classes_argmax + 1

    rel_scores_argmaxed = pred_scores_max * obj_scores0 * obj_scores1
    if rel_rank_scores is not None:
        rel_scores_argmaxed *= rel_rank_scores.data

    # split the relations according to image
    rel_im_inds = rel_inds[:, 0]

    rels = []
    pred_classes = []
    for i, s, e in enumerate_by_image(rel_im_inds):
        rels_i = rel_inds[s:e, :]
        pred_classes_argmax_i = pred_classes_argmax[s:e]
        rel_scores_argmaxed_i = rel_scores_argmaxed[s:e]
        rel_scores_vs_i, rel_scores_idx_i = torch.sort(rel_scores_argmaxed_i.view(-1), dim=0, descending=True)

        rels_i = rels_i[rel_scores_idx_i]
        pred_classes_argmax_i = pred_classes_argmax_i[rel_scores_idx_i]

        rels.append(rels_i)
        pred_classes.append(pred_classes_argmax_i)
    rels = torch.cat(rels, 0)
    pred_classes = torch.cat(pred_classes, 0)

    return im_inds, region_feats, pred_classes, rels, obj_classes.data, seq_labels, mask_labels, coco_ids
Exemple #14
0
def filter_dets_for_caption(boxes, obj_scores, obj_classes, rel_inds, pred_scores, rel_feats, image_fmap,
                            rel_rank_scores=None,
                            seq_labels=None, mask_labels=None, coco_ids=None):
    """
        Filters detections....
        :param boxes: [num_box, 4]
        :param obj_scores: [num_box] probabilities for the scores
        :param obj_classes: [num_box] class labels for the topk
        :param rel_inds: [num_rel, 3] TENSOR consisting of (rel_im_inds, box_ind0, box_ind1)
        :param pred_scores: [num_rel, num_predicates]
        :param use_nms: True if use NMS to filter dets.
        :return:
        boxes: FloatTensor
        obj_classes: FloatTensor
        rels: LongTensor, [num_rel, 3]
        pred_classes: LongTensor, [num_rel,]
        rel_feats_all: FloatTensor, [num_rel, 4096]
        seq_labels: [num_img*5, 19], [im_inds, <start>, seq labels, <end>, 0, 0, ...]
        mask_labels: [num_img*5, 19], [im_inds, 1, 1, ..., {1 for <end>}, 0, 0, ...]

        """
    if boxes.dim() != 2:
        raise ValueError("Boxes needs to be [num_box, 4] but its {}".format(boxes.size()))

    num_box = boxes.size(0)
    assert obj_scores.size(0) == num_box

    assert obj_classes.size() == obj_scores.size()
    num_rel = rel_inds.size(0)
    assert rel_inds.size(1) == 3
    assert pred_scores.size(0) == num_rel

    obj_scores0 = obj_scores.data[rel_inds[:, 1]]
    obj_scores1 = obj_scores.data[rel_inds[:, 2]]

    pred_scores_max, pred_classes_argmax = pred_scores.data[:, 1:].max(1)
    pred_classes_argmax = pred_classes_argmax + 1

    rel_scores_argmaxed = pred_scores_max * obj_scores0 * obj_scores1
    if rel_rank_scores is not None:
        rel_scores_argmaxed *= rel_rank_scores.data

    # split the relations according to image
    rel_im_inds = rel_inds[:, 0]

    rels = []
    rel_feats_all = []
    pred_classes = []
    for i, s, e in enumerate_by_image(rel_im_inds):
        rels_i = rel_inds[s:e, :]
        pred_classes_argmax_i = pred_classes_argmax[s:e]
        rel_feats_i = rel_feats[s:e, :]
        rel_scores_argmaxed_i = rel_scores_argmaxed[s:e]
        rel_scores_vs_i, rel_scores_idx_i = torch.sort(rel_scores_argmaxed_i.view(-1), dim=0, descending=True)

        rels_i = rels_i[rel_scores_idx_i]
        pred_classes_argmax_i = pred_classes_argmax_i[rel_scores_idx_i]
        rel_feats_i = rel_feats_i[rel_scores_idx_i]

        rels.append(rels_i)
        rel_feats_all.append(rel_feats_i)
        pred_classes.append(pred_classes_argmax_i)
    rels = torch.cat(rels, 0)
    rel_feats_all = torch.cat(rel_feats_all, 0)
    pred_classes = torch.cat(pred_classes, 0)

    return boxes, obj_classes, rels, Variable(
        pred_classes), rel_feats_all, image_fmap, seq_labels, mask_labels, coco_ids
def proposal_assignments_gtbox(rois,
                               gt_boxes,
                               gt_classes,
                               gt_rels,
                               image_offset,
                               RELS_PER_IMG,
                               sample_factor=-1):
    """
    Assign object detection proposals to ground-truth targets. Produces proposal
    classification labels and bounding-box regression targets.
    :param rpn_rois: [img_ind, x1, y1, x2, y2]
    :param gt_boxes:   [num_boxes, 4] array of x0, y0, x1, y1]. Not needed it seems
    :param gt_classes: [num_boxes, 2] array of [img_ind, class]
        Note, the img_inds here start at image_offset
    :param gt_rels     [num_boxes, 4] array of [img_ind, box_0, box_1, rel type].
        Note, the img_inds here start at image_offset
    :param Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
    :return:
        rois: [num_rois, 5]
        labels: [num_rois] array of labels
        bbox_targets [num_rois, 4] array of targets for the labels.
        rel_labels: [num_rels, 4] (img ind, box0 ind, box1ind, rel type)
    """
    im_inds = rois[:, 0].long()

    num_im = im_inds[-1] + 1

    # Offset the image indices in fg_rels to refer to absolute indices (not just within img i)
    fg_rels = gt_rels.clone()
    fg_rels[:, 0] -= image_offset
    offset = {}
    for i, s, e in enumerate_by_image(im_inds):
        offset[i] = s
    for i, s, e in enumerate_by_image(fg_rels[:, 0]):
        fg_rels[s:e, 1:3] += offset[i]

    # Try ALL things, not just intersections.
    is_cand = (im_inds[:, None] == im_inds[None])
    is_cand.view(-1)[diagonal_inds(is_cand)] = 0

    # NOW WE HAVE TO EXCLUDE THE FGs.
    is_cand.view(-1)[fg_rels[:, 1] * im_inds.size(0) + fg_rels[:, 2]] = 0
    is_bgcand = torch.nonzero(is_cand)

    # TODO: make this sample on a per image case
    # If too many then sample
    num_fg = min(fg_rels.size(0), int(RELS_PER_IMG * REL_FG_FRACTION * num_im))
    if num_fg < fg_rels.size(0):
        fg_rels = random_choose(fg_rels, num_fg)

    # If too many then sample
    is_train = num_im > 1  # assume num_im = 1 at test time (except for the det mode, which we don't use for now)
    sample_bg = is_train and sample_factor > -1

    num_bg = min(
        is_bgcand.size(0) if is_bgcand.dim() > 0 else 0,
        int(num_fg * sample_factor) if sample_bg else
        (int(RELS_PER_IMG * num_im) -
         num_fg))  # sample num_fg at training time

    if num_bg > 0:
        bg_rels = torch.cat((
            im_inds[is_bgcand[:, 0]][:, None],
            is_bgcand,
            (is_bgcand[:, 0, None] < -10).long(),
        ), 1)

        if num_bg < is_bgcand.size(0):
            bg_rels = random_choose(
                bg_rels, num_bg
            )  # at test time will correspond to the baseline approach

        rel_labels = torch.cat((fg_rels, bg_rels), 0)
    else:
        rel_labels = fg_rels

    # last sort by rel.
    _, perm = torch.sort(rel_labels[:, 0] * (gt_boxes.size(0)**2) +
                         rel_labels[:, 1] * gt_boxes.size(0) +
                         rel_labels[:, 2])

    rel_labels = rel_labels[perm].contiguous()

    labels = gt_classes[:, 1].contiguous()
    return rois, labels, rel_labels
def rel_proposal_target(rois, rel_proposal_inds, gt_boxes, gt_classes, gt_rels, image_offset, mode):
    """
    Assign the tareget for each proposal pairs.
    When the mode is predcls or sgcls, the target is directly obtained by comparing with gt_rel.
    When the mode is sgdet, the target is sampled by firstly compute iou with gt_pairs
    :param rois:
    :param rel_proposal_inds: [im_ind, ind1, ind2]
    :param gt_boxes:
    :param image_offset:
    :param mode:
    :return:
    """
    im_inds = rois[:, 0].long()

    num_im = im_inds[-1] + 1

    # Offset the image indices in fg_rels to refer to absolute indices (not just within img i)
    fg_rels = gt_rels.clone()
    fg_rels[:, 0] -= image_offset
    offset = {}
    for i, s, e in enumerate_by_image(gt_classes[:, 0]):
        offset[i] = s
    for i, s, e in enumerate_by_image(fg_rels[:, 0]):
        fg_rels[s:e, 1:3] += offset[i]

    rels_to_gt = []
    num_gt_rels_seen = 0

    if mode in ('predcls', 'sgcls'):
        rel_proposal_inds_np = rel_proposal_inds.cpu().numpy()
        fg_rels_np = fg_rels.cpu().numpy()  ## Ngtp, 4

        # locate the proposal
        locate_inds = np.where(intersect_2d(rel_proposal_inds_np, fg_rels_np[:, :-1]))
        proposal_to_gt = defaultdict(list)
        for ind in zip(*locate_inds):
            proposal_to_gt[ind[0]].append(ind[1])
        for k, v in proposal_to_gt.items():
            v0 = v[0] if len(v) == 1 else np.random.choice(v)
            proposal_to_gt[k] = v0



        fg_proposal_inds = np.array(list(proposal_to_gt.keys())).astype(np.int32)
        bg_proposal_inds = np.array(list(set(list(range(rel_proposal_inds_np.shape[0]))) - set(list(proposal_to_gt.keys())))).astype(np.int32)

        rels_to_gt = np.ones(fg_proposal_inds.shape[0] + bg_proposal_inds.shape[0], dtype=np.int64) * -1
        if len(fg_proposal_inds) > 0:
            rels_to_gt[fg_proposal_inds] = np.array([proposal_to_gt[ind] for ind in fg_proposal_inds])

        num_fg = min(fg_proposal_inds.size, int(RELS_BATCHSIZE * REL_FG_FRACTION * num_im))
        if num_fg < fg_proposal_inds.size:
            fg_proposal_inds = np.random.choice(fg_proposal_inds, num_fg, replace=False)
        num_bg = min(bg_proposal_inds.size if bg_proposal_inds.size else 0, int(RELS_BATCHSIZE * num_im) - num_fg)
        if num_bg < bg_proposal_inds.size:
            bg_proposal_inds = np.random.choice(bg_proposal_inds, num_bg, replace=False)

        if len(fg_proposal_inds) == 0:
            bg_labels = np.zeros(bg_proposal_inds.size)
            bg_rel_labels = np.hstack((rel_proposal_inds_np[bg_proposal_inds], bg_labels[:, None]))
            proposal_labels = bg_rel_labels
        else:
            fg_labels = np.array([fg_rels[proposal_to_gt[ind], -1] for ind in fg_proposal_inds])
            fg_rel_labels = np.hstack((rel_proposal_inds_np[fg_proposal_inds], fg_labels[:, None]))

            bg_labels = np.zeros(bg_proposal_inds.size)
            bg_rel_labels = np.hstack((rel_proposal_inds_np[bg_proposal_inds], bg_labels[:, None]))
            proposal_labels = np.vstack((fg_rel_labels, bg_rel_labels))

            rels_to_gt = np.hstack((rels_to_gt[fg_proposal_inds], rels_to_gt[bg_proposal_inds]))

        proposal_labels = torch.LongTensor(proposal_labels).cuda(gt_rels.get_device())
        rels_to_gt = torch.LongTensor(rels_to_gt).cuda(gt_rels.get_device())
    else:
        assert mode == 'sgdet'

        gt_box_pairs = torch.cat((gt_boxes[fg_rels[:, 1]], gt_boxes[fg_rels[:, 2]]), 1)
        rel_proposal_pairs = torch.cat((rois[:, 1:][rel_proposal_inds[:, 0]], rois[:, 1:][rel_proposal_inds[:, 1]]), 1)

        num_pairs = np.zeros(num_im + 1).astype(np.int32)
        for i, s, e in enumerate_by_image(rel_proposal_inds[:, 0]):
            num_pairs[i + 1] = e - s

        cumsum_num_pairs = np.cumsum(num_pairs).astype(np.int32)
        fg_rel_per_image = int(RELS_BATCHSIZE * REL_FG_FRACTION)

        proposal_labels = []
        gt_rel_labels = fg_rels[:, -1].contiguous().view(-1)
        for i in range(1, num_im + 1):
            rel_proposal_inds_i = rel_proposal_inds[cumsum_num_pairs[i - 1]:cumsum_num_pairs[i]]
            rel_proposal_pairs_i = rel_proposal_pairs[cumsum_num_pairs[i - 1]:cumsum_num_pairs[i]]
            gt_box_pairs_i = gt_box_pairs[torch.nonzero(fg_rels[:, 0] == (i - 1)).view(-1)]

            gt_box_pairs_label_i = gt_rel_labels[torch.nonzero(fg_rels[:, 0] == (i - 1)).view(-1)].view(-1).contiguous()

            overlaps = co_bbox_overlaps(rel_proposal_pairs_i, gt_box_pairs_i)  # Np, Ngtp
            max_overlaps, gt_assignment = torch.max(overlaps, 1)  # Np
            fg_inds = torch.nonzero(max_overlaps >= 0.5).view(-1)
            fg_num = fg_inds.numel()

            bg_inds = torch.nonzero((max_overlaps < 0.5) & (max_overlaps >= 0.0)).view(-1)
            bg_num = bg_inds.numel()

            rels_to_gt_i = torch.LongTensor(rel_proposal_pairs_i.shape[0]).fill(-1).cuda(gt_rels.get_device())
            rels_to_gt_i[fg_inds] = gt_assignment[fg_inds] + num_gt_rels_seen

            if fg_num > 0 and bg_num > 0:
                fg_this_image = min(fg_rel_per_image, fg_num)
                rand_num = torch.from_numpy(np.random.permutation(fg_num)).long().cuda()
                fg_inds = fg_inds[rand_num[:fg_this_image]]

                # sampling bg
                bg_this_image = RELS_BATCHSIZE - fg_this_image
                rand_num = np.floor(np.random.rand(bg_this_image) * bg_num)
                rand_num = torch.from_numpy(rand_num).long().cuda()
                bg_inds = bg_inds[rand_num]

                rels_to_gt_i = torch.cat((rels_to_gt_i[fg_inds], rels_to_gt_i[bg_inds]), 0)

            elif fg_num > 0 and bg_num == 0:
                rand_num = np.floor(np.random.rand(RELS_BATCHSIZE) * fg_num)
                rand_num = torch.from_numpy(rand_num).long().cuda()
                fg_inds = fg_inds[rand_num]
                fg_this_image = RELS_BATCHSIZE
                bg_this_image = 0
                rels_to_gt_i = rels_to_gt_i[fg_inds]
            elif bg_num > 0 and fg_num == 0:
                # sampling bg
                # rand_num = torch.floor(torch.rand(rois_per_image) * bg_num_rois).long().cuda()
                rand_num = np.floor(np.random.rand(RELS_BATCHSIZE) * bg_num)
                rand_num = torch.from_numpy(rand_num).long().cuda()

                bg_inds = bg_inds[rand_num]
                bg_this_image = RELS_BATCHSIZE
                fg_this_image = 0
                rels_to_gt_i = rels_to_gt_i[bg_inds]
            else:
                import pdb
                pdb.set_trace()

            keep_inds = torch.cat([fg_inds, bg_inds], 0)
            rel_proposal_inds_i = rel_proposal_inds_i[keep_inds]
            labels_i = gt_box_pairs_label_i[gt_assignment[keep_inds]]
            if fg_this_image < labels_i.size(0):
                labels_i[fg_this_image:] = 0
            rels_to_gt.append(rels_to_gt_i)
            num_gt_rels_seen += gt_box_pairs_i.shape[0]
            #try:
            #    labels_i[fg_this_image:] = 0
            #except ValueError:
            #    print(labels_i)
            #    print(fg_this_image)
            #    import pdb
            #    pdb.set_trace()
            proposal_labels.append(torch.cat((rel_proposal_inds_i, labels_i[:, None]), 1))
        proposal_labels = torch.cat(proposal_labels, 0)
        rels_to_gt = torch.cat(rels_to_gt, 0)

    # sort
    _, perm = torch.sort(
        proposal_labels[:, 0] * (rois.size(0) ** 2) + proposal_labels[:, 1] * rois.size(0) + proposal_labels[:, 2])
    proposal_labels = proposal_labels[perm].contiguous()
    rels_to_gt = rels_to_gt[perm].contiguous()

    return proposal_labels, rels_to_gt
Exemple #17
0
def convert_roi_to_list(rois):
    rois_lst = []
    for im_ind, s, e in enumerate_by_image(rois[:, 0].long().data):
        rois_lst.append(rois[s:e, 1:])

    return rois_lst
Exemple #18
0
    def forward(self,
                x,
                im_sizes,
                image_offset,
                gt_boxes=None,
                gt_classes=None,
                gt_rels=None,
                proposals=None,
                train_anchor_inds=None,
                return_fmap=False):
        """
        Forward pass for Relation detection
        Args:
            x: Images@[batch_size, 3, IM_SIZE, IM_SIZE]
            im_sizes: A numpy array of (h, w, scale) for each image.
            image_offset: Offset onto what image we're on for MGPU training (if single GPU this is 0)

            parameters for training:
            gt_boxes: [num_gt, 4] GT boxes over the batch.
            gt_classes: [num_gt, 2] gt boxes where each one is (img_id, class)
            gt_rels:
            proposals:
            train_anchor_inds: a [num_train, 2] array of indices for the anchors that will
                                  be used to compute the training loss. Each (img_ind, fpn_idx)
            return_fmap:

        Returns:
            If train:
                scores, boxdeltas, labels, boxes, boxtargets, rpnscores, rpnboxes, rellabels
            If test:
                prob dists, boxes, img inds, maxscores, classes
        """
        result = self.detector(x,
                               im_sizes,
                               image_offset,
                               gt_boxes,
                               gt_classes,
                               gt_rels,
                               proposals,
                               train_anchor_inds,
                               return_fmap=True)

        assert not result.is_none(), 'Empty detection result'

        # image_offset refer to Blob
        # self.batch_size_per_gpu * index
        im_inds = result.im_inds - image_offset
        boxes = result.rm_box_priors
        obj_scores, box_classes = F.softmax(
            result.rm_obj_dists[:, 1:].contiguous(), dim=1).max(1)
        box_classes += 1

        num_img = im_inds[-1] + 1

        # embed(header='rel_model.py before rel_assignments')
        if self.training and result.rel_labels is None:
            assert self.mode == 'sgdet'

            # only in sgdet mode

            # shapes:
            # im_inds: (box_num,)
            # boxes: (box_num, 4)
            # rm_obj_labels: (box_num,)
            # gt_boxes: (box_num, 4)
            # gt_classes: (box_num, 2) maybe[im_ind, class_ind]
            # gt_rels: (rel_num, 4)
            # image_offset: integer
            result.rel_labels = rel_assignments(im_inds.data,
                                                boxes.data,
                                                result.rm_obj_labels.data,
                                                gt_boxes.data,
                                                gt_classes.data,
                                                gt_rels.data,
                                                image_offset,
                                                filter_non_overlap=True,
                                                num_sample_per_gt=1)
        rel_inds = self.get_rel_inds(result.rel_labels, im_inds, boxes)
        rois = torch.cat((im_inds[:, None].float(), boxes), 1)
        # union boxes feats (NumOfRels, obj_dim)
        union_box_feats = self.visual_rep(result.fmap.detach(), rois,
                                          rel_inds[:, 1:].contiguous())
        # single box feats (NumOfBoxes, feats)
        box_feats = self.obj_feature_map(result.fmap.detach(), rois)
        # box spatial feats (NumOfBox, 4)
        bboxes = Variable(center_size(boxes.data))
        sub_bboxes = bboxes[rel_inds[:, 1].contiguous()]
        obj_bboxes = bboxes[rel_inds[:, 2].contiguous()]

        obj_bboxes[:, :2] = obj_bboxes[:, :2].contiguous(
        ) - sub_bboxes[:, :2].contiguous()  # x-y
        obj_bboxes[:, 2:] = obj_bboxes[:, 2:].contiguous(
        ) / sub_bboxes[:, 2:].contiguous()  # w/h
        obj_bboxes[:, :2] /= sub_bboxes[:, 2:].contiguous()  # x-y/h
        obj_bboxes[:, 2:] = torch.log(obj_bboxes[:,
                                                 2:].contiguous())  # log(w/h)

        bbox_spatial_feats = self.spatial_fc(obj_bboxes)

        box_word = self.classes_word_embedding(box_classes)
        box_pair_word = torch.cat((box_word[rel_inds[:, 1].contiguous()],
                                   box_word[rel_inds[:, 2].contiguous()]), 1)
        box_word_feats = self.word_fc(box_pair_word)

        # (NumOfRels, DIM=)
        box_pair_feats = torch.cat(
            (union_box_feats, bbox_spatial_feats, box_word_feats), 1)

        box_pair_score = self.relpn_fc(box_pair_feats)
        #embed(header='filter_rel_labels')
        if self.training:
            pn_rel_label = list()
            pn_pair_score = list()
            #print(result.rel_labels.shape)
            #print(result.rel_labels[:, 0].contiguous().squeeze())
            for i, s, e in enumerate_by_image(
                    result.rel_labels[:, 0].data.contiguous()):
                im_i_rel_label = result.rel_labels[s:e].contiguous()
                im_i_box_pair_score = box_pair_score[s:e].contiguous()

                im_i_rel_fg_inds = torch.nonzero(
                    im_i_rel_label[:, -1].contiguous()).squeeze()
                im_i_rel_fg_inds = im_i_rel_fg_inds.data.cpu().numpy()
                im_i_fg_sample_num = min(RELEVANT_PER_IM,
                                         im_i_rel_fg_inds.shape[0])
                if im_i_rel_fg_inds.size > 0:
                    im_i_rel_fg_inds = np.random.choice(
                        im_i_rel_fg_inds,
                        size=im_i_fg_sample_num,
                        replace=False)

                im_i_rel_bg_inds = torch.nonzero(
                    im_i_rel_label[:, -1].contiguous() == 0).squeeze()
                im_i_rel_bg_inds = im_i_rel_bg_inds.data.cpu().numpy()
                im_i_bg_sample_num = min(EDGES_PER_IM - im_i_fg_sample_num,
                                         im_i_rel_bg_inds.shape[0])
                if im_i_rel_bg_inds.size > 0:
                    im_i_rel_bg_inds = np.random.choice(
                        im_i_rel_bg_inds,
                        size=im_i_bg_sample_num,
                        replace=False)

                #print('{}/{} fg/bg in image {}'.format(im_i_fg_sample_num, im_i_bg_sample_num, i))
                result.rel_sample_pos = torch.Tensor(
                    [im_i_fg_sample_num]).cuda(im_i_rel_label.get_device())
                result.rel_sample_neg = torch.Tensor(
                    [im_i_bg_sample_num]).cuda(im_i_rel_label.get_device())

                im_i_keep_inds = np.append(im_i_rel_fg_inds, im_i_rel_bg_inds)
                im_i_pair_score = im_i_box_pair_score[
                    im_i_keep_inds.tolist()].contiguous()

                im_i_rel_pn_labels = Variable(
                    torch.zeros(im_i_fg_sample_num + im_i_bg_sample_num).type(
                        torch.LongTensor).cuda(x.get_device()))
                im_i_rel_pn_labels[:im_i_fg_sample_num] = 1

                pn_rel_label.append(im_i_rel_pn_labels)
                pn_pair_score.append(im_i_pair_score)

            result.rel_pn_dists = torch.cat(pn_pair_score, 0)
            result.rel_pn_labels = torch.cat(pn_rel_label, 0)

        box_pair_relevant = F.softmax(box_pair_score, dim=1)
        box_pos_pair_ind = torch.nonzero(box_pair_relevant[:, 1].contiguous(
        ) > box_pair_relevant[:, 0].contiguous()).squeeze()

        if box_pos_pair_ind.data.shape == torch.Size([]):
            return None
        #print('{}/{} trim edges'.format(box_pos_pair_ind.size(0), rel_inds.size(0)))
        result.rel_trim_pos = torch.Tensor([box_pos_pair_ind.size(0)]).cuda(
            box_pos_pair_ind.get_device())
        result.rel_trim_total = torch.Tensor([rel_inds.size(0)
                                              ]).cuda(rel_inds.get_device())

        # filtering relations
        filter_rel_inds = rel_inds[box_pos_pair_ind.data]
        filter_box_pair_feats = box_pair_feats[box_pos_pair_ind.data]
        if self.training:
            filter_rel_labels = result.rel_labels[box_pos_pair_ind.data]
            result.rel_labels = filter_rel_labels

        # message passing between boxes and relations
        #embed(header='mp')
        for _ in range(self.mp_iter_num):
            box_feats = self.message_passing(box_feats, filter_box_pair_feats,
                                             filter_rel_inds)
        box_cls_scores = self.cls_fc(box_feats)
        result.rm_obj_dists = box_cls_scores
        obj_scores, box_classes = F.softmax(box_cls_scores[:, 1:].contiguous(),
                                            dim=1).max(1)
        box_classes += 1  # skip background

        # TODO: add memory module
        # filter_box_pair_feats is to be added to memory
        # fbiilter_box_pair_feats = self.memory_()

        # filter_box_pair_feats is to be added to memory

        # RelationCNN
        filter_box_pair_feats_fc1 = self.relcnn_fc1(filter_box_pair_feats)
        filter_box_pair_score = self.relcnn_fc2(filter_box_pair_feats_fc1)
        if not self.graph_cons:
            filter_box_pair_score = filter_box_pair_score.view(
                -1, 2, self.num_rels)
        result.rel_dists = filter_box_pair_score

        if self.training:
            return result

        pred_scores = F.softmax(result.rel_dists, dim=1)
        """
        filter_dets
        boxes: bbox regression else [num_box, 4]
        obj_scores: [num_box] probabilities for the scores
        obj_classes: [num_box] class labels integer
        rel_inds: [num_rel, 2] TENSOR consisting of (im_ind0, im_ind1)
        pred_scores: [num_rel, num_predicates] including irrelevant class(#relclass + 1)
        """
        return filter_dets(boxes, obj_scores, box_classes,
                           filter_rel_inds[:, 1:].contiguous(), pred_scores)
Exemple #19
0
def rel_assignments_sgcls(rois, gt_boxes, gt_classes, gt_rels, image_offset):
    """
    sample_rels to balance proportion of positive and negative samples
    :param rois: [img_ind, x1, y1, x2, y2]
    :param gt_boxes:   [num_boxes, 4] array of x0, y0, x1, y1]. Not needed it seems
    :param gt_classes: [num_boxes, 2] array of [img_ind, class]
        Note, the img_inds here start at image_offset
    :param gt_rels     [num_boxes, 4] array of [img_ind, box_0, box_1, rel type].
        Note, the img_inds here start at image_offset
    :param Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
    :return:
        rois: [num_rois, 5]
        labels: [num_rois] array of labels
        rel_labels: [num_rels, 4] (img ind, box0 ind, box1ind, rel type)
    """
    im_inds = rois[:,0].long()

    num_im = im_inds[-1] + 1

    # Offset the image indices in fg_rels to refer to absolute indices (not just within img i)
    fg_rels = gt_rels.clone()
    fg_rels[:,0] -= image_offset
    offset = {}
    for i, s, e in enumerate_by_image(im_inds):
        offset[i] = s
    for i, s, e in enumerate_by_image(fg_rels[:, 0]):
        fg_rels[s:e, 1:3] += offset[i]

    # Try ALL things, not just intersections.
    is_cand = (im_inds[:, None] == im_inds[None])
    is_cand.view(-1)[diagonal_inds(is_cand)] = 0

    # # Compute salience
    # gt_inds = fg_rels[:, 1:3].contiguous().view(-1)
    # labels_arange = labels.data.new(labels.size(0))
    # torch.arange(0, labels.size(0), out=labels_arange)
    # salience_labels = ((gt_inds[:, None] == labels_arange[None]).long().sum(0) > 0).long()
    # labels = torch.stack((labels, salience_labels), 1)

    # Add in some BG labels

    # NOW WE HAVE TO EXCLUDE THE FGs.
    # TODO: check if this causes an error if many duplicate GTs havent been filtered out

    is_cand.view(-1)[fg_rels[:,1]*im_inds.size(0) + fg_rels[:,2]] = 0
    is_bgcand = is_cand.nonzero()
    # TODO: make this sample on a per image case
    # If too many then sample
    num_fg = min(fg_rels.size(0), int(RELS_PER_IMG * REL_FG_FRACTION * num_im))
    if num_fg < fg_rels.size(0):
        fg_rels = random_choose(fg_rels, num_fg)

    # If too many then sample
    num_bg = min(is_bgcand.size(0) if is_bgcand.dim() > 0 else 0,
                 int(RELS_PER_IMG * num_im) - num_fg)
    if num_bg > 0:
        bg_rels = torch.cat((
            im_inds[is_bgcand[:, 0]][:, None],
            is_bgcand,
            (is_bgcand[:, 0, None] < -10).long(),
        ), 1)

        if num_bg < is_bgcand.size(0):
            bg_rels = random_choose(bg_rels, num_bg)
        rel_labels = torch.cat((fg_rels, bg_rels), 0)
    else:
        rel_labels = fg_rels


    # last sort by rel.
    _, perm = torch.sort(rel_labels[:, 0]*(gt_boxes.size(0)**2) +
                         rel_labels[:,1]*gt_boxes.size(0) + rel_labels[:,2])

    rel_labels = rel_labels[perm].contiguous()

    labels = gt_classes[:,1].contiguous()
    return rois, labels, rel_labels
    def forward(self,
                x,
                im_sizes,
                image_offset,
                gt_boxes=None,
                gt_classes=None,
                gt_rels=None,
                proposals=None,
                train_anchor_inds=None,
                return_fmap=False):
        """
        Forward pass for Relation detection
        Args:
            x: Images@[batch_size, 3, IM_SIZE, IM_SIZE]
            im_sizes: A numpy array of (h, w, scale) for each image.
            image_offset: Offset onto what image we're on for MGPU training (if single GPU this is 0)

            parameters for training:
            gt_boxes: [num_gt, 4] GT boxes over the batch.
            gt_classes: [num_gt, 2] gt boxes where each one is (img_id, class)
            gt_rels:
            proposals:
            train_anchor_inds: a [num_train, 2] array of indices for the anchors that will
                                  be used to compute the training loss. Each (img_ind, fpn_idx)
            return_fmap:

        Returns:
            If train:
                scores, boxdeltas, labels, boxes, boxtargets, rpnscores, rpnboxes, rellabels
            If test:
                prob dists, boxes, img inds, maxscores, classes
        """
        s_t = time.time()
        verbose = False

        def check(sl, een, sst=s_t):
            if verbose:
                print('{}{}'.format(sl, een - sst))

        result = self.detector(x,
                               im_sizes,
                               image_offset,
                               gt_boxes,
                               gt_classes,
                               gt_rels,
                               proposals,
                               train_anchor_inds,
                               return_fmap=True)
        check('detector', tt())

        assert not result.is_none(), 'Empty detection result'

        # image_offset refer to Blob
        # self.batch_size_per_gpu * index
        im_inds = result.im_inds - image_offset
        boxes = result.rm_box_priors
        obj_scores, box_classes = F.softmax(
            result.rm_obj_dists[:, 1:].contiguous(), dim=1).max(1)
        box_classes += 1
        # TODO: predcls implementation obj_scores and box_classes

        num_img = im_inds[-1] + 1

        # embed(header='rel_model.py before rel_assignments')
        if self.training and result.rel_labels is None:
            assert self.mode == 'sgdet'

            # only in sgdet mode

            # shapes:
            # im_inds: (box_num,)
            # boxes: (box_num, 4)
            # rm_obj_labels: (box_num,)
            # gt_boxes: (box_num, 4)
            # gt_classes: (box_num, 2) maybe[im_ind, class_ind]
            # gt_rels: (rel_num, 4)
            # image_offset: integer
            result.rel_labels = rel_assignments(im_inds.data,
                                                boxes.data,
                                                result.rm_obj_labels.data,
                                                gt_boxes.data,
                                                gt_classes.data,
                                                gt_rels.data,
                                                image_offset,
                                                filter_non_overlap=True,
                                                num_sample_per_gt=1)
        rel_inds = self.get_rel_inds(result.rel_labels, im_inds, boxes)
        rois = torch.cat((im_inds[:, None].float(), boxes), 1)
        # union boxes feats (NumOfRels, obj_dim)
        union_box_feats = self.visual_rep(result.fmap.detach(), rois,
                                          rel_inds[:, 1:].contiguous())
        # single box feats (NumOfBoxes, feats)
        box_feats = self.obj_feature_map(result.fmap.detach(), rois)
        # box spatial feats (NumOfBox, 4)

        box_pair_feats = self.fuse_message(union_box_feats, boxes, box_classes,
                                           rel_inds)
        box_pair_score = self.relpn_fc(box_pair_feats)

        if self.training:
            # sampling pos and neg relations here for training
            rel_sample_pos, rel_sample_neg = 0, 0
            pn_rel_label, pn_pair_score = list(), list()
            for i, s, e in enumerate_by_image(
                    result.rel_labels[:, 0].data.contiguous()):
                im_i_rel_label = result.rel_labels[s:e].contiguous()
                im_i_box_pair_score = box_pair_score[s:e].contiguous()

                im_i_rel_fg_inds = torch.nonzero(
                    im_i_rel_label[:, -1].contiguous()).squeeze()
                im_i_rel_fg_inds = im_i_rel_fg_inds.data.cpu().numpy()
                im_i_fg_sample_num = min(RELEVANT_PER_IM,
                                         im_i_rel_fg_inds.shape[0])
                if im_i_rel_fg_inds.size > 0:
                    im_i_rel_fg_inds = np.random.choice(
                        im_i_rel_fg_inds,
                        size=im_i_fg_sample_num,
                        replace=False)

                im_i_rel_bg_inds = torch.nonzero(
                    im_i_rel_label[:, -1].contiguous() == 0).squeeze()
                im_i_rel_bg_inds = im_i_rel_bg_inds.data.cpu().numpy()
                im_i_bg_sample_num = min(EDGES_PER_IM - im_i_fg_sample_num,
                                         im_i_rel_bg_inds.shape[0])
                if im_i_rel_bg_inds.size > 0:
                    im_i_rel_bg_inds = np.random.choice(
                        im_i_rel_bg_inds,
                        size=im_i_bg_sample_num,
                        replace=False)

                #print('{}/{} fg/bg in image {}'.format(im_i_fg_sample_num, im_i_bg_sample_num, i))
                rel_sample_pos += im_i_fg_sample_num
                rel_sample_neg += im_i_bg_sample_num

                im_i_keep_inds = np.append(im_i_rel_fg_inds, im_i_rel_bg_inds)
                im_i_pair_score = im_i_box_pair_score[
                    im_i_keep_inds.tolist()].contiguous()

                im_i_rel_pn_labels = Variable(
                    torch.zeros(im_i_fg_sample_num + im_i_bg_sample_num).type(
                        torch.LongTensor).cuda(x.get_device()))
                im_i_rel_pn_labels[:im_i_fg_sample_num] = 1

                pn_rel_label.append(im_i_rel_pn_labels)
                pn_pair_score.append(im_i_pair_score)

            result.rel_pn_dists = torch.cat(pn_pair_score, 0)
            result.rel_pn_labels = torch.cat(pn_rel_label, 0)
            result.rel_sample_pos = torch.Tensor([rel_sample_pos]).cuda(
                im_i_rel_label.get_device())
            result.rel_sample_neg = torch.Tensor([rel_sample_neg]).cuda(
                im_i_rel_label.get_device())

        box_pair_relevant = F.softmax(box_pair_score, dim=1)
        box_pos_pair_ind = torch.nonzero(box_pair_relevant[:, 1].contiguous(
        ) > box_pair_relevant[:, 0].contiguous()).squeeze()

        if box_pos_pair_ind.data.shape == torch.Size([]):
            return None
        #print('{}/{} trim edges'.format(box_pos_pair_ind.size(0), rel_inds.size(0)))
        result.rel_trim_pos = torch.Tensor([box_pos_pair_ind.size(0)]).cuda(
            box_pos_pair_ind.get_device())
        result.rel_trim_total = torch.Tensor([rel_inds.size(0)
                                              ]).cuda(rel_inds.get_device())

        if self.trim_graph:
            # filtering relations
            filter_rel_inds = rel_inds[box_pos_pair_ind.data]
            filter_box_pair_feats = box_pair_feats[box_pos_pair_ind.data]
        else:
            filter_rel_inds = rel_inds
            filter_box_pair_feats = box_pair_feats
        if self.training:
            if self.trim_graph:
                filter_rel_labels = result.rel_labels[box_pos_pair_ind.data]
            else:
                filter_rel_labels = result.rel_labels
            num_gt_filtered = torch.nonzero(filter_rel_labels[:, -1])
            if num_gt_filtered.shape == torch.Size([]):
                num_gt_filtered = 0
            else:
                num_gt_filtered = num_gt_filtered.size(0)
            num_gt_orignial = torch.nonzero(result.rel_labels[:, -1]).size(0)
            result.rel_pn_recall = torch.Tensor(
                [num_gt_filtered / num_gt_orignial]).cuda(x.get_device())
            result.rel_labels = filter_rel_labels
        check('trim', tt())

        # message passing between boxes and relations
        if self.mode in ('sgcls', 'sgdet'):
            for _ in range(self.mp_iter_num):
                box_feats = self.message_passing(box_feats,
                                                 filter_box_pair_feats,
                                                 filter_rel_inds)
            box_cls_scores = self.cls_fc(box_feats)
            result.rm_obj_dists = box_cls_scores
            obj_scores, box_classes = F.softmax(
                box_cls_scores[:, 1:].contiguous(), dim=1).max(1)
            box_classes += 1  # skip background
        check('mp', tt())

        # RelationCNN
        filter_box_pair_feats_fc1 = self.relcnn_fc1(filter_box_pair_feats)
        filter_box_pair_score = self.relcnn_fc2(filter_box_pair_feats_fc1)

        result.rel_dists = filter_box_pair_score
        pred_scores_stage_one = F.softmax(result.rel_dists, dim=1).data

        # filter_box_pair_feats is to be added to memory
        if self.training:
            padded_filter_feats, pack_lengths, re_filter_rel_inds, padded_rel_labels = \
                self.pad_sequence(
                    filter_rel_inds,
                    filter_box_pair_feats_fc1,
                    rel_labels=result.rel_labels
                )
        else:
            padded_filter_feats, pack_lengths, re_filter_rel_inds, padded_rel_inds = \
                self.pad_sequence(
                    filter_rel_inds,
                    filter_box_pair_feats_fc1
                )

        # trimming zeros to avoid no rel in image
        trim_pack_lengths = np.trim_zeros(pack_lengths)
        trim_padded_filter_feats = padded_filter_feats[:trim_pack_lengths.
                                                       shape[0]]
        packed_filter_feats = pack_padded_sequence(trim_padded_filter_feats,
                                                   trim_pack_lengths,
                                                   batch_first=True)
        if self.training:
            trim_padded_rel_labels = padded_rel_labels[:trim_pack_lengths.
                                                       shape[0]]
            packed_rel_labels = pack_padded_sequence(trim_padded_rel_labels,
                                                     trim_pack_lengths,
                                                     batch_first=True)
            rel_mem_dists = self.mem_module(inputs=packed_filter_feats,
                                            rel_labels=packed_rel_labels)
            rel_mem_dists = self.re_order_packed_seq(rel_mem_dists,
                                                     filter_rel_inds,
                                                     re_filter_rel_inds)
            result.rel_mem_dists = rel_mem_dists
        else:
            trim_padded_rel_inds = padded_rel_inds[:trim_pack_lengths.shape[0]]
            packed_rel_inds = pack_padded_sequence(trim_padded_rel_inds,
                                                   trim_pack_lengths,
                                                   batch_first=True)
            rel_mem_dists = self.mem_module(inputs=packed_filter_feats,
                                            rel_inds=packed_rel_inds,
                                            obj_classes=box_classes)
            rel_mem_probs = self.re_order_packed_seq(rel_mem_dists,
                                                     filter_rel_inds,
                                                     re_filter_rel_inds)
            rel_mem_probs = rel_mem_probs.data

        check('mem', tt())
        if self.training:
            return result

        # pad stage one output in rel_mem_probs if it sums zero
        for rel_i in range(rel_mem_probs.size(0)):
            rel_i_probs = rel_mem_probs[rel_i]
            if rel_i_probs.sum() == 0:
                rel_mem_probs[rel_i] = pred_scores_stage_one[rel_i]
        """
        filter_dets
        boxes: bbox regression else [num_box, 4]
        obj_scores: [num_box] probabilities for the scores
        obj_classes: [num_box] class labels integer
        rel_inds: [num_rel, 2] TENSOR consisting of (im_ind0, im_ind1)
        pred_scores: [num_rel, num_predicates] including irrelevant class(#relclass + 1)
        """
        check('mem processing', tt())
        return filter_dets(boxes, obj_scores, box_classes,
                           filter_rel_inds[:, 1:].contiguous(), rel_mem_probs)
    def message_passing(self, box_feats, rel_feats, edges):
        """Integrate box feats to each other
        update box feats by decending out-degree order, that is, the node with largest out-degree update first

        suppose node i and j are neighbours, and has connection i->j
        feature of i and j are fi and fj, feature of union box ij are fij

        fi = sigma(W1*fi + sum_neighbour(V1 * alpha * fij))
        fj = sigma(W2*fj + sum_neighbour(V2 * alpha * fij))

        alpha = attention(fi, fij)

        V1, V2, W1, W2 are parameters to be learned, sigma is acitvation function, alpha is attention
        Args:
            box_feats: Variable, box features with shape of (NumOfBoxes, FEAT_DIM)
            rel_feats: Variable, edge features with shape of (NumOfRels, REL_FEAT_DIM)
            edges: Variable, scene graph edges(pruned), with shape of (NumOfRels, 3)
                e.g. edges[0, :] = [1, 0, 5] means box 0 and box 5 in image 1 had an affair~
        Returns:
            box_feats: Variable, box features combining relation features
        """
        # embed(header='mp ')
        im_inds = edges[:, 0].contiguous()
        num_img = im_inds[-1] + 1
        # list of dict: record the number of boxes per image
        count_dic = [{} for _ in range(num_img)]
        for im_i, s, e in enumerate_by_image(im_inds):
            im_i_edges = edges[s:e, :].contiguous()
            for rel in im_i_edges:
                box0, box1 = rel[1:]
                count_dic[im_i][box0] = 1 + count_dic[im_i].get(box0, 0)

        # list of Variable
        box_nodes_feats = list()
        for box_feat in box_feats:
            box_nodes_feats.append(box_feat)  #.clone())

        for im_i, s, e in enumerate_by_image(im_inds):
            im_i_edges = edges[s:e, :].contiguous()
            im_i_rel_feats = rel_feats[s:e, :].contiguous()
            for box_id, v in \
                    sorted(
                        count_dic[im_i].items(),
                        key=lambda kv: kv[1],
                        reverse=True
                    ):
                # update passing message
                # subject message from rel feats
                choose_sub_edges_ind = torch.nonzero(
                    im_i_edges[:, 1].contiguous() == box_id).squeeze()
                choose_sub_edges = im_i_edges[choose_sub_edges_ind]
                choose_sub_rel_feats = im_i_rel_feats[choose_sub_edges_ind]
                box_id_feats = box_nodes_feats[box_id]

                # attention on subject reltions
                num_sub_neigh = choose_sub_edges.size(0)
                sub_cat_att_feats = torch.cat((box_id_feats.expand(
                    [num_sub_neigh, -1]), choose_sub_rel_feats), 1)
                sub_atten = self.mp_atten_fc(sub_cat_att_feats)
                sub_alpha = F.softmax(sub_atten, dim=0)
                sub_feats = (sub_alpha *
                             self.sub_rel_mp_fc(choose_sub_rel_feats)).sum(0)

                # object message from rel feats(may be null)
                choose_obj_edges_ind = torch.nonzero(
                    im_i_edges[:, 2].contiguous() == box_id).squeeze()
                if choose_obj_edges_ind.size() == torch.Size([]):
                    box_id_feats = self.box_mp_fc(box_id_feats) + sub_feats
                    box_id_feats = F.relu(box_id_feats, inplace=True)
                    box_nodes_feats[box_id] = box_id_feats
                    continue
                choose_obj_edges = im_i_edges[choose_obj_edges_ind]
                choose_obj_rel_feats = im_i_rel_feats[choose_obj_edges_ind]
                box_id_feats = box_nodes_feats[box_id]

                # attention on object reltions
                num_obj_neigh = choose_obj_edges.size(0)
                obj_cat_att_feats = torch.cat((box_id_feats.expand(
                    [num_obj_neigh, -1]), choose_obj_rel_feats), 1)
                obj_atten = self.mp_atten_fc(obj_cat_att_feats)
                obj_alpha = F.softmax(obj_atten, dim=0)
                obj_feats = (obj_alpha *
                             self.obj_rel_mp_fc(choose_obj_rel_feats)).sum(0)

                # add back to box feature
                box_id_feats = self.box_mp_fc(
                    box_id_feats) + obj_feats + sub_feats
                box_id_feats = F.relu(box_id_feats, inplace=True)

                box_nodes_feats[box_id] = box_id_feats

        mp_box_feats = torch.stack(box_nodes_feats)
        return mp_box_feats
Exemple #22
0
def rel_anchor_target(rois, gt_boxes, gt_classes, scores, gt_rels,
                      image_offset):
    """
    use all roi pairs and sample some pairs to train relation proposal module
    Note: ONLY for mode SGDET!!!!
    rois are from RPN,
    We take the CO_Overlap strategy from Graph-RCNN to sample fg and bg rels
    :param rois: N, 5
    :param scores: N, N
    :param gt_rels:
    :return:
    """
    im_inds = rois[:, 0].long()
    num_im = im_inds[-1] + 1

    # Offset the image indices in fg_rels to refer to absolute indices (not just within img i)
    fg_rels = gt_rels.clone()
    fg_rels[:, 0] -= image_offset
    offset = {}
    for i, s, e in enumerate_by_image(gt_classes[:, 0]):
        offset[i] = s
    for i, s, e in enumerate_by_image(fg_rels[:, 0]):
        fg_rels[s:e, 1:3] += offset[i]

    gt_box_pairs = torch.cat(
        (gt_boxes[fg_rels[:, 1]], gt_boxes[fg_rels[:, 2]]), 1)  # Ngtp, 8

    # get all potential pairs
    is_cand = (im_inds[:, None] == im_inds[None])
    is_cand.view(-1)[diagonal_inds(is_cand)] = 0

    all_pair_inds = torch.nonzero(is_cand)
    all_box_pairs = torch.cat(
        (rois[:, 1:][all_pair_inds[:, 0]], rois[:, 1:][all_pair_inds[:, 1]]),
        1)

    num_pairs = np.zeros(num_im + 1).astype(np.int32)
    id_to_iminds = {}
    for i, s, e in enumerate_by_image(im_inds):
        num_pairs[i + 1] = (e - s) * (e - s - 1)
        id_to_iminds[i] = im_inds[s]
    cumsum_num_pairs = np.cumsum(num_pairs).astype(np.int32)

    all_rel_inds = []
    for i in range(1, num_im + 1):
        all_pair_inds_i = all_pair_inds[
            cumsum_num_pairs[i - 1]:cumsum_num_pairs[i]]
        all_box_pairs_i = all_box_pairs[
            cumsum_num_pairs[i - 1]:cumsum_num_pairs[i]]
        gt_box_pairs_i = gt_box_pairs[torch.nonzero(
            fg_rels[:, 0] == (i - 1)).view(-1)]
        labels = gt_rels.new(all_box_pairs_i.size(0)).fill_(-1)

        overlaps = co_bbox_overlaps(all_box_pairs_i,
                                    gt_box_pairs_i)  ## Np, Ngtp
        max_overlaps, argmax_overlaps = torch.max(overlaps, 1)  ## Np
        gt_max_overlaps, _ = torch.max(overlaps, 0)  ## Ngtp

        labels[max_overlaps < 0.15] = 0
        gt_max_overlaps[gt_max_overlaps == 0] = 1e-5

        # fg rel: for each gt pair, the max overlap anchor is fg
        keep = torch.sum(
            overlaps.eq(gt_max_overlaps.view(1, -1).expand_as(overlaps)),
            1)  # Np
        if torch.sum(keep) > 0:
            labels[keep > 0] = 1

        # fg rel: above thresh
        labels[max_overlaps >= 0.25] = 1

        num_fg = int(RELPN_BATCHSIZE * RELPN_FG_FRACTION)
        sum_fg = torch.sum((labels == 1).int())
        sum_bg = torch.sum((labels == 0).int())

        if sum_fg > num_fg:
            fg_inds = torch.nonzero(labels == 1).view(-1)
            rand_num = torch.from_numpy(np.random.permutation(
                fg_inds.size(0))).type_as(gt_boxes).long()
            disable_inds = fg_inds[rand_num[:fg_inds.size(0) - num_fg]]
            labels[disable_inds] = -1
        num_bg = RELPN_BATCHSIZE - torch.sum((labels == 1).int())

        if sum_bg > num_bg:
            bg_inds = torch.nonzero(labels == 0).view(-1)
            rand_num = torch.from_numpy(np.random.permutation(
                bg_inds.size(0))).type_as(gt_boxes).long()
            disable_inds = bg_inds[rand_num[:bg_inds.size(0) - num_bg]]
            labels[disable_inds] = -1

        keep_inds = torch.nonzero(labels >= 0).view(-1)
        labels = labels[keep_inds]
        all_pair_inds_i = all_pair_inds_i[keep_inds]

        im_inds_i = torch.LongTensor([id_to_iminds[i - 1]] *
                                     keep_inds.size(0)).view(-1, 1).cuda(
                                         all_pair_inds.get_device())
        all_pair_inds_i = torch.cat(
            (im_inds_i, all_pair_inds_i, labels.view(-1, 1)), 1)
        all_rel_inds.append(all_pair_inds_i)

    all_rel_inds = torch.cat(all_rel_inds, 0)
    # sort by rel
    _, perm = torch.sort(all_rel_inds[:, 0] * (rois.size(0)**2) +
                         all_rel_inds[:, 1] * rois.size(0) +
                         all_rel_inds[:, 2])
    all_rel_inds = all_rel_inds[perm].contiguous()
    return all_rel_inds
Exemple #23
0
    def faster_rcnn(self, x, gt_boxes, gt_classes, gt_rels):
        targets, x_lst, original_image_sizes = [], [], []
        device = self.rel_fc.weight.get_device(
        ) if self.rel_fc.weight.is_cuda else 'cpu'
        for i, s, e in enumerate_by_image(gt_classes[:, 0].long().data):
            targets.append({
                'boxes': copy.deepcopy(gt_boxes[s:e]),
                'labels': gt_classes[s:e, 1].long()
            })
            x_lst.append(x[i].to(device).squeeze())
            original_image_sizes.append(x[i].shape[-2:])

        images, targets = self.detector.transform(x_lst, targets)
        fmap_multiscale = self.detector.backbone(images.tensors)
        if isinstance(fmap_multiscale, torch.Tensor):
            fmap_multiscale = OrderedDict([('0', fmap_multiscale)])

        if self.mode != 'sgdet':
            rois, obj_labels, rel_labels = self.gt_labels(
                gt_boxes, gt_classes, gt_rels)
            rm_box_priors, rm_box_priors_org = [], []
            for i, s, e in enumerate_by_image(gt_classes[:, 0].long().data):
                rm_box_priors.append(targets[i]['boxes'])
                rm_box_priors_org.append(gt_boxes[s:e])

            im_inds = rois[:, 0]
            result = Result(
                od_box_targets=None,
                rm_box_targets=None,
                od_obj_labels=obj_labels,
                rm_box_priors=torch.cat(rm_box_priors),
                rm_obj_labels=obj_labels,
                rpn_scores=None,
                rpn_box_deltas=None,
                rel_labels=rel_labels,
                im_inds=im_inds.long(),
            )
            result.rm_box_priors_org = torch.cat(rm_box_priors_org)

        else:
            proposals, _ = self.detector.rpn(images, fmap_multiscale, targets)
            detections, _ = self.detector.roi_heads(fmap_multiscale, proposals,
                                                    images.image_sizes,
                                                    targets)
            boxes = copy.deepcopy(detections)
            boxes_all_dict = self.detector.transform.postprocess(
                detections, images.image_sizes, original_image_sizes)
            rm_box_priors, rm_box_priors_org, im_inds, obj_labels = [], [], [], []
            for i in range(len(proposals)):
                if len(boxes[i]['boxes']) <= 1:
                    raise ValueError(
                        'at least two objects must be detected to build relationships, make sure the detector is properly pretrained',
                        boxes)
                rm_box_priors.append(boxes[i]['boxes'])
                rm_box_priors_org.append(boxes_all_dict[i]['boxes'])
                obj_labels.append(boxes_all_dict[i]['labels'])
                im_inds.append(torch.zeros(len(detections[i]['boxes'])) + i)

            im_inds = torch.cat(im_inds).to(device)
            result = Result(rm_obj_labels=torch.cat(obj_labels).view(-1),
                            rm_box_priors=torch.cat(rm_box_priors),
                            rel_labels=None,
                            im_inds=im_inds.long())
            result.rm_box_priors_org = torch.cat(rm_box_priors_org)

            if len(result.rm_box_priors) <= 1:
                raise ValueError(
                    'at least two objects must be detected to build relationships'
                )

        result.im_sizes_org = original_image_sizes
        result.im_sizes = images.image_sizes
        result.fmap = fmap_multiscale[list(
            fmap_multiscale.keys())[-1]]  # last scale for global feature maps
        result.rois = torch.cat(
            (im_inds.float()[:, None], result.rm_box_priors), 1)

        return result
Exemple #24
0

        
Exemple #25
0
    def forward(self,
                x,
                im_sizes,
                image_offset,
                gt_boxes=None,
                gt_classes=None,
                gt_rels=None,
                *args):
        """
        Forward pass for detection
        :param x: Images@[batch_size, 3, IM_SIZE, IM_SIZE]
        :param im_sizes: A numpy array of (h, w, scale) for each image.
        :param image_offset: Offset onto what image we're on for MGPU training (if single GPU this is 0)
        :param gt_boxes:

        Training parameters:
        :param gt_boxes: [num_gt, 4] GT boxes over the batch.
        :param gt_classes: [num_gt, 2] gt boxes where each one is (img_id, class)
        :param train_anchor_inds: a [num_train, 2] array of indices for the anchors that will
                                  be used to compute the training loss. Each (img_ind, fpn_idx)
        :return: If train:
            scores, boxdeltas, labels, boxes, boxtargets, rpnscores, rpnboxes, rellabels
            
            if test:
            prob dists, boxes, img inds, maxscores, classes
            
        """

        with torch.no_grad():  # do not update anything in the detector

            targets, x_lst, original_image_sizes = [], [], []
            device = self.rel_fc.weight.get_device(
            ) if self.rel_fc.weight.is_cuda else 'cpu'
            gt_boxes = gt_boxes.to(device)
            gt_classes = gt_classes.to(device)
            gt_rels = gt_rels.to(device)
            for i, s, e in enumerate_by_image(gt_classes[:, 0].long().data):
                targets.append({
                    'boxes': copy.deepcopy(gt_boxes[s:e]),
                    'labels': gt_classes[s:e, 1].long().to(device)
                })
                x_lst.append(x[i].to(device).squeeze())
                original_image_sizes.append(x[i].shape[-2:])

            images, targets = self.detector.transform(x_lst, targets)
            fmap_multiscale = self.detector.backbone(images.tensors)
            if self.mode != 'sgdet':
                rois, obj_labels, bbox_targets, rpn_scores, rpn_box_deltas, rel_labels = \
                    self.gt_boxes(None, im_sizes, image_offset, self.RELS_PER_IMG, gt_boxes,
                                   gt_classes, gt_rels, None, proposals=None,
                                   sample_factor=-1)
                rm_box_priors, rm_box_priors_org = [], []
                for i, s, e in enumerate_by_image(gt_classes[:,
                                                             0].long().data):
                    rm_box_priors.append(targets[i]['boxes'])
                    rm_box_priors_org.append(gt_boxes[s:e])

                result = Result(od_box_targets=bbox_targets,
                                rm_box_targets=bbox_targets,
                                od_obj_labels=obj_labels,
                                rm_box_priors=torch.cat(rm_box_priors),
                                rm_obj_labels=obj_labels,
                                rpn_scores=rpn_scores,
                                rpn_box_deltas=rpn_box_deltas,
                                rel_labels=rel_labels,
                                im_inds=rois[:, 0].long().contiguous() +
                                image_offset)
                result.rm_box_priors_org = torch.cat(rm_box_priors_org)

            else:

                if isinstance(fmap_multiscale, torch.Tensor):
                    fmap_multiscale = OrderedDict([(0, fmap_multiscale)])
                proposals, _ = self.detector.rpn(images, fmap_multiscale,
                                                 targets)
                detections, _ = self.detector.roi_heads(
                    fmap_multiscale, proposals, images.image_sizes, targets)
                boxes = copy.deepcopy(detections)
                boxes_all_dict = self.detector.transform.postprocess(
                    detections, images.image_sizes, original_image_sizes)
                rm_box_priors, rm_box_priors_org, im_inds, obj_labels = [], [], [], []
                for i in range(len(proposals)):
                    rm_box_priors.append(boxes[i]['boxes'])
                    rm_box_priors_org.append(boxes_all_dict[i]['boxes'])
                    obj_labels.append(boxes_all_dict[i]['labels'])
                    im_inds.append(
                        torch.zeros(len(detections[i]['boxes']),
                                    device=device).float() + i)
                im_inds = torch.cat(im_inds).view(-1, 1)

                result = Result(rm_obj_labels=torch.cat(obj_labels).view(-1),
                                rm_box_priors=torch.cat(rm_box_priors),
                                rel_labels=None,
                                im_inds=im_inds.view(-1).long().contiguous() +
                                image_offset)
                result.rm_box_priors_org = torch.cat(rm_box_priors_org)

                if len(result.rm_box_priors) <= 1:
                    raise ValueError(
                        'at least two objects must be detected to build relationships'
                    )

        if result.is_none():
            return ValueError("heck")

        if self.detector_model == 'baseline':
            if self.slim > 0:
                result.fmap = self.fmap_reduce(result.fmap.detach())
            else:
                result.fmap = result.fmap.detach()

        im_inds = result.im_inds - image_offset
        boxes = result.rm_box_priors

        if not hasattr(result, 'rel_labels'):
            result.rel_labels = None

        if self.training and result.rel_labels is None:
            assert self.mode == 'sgdet'
            result.rel_labels = rel_assignments(im_inds.data,
                                                boxes.data,
                                                result.rm_obj_labels.data,
                                                gt_boxes.data,
                                                gt_classes.data,
                                                gt_rels.data,
                                                image_offset,
                                                filter_non_overlap=True,
                                                num_sample_per_gt=1)

        rel_inds = self.get_rel_inds(
            result.rel_labels if self.training else None, im_inds, boxes)
        rois = torch.cat((im_inds[:, None].float(), boxes), 1)

        union_rois = torch.cat((
            rois[:, 0][rel_inds[:, 1]][:, None],
            torch.min(rois[:, 1:3][rel_inds[:, 1]], rois[:, 1:3][rel_inds[:,
                                                                          2]]),
            torch.max(rois[:, 3:5][rel_inds[:, 1]], rois[:, 3:5][rel_inds[:,
                                                                          2]]),
        ), 1)

        node_feat = self.multiscale_roi_pool(fmap_multiscale, rm_box_priors,
                                             images.image_sizes)
        edge_feat = self.multiscale_roi_pool(fmap_multiscale,
                                             convert_roi_to_list(union_rois),
                                             images.image_sizes)

        result.rm_obj_dists, result.rel_dists = self.predict(
            node_feat, edge_feat, rel_inds, rois, images.image_sizes)

        if self.use_bias:

            scores_nz = F.softmax(result.rm_obj_dists, dim=1).data
            scores_nz[:, 0] = 0.0
            _, score_ord = scores_nz[:, 1:].sort(dim=1, descending=True)
            result.obj_preds = score_ord[:, 0] + 1

            if self.mode == 'predcls':
                result.obj_preds = gt_classes.data[:, 1]

            freq_pred = self.freq_bias.index_with_labels(
                torch.stack((
                    result.obj_preds[rel_inds[:, 1]],
                    result.obj_preds[rel_inds[:, 2]],
                ), 1))
            # tune the weight for freq_bias
            if self.test_bias:
                result.rel_dists = freq_pred
            else:
                result.rel_dists = result.rel_dists + freq_pred

        if self.training:
            return result

        if self.mode == 'predcls':
            result.obj_scores = result.rm_obj_dists.data.new(
                gt_classes.size(0)).fill_(1)
            result.obj_preds = gt_classes.data[:, 1]
        elif self.mode in ['sgcls', 'sgdet']:
            scores_nz = F.softmax(result.rm_obj_dists, dim=1).data
            scores_nz[:, 0] = 0.0  # does not change actually anything
            result.obj_scores, score_ord = scores_nz[:,
                                                     1:].sort(dim=1,
                                                              descending=True)
            result.obj_preds = score_ord[:, 0] + 1
            result.obj_scores = result.obj_scores[:, 0]
        else:
            raise NotImplementedError(self.mode)

        result.obj_preds = Variable(result.obj_preds)
        result.obj_scores = Variable(result.obj_scores)

        # Boxes will get fixed by filter_dets function.
        if self.detector_model == 'mrcnn':
            bboxes = result.rm_box_priors_org
        else:
            bboxes = result.rm_box_priors

        rel_rep = F.softmax(result.rel_dists, dim=1)

        return filter_dets(bboxes, result.obj_scores, result.obj_preds,
                           rel_inds[:, 1:], rel_rep)
def proposal_assignments_gtbox(rois,
                               gt_boxes,
                               gt_classes,
                               gt_rels,
                               image_offset,
                               fg_thresh=0.5):
    """
    Assign object detection proposals to ground-truth targets. Produces proposal
    classification labels and bounding-box regression targets.
    :param rpn_rois: [img_ind, x1, y1, x2, y2]
    :param gt_boxes:   [num_boxes, 4] array of x0, y0, x1, y1]. Not needed it seems
    :param gt_classes: [num_boxes, 2.0] array of [img_ind, class]
        Note, the img_inds here start at image_offset
    :param gt_rels     [num_boxes, 4] array of [img_ind, box_0, box_1, rel type].
        Note, the img_inds here start at image_offset
    :param Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
    :return:
        rois: [num_rois, 5]
        labels: [num_rois] array of labels
        bbox_targets [num_rois, 4] array of targets for the labels.
        rel_labels: [num_rels, 4] (img ind, box0 ind, box1ind, rel type)
    """
    im_inds = rois[:, 0].long()

    num_im = im_inds[-1] + 1

    # Offset the image indices in fg_rels to refer to absolute indices (not just within img i)
    fg_rels = gt_rels.clone()
    fg_rels[:, 0] -= image_offset
    offset = {}
    for i, s, e in enumerate_by_image(im_inds):
        offset[i] = s
    for i, s, e in enumerate_by_image(fg_rels[:, 0]):
        fg_rels[s:e, 1:3] += offset[i]

    #----------------------------------------------------------------------------#
    fg_rel_list = []
    for i in range(num_im):
        fg_rel_list.append(sum(fg_rels[:, 0] == i).item())
    longest_len = max(fg_rel_list)
    bg_rel_length = [longest_len - i for i in fg_rel_list]
    #----------------------------------------------------------------------------#

    # Try ALL things, not just intersections.
    is_cand = (im_inds[:, None] == im_inds[None])
    is_cand.view(-1)[diagonal_inds(is_cand)] = 0

    # # Compute salience
    # gt_inds = fg_rels[:, ĺeftright:3].contiguous().view(-ĺeftright)
    # labels_arange = labels.data.new(labels.size(0))
    # torch.arange(0, labels.size(0), out=labels_arange)
    # salience_labels = ((gt_inds[:, None] == labels_arange[None]).long().sum(0) > 0).long()
    # labels = torch.stack((labels, salience_labels), ĺeftright)

    # Add in some BG labels

    # NOW WE HAVE TO EXCLUDE THE FGs.
    # TODO: check if this causes an error if many duplicate GTs havent been filtered out

    is_cand.view(-1)[fg_rels[:, 1] * im_inds.size(0) + fg_rels[:, 2]] = 0
    is_bgcand = is_cand.nonzero()
    # TODO: make this sample on a per image case
    # If too many then sample
    num_fg = min(fg_rels.size(0), int(RELS_PER_IMG * REL_FG_FRACTION * num_im))
    if num_fg < fg_rels.size(0):
        fg_rels = random_choose(fg_rels, num_fg)

    # If too many then sample
    num_bg = min(
        is_bgcand.size(0) if is_bgcand.dim() > 0 else 0, int(num_fg / 2))

    bg_rels = torch.cat((
        im_inds[is_bgcand[:, 0]][:, None],
        is_bgcand,
        (is_bgcand[:, 0, None] < -10).long(),
    ), 1)
    rel_labels = fg_rels
    for i, j in enumerate(bg_rel_length):
        if bg_rels[bg_rels[:, 0] == i, :].shape[0] >= j:
            bg_rel_per_image = random_choose(bg_rels[bg_rels[:, 0] == i, :], j)
        else:
            bg_rel_per_image = torch.cat(
                (bg_rels[bg_rels[:, 0] == i, :],
                 random_choose(bg_rels[bg_rels[:, 0] == i, :],
                               j - bg_rels[bg_rels[:, 0] == i, :].shape[0])),
                0)
        rel_labels = torch.cat((rel_labels, bg_rel_per_image), 0)

    # last sort by rel.
    _, perm = torch.sort(rel_labels[:, 0] * (gt_boxes.size(0)**2) +
                         rel_labels[:, 1] * gt_boxes.size(0) +
                         rel_labels[:, 2])

    rel_labels = rel_labels[perm].contiguous()

    labels = gt_classes[:, 1].contiguous()

    return rois, labels, rel_labels