def perturb(self, gt_obj, gt_rels, verbose=False): gt_obj_lst = [gt_obj[s:e] for _, s, e in enumerate_by_image(gt_obj[:, 0])] gt_rels_lst = [gt_rels[s:e] for _, s, e in enumerate_by_image(gt_rels[:, 0])] nodes = self.sample_nodes_(gt_obj_lst, gt_rels_lst) gt_obj_new = [] for im, objs in enumerate(gt_obj_lst): # for each image for obj_ind, obj_rels in zip(*nodes[im]): # for each sampled node that will be perturbed if verbose: before = objs[obj_ind, 1] print('\nbefore: %s' % self.obj_classes[before]) for (_, o1, o2, R) in obj_rels: print(self.triplet2str('{}_{}_{}'.format(objs[o1, 1], R, objs[o2, 1]))) objs[obj_ind, 1] = self.perturb_object_(objs, obj_rels, obj_ind, verbose=verbose) if verbose: print('\nafter: %s' % self.obj_classes[objs[obj_ind, 1]]) for (_, o1, o2, R) in obj_rels: print(self.triplet2str('{}_{}_{}'.format(objs[o1, 1], R, objs[o2, 1]))) gt_obj_new.append(objs) gt_obj_new = torch.cat(gt_obj_new) return gt_obj_new
def forward(self, union_pools, rois, union_inds, im_sizes): boxes = rois[:, 1:].clone() # scale boxes to the range [0,1] scale = boxes.new(boxes.shape).fill_(0) for i, s, e in enumerate_by_image(rois[:, 0].long().data): h, w = im_sizes[i][:2] scale[s:e, 0] = w scale[s:e, 1] = h scale[s:e, 2] = w scale[s:e, 3] = h boxes = boxes / scale try: rects = draw_union_boxes_my(boxes, union_inds, self.pooling_size * 4 - 1) - 0.5 except Exception as e: # there was a problem with bboxes being larger than images at test time, had to clip them print(rois, boxes, im_sizes, scale) raise if self.concat: return torch.cat((union_pools, self.conv(rects)), 1) return union_pools + self.conv(rects)
def _sort_by_score(im_inds, scores): """ We'll sort everything scorewise from Hi->low, BUT we need to keep images together and sort LSTM from l :param im_inds: Which im we're on :param scores: Goodness ranging between [0, 1]. Higher numbers come FIRST :return: Permutation to put everything in the right order for the LSTM Inverse permutation Lengths for the TxB packed sequence. """ num_im = im_inds[-1] + 1 rois_per_image = scores.new(num_im) lengths = [] for i, s, e in enumerate_by_image(im_inds): rois_per_image[i] = 2 * (s - e) * num_im + i lengths.append(e - s) lengths = sorted(lengths, reverse=True) inds, ls_transposed = transpose_packed_sequence_inds(lengths) # move it to TxB form inds = torch.LongTensor(inds).cuda(im_inds.get_device()) # ~~~~~~~~~~~~~~~~ # HACKY CODE ALERT!!! # we're sorting by confidence which is in the range (0,1), but more importantly by longest # img.... # ~~~~~~~~~~~~~~~~ roi_order = scores - 2 * rois_per_image[im_inds] _, perm = torch.sort(roi_order, 0, descending=True) perm = perm[inds] _, inv_perm = torch.sort(perm) return perm, inv_perm, ls_transposed
def _PadRelFeats(rel_im_inds, rel_feats_all, num_relation, seq_per_img, pred_classes, obj_classes, rels, freq_matrix=None): """ :param rel_im_inds: torch.LongTensor, [num_rels, ] :param rel_feats_all: Variable, [num_rels, 4096] :param num_relation: :param seq_per_img: :return: rel_feats, [batch_size*seq_per_img, num_relation, 4096] """ rel_feats = [] categories_info_all = [] for i, s, e in enumerate_by_image(rel_im_inds): rel_feats_i = rel_feats_all[s:e, :] pred_classes_i = pred_classes[s:e][:, None] rels_i = rels[s:e, :] subj_categories = obj_classes[rels_i[:, 1]][:, None] obj_categories = obj_classes[rels_i[:, 2]][:, None] categories_info = torch.cat( (subj_categories, obj_categories, pred_classes_i), 1) # compute freqency baseline: reranking based on triplet frequency if freq_matrix is not None: categories_info_np = categories_info.data.cpu().numpy() freqs = [] for cat in categories_info_np: freqs.append(freq_matrix[cat[0], cat[1], cat[2]]) sort_index = torch.from_numpy(np.argsort( np.array(freqs) * -1)).cuda(rel_feats_all.get_device()) rel_feats_i = rel_feats_i[sort_index, :] rels_i = rels_i[sort_index, :] categories_info = categories_info[sort_index, :] this_num_rel = e - s if num_relation <= this_num_rel: rel_feats_i = rel_feats_i[:num_relation, :] categories_info = categories_info[:num_relation] else: # oversample sample_inds = torch.from_numpy( np.random.choice(np.arange(this_num_rel, dtype=np.int32), num_relation, replace=True)).long().cuda( rel_feats_all.get_device()) rel_feats_i = rel_feats_i[sample_inds] categories_info = categories_info[sample_inds] rel_feats += [rel_feats_i[None, :, :]] * seq_per_img categories_info_all += [categories_info[None, :, :]] * seq_per_img rel_feats = torch.cat(rel_feats, 0) categories_info_all = torch.cat(categories_info_all, 0) return rel_feats, categories_info_all
def nms_boxes(self, obj_dists, rois, box_deltas, im_sizes): """ Performs NMS on the boxes :param obj_dists: [#rois, #classes], ex:[4000+, 151] :param rois: [#rois, 5], ex:[4000+, 5] :param box_deltas: [#rois, #classes, 4] :param im_sizes: sizes of images [6,3] :return nms_inds [#nms], ex: #nms=384 nms_scores [#nms] nms_labels [#nms] nms_boxes_assign [#nms, 4] nms_boxes [#nms, #classes, 4]. classid=0 is the box prior. """ # Now Converts "deltas" (predicted by the network) along with prior boxes into (x1, y1, x2, y2) representation. # box deltas is (num_rois, num_classes, 4) but rois is only #(num_rois, 4) # boxes = bbox_preds([#rois * 151, 4]) = [#rois, 151, 4] boxes = bbox_preds( rois[:, None, 1:].expand_as(box_deltas).contiguous().view(-1, 4), box_deltas.view(-1, 4)).view(*box_deltas.size()) inds = rois[:, 0].long().contiguous() dets = [] # Clip the boxes and get the best N dets per image. for i, s, e in enumerate_by_image(inds.data): h, w = im_sizes[i, :2] boxes[s:e, :, 0].data.clamp_(min=0, max=w - 1) boxes[s:e, :, 1].data.clamp_(min=0, max=h - 1) boxes[s:e, :, 2].data.clamp_(min=0, max=w - 1) boxes[s:e, :, 3].data.clamp_(min=0, max=h - 1) d_filtered = filter_det( F.softmax(obj_dists[s:e], 1), boxes[s:e], start_ind=s, nms_filter_duplicates=self.nms_filter_duplicates, max_per_img=self.max_per_img, thresh=self.thresh, ) if d_filtered is not None: dets.append(d_filtered) # dets is a list: len is 6 (images); each image has (inds, scores, labels), each len is 64 if len(dets) == 0: print("nothing was detected", flush=True) return None nms_inds, nms_scores, nms_labels = [ torch.cat(x, 0) for x in zip(*dets) ] # [384] twod_inds = nms_inds * boxes.size(1) + nms_labels.data nms_boxes_assign = boxes.view(-1, 4)[twod_inds] # nms_boxes: [384,151,4], the first dim of 151 is not "0" background class, it's rois # rois[:, 1:][nms_inds][:, None].shape: [384, 1, 4]; boxes[nms_inds][:, 1:]: [384,150,4] nms_boxes = torch.cat( (rois[:, 1:][nms_inds][:, None], boxes[nms_inds][:, 1:]), 1) return nms_inds, nms_scores, nms_labels, nms_boxes_assign, nms_boxes, inds[ nms_inds]
def nms_boxes(self, obj_dists, rois, box_deltas, im_sizes): """ Performs NMS on the boxes :param obj_dists: [#rois, #classes] :param rois: [#rois, 5] :param box_deltas: [#rois, #classes, 4] :param im_sizes: sizes of images :return nms_inds [#nms] nms_scores [#nms] nms_labels [#nms] nms_boxes_assign [#nms, 4] nms_boxes [#nms, #classes, 4]. classid=0 is the box prior. """ # Now produce the boxes # box deltas is (num_rois, num_classes, 4) but rois is only #(num_rois, 4) boxes = bbox_preds( rois[:, None, 1:].expand_as(box_deltas).contiguous().view(-1, 4), box_deltas.view(-1, 4)).view(*box_deltas.size()) # Clip the boxes and get the best N dets per image. inds = rois[:, 0].long().contiguous() dets = [] for i, s, e in enumerate_by_image(inds.data): h, w = im_sizes[i, :2] boxes[s:e, :, 0].data.clamp_(min=0, max=w - 1) boxes[s:e, :, 1].data.clamp_(min=0, max=h - 1) boxes[s:e, :, 2].data.clamp_(min=0, max=w - 1) boxes[s:e, :, 3].data.clamp_(min=0, max=h - 1) d_filtered = filter_det( F.softmax(obj_dists[s:e], 1), boxes[s:e], start_ind=s, nms_filter_duplicates=self.nms_filter_duplicates, max_per_img=self.max_per_img, thresh=self.thresh, ) if d_filtered is not None: dets.append(d_filtered) if len(dets) == 0: print("nothing was detected", flush=True) return None nms_inds, nms_scores, nms_labels = [ torch.cat(x, 0) for x in zip(*dets) ] twod_inds = nms_inds * boxes.size(1) + nms_labels.data nms_boxes_assign = boxes.view(-1, 4)[twod_inds] nms_boxes = torch.cat( (rois[:, 1:][nms_inds][:, None], boxes[nms_inds][:, 1:]), 1) return nms_inds, nms_scores, nms_labels, nms_boxes_assign, nms_boxes, inds[ nms_inds]
def get_scaled_boxes(self, boxes, im_inds, im_sizes): if self.backbone == 'vgg16_old': boxes_scaled = boxes / IM_SCALE else: boxes_scaled = boxes.clone() for im_ind, s, e in enumerate_by_image(im_inds.long().data): boxes_scaled[s:e, [0, 2]] = boxes_scaled[ s:e, [0, 2]] / im_sizes[im_ind][1] # width boxes_scaled[s:e, [1, 3]] = boxes_scaled[ s:e, [1, 3]] / im_sizes[im_ind][0] # height assert boxes_scaled.max() <= 1 + 1e-3, (boxes_scaled.max(), boxes.max(), im_sizes) return boxes_scaled
def dummy_nodes(gt_objs, gt_boxes, gt_rels): # Add dummy nodes to scene graphs to improve message propagation gt_objs_new, gt_boxes_new, gt_rels_new = [], [], [] gt_rels_lst = [ gt_rels[s:e] for im, s, e in enumerate_by_image(gt_rels[:, 0]) ] dummy_box = torch.Tensor([0, 0, 1, 1]).view(1, 4).to(gt_boxes) offset = 0 for im, s, e in enumerate_by_image(gt_objs[:, 0]): gt_objs_im = gt_objs[s:e] n_obj = len(gt_objs_im) rels = torch.zeros( (n_obj * 2, 4)).to(gt_rels) # adding two way edges from/to the dummy node rels[:, 0] = im for i in range(n_obj): # make edges two way as in the visual genome data loader for j, in_out in zip([i, i + n_obj], [(1, 2), (2, 1)]): rels[j, in_out[0]] = n_obj rels[j, in_out[1]] = i rels = torch.cat((gt_rels_lst[im].clone(), rels), 0) rels[:, 1:3] += offset gt_rels_new.append(rels) gt_objs_new.append( torch.cat( (gt_objs_im, torch.Tensor([im, 0]).view(1, 2).to(gt_objs_im)), 0)) gt_boxes_new.append(torch.cat((gt_boxes[s:e], dummy_box), 0)) offset += (n_obj + 1) # +1 because 1 dummy node is added # assert len(torch.cat(gt_objs_new)) == offset, (torch.cat(gt_objs_new).shape, offset) return torch.cat(gt_objs_new), torch.cat(gt_boxes_new), torch.cat( gt_rels_new)
def pack_vectors(im_inds, vec_inps): num_im = int(im_inds[-1] + 1) max_num_roi = 0 im2roi = [] ls_rois = [] for i, s, e in enumerate_by_image(im_inds): im2roi.append((s, e)) max_num_roi = max(max_num_roi, e - s) ls_rois.append(e - s) packed_tensor = Variable( torch.FloatTensor(num_im, max_num_roi, vec_inps.shape[1]).fill_(0).cuda( vec_inps.get_device())) for i, seg in enumerate(im2roi): packed_tensor[i, :ls_rois[i]] = vec_inps[seg[0]:seg[1], :] return packed_tensor, np.array(ls_rois)
def forward(self, im_inds, obj_fmaps, obj_logits, rel_inds, vr, obj_labels=None, boxes_per_cls=None): """ Reason relationship classes using knowledge of object and relationship coccurrence. """ # print(rel_inds.shape) # (num_rel, 3) if self.mode == 'predcls': obj_logits = Variable(onehot_logits(obj_labels.data, self.num_obj_cls)) obj_probs = F.softmax(obj_logits, 1) obj_fmaps = self.obj_proj(obj_fmaps) vr = self.rel_proj(vr) rel_logits = [] obj_logits_refined = [] for (_, obj_s, obj_e), (_, rel_s, rel_e) in zip(enumerate_by_image(im_inds.data), enumerate_by_image(rel_inds[:,0])): rl, ol = self.ggnn(rel_inds[rel_s:rel_e, 1:] - obj_s, obj_probs[obj_s:obj_e], obj_fmaps[obj_s:obj_e], vr[rel_s:rel_e]) rel_logits.append(rl) obj_logits_refined.append(ol) rel_logits = torch.cat(rel_logits, 0) if self.ggnn.refine_obj_cls: obj_logits_refined = torch.cat(obj_logits_refined, 0) obj_logits = obj_logits_refined obj_probs = F.softmax(obj_logits, 1) if self.mode == 'sgdet' and not self.training: # NMS here for baseline nms_mask = obj_probs.data.clone() nms_mask.zero_() for c_i in range(1, obj_probs.size(1)): scores_ci = obj_probs.data[:, c_i] boxes_ci = boxes_per_cls.data[:, c_i] keep = apply_nms(scores_ci, boxes_ci, pre_nms_topn=scores_ci.size(0), post_nms_topn=scores_ci.size(0), nms_thresh=0.3) nms_mask[:, c_i][keep] = 1 obj_preds = Variable(nms_mask * obj_probs.data, volatile=True)[:,1:].max(1)[1] + 1 else: obj_preds = obj_labels if obj_labels is not None else obj_probs[:,1:].max(1)[1] + 1 return obj_logits, obj_preds, rel_logits
def forward(self, im_inds, obj_fmaps, obj_labels): """ Reason object classes using knowledge of object cooccurrence """ if self.mode == 'predcls': # in task 'predcls', there is no need to run GGNN_obj obj_dists = Variable(to_onehot(obj_labels.data, self.num_obj_cls)) return obj_dists else: input_ggnn = self.obj_proj(obj_fmaps) lengths = [] for i, s, e in enumerate_by_image(im_inds.data): lengths.append(e - s) obj_cum_add = np.cumsum([0] + lengths) obj_dists = torch.cat([self.ggnn_obj(input_ggnn[obj_cum_add[i] : obj_cum_add[i+1]]) for i in range(len(lengths))], 0) return obj_dists
def forward(self, union_pools, rois, union_inds, im_sizes): if self.edge_model == 'motifs': pair_rois = torch.cat( (rois[:, 1:][union_inds[:, 0]], rois[:, 1:][union_inds[:, 1]]), 1).data.cpu().numpy() rects = torch.from_numpy( self.draw_union_boxes(pair_rois, self.pooling_size * 4 - 1) - 0.5).to(union_pools) elif self.edge_model == 'raw_boxes': boxes = rois[:, 1:].clone() # scale boxes to the range [0,1] scale = boxes.new(boxes.shape).fill_(0).float() for i, s, e in enumerate_by_image(rois[:, 0].long().data): h, w = im_sizes[i][:2] scale[s:e, 0] = w scale[s:e, 1] = h scale[s:e, 2] = w scale[s:e, 3] = h boxes = boxes / scale try: rects = self.draw_union_boxes(boxes, union_inds, self.pooling_size * 4 - 1) - 0.5 except Exception as e: # there was a problem with bboxes being larger than images at test time, had to clip them print(rois, boxes, im_sizes, scale) raise # to debug: # print('rects my', rects.shape, rects.min(), rects.max()) # np.save('rects.npy', rects.data.cpu().numpy()) # pair_rois = torch.cat((rois[:, 1:][union_inds[:, 0]], rois[:, 1:][union_inds[:, 1]]), 1).data.cpu().numpy() # rects2 = torch.from_numpy(draw_union_boxes(pair_rois, self.pooling_size * 4 - 1) - 0.5).to(union_pools) # print('rects2', rects2.shape, rects2.min(), rects2.max()) # np.save('rects2.npy', rects2.data.cpu().numpy()) # print(union_inds) # raise ValueError('saved') else: raise NotImplementedError(self.edge_model) if self.concat: return torch.cat((union_pools, self.conv(rects)), 1) return union_pools + self.conv(rects)
def filter_dets_for_gcn_caption(im_inds, region_feats, obj_scores, obj_classes, rel_inds, pred_scores, rel_rank_scores=None, seq_labels=None, mask_labels=None, coco_ids=None): num_box = obj_classes.size() num_rel = rel_inds.size(0) assert rel_inds.size(1) == 3 assert pred_scores.size(0) == num_rel obj_scores0 = obj_scores.data[rel_inds[:, 1]] obj_scores1 = obj_scores.data[rel_inds[:, 2]] pred_scores_max, pred_classes_argmax = pred_scores.data[:, 1:].max(1) pred_classes_argmax = pred_classes_argmax + 1 rel_scores_argmaxed = pred_scores_max * obj_scores0 * obj_scores1 if rel_rank_scores is not None: rel_scores_argmaxed *= rel_rank_scores.data # split the relations according to image rel_im_inds = rel_inds[:, 0] rels = [] pred_classes = [] for i, s, e in enumerate_by_image(rel_im_inds): rels_i = rel_inds[s:e, :] pred_classes_argmax_i = pred_classes_argmax[s:e] rel_scores_argmaxed_i = rel_scores_argmaxed[s:e] rel_scores_vs_i, rel_scores_idx_i = torch.sort(rel_scores_argmaxed_i.view(-1), dim=0, descending=True) rels_i = rels_i[rel_scores_idx_i] pred_classes_argmax_i = pred_classes_argmax_i[rel_scores_idx_i] rels.append(rels_i) pred_classes.append(pred_classes_argmax_i) rels = torch.cat(rels, 0) pred_classes = torch.cat(pred_classes, 0) return im_inds, region_feats, pred_classes, rels, obj_classes.data, seq_labels, mask_labels, coco_ids
def filter_dets_for_caption(boxes, obj_scores, obj_classes, rel_inds, pred_scores, rel_feats, image_fmap, rel_rank_scores=None, seq_labels=None, mask_labels=None, coco_ids=None): """ Filters detections.... :param boxes: [num_box, 4] :param obj_scores: [num_box] probabilities for the scores :param obj_classes: [num_box] class labels for the topk :param rel_inds: [num_rel, 3] TENSOR consisting of (rel_im_inds, box_ind0, box_ind1) :param pred_scores: [num_rel, num_predicates] :param use_nms: True if use NMS to filter dets. :return: boxes: FloatTensor obj_classes: FloatTensor rels: LongTensor, [num_rel, 3] pred_classes: LongTensor, [num_rel,] rel_feats_all: FloatTensor, [num_rel, 4096] seq_labels: [num_img*5, 19], [im_inds, <start>, seq labels, <end>, 0, 0, ...] mask_labels: [num_img*5, 19], [im_inds, 1, 1, ..., {1 for <end>}, 0, 0, ...] """ if boxes.dim() != 2: raise ValueError("Boxes needs to be [num_box, 4] but its {}".format(boxes.size())) num_box = boxes.size(0) assert obj_scores.size(0) == num_box assert obj_classes.size() == obj_scores.size() num_rel = rel_inds.size(0) assert rel_inds.size(1) == 3 assert pred_scores.size(0) == num_rel obj_scores0 = obj_scores.data[rel_inds[:, 1]] obj_scores1 = obj_scores.data[rel_inds[:, 2]] pred_scores_max, pred_classes_argmax = pred_scores.data[:, 1:].max(1) pred_classes_argmax = pred_classes_argmax + 1 rel_scores_argmaxed = pred_scores_max * obj_scores0 * obj_scores1 if rel_rank_scores is not None: rel_scores_argmaxed *= rel_rank_scores.data # split the relations according to image rel_im_inds = rel_inds[:, 0] rels = [] rel_feats_all = [] pred_classes = [] for i, s, e in enumerate_by_image(rel_im_inds): rels_i = rel_inds[s:e, :] pred_classes_argmax_i = pred_classes_argmax[s:e] rel_feats_i = rel_feats[s:e, :] rel_scores_argmaxed_i = rel_scores_argmaxed[s:e] rel_scores_vs_i, rel_scores_idx_i = torch.sort(rel_scores_argmaxed_i.view(-1), dim=0, descending=True) rels_i = rels_i[rel_scores_idx_i] pred_classes_argmax_i = pred_classes_argmax_i[rel_scores_idx_i] rel_feats_i = rel_feats_i[rel_scores_idx_i] rels.append(rels_i) rel_feats_all.append(rel_feats_i) pred_classes.append(pred_classes_argmax_i) rels = torch.cat(rels, 0) rel_feats_all = torch.cat(rel_feats_all, 0) pred_classes = torch.cat(pred_classes, 0) return boxes, obj_classes, rels, Variable( pred_classes), rel_feats_all, image_fmap, seq_labels, mask_labels, coco_ids
def proposal_assignments_gtbox(rois, gt_boxes, gt_classes, gt_rels, image_offset, RELS_PER_IMG, sample_factor=-1): """ Assign object detection proposals to ground-truth targets. Produces proposal classification labels and bounding-box regression targets. :param rpn_rois: [img_ind, x1, y1, x2, y2] :param gt_boxes: [num_boxes, 4] array of x0, y0, x1, y1]. Not needed it seems :param gt_classes: [num_boxes, 2] array of [img_ind, class] Note, the img_inds here start at image_offset :param gt_rels [num_boxes, 4] array of [img_ind, box_0, box_1, rel type]. Note, the img_inds here start at image_offset :param Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) :return: rois: [num_rois, 5] labels: [num_rois] array of labels bbox_targets [num_rois, 4] array of targets for the labels. rel_labels: [num_rels, 4] (img ind, box0 ind, box1ind, rel type) """ im_inds = rois[:, 0].long() num_im = im_inds[-1] + 1 # Offset the image indices in fg_rels to refer to absolute indices (not just within img i) fg_rels = gt_rels.clone() fg_rels[:, 0] -= image_offset offset = {} for i, s, e in enumerate_by_image(im_inds): offset[i] = s for i, s, e in enumerate_by_image(fg_rels[:, 0]): fg_rels[s:e, 1:3] += offset[i] # Try ALL things, not just intersections. is_cand = (im_inds[:, None] == im_inds[None]) is_cand.view(-1)[diagonal_inds(is_cand)] = 0 # NOW WE HAVE TO EXCLUDE THE FGs. is_cand.view(-1)[fg_rels[:, 1] * im_inds.size(0) + fg_rels[:, 2]] = 0 is_bgcand = torch.nonzero(is_cand) # TODO: make this sample on a per image case # If too many then sample num_fg = min(fg_rels.size(0), int(RELS_PER_IMG * REL_FG_FRACTION * num_im)) if num_fg < fg_rels.size(0): fg_rels = random_choose(fg_rels, num_fg) # If too many then sample is_train = num_im > 1 # assume num_im = 1 at test time (except for the det mode, which we don't use for now) sample_bg = is_train and sample_factor > -1 num_bg = min( is_bgcand.size(0) if is_bgcand.dim() > 0 else 0, int(num_fg * sample_factor) if sample_bg else (int(RELS_PER_IMG * num_im) - num_fg)) # sample num_fg at training time if num_bg > 0: bg_rels = torch.cat(( im_inds[is_bgcand[:, 0]][:, None], is_bgcand, (is_bgcand[:, 0, None] < -10).long(), ), 1) if num_bg < is_bgcand.size(0): bg_rels = random_choose( bg_rels, num_bg ) # at test time will correspond to the baseline approach rel_labels = torch.cat((fg_rels, bg_rels), 0) else: rel_labels = fg_rels # last sort by rel. _, perm = torch.sort(rel_labels[:, 0] * (gt_boxes.size(0)**2) + rel_labels[:, 1] * gt_boxes.size(0) + rel_labels[:, 2]) rel_labels = rel_labels[perm].contiguous() labels = gt_classes[:, 1].contiguous() return rois, labels, rel_labels
def rel_proposal_target(rois, rel_proposal_inds, gt_boxes, gt_classes, gt_rels, image_offset, mode): """ Assign the tareget for each proposal pairs. When the mode is predcls or sgcls, the target is directly obtained by comparing with gt_rel. When the mode is sgdet, the target is sampled by firstly compute iou with gt_pairs :param rois: :param rel_proposal_inds: [im_ind, ind1, ind2] :param gt_boxes: :param image_offset: :param mode: :return: """ im_inds = rois[:, 0].long() num_im = im_inds[-1] + 1 # Offset the image indices in fg_rels to refer to absolute indices (not just within img i) fg_rels = gt_rels.clone() fg_rels[:, 0] -= image_offset offset = {} for i, s, e in enumerate_by_image(gt_classes[:, 0]): offset[i] = s for i, s, e in enumerate_by_image(fg_rels[:, 0]): fg_rels[s:e, 1:3] += offset[i] rels_to_gt = [] num_gt_rels_seen = 0 if mode in ('predcls', 'sgcls'): rel_proposal_inds_np = rel_proposal_inds.cpu().numpy() fg_rels_np = fg_rels.cpu().numpy() ## Ngtp, 4 # locate the proposal locate_inds = np.where(intersect_2d(rel_proposal_inds_np, fg_rels_np[:, :-1])) proposal_to_gt = defaultdict(list) for ind in zip(*locate_inds): proposal_to_gt[ind[0]].append(ind[1]) for k, v in proposal_to_gt.items(): v0 = v[0] if len(v) == 1 else np.random.choice(v) proposal_to_gt[k] = v0 fg_proposal_inds = np.array(list(proposal_to_gt.keys())).astype(np.int32) bg_proposal_inds = np.array(list(set(list(range(rel_proposal_inds_np.shape[0]))) - set(list(proposal_to_gt.keys())))).astype(np.int32) rels_to_gt = np.ones(fg_proposal_inds.shape[0] + bg_proposal_inds.shape[0], dtype=np.int64) * -1 if len(fg_proposal_inds) > 0: rels_to_gt[fg_proposal_inds] = np.array([proposal_to_gt[ind] for ind in fg_proposal_inds]) num_fg = min(fg_proposal_inds.size, int(RELS_BATCHSIZE * REL_FG_FRACTION * num_im)) if num_fg < fg_proposal_inds.size: fg_proposal_inds = np.random.choice(fg_proposal_inds, num_fg, replace=False) num_bg = min(bg_proposal_inds.size if bg_proposal_inds.size else 0, int(RELS_BATCHSIZE * num_im) - num_fg) if num_bg < bg_proposal_inds.size: bg_proposal_inds = np.random.choice(bg_proposal_inds, num_bg, replace=False) if len(fg_proposal_inds) == 0: bg_labels = np.zeros(bg_proposal_inds.size) bg_rel_labels = np.hstack((rel_proposal_inds_np[bg_proposal_inds], bg_labels[:, None])) proposal_labels = bg_rel_labels else: fg_labels = np.array([fg_rels[proposal_to_gt[ind], -1] for ind in fg_proposal_inds]) fg_rel_labels = np.hstack((rel_proposal_inds_np[fg_proposal_inds], fg_labels[:, None])) bg_labels = np.zeros(bg_proposal_inds.size) bg_rel_labels = np.hstack((rel_proposal_inds_np[bg_proposal_inds], bg_labels[:, None])) proposal_labels = np.vstack((fg_rel_labels, bg_rel_labels)) rels_to_gt = np.hstack((rels_to_gt[fg_proposal_inds], rels_to_gt[bg_proposal_inds])) proposal_labels = torch.LongTensor(proposal_labels).cuda(gt_rels.get_device()) rels_to_gt = torch.LongTensor(rels_to_gt).cuda(gt_rels.get_device()) else: assert mode == 'sgdet' gt_box_pairs = torch.cat((gt_boxes[fg_rels[:, 1]], gt_boxes[fg_rels[:, 2]]), 1) rel_proposal_pairs = torch.cat((rois[:, 1:][rel_proposal_inds[:, 0]], rois[:, 1:][rel_proposal_inds[:, 1]]), 1) num_pairs = np.zeros(num_im + 1).astype(np.int32) for i, s, e in enumerate_by_image(rel_proposal_inds[:, 0]): num_pairs[i + 1] = e - s cumsum_num_pairs = np.cumsum(num_pairs).astype(np.int32) fg_rel_per_image = int(RELS_BATCHSIZE * REL_FG_FRACTION) proposal_labels = [] gt_rel_labels = fg_rels[:, -1].contiguous().view(-1) for i in range(1, num_im + 1): rel_proposal_inds_i = rel_proposal_inds[cumsum_num_pairs[i - 1]:cumsum_num_pairs[i]] rel_proposal_pairs_i = rel_proposal_pairs[cumsum_num_pairs[i - 1]:cumsum_num_pairs[i]] gt_box_pairs_i = gt_box_pairs[torch.nonzero(fg_rels[:, 0] == (i - 1)).view(-1)] gt_box_pairs_label_i = gt_rel_labels[torch.nonzero(fg_rels[:, 0] == (i - 1)).view(-1)].view(-1).contiguous() overlaps = co_bbox_overlaps(rel_proposal_pairs_i, gt_box_pairs_i) # Np, Ngtp max_overlaps, gt_assignment = torch.max(overlaps, 1) # Np fg_inds = torch.nonzero(max_overlaps >= 0.5).view(-1) fg_num = fg_inds.numel() bg_inds = torch.nonzero((max_overlaps < 0.5) & (max_overlaps >= 0.0)).view(-1) bg_num = bg_inds.numel() rels_to_gt_i = torch.LongTensor(rel_proposal_pairs_i.shape[0]).fill(-1).cuda(gt_rels.get_device()) rels_to_gt_i[fg_inds] = gt_assignment[fg_inds] + num_gt_rels_seen if fg_num > 0 and bg_num > 0: fg_this_image = min(fg_rel_per_image, fg_num) rand_num = torch.from_numpy(np.random.permutation(fg_num)).long().cuda() fg_inds = fg_inds[rand_num[:fg_this_image]] # sampling bg bg_this_image = RELS_BATCHSIZE - fg_this_image rand_num = np.floor(np.random.rand(bg_this_image) * bg_num) rand_num = torch.from_numpy(rand_num).long().cuda() bg_inds = bg_inds[rand_num] rels_to_gt_i = torch.cat((rels_to_gt_i[fg_inds], rels_to_gt_i[bg_inds]), 0) elif fg_num > 0 and bg_num == 0: rand_num = np.floor(np.random.rand(RELS_BATCHSIZE) * fg_num) rand_num = torch.from_numpy(rand_num).long().cuda() fg_inds = fg_inds[rand_num] fg_this_image = RELS_BATCHSIZE bg_this_image = 0 rels_to_gt_i = rels_to_gt_i[fg_inds] elif bg_num > 0 and fg_num == 0: # sampling bg # rand_num = torch.floor(torch.rand(rois_per_image) * bg_num_rois).long().cuda() rand_num = np.floor(np.random.rand(RELS_BATCHSIZE) * bg_num) rand_num = torch.from_numpy(rand_num).long().cuda() bg_inds = bg_inds[rand_num] bg_this_image = RELS_BATCHSIZE fg_this_image = 0 rels_to_gt_i = rels_to_gt_i[bg_inds] else: import pdb pdb.set_trace() keep_inds = torch.cat([fg_inds, bg_inds], 0) rel_proposal_inds_i = rel_proposal_inds_i[keep_inds] labels_i = gt_box_pairs_label_i[gt_assignment[keep_inds]] if fg_this_image < labels_i.size(0): labels_i[fg_this_image:] = 0 rels_to_gt.append(rels_to_gt_i) num_gt_rels_seen += gt_box_pairs_i.shape[0] #try: # labels_i[fg_this_image:] = 0 #except ValueError: # print(labels_i) # print(fg_this_image) # import pdb # pdb.set_trace() proposal_labels.append(torch.cat((rel_proposal_inds_i, labels_i[:, None]), 1)) proposal_labels = torch.cat(proposal_labels, 0) rels_to_gt = torch.cat(rels_to_gt, 0) # sort _, perm = torch.sort( proposal_labels[:, 0] * (rois.size(0) ** 2) + proposal_labels[:, 1] * rois.size(0) + proposal_labels[:, 2]) proposal_labels = proposal_labels[perm].contiguous() rels_to_gt = rels_to_gt[perm].contiguous() return proposal_labels, rels_to_gt
def convert_roi_to_list(rois): rois_lst = [] for im_ind, s, e in enumerate_by_image(rois[:, 0].long().data): rois_lst.append(rois[s:e, 1:]) return rois_lst
def forward(self, x, im_sizes, image_offset, gt_boxes=None, gt_classes=None, gt_rels=None, proposals=None, train_anchor_inds=None, return_fmap=False): """ Forward pass for Relation detection Args: x: Images@[batch_size, 3, IM_SIZE, IM_SIZE] im_sizes: A numpy array of (h, w, scale) for each image. image_offset: Offset onto what image we're on for MGPU training (if single GPU this is 0) parameters for training: gt_boxes: [num_gt, 4] GT boxes over the batch. gt_classes: [num_gt, 2] gt boxes where each one is (img_id, class) gt_rels: proposals: train_anchor_inds: a [num_train, 2] array of indices for the anchors that will be used to compute the training loss. Each (img_ind, fpn_idx) return_fmap: Returns: If train: scores, boxdeltas, labels, boxes, boxtargets, rpnscores, rpnboxes, rellabels If test: prob dists, boxes, img inds, maxscores, classes """ result = self.detector(x, im_sizes, image_offset, gt_boxes, gt_classes, gt_rels, proposals, train_anchor_inds, return_fmap=True) assert not result.is_none(), 'Empty detection result' # image_offset refer to Blob # self.batch_size_per_gpu * index im_inds = result.im_inds - image_offset boxes = result.rm_box_priors obj_scores, box_classes = F.softmax( result.rm_obj_dists[:, 1:].contiguous(), dim=1).max(1) box_classes += 1 num_img = im_inds[-1] + 1 # embed(header='rel_model.py before rel_assignments') if self.training and result.rel_labels is None: assert self.mode == 'sgdet' # only in sgdet mode # shapes: # im_inds: (box_num,) # boxes: (box_num, 4) # rm_obj_labels: (box_num,) # gt_boxes: (box_num, 4) # gt_classes: (box_num, 2) maybe[im_ind, class_ind] # gt_rels: (rel_num, 4) # image_offset: integer result.rel_labels = rel_assignments(im_inds.data, boxes.data, result.rm_obj_labels.data, gt_boxes.data, gt_classes.data, gt_rels.data, image_offset, filter_non_overlap=True, num_sample_per_gt=1) rel_inds = self.get_rel_inds(result.rel_labels, im_inds, boxes) rois = torch.cat((im_inds[:, None].float(), boxes), 1) # union boxes feats (NumOfRels, obj_dim) union_box_feats = self.visual_rep(result.fmap.detach(), rois, rel_inds[:, 1:].contiguous()) # single box feats (NumOfBoxes, feats) box_feats = self.obj_feature_map(result.fmap.detach(), rois) # box spatial feats (NumOfBox, 4) bboxes = Variable(center_size(boxes.data)) sub_bboxes = bboxes[rel_inds[:, 1].contiguous()] obj_bboxes = bboxes[rel_inds[:, 2].contiguous()] obj_bboxes[:, :2] = obj_bboxes[:, :2].contiguous( ) - sub_bboxes[:, :2].contiguous() # x-y obj_bboxes[:, 2:] = obj_bboxes[:, 2:].contiguous( ) / sub_bboxes[:, 2:].contiguous() # w/h obj_bboxes[:, :2] /= sub_bboxes[:, 2:].contiguous() # x-y/h obj_bboxes[:, 2:] = torch.log(obj_bboxes[:, 2:].contiguous()) # log(w/h) bbox_spatial_feats = self.spatial_fc(obj_bboxes) box_word = self.classes_word_embedding(box_classes) box_pair_word = torch.cat((box_word[rel_inds[:, 1].contiguous()], box_word[rel_inds[:, 2].contiguous()]), 1) box_word_feats = self.word_fc(box_pair_word) # (NumOfRels, DIM=) box_pair_feats = torch.cat( (union_box_feats, bbox_spatial_feats, box_word_feats), 1) box_pair_score = self.relpn_fc(box_pair_feats) #embed(header='filter_rel_labels') if self.training: pn_rel_label = list() pn_pair_score = list() #print(result.rel_labels.shape) #print(result.rel_labels[:, 0].contiguous().squeeze()) for i, s, e in enumerate_by_image( result.rel_labels[:, 0].data.contiguous()): im_i_rel_label = result.rel_labels[s:e].contiguous() im_i_box_pair_score = box_pair_score[s:e].contiguous() im_i_rel_fg_inds = torch.nonzero( im_i_rel_label[:, -1].contiguous()).squeeze() im_i_rel_fg_inds = im_i_rel_fg_inds.data.cpu().numpy() im_i_fg_sample_num = min(RELEVANT_PER_IM, im_i_rel_fg_inds.shape[0]) if im_i_rel_fg_inds.size > 0: im_i_rel_fg_inds = np.random.choice( im_i_rel_fg_inds, size=im_i_fg_sample_num, replace=False) im_i_rel_bg_inds = torch.nonzero( im_i_rel_label[:, -1].contiguous() == 0).squeeze() im_i_rel_bg_inds = im_i_rel_bg_inds.data.cpu().numpy() im_i_bg_sample_num = min(EDGES_PER_IM - im_i_fg_sample_num, im_i_rel_bg_inds.shape[0]) if im_i_rel_bg_inds.size > 0: im_i_rel_bg_inds = np.random.choice( im_i_rel_bg_inds, size=im_i_bg_sample_num, replace=False) #print('{}/{} fg/bg in image {}'.format(im_i_fg_sample_num, im_i_bg_sample_num, i)) result.rel_sample_pos = torch.Tensor( [im_i_fg_sample_num]).cuda(im_i_rel_label.get_device()) result.rel_sample_neg = torch.Tensor( [im_i_bg_sample_num]).cuda(im_i_rel_label.get_device()) im_i_keep_inds = np.append(im_i_rel_fg_inds, im_i_rel_bg_inds) im_i_pair_score = im_i_box_pair_score[ im_i_keep_inds.tolist()].contiguous() im_i_rel_pn_labels = Variable( torch.zeros(im_i_fg_sample_num + im_i_bg_sample_num).type( torch.LongTensor).cuda(x.get_device())) im_i_rel_pn_labels[:im_i_fg_sample_num] = 1 pn_rel_label.append(im_i_rel_pn_labels) pn_pair_score.append(im_i_pair_score) result.rel_pn_dists = torch.cat(pn_pair_score, 0) result.rel_pn_labels = torch.cat(pn_rel_label, 0) box_pair_relevant = F.softmax(box_pair_score, dim=1) box_pos_pair_ind = torch.nonzero(box_pair_relevant[:, 1].contiguous( ) > box_pair_relevant[:, 0].contiguous()).squeeze() if box_pos_pair_ind.data.shape == torch.Size([]): return None #print('{}/{} trim edges'.format(box_pos_pair_ind.size(0), rel_inds.size(0))) result.rel_trim_pos = torch.Tensor([box_pos_pair_ind.size(0)]).cuda( box_pos_pair_ind.get_device()) result.rel_trim_total = torch.Tensor([rel_inds.size(0) ]).cuda(rel_inds.get_device()) # filtering relations filter_rel_inds = rel_inds[box_pos_pair_ind.data] filter_box_pair_feats = box_pair_feats[box_pos_pair_ind.data] if self.training: filter_rel_labels = result.rel_labels[box_pos_pair_ind.data] result.rel_labels = filter_rel_labels # message passing between boxes and relations #embed(header='mp') for _ in range(self.mp_iter_num): box_feats = self.message_passing(box_feats, filter_box_pair_feats, filter_rel_inds) box_cls_scores = self.cls_fc(box_feats) result.rm_obj_dists = box_cls_scores obj_scores, box_classes = F.softmax(box_cls_scores[:, 1:].contiguous(), dim=1).max(1) box_classes += 1 # skip background # TODO: add memory module # filter_box_pair_feats is to be added to memory # fbiilter_box_pair_feats = self.memory_() # filter_box_pair_feats is to be added to memory # RelationCNN filter_box_pair_feats_fc1 = self.relcnn_fc1(filter_box_pair_feats) filter_box_pair_score = self.relcnn_fc2(filter_box_pair_feats_fc1) if not self.graph_cons: filter_box_pair_score = filter_box_pair_score.view( -1, 2, self.num_rels) result.rel_dists = filter_box_pair_score if self.training: return result pred_scores = F.softmax(result.rel_dists, dim=1) """ filter_dets boxes: bbox regression else [num_box, 4] obj_scores: [num_box] probabilities for the scores obj_classes: [num_box] class labels integer rel_inds: [num_rel, 2] TENSOR consisting of (im_ind0, im_ind1) pred_scores: [num_rel, num_predicates] including irrelevant class(#relclass + 1) """ return filter_dets(boxes, obj_scores, box_classes, filter_rel_inds[:, 1:].contiguous(), pred_scores)
def rel_assignments_sgcls(rois, gt_boxes, gt_classes, gt_rels, image_offset): """ sample_rels to balance proportion of positive and negative samples :param rois: [img_ind, x1, y1, x2, y2] :param gt_boxes: [num_boxes, 4] array of x0, y0, x1, y1]. Not needed it seems :param gt_classes: [num_boxes, 2] array of [img_ind, class] Note, the img_inds here start at image_offset :param gt_rels [num_boxes, 4] array of [img_ind, box_0, box_1, rel type]. Note, the img_inds here start at image_offset :param Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) :return: rois: [num_rois, 5] labels: [num_rois] array of labels rel_labels: [num_rels, 4] (img ind, box0 ind, box1ind, rel type) """ im_inds = rois[:,0].long() num_im = im_inds[-1] + 1 # Offset the image indices in fg_rels to refer to absolute indices (not just within img i) fg_rels = gt_rels.clone() fg_rels[:,0] -= image_offset offset = {} for i, s, e in enumerate_by_image(im_inds): offset[i] = s for i, s, e in enumerate_by_image(fg_rels[:, 0]): fg_rels[s:e, 1:3] += offset[i] # Try ALL things, not just intersections. is_cand = (im_inds[:, None] == im_inds[None]) is_cand.view(-1)[diagonal_inds(is_cand)] = 0 # # Compute salience # gt_inds = fg_rels[:, 1:3].contiguous().view(-1) # labels_arange = labels.data.new(labels.size(0)) # torch.arange(0, labels.size(0), out=labels_arange) # salience_labels = ((gt_inds[:, None] == labels_arange[None]).long().sum(0) > 0).long() # labels = torch.stack((labels, salience_labels), 1) # Add in some BG labels # NOW WE HAVE TO EXCLUDE THE FGs. # TODO: check if this causes an error if many duplicate GTs havent been filtered out is_cand.view(-1)[fg_rels[:,1]*im_inds.size(0) + fg_rels[:,2]] = 0 is_bgcand = is_cand.nonzero() # TODO: make this sample on a per image case # If too many then sample num_fg = min(fg_rels.size(0), int(RELS_PER_IMG * REL_FG_FRACTION * num_im)) if num_fg < fg_rels.size(0): fg_rels = random_choose(fg_rels, num_fg) # If too many then sample num_bg = min(is_bgcand.size(0) if is_bgcand.dim() > 0 else 0, int(RELS_PER_IMG * num_im) - num_fg) if num_bg > 0: bg_rels = torch.cat(( im_inds[is_bgcand[:, 0]][:, None], is_bgcand, (is_bgcand[:, 0, None] < -10).long(), ), 1) if num_bg < is_bgcand.size(0): bg_rels = random_choose(bg_rels, num_bg) rel_labels = torch.cat((fg_rels, bg_rels), 0) else: rel_labels = fg_rels # last sort by rel. _, perm = torch.sort(rel_labels[:, 0]*(gt_boxes.size(0)**2) + rel_labels[:,1]*gt_boxes.size(0) + rel_labels[:,2]) rel_labels = rel_labels[perm].contiguous() labels = gt_classes[:,1].contiguous() return rois, labels, rel_labels
def forward(self, x, im_sizes, image_offset, gt_boxes=None, gt_classes=None, gt_rels=None, proposals=None, train_anchor_inds=None, return_fmap=False): """ Forward pass for Relation detection Args: x: Images@[batch_size, 3, IM_SIZE, IM_SIZE] im_sizes: A numpy array of (h, w, scale) for each image. image_offset: Offset onto what image we're on for MGPU training (if single GPU this is 0) parameters for training: gt_boxes: [num_gt, 4] GT boxes over the batch. gt_classes: [num_gt, 2] gt boxes where each one is (img_id, class) gt_rels: proposals: train_anchor_inds: a [num_train, 2] array of indices for the anchors that will be used to compute the training loss. Each (img_ind, fpn_idx) return_fmap: Returns: If train: scores, boxdeltas, labels, boxes, boxtargets, rpnscores, rpnboxes, rellabels If test: prob dists, boxes, img inds, maxscores, classes """ s_t = time.time() verbose = False def check(sl, een, sst=s_t): if verbose: print('{}{}'.format(sl, een - sst)) result = self.detector(x, im_sizes, image_offset, gt_boxes, gt_classes, gt_rels, proposals, train_anchor_inds, return_fmap=True) check('detector', tt()) assert not result.is_none(), 'Empty detection result' # image_offset refer to Blob # self.batch_size_per_gpu * index im_inds = result.im_inds - image_offset boxes = result.rm_box_priors obj_scores, box_classes = F.softmax( result.rm_obj_dists[:, 1:].contiguous(), dim=1).max(1) box_classes += 1 # TODO: predcls implementation obj_scores and box_classes num_img = im_inds[-1] + 1 # embed(header='rel_model.py before rel_assignments') if self.training and result.rel_labels is None: assert self.mode == 'sgdet' # only in sgdet mode # shapes: # im_inds: (box_num,) # boxes: (box_num, 4) # rm_obj_labels: (box_num,) # gt_boxes: (box_num, 4) # gt_classes: (box_num, 2) maybe[im_ind, class_ind] # gt_rels: (rel_num, 4) # image_offset: integer result.rel_labels = rel_assignments(im_inds.data, boxes.data, result.rm_obj_labels.data, gt_boxes.data, gt_classes.data, gt_rels.data, image_offset, filter_non_overlap=True, num_sample_per_gt=1) rel_inds = self.get_rel_inds(result.rel_labels, im_inds, boxes) rois = torch.cat((im_inds[:, None].float(), boxes), 1) # union boxes feats (NumOfRels, obj_dim) union_box_feats = self.visual_rep(result.fmap.detach(), rois, rel_inds[:, 1:].contiguous()) # single box feats (NumOfBoxes, feats) box_feats = self.obj_feature_map(result.fmap.detach(), rois) # box spatial feats (NumOfBox, 4) box_pair_feats = self.fuse_message(union_box_feats, boxes, box_classes, rel_inds) box_pair_score = self.relpn_fc(box_pair_feats) if self.training: # sampling pos and neg relations here for training rel_sample_pos, rel_sample_neg = 0, 0 pn_rel_label, pn_pair_score = list(), list() for i, s, e in enumerate_by_image( result.rel_labels[:, 0].data.contiguous()): im_i_rel_label = result.rel_labels[s:e].contiguous() im_i_box_pair_score = box_pair_score[s:e].contiguous() im_i_rel_fg_inds = torch.nonzero( im_i_rel_label[:, -1].contiguous()).squeeze() im_i_rel_fg_inds = im_i_rel_fg_inds.data.cpu().numpy() im_i_fg_sample_num = min(RELEVANT_PER_IM, im_i_rel_fg_inds.shape[0]) if im_i_rel_fg_inds.size > 0: im_i_rel_fg_inds = np.random.choice( im_i_rel_fg_inds, size=im_i_fg_sample_num, replace=False) im_i_rel_bg_inds = torch.nonzero( im_i_rel_label[:, -1].contiguous() == 0).squeeze() im_i_rel_bg_inds = im_i_rel_bg_inds.data.cpu().numpy() im_i_bg_sample_num = min(EDGES_PER_IM - im_i_fg_sample_num, im_i_rel_bg_inds.shape[0]) if im_i_rel_bg_inds.size > 0: im_i_rel_bg_inds = np.random.choice( im_i_rel_bg_inds, size=im_i_bg_sample_num, replace=False) #print('{}/{} fg/bg in image {}'.format(im_i_fg_sample_num, im_i_bg_sample_num, i)) rel_sample_pos += im_i_fg_sample_num rel_sample_neg += im_i_bg_sample_num im_i_keep_inds = np.append(im_i_rel_fg_inds, im_i_rel_bg_inds) im_i_pair_score = im_i_box_pair_score[ im_i_keep_inds.tolist()].contiguous() im_i_rel_pn_labels = Variable( torch.zeros(im_i_fg_sample_num + im_i_bg_sample_num).type( torch.LongTensor).cuda(x.get_device())) im_i_rel_pn_labels[:im_i_fg_sample_num] = 1 pn_rel_label.append(im_i_rel_pn_labels) pn_pair_score.append(im_i_pair_score) result.rel_pn_dists = torch.cat(pn_pair_score, 0) result.rel_pn_labels = torch.cat(pn_rel_label, 0) result.rel_sample_pos = torch.Tensor([rel_sample_pos]).cuda( im_i_rel_label.get_device()) result.rel_sample_neg = torch.Tensor([rel_sample_neg]).cuda( im_i_rel_label.get_device()) box_pair_relevant = F.softmax(box_pair_score, dim=1) box_pos_pair_ind = torch.nonzero(box_pair_relevant[:, 1].contiguous( ) > box_pair_relevant[:, 0].contiguous()).squeeze() if box_pos_pair_ind.data.shape == torch.Size([]): return None #print('{}/{} trim edges'.format(box_pos_pair_ind.size(0), rel_inds.size(0))) result.rel_trim_pos = torch.Tensor([box_pos_pair_ind.size(0)]).cuda( box_pos_pair_ind.get_device()) result.rel_trim_total = torch.Tensor([rel_inds.size(0) ]).cuda(rel_inds.get_device()) if self.trim_graph: # filtering relations filter_rel_inds = rel_inds[box_pos_pair_ind.data] filter_box_pair_feats = box_pair_feats[box_pos_pair_ind.data] else: filter_rel_inds = rel_inds filter_box_pair_feats = box_pair_feats if self.training: if self.trim_graph: filter_rel_labels = result.rel_labels[box_pos_pair_ind.data] else: filter_rel_labels = result.rel_labels num_gt_filtered = torch.nonzero(filter_rel_labels[:, -1]) if num_gt_filtered.shape == torch.Size([]): num_gt_filtered = 0 else: num_gt_filtered = num_gt_filtered.size(0) num_gt_orignial = torch.nonzero(result.rel_labels[:, -1]).size(0) result.rel_pn_recall = torch.Tensor( [num_gt_filtered / num_gt_orignial]).cuda(x.get_device()) result.rel_labels = filter_rel_labels check('trim', tt()) # message passing between boxes and relations if self.mode in ('sgcls', 'sgdet'): for _ in range(self.mp_iter_num): box_feats = self.message_passing(box_feats, filter_box_pair_feats, filter_rel_inds) box_cls_scores = self.cls_fc(box_feats) result.rm_obj_dists = box_cls_scores obj_scores, box_classes = F.softmax( box_cls_scores[:, 1:].contiguous(), dim=1).max(1) box_classes += 1 # skip background check('mp', tt()) # RelationCNN filter_box_pair_feats_fc1 = self.relcnn_fc1(filter_box_pair_feats) filter_box_pair_score = self.relcnn_fc2(filter_box_pair_feats_fc1) result.rel_dists = filter_box_pair_score pred_scores_stage_one = F.softmax(result.rel_dists, dim=1).data # filter_box_pair_feats is to be added to memory if self.training: padded_filter_feats, pack_lengths, re_filter_rel_inds, padded_rel_labels = \ self.pad_sequence( filter_rel_inds, filter_box_pair_feats_fc1, rel_labels=result.rel_labels ) else: padded_filter_feats, pack_lengths, re_filter_rel_inds, padded_rel_inds = \ self.pad_sequence( filter_rel_inds, filter_box_pair_feats_fc1 ) # trimming zeros to avoid no rel in image trim_pack_lengths = np.trim_zeros(pack_lengths) trim_padded_filter_feats = padded_filter_feats[:trim_pack_lengths. shape[0]] packed_filter_feats = pack_padded_sequence(trim_padded_filter_feats, trim_pack_lengths, batch_first=True) if self.training: trim_padded_rel_labels = padded_rel_labels[:trim_pack_lengths. shape[0]] packed_rel_labels = pack_padded_sequence(trim_padded_rel_labels, trim_pack_lengths, batch_first=True) rel_mem_dists = self.mem_module(inputs=packed_filter_feats, rel_labels=packed_rel_labels) rel_mem_dists = self.re_order_packed_seq(rel_mem_dists, filter_rel_inds, re_filter_rel_inds) result.rel_mem_dists = rel_mem_dists else: trim_padded_rel_inds = padded_rel_inds[:trim_pack_lengths.shape[0]] packed_rel_inds = pack_padded_sequence(trim_padded_rel_inds, trim_pack_lengths, batch_first=True) rel_mem_dists = self.mem_module(inputs=packed_filter_feats, rel_inds=packed_rel_inds, obj_classes=box_classes) rel_mem_probs = self.re_order_packed_seq(rel_mem_dists, filter_rel_inds, re_filter_rel_inds) rel_mem_probs = rel_mem_probs.data check('mem', tt()) if self.training: return result # pad stage one output in rel_mem_probs if it sums zero for rel_i in range(rel_mem_probs.size(0)): rel_i_probs = rel_mem_probs[rel_i] if rel_i_probs.sum() == 0: rel_mem_probs[rel_i] = pred_scores_stage_one[rel_i] """ filter_dets boxes: bbox regression else [num_box, 4] obj_scores: [num_box] probabilities for the scores obj_classes: [num_box] class labels integer rel_inds: [num_rel, 2] TENSOR consisting of (im_ind0, im_ind1) pred_scores: [num_rel, num_predicates] including irrelevant class(#relclass + 1) """ check('mem processing', tt()) return filter_dets(boxes, obj_scores, box_classes, filter_rel_inds[:, 1:].contiguous(), rel_mem_probs)
def message_passing(self, box_feats, rel_feats, edges): """Integrate box feats to each other update box feats by decending out-degree order, that is, the node with largest out-degree update first suppose node i and j are neighbours, and has connection i->j feature of i and j are fi and fj, feature of union box ij are fij fi = sigma(W1*fi + sum_neighbour(V1 * alpha * fij)) fj = sigma(W2*fj + sum_neighbour(V2 * alpha * fij)) alpha = attention(fi, fij) V1, V2, W1, W2 are parameters to be learned, sigma is acitvation function, alpha is attention Args: box_feats: Variable, box features with shape of (NumOfBoxes, FEAT_DIM) rel_feats: Variable, edge features with shape of (NumOfRels, REL_FEAT_DIM) edges: Variable, scene graph edges(pruned), with shape of (NumOfRels, 3) e.g. edges[0, :] = [1, 0, 5] means box 0 and box 5 in image 1 had an affair~ Returns: box_feats: Variable, box features combining relation features """ # embed(header='mp ') im_inds = edges[:, 0].contiguous() num_img = im_inds[-1] + 1 # list of dict: record the number of boxes per image count_dic = [{} for _ in range(num_img)] for im_i, s, e in enumerate_by_image(im_inds): im_i_edges = edges[s:e, :].contiguous() for rel in im_i_edges: box0, box1 = rel[1:] count_dic[im_i][box0] = 1 + count_dic[im_i].get(box0, 0) # list of Variable box_nodes_feats = list() for box_feat in box_feats: box_nodes_feats.append(box_feat) #.clone()) for im_i, s, e in enumerate_by_image(im_inds): im_i_edges = edges[s:e, :].contiguous() im_i_rel_feats = rel_feats[s:e, :].contiguous() for box_id, v in \ sorted( count_dic[im_i].items(), key=lambda kv: kv[1], reverse=True ): # update passing message # subject message from rel feats choose_sub_edges_ind = torch.nonzero( im_i_edges[:, 1].contiguous() == box_id).squeeze() choose_sub_edges = im_i_edges[choose_sub_edges_ind] choose_sub_rel_feats = im_i_rel_feats[choose_sub_edges_ind] box_id_feats = box_nodes_feats[box_id] # attention on subject reltions num_sub_neigh = choose_sub_edges.size(0) sub_cat_att_feats = torch.cat((box_id_feats.expand( [num_sub_neigh, -1]), choose_sub_rel_feats), 1) sub_atten = self.mp_atten_fc(sub_cat_att_feats) sub_alpha = F.softmax(sub_atten, dim=0) sub_feats = (sub_alpha * self.sub_rel_mp_fc(choose_sub_rel_feats)).sum(0) # object message from rel feats(may be null) choose_obj_edges_ind = torch.nonzero( im_i_edges[:, 2].contiguous() == box_id).squeeze() if choose_obj_edges_ind.size() == torch.Size([]): box_id_feats = self.box_mp_fc(box_id_feats) + sub_feats box_id_feats = F.relu(box_id_feats, inplace=True) box_nodes_feats[box_id] = box_id_feats continue choose_obj_edges = im_i_edges[choose_obj_edges_ind] choose_obj_rel_feats = im_i_rel_feats[choose_obj_edges_ind] box_id_feats = box_nodes_feats[box_id] # attention on object reltions num_obj_neigh = choose_obj_edges.size(0) obj_cat_att_feats = torch.cat((box_id_feats.expand( [num_obj_neigh, -1]), choose_obj_rel_feats), 1) obj_atten = self.mp_atten_fc(obj_cat_att_feats) obj_alpha = F.softmax(obj_atten, dim=0) obj_feats = (obj_alpha * self.obj_rel_mp_fc(choose_obj_rel_feats)).sum(0) # add back to box feature box_id_feats = self.box_mp_fc( box_id_feats) + obj_feats + sub_feats box_id_feats = F.relu(box_id_feats, inplace=True) box_nodes_feats[box_id] = box_id_feats mp_box_feats = torch.stack(box_nodes_feats) return mp_box_feats
def rel_anchor_target(rois, gt_boxes, gt_classes, scores, gt_rels, image_offset): """ use all roi pairs and sample some pairs to train relation proposal module Note: ONLY for mode SGDET!!!! rois are from RPN, We take the CO_Overlap strategy from Graph-RCNN to sample fg and bg rels :param rois: N, 5 :param scores: N, N :param gt_rels: :return: """ im_inds = rois[:, 0].long() num_im = im_inds[-1] + 1 # Offset the image indices in fg_rels to refer to absolute indices (not just within img i) fg_rels = gt_rels.clone() fg_rels[:, 0] -= image_offset offset = {} for i, s, e in enumerate_by_image(gt_classes[:, 0]): offset[i] = s for i, s, e in enumerate_by_image(fg_rels[:, 0]): fg_rels[s:e, 1:3] += offset[i] gt_box_pairs = torch.cat( (gt_boxes[fg_rels[:, 1]], gt_boxes[fg_rels[:, 2]]), 1) # Ngtp, 8 # get all potential pairs is_cand = (im_inds[:, None] == im_inds[None]) is_cand.view(-1)[diagonal_inds(is_cand)] = 0 all_pair_inds = torch.nonzero(is_cand) all_box_pairs = torch.cat( (rois[:, 1:][all_pair_inds[:, 0]], rois[:, 1:][all_pair_inds[:, 1]]), 1) num_pairs = np.zeros(num_im + 1).astype(np.int32) id_to_iminds = {} for i, s, e in enumerate_by_image(im_inds): num_pairs[i + 1] = (e - s) * (e - s - 1) id_to_iminds[i] = im_inds[s] cumsum_num_pairs = np.cumsum(num_pairs).astype(np.int32) all_rel_inds = [] for i in range(1, num_im + 1): all_pair_inds_i = all_pair_inds[ cumsum_num_pairs[i - 1]:cumsum_num_pairs[i]] all_box_pairs_i = all_box_pairs[ cumsum_num_pairs[i - 1]:cumsum_num_pairs[i]] gt_box_pairs_i = gt_box_pairs[torch.nonzero( fg_rels[:, 0] == (i - 1)).view(-1)] labels = gt_rels.new(all_box_pairs_i.size(0)).fill_(-1) overlaps = co_bbox_overlaps(all_box_pairs_i, gt_box_pairs_i) ## Np, Ngtp max_overlaps, argmax_overlaps = torch.max(overlaps, 1) ## Np gt_max_overlaps, _ = torch.max(overlaps, 0) ## Ngtp labels[max_overlaps < 0.15] = 0 gt_max_overlaps[gt_max_overlaps == 0] = 1e-5 # fg rel: for each gt pair, the max overlap anchor is fg keep = torch.sum( overlaps.eq(gt_max_overlaps.view(1, -1).expand_as(overlaps)), 1) # Np if torch.sum(keep) > 0: labels[keep > 0] = 1 # fg rel: above thresh labels[max_overlaps >= 0.25] = 1 num_fg = int(RELPN_BATCHSIZE * RELPN_FG_FRACTION) sum_fg = torch.sum((labels == 1).int()) sum_bg = torch.sum((labels == 0).int()) if sum_fg > num_fg: fg_inds = torch.nonzero(labels == 1).view(-1) rand_num = torch.from_numpy(np.random.permutation( fg_inds.size(0))).type_as(gt_boxes).long() disable_inds = fg_inds[rand_num[:fg_inds.size(0) - num_fg]] labels[disable_inds] = -1 num_bg = RELPN_BATCHSIZE - torch.sum((labels == 1).int()) if sum_bg > num_bg: bg_inds = torch.nonzero(labels == 0).view(-1) rand_num = torch.from_numpy(np.random.permutation( bg_inds.size(0))).type_as(gt_boxes).long() disable_inds = bg_inds[rand_num[:bg_inds.size(0) - num_bg]] labels[disable_inds] = -1 keep_inds = torch.nonzero(labels >= 0).view(-1) labels = labels[keep_inds] all_pair_inds_i = all_pair_inds_i[keep_inds] im_inds_i = torch.LongTensor([id_to_iminds[i - 1]] * keep_inds.size(0)).view(-1, 1).cuda( all_pair_inds.get_device()) all_pair_inds_i = torch.cat( (im_inds_i, all_pair_inds_i, labels.view(-1, 1)), 1) all_rel_inds.append(all_pair_inds_i) all_rel_inds = torch.cat(all_rel_inds, 0) # sort by rel _, perm = torch.sort(all_rel_inds[:, 0] * (rois.size(0)**2) + all_rel_inds[:, 1] * rois.size(0) + all_rel_inds[:, 2]) all_rel_inds = all_rel_inds[perm].contiguous() return all_rel_inds
def faster_rcnn(self, x, gt_boxes, gt_classes, gt_rels): targets, x_lst, original_image_sizes = [], [], [] device = self.rel_fc.weight.get_device( ) if self.rel_fc.weight.is_cuda else 'cpu' for i, s, e in enumerate_by_image(gt_classes[:, 0].long().data): targets.append({ 'boxes': copy.deepcopy(gt_boxes[s:e]), 'labels': gt_classes[s:e, 1].long() }) x_lst.append(x[i].to(device).squeeze()) original_image_sizes.append(x[i].shape[-2:]) images, targets = self.detector.transform(x_lst, targets) fmap_multiscale = self.detector.backbone(images.tensors) if isinstance(fmap_multiscale, torch.Tensor): fmap_multiscale = OrderedDict([('0', fmap_multiscale)]) if self.mode != 'sgdet': rois, obj_labels, rel_labels = self.gt_labels( gt_boxes, gt_classes, gt_rels) rm_box_priors, rm_box_priors_org = [], [] for i, s, e in enumerate_by_image(gt_classes[:, 0].long().data): rm_box_priors.append(targets[i]['boxes']) rm_box_priors_org.append(gt_boxes[s:e]) im_inds = rois[:, 0] result = Result( od_box_targets=None, rm_box_targets=None, od_obj_labels=obj_labels, rm_box_priors=torch.cat(rm_box_priors), rm_obj_labels=obj_labels, rpn_scores=None, rpn_box_deltas=None, rel_labels=rel_labels, im_inds=im_inds.long(), ) result.rm_box_priors_org = torch.cat(rm_box_priors_org) else: proposals, _ = self.detector.rpn(images, fmap_multiscale, targets) detections, _ = self.detector.roi_heads(fmap_multiscale, proposals, images.image_sizes, targets) boxes = copy.deepcopy(detections) boxes_all_dict = self.detector.transform.postprocess( detections, images.image_sizes, original_image_sizes) rm_box_priors, rm_box_priors_org, im_inds, obj_labels = [], [], [], [] for i in range(len(proposals)): if len(boxes[i]['boxes']) <= 1: raise ValueError( 'at least two objects must be detected to build relationships, make sure the detector is properly pretrained', boxes) rm_box_priors.append(boxes[i]['boxes']) rm_box_priors_org.append(boxes_all_dict[i]['boxes']) obj_labels.append(boxes_all_dict[i]['labels']) im_inds.append(torch.zeros(len(detections[i]['boxes'])) + i) im_inds = torch.cat(im_inds).to(device) result = Result(rm_obj_labels=torch.cat(obj_labels).view(-1), rm_box_priors=torch.cat(rm_box_priors), rel_labels=None, im_inds=im_inds.long()) result.rm_box_priors_org = torch.cat(rm_box_priors_org) if len(result.rm_box_priors) <= 1: raise ValueError( 'at least two objects must be detected to build relationships' ) result.im_sizes_org = original_image_sizes result.im_sizes = images.image_sizes result.fmap = fmap_multiscale[list( fmap_multiscale.keys())[-1]] # last scale for global feature maps result.rois = torch.cat( (im_inds.float()[:, None], result.rm_box_priors), 1) return result
def forward(self, x, im_sizes, image_offset, gt_boxes=None, gt_classes=None, gt_rels=None, *args): """ Forward pass for detection :param x: Images@[batch_size, 3, IM_SIZE, IM_SIZE] :param im_sizes: A numpy array of (h, w, scale) for each image. :param image_offset: Offset onto what image we're on for MGPU training (if single GPU this is 0) :param gt_boxes: Training parameters: :param gt_boxes: [num_gt, 4] GT boxes over the batch. :param gt_classes: [num_gt, 2] gt boxes where each one is (img_id, class) :param train_anchor_inds: a [num_train, 2] array of indices for the anchors that will be used to compute the training loss. Each (img_ind, fpn_idx) :return: If train: scores, boxdeltas, labels, boxes, boxtargets, rpnscores, rpnboxes, rellabels if test: prob dists, boxes, img inds, maxscores, classes """ with torch.no_grad(): # do not update anything in the detector targets, x_lst, original_image_sizes = [], [], [] device = self.rel_fc.weight.get_device( ) if self.rel_fc.weight.is_cuda else 'cpu' gt_boxes = gt_boxes.to(device) gt_classes = gt_classes.to(device) gt_rels = gt_rels.to(device) for i, s, e in enumerate_by_image(gt_classes[:, 0].long().data): targets.append({ 'boxes': copy.deepcopy(gt_boxes[s:e]), 'labels': gt_classes[s:e, 1].long().to(device) }) x_lst.append(x[i].to(device).squeeze()) original_image_sizes.append(x[i].shape[-2:]) images, targets = self.detector.transform(x_lst, targets) fmap_multiscale = self.detector.backbone(images.tensors) if self.mode != 'sgdet': rois, obj_labels, bbox_targets, rpn_scores, rpn_box_deltas, rel_labels = \ self.gt_boxes(None, im_sizes, image_offset, self.RELS_PER_IMG, gt_boxes, gt_classes, gt_rels, None, proposals=None, sample_factor=-1) rm_box_priors, rm_box_priors_org = [], [] for i, s, e in enumerate_by_image(gt_classes[:, 0].long().data): rm_box_priors.append(targets[i]['boxes']) rm_box_priors_org.append(gt_boxes[s:e]) result = Result(od_box_targets=bbox_targets, rm_box_targets=bbox_targets, od_obj_labels=obj_labels, rm_box_priors=torch.cat(rm_box_priors), rm_obj_labels=obj_labels, rpn_scores=rpn_scores, rpn_box_deltas=rpn_box_deltas, rel_labels=rel_labels, im_inds=rois[:, 0].long().contiguous() + image_offset) result.rm_box_priors_org = torch.cat(rm_box_priors_org) else: if isinstance(fmap_multiscale, torch.Tensor): fmap_multiscale = OrderedDict([(0, fmap_multiscale)]) proposals, _ = self.detector.rpn(images, fmap_multiscale, targets) detections, _ = self.detector.roi_heads( fmap_multiscale, proposals, images.image_sizes, targets) boxes = copy.deepcopy(detections) boxes_all_dict = self.detector.transform.postprocess( detections, images.image_sizes, original_image_sizes) rm_box_priors, rm_box_priors_org, im_inds, obj_labels = [], [], [], [] for i in range(len(proposals)): rm_box_priors.append(boxes[i]['boxes']) rm_box_priors_org.append(boxes_all_dict[i]['boxes']) obj_labels.append(boxes_all_dict[i]['labels']) im_inds.append( torch.zeros(len(detections[i]['boxes']), device=device).float() + i) im_inds = torch.cat(im_inds).view(-1, 1) result = Result(rm_obj_labels=torch.cat(obj_labels).view(-1), rm_box_priors=torch.cat(rm_box_priors), rel_labels=None, im_inds=im_inds.view(-1).long().contiguous() + image_offset) result.rm_box_priors_org = torch.cat(rm_box_priors_org) if len(result.rm_box_priors) <= 1: raise ValueError( 'at least two objects must be detected to build relationships' ) if result.is_none(): return ValueError("heck") if self.detector_model == 'baseline': if self.slim > 0: result.fmap = self.fmap_reduce(result.fmap.detach()) else: result.fmap = result.fmap.detach() im_inds = result.im_inds - image_offset boxes = result.rm_box_priors if not hasattr(result, 'rel_labels'): result.rel_labels = None if self.training and result.rel_labels is None: assert self.mode == 'sgdet' result.rel_labels = rel_assignments(im_inds.data, boxes.data, result.rm_obj_labels.data, gt_boxes.data, gt_classes.data, gt_rels.data, image_offset, filter_non_overlap=True, num_sample_per_gt=1) rel_inds = self.get_rel_inds( result.rel_labels if self.training else None, im_inds, boxes) rois = torch.cat((im_inds[:, None].float(), boxes), 1) union_rois = torch.cat(( rois[:, 0][rel_inds[:, 1]][:, None], torch.min(rois[:, 1:3][rel_inds[:, 1]], rois[:, 1:3][rel_inds[:, 2]]), torch.max(rois[:, 3:5][rel_inds[:, 1]], rois[:, 3:5][rel_inds[:, 2]]), ), 1) node_feat = self.multiscale_roi_pool(fmap_multiscale, rm_box_priors, images.image_sizes) edge_feat = self.multiscale_roi_pool(fmap_multiscale, convert_roi_to_list(union_rois), images.image_sizes) result.rm_obj_dists, result.rel_dists = self.predict( node_feat, edge_feat, rel_inds, rois, images.image_sizes) if self.use_bias: scores_nz = F.softmax(result.rm_obj_dists, dim=1).data scores_nz[:, 0] = 0.0 _, score_ord = scores_nz[:, 1:].sort(dim=1, descending=True) result.obj_preds = score_ord[:, 0] + 1 if self.mode == 'predcls': result.obj_preds = gt_classes.data[:, 1] freq_pred = self.freq_bias.index_with_labels( torch.stack(( result.obj_preds[rel_inds[:, 1]], result.obj_preds[rel_inds[:, 2]], ), 1)) # tune the weight for freq_bias if self.test_bias: result.rel_dists = freq_pred else: result.rel_dists = result.rel_dists + freq_pred if self.training: return result if self.mode == 'predcls': result.obj_scores = result.rm_obj_dists.data.new( gt_classes.size(0)).fill_(1) result.obj_preds = gt_classes.data[:, 1] elif self.mode in ['sgcls', 'sgdet']: scores_nz = F.softmax(result.rm_obj_dists, dim=1).data scores_nz[:, 0] = 0.0 # does not change actually anything result.obj_scores, score_ord = scores_nz[:, 1:].sort(dim=1, descending=True) result.obj_preds = score_ord[:, 0] + 1 result.obj_scores = result.obj_scores[:, 0] else: raise NotImplementedError(self.mode) result.obj_preds = Variable(result.obj_preds) result.obj_scores = Variable(result.obj_scores) # Boxes will get fixed by filter_dets function. if self.detector_model == 'mrcnn': bboxes = result.rm_box_priors_org else: bboxes = result.rm_box_priors rel_rep = F.softmax(result.rel_dists, dim=1) return filter_dets(bboxes, result.obj_scores, result.obj_preds, rel_inds[:, 1:], rel_rep)
def proposal_assignments_gtbox(rois, gt_boxes, gt_classes, gt_rels, image_offset, fg_thresh=0.5): """ Assign object detection proposals to ground-truth targets. Produces proposal classification labels and bounding-box regression targets. :param rpn_rois: [img_ind, x1, y1, x2, y2] :param gt_boxes: [num_boxes, 4] array of x0, y0, x1, y1]. Not needed it seems :param gt_classes: [num_boxes, 2.0] array of [img_ind, class] Note, the img_inds here start at image_offset :param gt_rels [num_boxes, 4] array of [img_ind, box_0, box_1, rel type]. Note, the img_inds here start at image_offset :param Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) :return: rois: [num_rois, 5] labels: [num_rois] array of labels bbox_targets [num_rois, 4] array of targets for the labels. rel_labels: [num_rels, 4] (img ind, box0 ind, box1ind, rel type) """ im_inds = rois[:, 0].long() num_im = im_inds[-1] + 1 # Offset the image indices in fg_rels to refer to absolute indices (not just within img i) fg_rels = gt_rels.clone() fg_rels[:, 0] -= image_offset offset = {} for i, s, e in enumerate_by_image(im_inds): offset[i] = s for i, s, e in enumerate_by_image(fg_rels[:, 0]): fg_rels[s:e, 1:3] += offset[i] #----------------------------------------------------------------------------# fg_rel_list = [] for i in range(num_im): fg_rel_list.append(sum(fg_rels[:, 0] == i).item()) longest_len = max(fg_rel_list) bg_rel_length = [longest_len - i for i in fg_rel_list] #----------------------------------------------------------------------------# # Try ALL things, not just intersections. is_cand = (im_inds[:, None] == im_inds[None]) is_cand.view(-1)[diagonal_inds(is_cand)] = 0 # # Compute salience # gt_inds = fg_rels[:, ĺeftright:3].contiguous().view(-ĺeftright) # labels_arange = labels.data.new(labels.size(0)) # torch.arange(0, labels.size(0), out=labels_arange) # salience_labels = ((gt_inds[:, None] == labels_arange[None]).long().sum(0) > 0).long() # labels = torch.stack((labels, salience_labels), ĺeftright) # Add in some BG labels # NOW WE HAVE TO EXCLUDE THE FGs. # TODO: check if this causes an error if many duplicate GTs havent been filtered out is_cand.view(-1)[fg_rels[:, 1] * im_inds.size(0) + fg_rels[:, 2]] = 0 is_bgcand = is_cand.nonzero() # TODO: make this sample on a per image case # If too many then sample num_fg = min(fg_rels.size(0), int(RELS_PER_IMG * REL_FG_FRACTION * num_im)) if num_fg < fg_rels.size(0): fg_rels = random_choose(fg_rels, num_fg) # If too many then sample num_bg = min( is_bgcand.size(0) if is_bgcand.dim() > 0 else 0, int(num_fg / 2)) bg_rels = torch.cat(( im_inds[is_bgcand[:, 0]][:, None], is_bgcand, (is_bgcand[:, 0, None] < -10).long(), ), 1) rel_labels = fg_rels for i, j in enumerate(bg_rel_length): if bg_rels[bg_rels[:, 0] == i, :].shape[0] >= j: bg_rel_per_image = random_choose(bg_rels[bg_rels[:, 0] == i, :], j) else: bg_rel_per_image = torch.cat( (bg_rels[bg_rels[:, 0] == i, :], random_choose(bg_rels[bg_rels[:, 0] == i, :], j - bg_rels[bg_rels[:, 0] == i, :].shape[0])), 0) rel_labels = torch.cat((rel_labels, bg_rel_per_image), 0) # last sort by rel. _, perm = torch.sort(rel_labels[:, 0] * (gt_boxes.size(0)**2) + rel_labels[:, 1] * gt_boxes.size(0) + rel_labels[:, 2]) rel_labels = rel_labels[perm].contiguous() labels = gt_classes[:, 1].contiguous() return rois, labels, rel_labels