def forward(self, obj_logits, vr, obj_labels=None, boxes_per_cls=None): if self.mode == 'predcls': obj_dists2 = Variable(to_onehot(obj_labels.data, self.num_obj_cls)) else: obj_dists2 = obj_logits if self.mode == 'sgdet' and not self.training: # NMS here for baseline probs = F.softmax(obj_dists2, 1) nms_mask = obj_dists2.data.clone() nms_mask.zero_() for c_i in range(1, obj_dists2.size(1)): scores_ci = probs.data[:, c_i] boxes_ci = boxes_per_cls.data[:, c_i] keep = apply_nms(scores_ci, boxes_ci, pre_nms_topn=scores_ci.size(0), post_nms_topn=scores_ci.size(0), nms_thresh=0.3) nms_mask[:, c_i][keep] = 1 obj_preds = Variable(nms_mask * probs.data, volatile=True)[:, 1:].max(1)[1] + 1 else: obj_preds = obj_labels if obj_labels is not None else obj_dists2[:, 1:].max( 1)[1] + 1 rel_dists = self.vr_fc(vr) return obj_dists2, obj_preds, rel_dists
def forward(self, obj_fmaps, obj_logits, vr, rel_inds, obj_labels=None, boxes_per_cls=None): if self.mode == 'predcls': obj_dists2 = Variable(to_onehot(obj_labels.data, self.num_classes)) else: obj_dists2 = obj_logits if self.mode == 'sgdet' and not self.training: # NMS here for baseline probs = F.softmax(obj_dists2, 1) nms_mask = obj_dists2.data.clone() nms_mask.zero_() for c_i in range(1, obj_dists2.size(1)): scores_ci = probs.data[:, c_i] boxes_ci = boxes_per_cls.data[:, c_i] keep = apply_nms(scores_ci, boxes_ci, pre_nms_topn=scores_ci.size(0), post_nms_topn=scores_ci.size(0), nms_thresh=0.3) nms_mask[:, c_i][keep] = 1 obj_preds = Variable(nms_mask * probs.data, volatile=True)[:,1:].max(1)[1] + 1 else: obj_preds = obj_labels if obj_labels is not None else obj_dists2[:,1:].max(1)[1] + 1 f_obj_rel = torch.stack([torch.cat([obj_fmaps[rel_ind[1]], obj_fmaps[rel_ind[2]], vr[index]]) for index, rel_ind in enumerate(rel_inds)]) rel_dists = self.vr_fc(f_obj_rel) return obj_dists2, obj_preds, rel_dists
def obj_ctx(self, obj_feats, obj_labels=None, box_priors=None, boxes_per_cls=None, forest = None, batch_size=0): """ Object context and object classification. :param obj_feats: [num_obj, img_dim + object embedding0 dim] :param obj_dists: [num_obj, #classes] :param im_inds: [num_obj] the indices of the images :param obj_labels: [num_obj] the GT labels of the image :param boxes: [num_obj, 4] boxes. We'll use this for NMS :return: obj_dists: [num_obj, #classes] new probability distribution. obj_preds: argmax of that distribution. obj_final_ctx: [num_obj, #feats] For later! """ # use bidirectional tree lstm to update encoder_rep = self.obj_tree_lstm(forest, obj_feats, box_priors.shape[0]) # Decode in order if self.mode != 'predcls': decode_feature = torch.cat((obj_feats, encoder_rep), 1) if self.pass_in_obj_feats_to_decoder else encoder_rep obj_dists, obj_preds = self.decoder_tree_lstm(forest, decode_feature, box_priors.shape[0], labels=obj_labels if obj_labels is not None else None, boxes_for_nms=boxes_per_cls if boxes_per_cls is not None else None, batch_size=batch_size) else: assert obj_labels is not None obj_preds = obj_labels obj_dists = Variable(to_onehot(obj_preds.data[:-batch_size], self.num_classes)) return obj_dists, obj_preds, encoder_rep
def forward(self, obj_fmaps, obj_logits, rel_inds, vr, obj_labels=None, boxes_per_cls=None): """ Reason relationship classes using knowledge of object and relationship coccurrence. """ # print(rel_inds.shape) # (num_rel, 3) if self.mode == 'predcls': obj_dists2 = Variable(to_onehot(obj_labels.data, self.num_obj_cls)) else: obj_dists2 = obj_logits if self.mode == 'sgdet' and not self.training: # NMS here for baseline probs = F.softmax(obj_dists2, 1) nms_mask = obj_dists2.data.clone() nms_mask.zero_() for c_i in range(1, obj_dists2.size(1)): scores_ci = probs.data[:, c_i] boxes_ci = boxes_per_cls.data[:, c_i] keep = apply_nms(scores_ci, boxes_ci, pre_nms_topn=scores_ci.size(0), post_nms_topn=scores_ci.size(0), nms_thresh=0.3) nms_mask[:, c_i][keep] = 1 obj_preds = Variable(nms_mask * probs.data, volatile=True)[:, 1:].max(1)[1] + 1 else: obj_preds = obj_labels if obj_labels is not None else obj_dists2[:, 1:].max( 1)[1] + 1 sub_obj_preds = torch.cat((obj_preds[rel_inds[:, 1]].view( -1, 1), obj_preds[rel_inds[:, 2]].view(-1, 1)), 1) obj_fmaps = self.obj_proj(obj_fmaps) vr = self.rel_proj(vr) input_ggnn = torch.stack([ torch.cat([ obj_fmaps[rel_ind[1]].unsqueeze(0), obj_fmaps[rel_ind[2]].unsqueeze(0), vr[index].repeat( self.num_rel_cls, 1) ], 0) for index, rel_ind in enumerate(rel_inds) ]) rel_dists = self.ggnn_rel(rel_inds[:, 1:], sub_obj_preds, input_ggnn) return obj_dists2, obj_preds, rel_dists
def obj_ctx(self, obj_feats, obj_dists, im_inds, obj_labels=None, box_priors=None, boxes_per_cls=None): """ Object context and object classification. Args: obj_feats: Variable, Object features with shape of (NumOfRoIs, feature_dim + word_embbeding_dim + pos_embbeding) obj_dists: object class score, with shape of (num_obj, #classes) im_inds: the indices of the images, with shape of (NumOfRoI,) obj_labels: [num_obj] the GT labels of the image box_priors: box coordinates of objects, with shape of (NumOfRoI, 4) boxes_per_cls: Returns: obj_dists: [num_obj, #classes] new probability distribution. obj_preds: argmax of that distribution, with shape of (NumOfClasses,), the exact value of class index encoder_rep: torch.Tensor, encoder feature with shape of (NumOfRoI, biLSTM_dim(512)) """ # Sort by the confidence of the maximum detection. confidence = F.softmax(obj_dists, dim=1).data[:, 1:].max(1)[0] perm, inv_perm, ls_transposed = self.sort_rois(im_inds.data, confidence, box_priors) # Pass object features, sorted by score, into the encoder LSTM obj_inp_rep = obj_feats[perm].contiguous() # embed(header='rel_model perm') input_packed = PackedSequence(obj_inp_rep, ls_transposed) encoder_rep = self.obj_ctx_rnn(input_packed)[0][0] # Decode in order if self.mode != 'predcls': decoder_inp = PackedSequence( torch.cat((obj_inp_rep, encoder_rep), 1) if self.pass_in_obj_feats_to_decoder else encoder_rep, ls_transposed) obj_dists, obj_preds = self.decoder_rnn( decoder_inp, labels=obj_labels[perm] if obj_labels is not None else None, boxes_for_nms=boxes_per_cls[perm] if boxes_per_cls is not None else None, ) obj_preds = obj_preds[inv_perm] obj_dists = obj_dists[inv_perm] else: assert obj_labels is not None obj_preds = obj_labels obj_dists = Variable(to_onehot(obj_preds.data, self.num_classes)) encoder_rep = encoder_rep[inv_perm] # embed(header='rel_model.py obj_ctx before return') return obj_dists, obj_preds, encoder_rep
def obj_ctx(self, obj_feats, obj_dists, im_inds, obj_labels=None, box_priors=None, boxes_per_cls=None): """ Object context and object classification. :param obj_feats: obj_pre_rep, [num_obj, 4096+200+128] :param obj_dists: result.rm_obj_dists.detach(), [num_obj, 151] :param im_inds: [num_obj] the indices of the images :param obj_labels: od_obj_labels, [num_obj] the GT labels of the image :param boxes: [num_obj, 4] boxes. We'll use this for NMS :return: obj_dists: [num_obj, #classes] new probability distribution. obj_preds: argmax of that distribution. obj_final_ctx: [num_obj, #feats] For later! """ #ipdb.set_trace() # Encode in order # Sort by the confidence of the maximum detection; # [384,151]->[384,151]->[384,150]->[384,2], (scores, true_index - 1)-> scores; index is 150-d but truth is 151-d confidence = F.softmax(obj_dists, dim=1).data[:, 1:].max(1)[0] # sort rois(boxes) within the !same! img according to some order; # a = a[perm][inv_perm] # perm: [384]; inv_perm: [384]; ls_transposed: len=64 (6,...6,6) perm, inv_perm, ls_transposed = self.sort_rois(im_inds.data, confidence, box_priors) # Pass object features, sorted within the same img by score obj_inp_rep = obj_feats[perm].contiguous() # make cache/memory contiguous # [6, 64, 151], (batch_size, num_timesteps, input_size) input_packed = PackedSequence(obj_inp_rep, ls_transposed) # encoder_rep: [#boxes, 512] encoder_rep = self.obj_ctx_rnn(input_packed)[0][0] #ipdb.set_trace() # Decode in order if self.mode != 'predcls': decoder_inp = PackedSequence(torch.cat((obj_inp_rep, encoder_rep), 1) if self.pass_in_obj_feats_to_decoder else encoder_rep, ls_transposed) # when training sgdet: obj_preds = F.softmax(obj_dists, dim=1).data[:, 1:].max(1)[1] .+ 1 obj_dists, obj_preds = self.decoder_rnn( decoder_inp, #obj_dists[perm], labels=obj_labels[perm] if obj_labels is not None else None, boxes_for_nms=boxes_per_cls[perm] if boxes_per_cls is not None else None, # not None when sgdet ) obj_preds = obj_preds[inv_perm] obj_dists = obj_dists[inv_perm] else: assert obj_labels is not None obj_preds = obj_labels obj_dists = Variable(to_onehot(obj_preds.data, self.num_classes)) #obj_preds = Variable(F.softmax(obj_dists, dim=1).data[:, 1:].max(1)[1] + 1) encoder_rep = encoder_rep[inv_perm] return obj_dists, obj_preds, encoder_rep
def forward(self, obj_fmaps, obj_logits, im_inds, obj_labels=None, box_priors=None, boxes_per_cls=None): """ Forward pass through the object and edge context :param obj_priors: :param obj_fmaps: :param im_inds: :param obj_labels: :param boxes: :return: """ obj_embed = F.softmax(obj_logits, dim=1) @ self.obj_embed.weight pos_embed = self.pos_embed(center_size(box_priors)) # obj_pre_rep = self.conver_fusion_feature(torch.cat((obj_fmaps, obj_embed, pos_embed), 1)) obj_pre_rep = self.conver_fusion_feature( torch.cat((obj_embed, pos_embed), 1)) # UNSURE WHAT TO DO HERE if self.mode == 'predcls': obj_dists2 = Variable(to_onehot(obj_labels.data, self.num_classes)) else: obj_dists2 = self.decoder_lin(obj_pre_rep) if self.mode == 'sgdet' and not self.training: # NMS here for baseline probs = F.softmax(obj_dists2, 1) nms_mask = obj_dists2.data.clone() nms_mask.zero_() for c_i in range(1, obj_dists2.size(1)): scores_ci = probs.data[:, c_i] boxes_ci = boxes_per_cls.data[:, c_i] keep = apply_nms(scores_ci, boxes_ci, pre_nms_topn=scores_ci.size(0), post_nms_topn=scores_ci.size(0), nms_thresh=0.3) nms_mask[:, c_i][keep] = 1 obj_preds = Variable(nms_mask * probs.data, volatile=True)[:, 1:].max(1)[1] + 1 else: obj_preds = obj_labels if obj_labels is not None else obj_dists2[:, 1:].max( 1)[1] + 1 return obj_dists2, obj_preds, obj_pre_rep
def obj_ctx(self, obj_feats, obj_dists, im_inds, obj_labels=None, box_priors=None, boxes_per_cls=None): """ Object context and object classification. :param obj_feats: [num_obj, img_dim + object embedding0 dim] :param obj_dists: [num_obj, #classes] :param im_inds: [num_obj] the indices of the images :param obj_labels: [num_obj] the GT labels of the image :param boxes: [num_obj, 4] boxes. We'll use this for NMS :return: obj_dists: [num_obj, #classes] new probability distribution. obj_preds: argmax of that distribution. obj_final_ctx: [num_obj, #feats] For later! """ # Sort by the confidence of the maximum detection. confidence = F.softmax(obj_dists, dim=1).data[:, 1:].max(1)[0] perm, inv_perm, ls_transposed = self.sort_rois(im_inds.data, confidence, box_priors) # Pass object features, sorted by score, into the encoder LSTM obj_inp_rep = obj_feats[perm].contiguous() input_packed = PackedSequence(obj_inp_rep, ls_transposed) encoder_rep = self.obj_ctx_rnn(input_packed)[0][0] # Decode in order if self.mode != 'predcls': decoder_inp = PackedSequence( torch.cat((obj_inp_rep, encoder_rep), 1) if self.pass_in_obj_feats_to_decoder else encoder_rep, ls_transposed) obj_dists, obj_preds = self.decoder_rnn( decoder_inp, #obj_dists[perm], labels=obj_labels[perm] if obj_labels is not None else None, boxes_for_nms=boxes_per_cls[perm] if boxes_per_cls is not None else None, ) obj_preds = obj_preds[inv_perm] obj_dists = obj_dists[inv_perm] else: assert obj_labels is not None obj_preds = obj_labels obj_dists = Variable(to_onehot(obj_preds.data, self.num_classes)) encoder_rep = encoder_rep[inv_perm] return obj_dists, obj_preds, encoder_rep
def forward(self, obj_dists1, obj_feats, obj_labels=None, box_priors=None, boxes_per_cls=None): """ Forward pass through the object and edge context :param obj_priors: :param obj_fmaps: :param im_inds: :param obj_labels: :param boxes: :return: """ # UNSURE WHAT TO DO HERE if self.mode == 'predcls': obj_dists2 = Variable(to_onehot(obj_labels.data, self.num_classes)) else: obj_dists2 = self.decoder_lin(obj_feats) + obj_dists1 if self.mode == 'sgdet' and not self.training: # NMS here for baseline is_overlap = nms_overlaps(boxes_per_cls.data).view( boxes_per_cls.size(0), boxes_per_cls.size(0), boxes_per_cls.size(1)).cpu().numpy() >= 0.5 probs = F.softmax(obj_dists2, 1).data.cpu().numpy() probs[:, 0] = 0 obj_preds = obj_dists2.data.new( obj_dists2.shape[0]).long().fill_(0) for i in range(obj_preds.size(0)): box_ind, cls_ind = np.unravel_index(probs.argmax(), probs.shape) obj_preds[int(box_ind)] = int(cls_ind) probs[is_overlap[box_ind, :, cls_ind], cls_ind] = 0.0 probs[box_ind] = -1.0 obj_preds = Variable(obj_preds.view(-1)) else: obj_preds = obj_labels if obj_labels is not None else obj_dists2[:, 1:].max( 1)[1] + 1 return obj_dists2, obj_preds
def forward(self, im_inds, obj_fmaps, obj_labels): """ Reason object classes using knowledge of object cooccurrence """ if self.mode == 'predcls': # in task 'predcls', there is no need to run GGNN_obj obj_dists = Variable(to_onehot(obj_labels.data, self.num_obj_cls)) return obj_dists else: input_ggnn = self.obj_proj(obj_fmaps) lengths = [] for i, s, e in enumerate_by_image(im_inds.data): lengths.append(e - s) obj_cum_add = np.cumsum([0] + lengths) obj_dists = torch.cat([self.ggnn_obj(input_ggnn[obj_cum_add[i] : obj_cum_add[i+1]]) for i in range(len(lengths))], 0) return obj_dists
def forward(self, obj_fmaps, obj_logits, im_inds, obj_labels=None, box_priors=None, boxes_per_cls=None): """ Forward pass through the object and edge context :param obj_priors: :param obj_fmaps: :param im_inds: :param obj_labels: :param boxes: :return: """ obj_embed = F.softmax(obj_logits, dim=1) @ self.obj_embed.weight pos_embed = self.pos_embed(Variable(center_size(box_priors))) obj_pre_rep = torch.cat((obj_fmaps, obj_embed, pos_embed), 1) if self.nl_obj > 0: obj_dists2, obj_preds, obj_ctx = self.obj_ctx( obj_pre_rep, obj_logits, im_inds, obj_labels, box_priors, boxes_per_cls, ) else: # UNSURE WHAT TO DO HERE if self.mode == 'predcls': obj_dists2 = Variable(to_onehot(obj_labels.data, self.num_classes)) else: obj_dists2 = self.decoder_lin(obj_pre_rep) obj_preds = obj_labels if obj_labels is not None else obj_dists2[:,1:].max(1)[1] + 1 obj_ctx = obj_pre_rep edge_ctx = None if self.nl_edge > 0: edge_ctx = self.edge_ctx( torch.cat((obj_fmaps, obj_ctx), 1) if self.pass_in_obj_feats_to_edge else obj_ctx, obj_dists=obj_dists2.detach(), # Was previously obj_logits. im_inds=im_inds, obj_preds=obj_preds, box_priors=box_priors, ) return obj_dists2, obj_preds, edge_ctx
def obj_ctx(self, obj_feats, obj_labels=None, boxes_for_nms=None): """ Object context and object classification. :param obj_feats: [num_obj, obj_dim + embed_dim + ctx_dim]: O0 :param obj_labels: [num_obj] the GT labels of the image :param boxes_for_nms: [num_obj, 4] boxes. We'll use this for NMS :return: obj_dists: [num_obj, num_classes] new probability distribution: O4 obj_preds: [num_obj] argmax of that distribution: O4' obj_ctx: [num_obj, hidden_dim] for later edge contex: O3 """ O1, O2 = self.RE1(obj_feats) obj_ctx, obj_dists = self.RE2(O2) if self.mode != 'predcls': obj_preds = self.get_max_preds(obj_dists, obj_labels, boxes_for_nms) else: assert obj_labels is not None obj_preds = obj_labels obj_dists = Variable(to_onehot(obj_preds.data, self.num_classes)) return obj_dists, obj_preds, obj_ctx
def forward(self, obj_fmaps, obj_logits, im_inds, obj_labels=None, box_priors=None, boxes_per_cls=None, batch_size=None, rois=None, od_box_deltas=None, im_sizes=None, image_offset=None, gt_classes=None, gt_boxes=None, ): """ Forward pass through the object and edge context :param obj_priors: :param obj_fmaps: :param im_inds: :param obj_labels: :param boxes: :return: """ obj_embed = F.softmax(obj_logits, dim=1) @ self.obj_embed.weight pos_embed = self.pos_embed(Variable(center_size(box_priors))) obj_pre_rep = torch.cat((obj_fmaps, obj_embed, pos_embed), 1) if self.mode == 'predcls': obj_dists2 = Variable(to_onehot(obj_labels.data, self.num_classes)) else: if self.mode == 'sgcls': obj_dists2 = self.decoder_lin1(obj_pre_rep) obj_dists2 = self.decoder_lin2(obj_dists2.view(-1, 1, 1024), 1) obj_dists2 = obj_dists2[1] obj_dists2 = self.decoder_lin3(obj_dists2.view(-1, 1024)) else: # this is for sgdet obj_dists2 = self.decoder_lin1(obj_pre_rep) perm, inv_perm, ls_transposed = self.sort_rois(im_inds.data, None, box_priors) obj_dists2 = obj_dists2[perm].contiguous() obj_dists2 = PackedSequence(obj_dists2, torch.tensor(ls_transposed)) obj_dists2, lengths1 = pad_packed_sequence(obj_dists2, batch_first=False) obj_dists2 = self.decoder_lin2(obj_dists2.view(-1, batch_size, 1024), batch_size)[1] obj_dists2, _ = pack_padded_sequence(obj_dists2, lengths1, batch_first=False) obj_dists2 = self.decoder_lin3(obj_dists2.view(-1, 1024)) obj_dists2 = obj_dists2[inv_perm] if (not self.training and not self.mode == 'gtbox') or self.mode in ('sgdet', 'refinerels'): # try: dont apply nms here, but after own obj_classifier nms_inds, nms_scores, nms_preds, nms_boxes_assign, nms_boxes, nms_imgs = self.nms_boxes( obj_dists2.clone().detach(), rois, od_box_deltas.clone().detach(), im_sizes, ) im_inds = nms_imgs + image_offset obj_dists2 = obj_dists2[nms_inds] obj_fmap = obj_fmaps[nms_inds] box_deltas = od_box_deltas[nms_inds] box_priors = nms_boxes[:, 0] rois = rois[nms_inds] if self.training and not self.mode == 'gtbox': # NOTE: If we're doing this during training, we need to assign labels here. pred_to_gtbox = bbox_overlaps(box_priors, gt_boxes).data pred_to_gtbox[im_inds.data[:, None] != gt_classes.data[None, :, 0]] = 0.0 max_overlaps, argmax_overlaps = pred_to_gtbox.max(1) rm_obj_labels = gt_classes[:, 1][argmax_overlaps] rm_obj_labels[max_overlaps < 0.5] = 0 else: rm_obj_labels = None if self.mode == 'sgdet' and not self.training: # have tried in training # NMS here for baseline probs = F.softmax(obj_dists2, 1) nms_mask = obj_dists2.data.clone() nms_mask.zero_() for c_i in range(1, obj_dists2.size(1)): scores_ci = probs.data[:, c_i] boxes_ci = nms_boxes.data[:, c_i] keep = apply_nms(scores_ci, boxes_ci, pre_nms_topn=scores_ci.size(0), post_nms_topn=scores_ci.size(0), nms_thresh=0.5)#nms_thresh= 0.3 default nms_mask[:, c_i][keep] = 1 obj_preds = Variable(nms_mask * probs.data, volatile=True)[:, 1:].max(1)[1] + 1 # this for sgdet test #obj_preds=obj_dists2[:,1:].max(1)[1] + 1 else: if self.mode == 'sgdet': # use gt obj_preds = rm_obj_labels if rm_obj_labels is not None else obj_dists2[:, 1:].max(1)[1] + 1 # use_predicted label # obj_preds = obj_dists2[:, 1:].max(1)[1] + 1 else: obj_preds = obj_labels if obj_labels is not None else obj_dists2[:, 1:].max(1)[1] + 1 if self.mode == 'sgdet': return obj_dists2, obj_preds, im_inds, box_priors, rm_obj_labels, rois, nms_boxes else: return obj_dists2, obj_preds
def forward(self, obj_fmaps, obj_logits, im_inds, obj_labels=None, box_priors=None, boxes_per_cls=None, gt_forest=None, image_rois=None, image_fmap=None, co_occour=None, rel_labels=None, origin_img=None): """ Forward pass through the object and edge context :param obj_priors: [obj_num, (x1,y1,x2,y2)], float cuda :param obj_fmaps: :param im_inds: [obj_num] long variable :param obj_labels: :param boxes: :return: """ if self.mode == 'predcls': obj_logits = Variable(to_onehot(obj_labels.data, self.num_classes)) obj_embed = F.softmax(obj_logits, dim=1) @ self.obj_embed.weight batch_size = image_rois.shape[0] # pseudo box and image index: to encode virtual node into original inputs pseudo_box_priors = torch.cat((box_priors, image_rois[:, 1:].contiguous().data), 0) # [obj_num + batch_size, 4] pseudo_im_inds = torch.cat((im_inds, image_rois[:,0].contiguous().long().view(-1)), 0) # [obj_num + batch_size] pseudo_obj_fmaps = torch.cat((obj_fmaps.clone().detach(), image_fmap.detach()), 0) # [obj_num + batch_size, 4096] virtual_embed = self.virtual_node_embed.weight[0].view(1, -1).expand(batch_size, -1) pseudo_obj_embed = torch.cat((obj_embed, virtual_embed), 0) # [obj_num + batch_size, embed_dim] if self.training or (self.mode == 'predcls'): pseudo_obj_labels = torch.cat((obj_labels, Variable(torch.randn(1).fill_(0).cuda()).expand(batch_size).long().view(-1)), 0) else: pseudo_obj_labels = None if self.mode == 'sgdet': obj_distributions = F.softmax(obj_logits, dim=1)[:,1:] else: obj_distributions = F.softmax(obj_logits[:,1:], dim=1) pseudo_obj_distributions = torch.cat((obj_distributions, Variable(torch.randn(batch_size, obj_distributions.shape[1]).fill_(0).cuda())), 0) # generate RL gen tree input box_embed = tree_utils.get_box_info(Variable(pseudo_box_priors)) # 8-digits overlap_embed, _ = tree_utils.get_overlap_info(pseudo_im_inds, Variable(pseudo_box_priors)) # 4-digits prepro_feat = self.feat_preprocess_net(pseudo_obj_fmaps, pseudo_obj_embed, box_embed, overlap_embed) pair_scores, pair_rel_gate, pair_rel_gt = self.rl_score_net(prepro_feat, pseudo_obj_distributions, co_occour, rel_labels, batch_size, im_inds, pseudo_im_inds) #print('node_scores', node_scores.data.cpu().numpy()) arbitrary_forest, gen_tree_loss, entropy_loss = gen_tree.generate_forest(pseudo_im_inds, gt_forest, pair_scores, Variable(pseudo_box_priors), pseudo_obj_labels, self.use_rl_tree, self.training, self.mode) forest = arbitraryForest_to_biForest(arbitrary_forest) pseudo_pos_embed = self.pos_embed(Variable(center_size(pseudo_box_priors))) obj_pre_rep = torch.cat((pseudo_obj_fmaps, pseudo_obj_embed, pseudo_pos_embed), 1) if self.nl_obj > 0: obj_dists2, obj_preds, obj_ctx = self.obj_ctx( obj_pre_rep, pseudo_obj_labels, pseudo_box_priors, boxes_per_cls, forest, batch_size ) else: print('Error, No obj ctx') edge_ctx = None if self.nl_edge > 0: edge_ctx = self.edge_ctx( torch.cat((pseudo_obj_fmaps, obj_ctx), 1) if self.pass_in_obj_feats_to_edge else obj_ctx, obj_preds=obj_preds, box_priors=pseudo_box_priors, forest = forest, ) # draw tree if self.draw_tree and (self.draw_tree_count < self.draw_tree_max): for tree_idx in range(len(forest)): draw_tree_region(forest[tree_idx], origin_img, self.draw_tree_count) draw_tree_region_v2(forest[tree_idx], origin_img, self.draw_tree_count, obj_preds) self.draw_tree_count += 1 # remove virtual nodes return obj_dists2, obj_preds[:-batch_size], edge_ctx[:-batch_size], gen_tree_loss, entropy_loss, pair_rel_gate, pair_rel_gt
def forward(self, x, im_sizes, image_offset, gt_boxes=None, gt_classes=None, gt_rels=None, proposals=None, train_anchor_inds=None, return_fmap=False, depth_imgs=None): """ Forward pass for relation detection :param x: Images@[batch_size, 3, IM_SIZE, IM_SIZE] :param im_sizes: a numpy array of (h, w, scale) for each image. :param image_offset: oOffset onto what image we're on for MGPU training (if single GPU this is 0) :param gt_boxes: [num_gt, 4] GT boxes over the batch. :param gt_classes: [num_gt, 2] gt boxes where each one is (img_id, class) :param gt_rels: [] gt relations :param proposals: region proposals retrieved from file :param train_anchor_inds: a [num_train, 2] array of indices for the anchors that will be used to compute the training loss. Each (img_ind, fpn_idx) :param return_fmap: if the object detector must return the extracted feature maps :param depth_imgs: depth images [batch_size, 1, IM_SIZE, IM_SIZE] """ # -- Get prior `result` object (instead of calling faster-rcnn-detector) result = self.get_prior_results(image_offset, gt_boxes, gt_classes, gt_rels) # -- Get RoI and relations rois, rel_inds = self.get_rois_and_rels(result, image_offset, gt_boxes, gt_classes, gt_rels) # -- Determine subject and object indices subj_inds = rel_inds[:, 1] obj_inds = rel_inds[:, 2] # -- Extract features from depth backbone depth_features = self.depth_backbone(depth_imgs) # -- Prevent the gradients from flowing back to depth backbone (Pre-trained mode) if self.pretrained_depth: depth_features = depth_features.detach() # -- Extract RoI features for relation detection depth_rois_features = self.get_roi_features_depth(depth_features, rois) # -- Create a pairwise relation vector out of location features rel_depth = torch.cat( (depth_rois_features[subj_inds], depth_rois_features[obj_inds]), 1) rel_depth_fc = self.depth_rel_hlayer(rel_depth) # -- Predict relation distances result.rel_dists = self.depth_rel_out(rel_depth_fc) # --- *** END OF ARCHITECTURE *** ---# # -- Prepare object predictions vector (PredCLS) # Assuming its predcls obj_labels = result.rm_obj_labels if self.training or self.mode == 'predcls' else None # One hot vector of objects result.rm_obj_dists = Variable( to_onehot(obj_labels.data, self.num_classes)) # Indexed vector result.obj_preds = obj_labels if obj_labels is not None else result.rm_obj_dists[:, 1:].max( 1)[1] + 1 if self.training: return result twod_inds = arange( result.obj_preds.data) * self.num_classes + result.obj_preds.data result.obj_scores = F.softmax(result.rm_obj_dists, dim=1).view(-1)[twod_inds] # Boxes will get fixed by filter_dets function. bboxes = result.rm_box_priors rel_rep = F.softmax(result.rel_dists, dim=1) # Filtering: Subject_Score * Pred_score * Obj_score, sorted and ranked return filter_dets(bboxes, result.obj_scores, result.obj_preds, rel_inds[:, 1:], rel_rep)
def forward(self, obj_fmaps, obj_logits, im_inds, obj_labels=None, box_priors=None, boxes_per_cls=None): """ Forward pass through the object and edge context :param obj_priors: from faster rcnn output boxes :param obj_fmaps: 4096-dim roi feature maps :param obj_logits: result.rm_obj_dists.detach() :param im_inds: :param obj_labels: od_obj_labels, gt :param boxes: :return: obj_dists2: [#boxes, 151], new score for boxes obj_preds: [#boxes], prediction/class value edge_ctx: [#boxes, 512], new features for boxes """ # Object State: # obj_embed: [#boxes, 200], and self.obj_embed.weight are both Variable # obj_logits: result.rm_obj_dists.detach(), [#boxes, 151], detector scores before softmax obj_embed = F.softmax(obj_logits, dim=1) @ self.obj_embed.weight # center_size returns boxes as (center_x, center_y, width, height) # pos_embed: [#boxes, 128], Variable, from boxes after Sequential processing pos_embed = self.pos_embed(Variable(center_size(box_priors))) # obj_pre_rep: [#boxes, 4424], Variable obj_pre_rep = torch.cat((obj_fmaps, obj_embed, pos_embed), 1) if self.nl_obj > 0: # obj_dists2: [#boxes, 151], new score for box # obj_preds: [#boxes], prediction/class value # obj_ctx: [#boxes, 512], new features vector for box obj_dists2, obj_preds, obj_ctx = self.obj_ctx( obj_pre_rep, #obj_fmaps, # original: obj_pre_rep, obj_logits, im_inds, obj_labels, box_priors, boxes_per_cls, ) else: # UNSURE WHAT TO DO HERE if self.mode == 'predcls': obj_dists2 = Variable( to_onehot(obj_labels.data, self.num_classes)) else: obj_dists2 = self.decoder_lin(obj_pre_rep) if self.mode == 'sgdet' and not self.training: # NMS here for baseline probs = F.softmax(obj_dists2, 1) nms_mask = obj_dists2.data.clone() nms_mask.zero_() for c_i in range(1, obj_dists2.size(1)): scores_ci = probs.data[:, c_i] boxes_ci = boxes_per_cls.data[:, c_i] keep = apply_nms(scores_ci, boxes_ci, pre_nms_topn=scores_ci.size(0), post_nms_topn=scores_ci.size(0), nms_thresh=0.3) nms_mask[:, c_i][keep] = 1 obj_preds = Variable(nms_mask * probs.data, volatile=True)[:, 1:].max(1)[1] + 1 else: obj_preds = obj_labels if obj_labels is not None else obj_dists2[:, 1:].max( 1)[1] + 1 obj_ctx = obj_pre_rep # Edge State: edge_ctx = None if self.nl_edge > 0: # edge_ctx: [#boxes, 512] edge_ctx = self.edge_ctx( torch.cat((obj_fmaps, obj_ctx), 1) if self.pass_in_obj_feats_to_edge else obj_ctx, obj_dists=obj_dists2.detach(), # Was previously obj_logits. im_inds=im_inds, obj_preds=obj_preds, box_priors=box_priors, ) return obj_dists2, obj_preds, edge_ctx
def forward(self, x, im_sizes, image_offset, gt_boxes=None, gt_masks=None, gt_classes=None, gt_rels=None, pred_boxes=None, pred_masks=None, pred_fmaps=None, pred_dists=None): """ Forward pass for detection :param x: Images@[batch_size, 3, IM_SIZE, IM_SIZE] :param im_sizes: A numpy array of (h, w, scale) for each image. :param image_offset: Offset onto what image we're on for MGPU training (if single GPU this is 0) :param gt_boxes: Training parameters: :param gt_boxes: [num_gt, 4] GT boxes over the batch. :param gt_classes: [num_gt, 2] gt boxes where each one is (img_id, class) :param train_anchor_inds: a [num_train, 2] array of indices for the anchors that will be used to compute the training loss. Each (img_ind, fpn_idx) :return: If train: scores, boxdeltas, labels, boxes, boxtargets, rpnscores, rpnboxes, rellabels if test: prob dists, boxes, img inds, maxscores, classes pred_fmaps N*256*14*14 pred_boxes N*4 pred_masks N*28*28 pred_dists N*85 """ #print(pred_fmaps.shape, pred_boxes.shape, pred_masks.shape, pred_dists.shape) if self.training: im_inds = gt_classes[:, 0] rois = torch.cat((im_inds.float()[:, None], gt_boxes), 1) # actually is rel_assignment for sgcls # 指定rel的gt, roi不发生变化 rois, labels, rel_labels = proposal_assignments_gtbox( rois.data, gt_boxes.data, gt_classes.data, gt_rels.data, image_offset) #boxes = rois[:, 1:] pred_boxes = rois[:, 1:] pred_masks = gt_masks pred_dists = Variable(to_onehot(labels.data, self.num_classes)) else: im_inds = pred_boxes[:, 0].long() pred_boxes = pred_boxes[:, 1:] labels = gt_classes[:, 1] rel_labels = None pred_dists = Variable( to_onehot(pred_dists.data.long(), self.num_classes)) rois = torch.cat((im_inds[:, None].float(), pred_boxes), 1) result = Result() #pred_fmaps = pred_fmaps * self.downsample(pred_masks[:, None, :, :]) #result.obj_fmap = self.roi_fmap_obj(pred_fmaps.view(len(pred_fmaps), -1)) result.obj_fmap = self.obj_feature_map(pred_fmaps, rois) result.rm_obj_dists = pred_dists result.rm_obj_labels = labels result.rel_labels = rel_labels #result.boxes_all = None rel_inds = self.get_rel_inds(result.rel_labels, im_inds, pred_boxes) #rois = torch.cat((im_inds[:, None].float(), boxes), 1) # result.obj_fmap = self.obj_feature_map(result.fmap, rois) # print(pred_fmaps[0][0][0]) # print(result.rm_obj_labels[0]) # print(result.rm_obj_dists[0][:10]) # print(pred_boxes.data[[0]]) # Prevent gradients from flowing back into score_fc from elsewhere result.rm_obj_dists, result.obj_preds, edge_ctx = self.context( result.obj_fmap, result.rm_obj_dists, im_inds, result.rm_obj_labels if self.training or self.mode == 'predcls' else None, pred_boxes.data, None) #print(fdsafds) if edge_ctx is None: edge_rep = self.post_emb(result.obj_preds) else: edge_rep = self.post_lstm(edge_ctx) # Split into subject and object representations edge_rep = edge_rep.view(edge_rep.size(0), 2, self.pooling_dim) subj_rep = edge_rep[:, 0] obj_rep = edge_rep[:, 1] prod_rep = subj_rep[rel_inds[:, 1]] * obj_rep[rel_inds[:, 2]] vr = self.visual_rep(pred_fmaps, rois, rel_inds[:, 1:]) prod_rep = prod_rep * vr # if self.use_vision: # vr = self.visual_rep(pred_fmaps, rois, rel_inds[:, 1:]) # if self.limit_vision: # # exact value TBD # prod_rep = torch.cat((prod_rep[:, :2048] * vr[:, :2048], prod_rep[:, 2048:]), 1) # else: # prod_rep = prod_rep * vr if self.use_tanh: prod_rep = F.tanh(prod_rep) result.rel_dists = self.rel_compress(prod_rep) if self.use_bias: result.rel_dists = result.rel_dists + self.freq_bias.index_with_labels( torch.stack(( result.obj_preds[rel_inds[:, 1]], result.obj_preds[rel_inds[:, 2]], ), 1)) if self.training: return result twod_inds = arange( result.obj_preds.data) * self.num_classes + result.obj_preds.data result.obj_scores = F.softmax(result.rm_obj_dists, dim=1).view(-1)[twod_inds] # # Bbox regression # if self.mode == 'sgdet': # bboxes = result.boxes_all.view(-1, 4)[twod_inds].view(result.boxes_all.size(0), 4) # else: # # Boxes will get fixed by filter_dets function. # bboxes = result.rm_box_priors rel_rep = F.softmax(result.rel_dists, dim=1) return filter_dets_mask(pred_boxes, pred_masks, result.obj_scores, result.obj_preds, rel_inds[:, 1:], rel_rep)