def nms_detections(props, scores, overlap=0.7, captions=None, topk=-1, return_index=False): """ Non-maximum suppression: Greedily select high-scoring detections and skip detections that are significantly covered by a previously selected detection. This version is translated from Matlab code by Tomasz Malisiewicz, who sped up Pedro Felzenszwalb's code. Args: props: ndarray Two-dimensional array of shape (num_props, 2), containing the start and end boundaries of the temporal proposals. scores: ndarray One-dimensional array of shape (num_props,), containing the corresponding scores for each detection above. Returns: nms_props, nms_scores, (nms_caps) : ndarrays, ndarrays, strs Arrays with the same number of dimensions as the original input, but with only the proposals selected after non-maximum suppression. """ if isinstance(props, torch.Tensor): props = net_utils.to_data(props) if isinstance(props, list): props = np.asarray(props) if isinstance(scores, torch.Tensor): scores = net_utils.to_data(scores) t1 = props[:, 0] t2 = props[:, 1] ind = np.argsort(scores) if topk > 0 and len(ind) > topk: ind = ind[-topk:] area = (t2 - t1 + 1).astype(float) pick = [] while len(ind) > 0: i = ind[-1] # index with highest proposal score pick.append(i) ind = ind[:-1] tt1 = np.maximum(t1[i], t1[ind]) tt2 = np.minimum(t2[i], t2[ind]) wh = np.maximum(0., tt2 - tt1 + 1.0) o = wh / (area[i] + area[ind] - wh) ind = ind[np.nonzero(o <= overlap)[0]] if captions: nms_props, nms_scores, nms_caps = props[ pick, :], scores[pick], captions[pick] if return_index: return nms_props, nms_scores, nms_caps, pick else: return nms_props, nms_scores, nms_caps else: nms_props, nms_scores = props[pick, :], scores[pick] if return_index: return nms_props, nms_scores, pick else: return nms_props, nms_scores
def _infer(self, net_inps, mode="forward", gts=None): # fetch inputs word_labels = net_inps["query_labels"] # [B,L] (nword == L) word_masks = net_inps["query_masks"] # [B,L] c3d_feats = net_inps["video_feats"] # [B,T,d_v] seg_masks = net_inps["video_masks"].squeeze(2) # [B,T] B, nseg, _ = c3d_feats.size() # nseg == T # forward encoders # get word-level, sentence-level and segment-level features word_feats, sen_feats = self.query_enc(word_labels, word_masks, "both") # [B,L,*] seg_feats = self.video_enc(c3d_feats, seg_masks) # [B,nseg,*] # get semantic phrase features: # se_feats: semantic phrase features [B,nse,*]; # ([e^1,...,e^n]) in Eq. (7) # se_attw: attention weights for semantic phrase [B,nse,nword]; # ([a^1,...,a^n]) in Eq. (6) if self.nse > 1: se_feats, se_attw = self.sqan(sen_feats, word_feats, word_masks) else: se_attw = None # Local-global video-text interactions # sa_feats: semantics-aware segment features [B,nseg,d]; R in Eq. (12) # s_attw: aggregating weights [B,nse] if self.nse > 1: q_feats = se_feats else: q_feats = sen_feats sa_feats, s_attw = self.vti_fn(seg_feats, seg_masks, q_feats) # Temporal attentive localization by regression # loc: prediction of time span (t^s, t^e) # t_attw: temporal attention weights (o) loc, t_attw = self.ta_reg_fn(sa_feats, seg_masks) if mode == "forward": outs = OrderedDict() outs["grounding_loc"] = loc if self.use_tag_loss: outs["tag_attw"] = t_attw if self.use_dqa_loss: outs["dqa_attw"] = se_attw else: outs = dict() outs["vids"] = gts["vids"] outs["qids"] = gts["qids"] outs["query_labels"] = net_utils.to_data(net_inps["query_labels"]) outs["grounding_gt"] = net_utils.to_data( gts["grounding_att_masks"]) outs["grounding_pred"] = net_utils.loc2mask(loc, seg_masks) outs["nfeats"] = gts["nfeats"] if self.nse > 1: outs["se_attw"] = net_utils.to_data(se_attw) else: outs["se_attw"] = net_utils.to_data( t_attw.new_zeros(t_attw.size(0), 2, 4)) outs["t_attw"] = net_utils.to_data(t_attw.unsqueeze(1)) if s_attw is None: outs["s_attw"] = net_utils.to_data( t_attw.new_zeros(t_attw.size(0), 2, 4)) else: outs["s_attw"] = net_utils.to_data(s_attw) if mode == "save_output": outs["duration"] = gts["duration"] outs["timestamps"] = gts["timestamps"] outs["grounding_pred_loc"] = net_utils.to_data(loc) return outs