Ejemplo n.º 1
0
def nms_detections(props,
                   scores,
                   overlap=0.7,
                   captions=None,
                   topk=-1,
                   return_index=False):
    """ Non-maximum suppression: Greedily select high-scoring detections and
    skip detections that are significantly covered by a previously selected
    detection. This version is translated from Matlab code by Tomasz
    Malisiewicz, who sped up Pedro Felzenszwalb's code.
    Args:
        props: ndarray
            Two-dimensional array of shape (num_props, 2), containing the start and
            end boundaries of the temporal proposals.
        scores: ndarray
            One-dimensional array of shape (num_props,), containing the corresponding
            scores for each detection above.
    Returns:
        nms_props, nms_scores, (nms_caps) : ndarrays, ndarrays, strs
            Arrays with the same number of dimensions as the original input, but
            with only the proposals selected after non-maximum suppression.
    """
    if isinstance(props, torch.Tensor): props = net_utils.to_data(props)
    if isinstance(props, list): props = np.asarray(props)
    if isinstance(scores, torch.Tensor): scores = net_utils.to_data(scores)
    t1 = props[:, 0]
    t2 = props[:, 1]
    ind = np.argsort(scores)
    if topk > 0 and len(ind) > topk:
        ind = ind[-topk:]
    area = (t2 - t1 + 1).astype(float)
    pick = []
    while len(ind) > 0:
        i = ind[-1]  # index with highest proposal score
        pick.append(i)
        ind = ind[:-1]
        tt1 = np.maximum(t1[i], t1[ind])
        tt2 = np.minimum(t2[i], t2[ind])
        wh = np.maximum(0., tt2 - tt1 + 1.0)
        o = wh / (area[i] + area[ind] - wh)
        ind = ind[np.nonzero(o <= overlap)[0]]
    if captions:
        nms_props, nms_scores, nms_caps = props[
            pick, :], scores[pick], captions[pick]
        if return_index:
            return nms_props, nms_scores, nms_caps, pick
        else:
            return nms_props, nms_scores, nms_caps
    else:
        nms_props, nms_scores = props[pick, :], scores[pick]
        if return_index:
            return nms_props, nms_scores, pick
        else:
            return nms_props, nms_scores
Ejemplo n.º 2
0
    def _infer(self, net_inps, mode="forward", gts=None):
        # fetch inputs
        word_labels = net_inps["query_labels"]  # [B,L] (nword == L)
        word_masks = net_inps["query_masks"]  # [B,L]
        c3d_feats = net_inps["video_feats"]  # [B,T,d_v]
        seg_masks = net_inps["video_masks"].squeeze(2)  # [B,T]
        B, nseg, _ = c3d_feats.size()  # nseg == T

        # forward encoders
        # get word-level, sentence-level and segment-level features
        word_feats, sen_feats = self.query_enc(word_labels, word_masks,
                                               "both")  # [B,L,*]
        seg_feats = self.video_enc(c3d_feats, seg_masks)  # [B,nseg,*]

        # get semantic phrase features:
        # se_feats: semantic phrase features [B,nse,*];
        #           ([e^1,...,e^n]) in Eq. (7)
        # se_attw: attention weights for semantic phrase [B,nse,nword];
        #           ([a^1,...,a^n]) in Eq. (6)
        if self.nse > 1:
            se_feats, se_attw = self.sqan(sen_feats, word_feats, word_masks)
        else:
            se_attw = None

        # Local-global video-text interactions
        # sa_feats: semantics-aware segment features [B,nseg,d]; R in Eq. (12)
        # s_attw: aggregating weights [B,nse]
        if self.nse > 1:
            q_feats = se_feats
        else:
            q_feats = sen_feats
        sa_feats, s_attw = self.vti_fn(seg_feats, seg_masks, q_feats)

        # Temporal attentive localization by regression
        # loc: prediction of time span (t^s, t^e)
        # t_attw: temporal attention weights (o)
        loc, t_attw = self.ta_reg_fn(sa_feats, seg_masks)

        if mode == "forward":
            outs = OrderedDict()
            outs["grounding_loc"] = loc
            if self.use_tag_loss:
                outs["tag_attw"] = t_attw
            if self.use_dqa_loss:
                outs["dqa_attw"] = se_attw
        else:
            outs = dict()
            outs["vids"] = gts["vids"]
            outs["qids"] = gts["qids"]
            outs["query_labels"] = net_utils.to_data(net_inps["query_labels"])
            outs["grounding_gt"] = net_utils.to_data(
                gts["grounding_att_masks"])
            outs["grounding_pred"] = net_utils.loc2mask(loc, seg_masks)
            outs["nfeats"] = gts["nfeats"]
            if self.nse > 1:
                outs["se_attw"] = net_utils.to_data(se_attw)
            else:
                outs["se_attw"] = net_utils.to_data(
                    t_attw.new_zeros(t_attw.size(0), 2, 4))
            outs["t_attw"] = net_utils.to_data(t_attw.unsqueeze(1))
            if s_attw is None:
                outs["s_attw"] = net_utils.to_data(
                    t_attw.new_zeros(t_attw.size(0), 2, 4))
            else:
                outs["s_attw"] = net_utils.to_data(s_attw)

            if mode == "save_output":
                outs["duration"] = gts["duration"]
                outs["timestamps"] = gts["timestamps"]
                outs["grounding_pred_loc"] = net_utils.to_data(loc)

        return outs