Beispiel #1
0
def _gen_detection_video(video_list,
                         cls_data,
                         thu_label_id,
                         video_dict,
                         cfg,
                         num_prop=200,
                         topk=2):
    dscale = cfg.DATASET.dscale
    cols = ["xmin", "xmax", "score"]

    for video_name, video_cls in zip(video_list, cls_data):
        # read all files
        video_folder = os.path.join(cfg.output_path, video_name)
        try:
            files = [
                os.path.join(video_folder, f) for f in os.listdir(video_folder)
            ]
        except:
            continue
        if len(files) == 0:
            print("Missing result for video {}".format(video_name))
        files.sort()

        # prepare whole video data
        num_frames = video_dict[video_name]["frame"]
        seconds = video_dict[video_name]["duration"]

        snippet_stride = cfg.FEATURE.snippet_stride
        snippet_num = num_frames // snippet_stride + 1
        v_snippets = [snippet_stride * i for i in range(snippet_num)]

        snippet_num = len(v_snippets)
        v_iou_map = np.zeros((cfg.DATASET.dscale, snippet_num))
        v_tem_s = np.zeros(snippet_num)
        v_tem_e = np.zeros(snippet_num)

        # proposal might be duplicated due to sliding window, need to mean by count
        v_iou_map_cnt = np.zeros((cfg.DATASET.dscale, snippet_num))
        v_tem_cnt = np.zeros(snippet_num)

        for snippet_file in files:
            # load output result
            with open(snippet_file, "rb") as infile:
                result_data = pickle.load(infile)

            [
                tmp_snippet, pred_local_s, pred_local_e, pred_global_s,
                pred_global_e, pred_iou_map
            ] = result_data

            pred_start = np.sqrt(pred_local_s * pred_global_s)
            pred_end = np.sqrt(pred_local_e * pred_global_e)

            # get true index of current window to aviod invalid time stamp
            start_idx = int(min(np.argwhere(tmp_snippet >= 0)))
            end_idx = int(max(np.argwhere(tmp_snippet <= num_frames)))
            true_start = tmp_snippet[start_idx]
            true_end = tmp_snippet[end_idx]

            # get absolute index of whole video
            v_s_idx = v_snippets.index(true_start)
            v_e_idx = v_snippets.index(true_end)

            # push data to whole data
            v_tem_s[v_s_idx:v_e_idx + 1] += pred_start[start_idx:end_idx + 1]
            v_tem_e[v_s_idx:v_e_idx + 1] += pred_end[start_idx:end_idx + 1]

            iou_mask = get_valid_mask(dscale, end_idx - start_idx + 1)
            pred_iou_map = pred_iou_map[:, :, :end_idx - start_idx + 1]
            pred_iou_map = pred_iou_map[0, :, :] * pred_iou_map[1, :, :]
            v_iou_map[:, v_s_idx:v_e_idx + 1] += pred_iou_map * iou_mask

            # update count
            v_tem_cnt[v_s_idx:v_e_idx + 1] += 1
            v_iou_map_cnt[:, v_s_idx:v_e_idx + 1] += iou_mask

        v_tem_s /= v_tem_cnt + 1e-6
        v_tem_e /= v_tem_cnt + 1e-6
        v_iou_map /= v_iou_map_cnt + 1e-6

        start_mask = boundary_choose(v_tem_s)
        start_mask[0] = 1.0
        end_mask = boundary_choose(v_tem_e)
        end_mask[-1] = 1.0

        score_vector_list = []
        for idx in range(1, dscale):
            for jdx in range(snippet_num):
                start_idx = jdx
                end_idx = start_idx + idx
                if end_idx < snippet_num and start_mask[
                        start_idx] == 1 and end_mask[end_idx] == 1:
                    xmin = v_snippets[start_idx]
                    xmax = v_snippets[end_idx]

                    xmin_score = v_tem_s[start_idx]
                    xmax_score = v_tem_e[end_idx]
                    bm_score = v_iou_map[idx, jdx]
                    conf_score = xmin_score * xmax_score * bm_score
                    score_vector_list.append([xmin, xmax, conf_score])

        score_vector_list = np.stack(score_vector_list)
        df = pd.DataFrame(score_vector_list, columns=cols)

        if len(df) > 1:
            df = Soft_NMS(
                df,
                iou_threshold=cfg.DETECTION_POST.iou_threshold,
                sigma=cfg.DETECTION_POST.sigma,
            )
        df = df.sort_values(by="score", ascending=False)

        # sort video classification
        video_cls_rank = sorted((e, i) for i, e in enumerate(video_cls))
        unet_classes = [
            thu_label_id[video_cls_rank[-k - 1][1]] + 1 for k in range(topk)
        ]
        unet_scores = [video_cls_rank[-k - 1][0] for k in range(topk)]

        proposal_list = []
        for j in range(min(num_prop, len(df))):
            for k in range(topk):
                tmp_proposal = {}
                tmp_proposal["label"] = thumos_class[int(unet_classes[k])]
                tmp_proposal["score"] = float(
                    round(df.score.values[j] * unet_scores[k], 6))

                tmp_xmin = max(0, df.xmin.values[j] / num_frames) * seconds
                tmp_xmax = min(1, df.xmax.values[j] / num_frames) * seconds
                tmp_proposal["segment"] = [
                    float(round(tmp_xmin, 1)),
                    float(round(tmp_xmax, 1)),
                ]
                proposal_list.append(tmp_proposal)
        result_dict[video_name] = proposal_list
Beispiel #2
0
def _gen_detection_video(video_list, video_dict, cfg, num_prop=100):
    tscale = cfg.DATASET.tscale
    dscale = cfg.DATASET.dscale
    output_path = "./exps/{}/output/".format(cfg.EXP_NAME)

    anchor_xmin = np.array([i / tscale for i in range(tscale)])
    anchor_xmax = np.array([i / tscale for i in range(1, tscale + 1)])
    cols = ["xmin", "xmax", "score"]

    for video_name in video_list:
        file_path = os.path.join(output_path, "{}.pkl".format(video_name))

        with open(file_path, "rb") as infile:
            result = pickle.load(infile)

        [_, pred_local_s, pred_local_e, pred_global_s, pred_global_e, pred_iou_map] = result

        pred_s = np.sqrt(pred_local_s * pred_global_s)
        pred_e = np.sqrt(pred_local_e * pred_global_e)
        pred_iou_map = pred_iou_map[0, :, :] * pred_iou_map[1, :, :]

        start_mask = boundary_choose(pred_s)
        start_mask[0] = 1.0
        end_mask = boundary_choose(pred_e)
        end_mask[-1] = 1.0

        score_vector_list = []
        for idx in range(dscale):
            for jdx in range(tscale - idx):
                start_idx = jdx
                end_idx = start_idx + idx
                if start_mask[start_idx] == 1 and end_mask[end_idx] == 1:
                    xmin = anchor_xmin[start_idx]
                    xmax = anchor_xmax[end_idx]
                    xmin_score = pred_s[start_idx]
                    xmax_score = pred_e[end_idx]
                    conf_score = xmin_score * xmax_score * pred_iou_map[idx, jdx]
                    score_vector_list.append([xmin, xmax, conf_score])

        score_vector_list = np.stack(score_vector_list)
        df = pd.DataFrame(score_vector_list, columns=cols)

        if len(df) > 1:
            df = soft_nms(
                df,
                alpha=cfg.PROPOSAL_POST.alpha,
                t1=cfg.PROPOSAL_POST.t1,
                t2=cfg.PROPOSAL_POST.t2,
                max_num=num_prop,
            )
        df = df.sort_values(by="score", ascending=False)

        video_duration = float(video_dict[video_name]["duration"])
        proposal_list = []
        for j in range(min(num_prop, len(df))):
            tmp_proposal = {}
            tmp_proposal["score"] = df.score.values[j]
            tmp_proposal["segment"] = [
                max(0, df.xmin.values[j]) * video_duration,
                min(1, df.xmax.values[j]) * video_duration,
            ]

            proposal_list.append(tmp_proposal)
        result_dict[video_name] = proposal_list