Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_path", type=str, required=True)
    parser.add_argument("--dset_name",
                        type=str,
                        default="tvc",
                        choices=["tvc"])
    parser.add_argument("--cache", type=str, default="./cache")
    parser.add_argument("--min_word_count", type=int, default=5)
    parser.add_argument("--raw_glove_path",
                        type=str,
                        help="downloaded glove vectors path")

    opt = parser.parse_args()
    if not os.path.exists(opt.cache):
        os.makedirs(opt.cache)

    # load, merge, clean, split data
    train_datalist = load_jsonl(opt.train_path)
    all_sentences = flat_list_of_lists(
        [[sub_e["desc"] for sub_e in e["descs"]] for e in train_datalist])
    all_sentences = [
        nltk.tokenize.word_tokenize(sen.lower()) for sen in all_sentences
    ]
    word2idx = build_vocab_idx(all_sentences, opt.min_word_count)
    print("[Info] Dumping the processed data to json file", opt.cache)
    save_json(
        word2idx,
        os.path.join(opt.cache, "{}_word2idx.json".format(opt.dset_name)))
    print("[Info] Finish.")

    if opt.raw_glove_path:
        vocab_glove_path = os.path.join(
            opt.cache, "{}_vocab_glove.pt".format(opt.dset_name))
        extract_glove(word2idx, opt.raw_glove_path, vocab_glove_path)
Beispiel #2
0
    def _load_data(self, data_path):
        logging.info("Loading data from {}".format(data_path))
        raw_lines = load_jsonl(data_path)
        raw_lines = raw_lines[:int(len(raw_lines) * self.data_ratio)]
        data = []
        for line in raw_lines:
            if self.is_eval:
                data.append(
                    dict(vid_name=line["vid_name"],
                         duration=line["duration"],
                         ts=line["ts"],
                         clip_id=line["clip_id"],
                         clip_st_ed=self.convert_ts_to_clip_indices(
                             line["ts"])))
            else:
                for d in line["descs"]:
                    data.append(
                        dict(vid_name=line["vid_name"],
                             duration=line["duration"],
                             ts=line["ts"],
                             clip_id=line["clip_id"],
                             desc=d["desc"],
                             clip_st_ed=self.convert_ts_to_clip_indices(
                                 line["ts"])))

        logging.info("Loading complete! {} captions".format(len(data)))
        return data
def load_process_sub_meta(sub_meta_path, vid2nframe, frame_length):
    """ which subtitles should be assigned to which frames
    Args:
        sub_meta_path: contains a jsonl file, each line is a dict
            {"vid_name": str, "sub": list(dicts)},
            each dict under "sub" is, e.g.,
            {'text': " Chase : That's all this is?",
             'start': 0.862, 'end': 1.862}.
            The dicts under "sub" are ordered
            the same as the original .srt files.
        frame_length: float, assign each subtitle to a frame segment
    Returns:
    """
    video2sub = {e["vid_name"]: e for e in load_jsonl(sub_meta_path)}
    total_overlapped_sub, total_sub = 0, 0
    max_sub_length, extra_long_subs = 0, 0
    max_gap_time, max_sub_duration = 0, 0
    max_matched_frame_len, max_unmatched_group_len = 0, 0
    max_overlap_time = 0
    for vid_name, sub_info in tqdm(video2sub.items(),
                                   desc="processing subtitles"):
        if isinstance(vid2nframe[vid_name], int):
            num_of_frames = vid2nframe[vid_name]
            if num_of_frames == 0:
                num_of_frames = int(
                    int(sub_info["sub"][-1]["end"]) / frame_length)
        else:
            raise ValueError(
                f"{vid_name} in vid2nframe, but with unexpected format:\n" +
                f"{vid2nframe[vid_name]}")
        info, overlapped_sub = process_single_vid_sub(sub_info["sub"],
                                                      frame_length,
                                                      num_of_frames)
        # sub_info.update(info)
        video2sub[vid_name] = info
        total_overlapped_sub += overlapped_sub
        total_sub += len(sub_info["sub"])
        max_sub_length = max(max_sub_length, info["max_sub_length"])
        max_matched_frame_len = max(max_matched_frame_len,
                                    info["max_matched_frame_len"])
        max_sub_duration = max(max_sub_duration, info["max_sub_duration"])
        max_gap_time = max(max_gap_time, info["max_gap_time"])
        max_unmatched_group_len = max(max_unmatched_group_len,
                                      info["max_unmatched_group_len"])
        max_overlap_time = max(max_overlap_time, info["max_overlap_time"])
        extra_long_subs += info["extra_long_subs"]
    print(f"overlap/total: {total_overlapped_sub}/{total_sub}")
    print(f"max subtitle length: {max_sub_length}")
    print(f"max subtitle duration: {max_sub_duration}")
    print(f"max overlap between two subtitles:{max_overlap_time}")
    print(f"max gap time between two subtitles: {max_gap_time}")
    print(f"max number of matched frames: {max_matched_frame_len}")
    print(f"max len of unmatched frame group: {max_unmatched_group_len}")
    print(f"extra long subs: {extra_long_subs}")
    return video2sub
def load_preprocess_tvr_subtitles(tokenizer, sub_data_file, max_length,
                                  filter_file_path=None, drop_edge=True, debug=False):
    """
    filter_file_path: if provided, will be used to filter relevant subtitles
    max_chunks: int, split each subtitle into multiple chunks
    max_length: int,
    drop_edge, bool, must set to False when doing feature extraction, optionally set to True to save some time
    """
    sub_datalist = load_jsonl(sub_data_file)
    sub_datalist = sub_datalist[:100] if debug else sub_datalist
    if filter_file_path is not None:  # filter at finetuning, to use only subtitles in train set.
        assert len(filter_file_path) == 1, "please supply only one filter file path (--train_data_file)"
        filter_file_path = filter_file_path[0]
        keep_ids = list(set([e["vid_name"] for e in load_jsonl(filter_file_path)]))
        sub_datalist = [e for e in sub_datalist if e["vid_name"] in keep_ids]

    preprocessed_sub_datalist = flat_list_of_lists(
        [chunk_single_sub(tokenizer, sub_data, max_length=max_length, drop_edge=drop_edge)
         for sub_data in tqdm(sub_datalist, desc="Loading subtitles")])
    return preprocessed_sub_datalist
Beispiel #5
0
    def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler,
                 max_desc_len, max_ctx_len,
                 vid_feat_path_or_handler, clip_length, ctx_mode="video",
                 normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0,
                 video_duration_idx_path=None, eval_split_name=None):
        self.dset_name = dset_name
        self.data_path = data_path
        self.data_ratio = data_ratio

        self.desc_bert_path_or_handler = desc_bert_path_or_handler
        self.max_desc_len = max_desc_len

        self.sub_bert_path_or_handler = sub_bert_path_or_handler
        self.max_ctx_len = max_ctx_len
        self.vid_feat_path_or_handler = vid_feat_path_or_handler
        self.clip_length = clip_length
        self.ctx_mode = ctx_mode

        # prepare desc data
        self.data = load_jsonl(data_path)
        if self.data_ratio != 1:
            n_examples = int(len(self.data) * data_ratio)
            self.data = self.data[:n_examples]
            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))

        self.use_video = "video" in self.ctx_mode
        self.use_sub = "sub" in self.ctx_mode
        self.use_tef = "tef" in self.ctx_mode

        if self.use_video:
            if isinstance(vid_feat_path_or_handler, h5py.File):
                self.vid_feat_h5 = vid_feat_path_or_handler
            else:  # str path
                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)

        if isinstance(desc_bert_path_or_handler, h5py.File):
            self.desc_bert_h5 = desc_bert_path_or_handler
        else:
            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)

        if self.use_sub:
            if isinstance(sub_bert_path_or_handler, h5py.File):
                self.sub_bert_h5 = sub_bert_path_or_handler
            else:  # str path
                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)

        self.normalize_vfeat = normalize_vfeat
        self.normalize_tfeat = normalize_tfeat

        if video_duration_idx_path is not None:
            video_data = load_json(video_duration_idx_path)[eval_split_name]
            self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()]
            self.video2idx = {k: v[1] for k, v in video_data.items()}
Beispiel #6
0
    def __init__(self, dset_name, eval_split_name, data_path=None,
                 desc_bert_path_or_handler=None, max_desc_len=None,  max_ctx_len=None,
                 sub_bert_path_or_handler=None, vid_feat_path_or_handler=None,
                 video_duration_idx_path=None, clip_length=None,
                 ctx_mode="video", data_mode="context",
                 h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True):
        self.dset_name = dset_name
        self.eval_split_name = eval_split_name
        self.ctx_mode = ctx_mode
        self.load_gt_video = False
        self.data_ratio = data_ratio  # only affect query data
        self.normalize_vfeat = normalize_vfeat
        self.normalize_tfeat = normalize_tfeat

        self.data_mode = None
        self.set_data_mode(data_mode)

        self.max_desc_len = max_desc_len
        self.max_ctx_len = max_ctx_len
        self.data_path = data_path
        self.query_data = load_jsonl(data_path)
        if data_ratio != 1:
            n_examples = int(len(self.query_data) * data_ratio)
            self.query_data = self.query_data[:n_examples]
            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
        if isinstance(desc_bert_path_or_handler, h5py.File):
            self.desc_bert_h5 = desc_bert_path_or_handler
        else:
            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)

        video_data = load_json(video_duration_idx_path)[self.eval_split_name]
        self.video_data = {k: v[0] for k, v in video_data.items()}
        self.video2idx = {k: v[1] for k, v in video_data.items()}
        self.clip_length = clip_length

        self.use_video = "video" in self.ctx_mode
        self.use_sub = "sub" in self.ctx_mode
        self.use_tef = "tef" in self.ctx_mode

        if self.use_video:
            if isinstance(vid_feat_path_or_handler, h5py.File):
                self.vid_feat_h5 = vid_feat_path_or_handler
            else:  # str path
                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)

        if self.use_sub:
            if isinstance(sub_bert_path_or_handler, h5py.File):
                self.sub_bert_h5 = sub_bert_path_or_handler
            else:  # str path
                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
Beispiel #7
0
 def __init__(self, db_dir, max_txt_len=-1):
     super().__init__(db_dir, max_txt_len)
     if os.path.exists(f'{self.db_dir}/query2video.json'):
         self.query2video = json.load(
             open(f'{self.db_dir}/query2video.json'))
         self.video2query = {}
         for k, v in self.query2video.items():
             if v not in self.video2query:
                 self.video2query[v] = [k]
             else:
                 self.video2query[v].append(k)
     else:
         self.query2video = {}
         self.video2query = {}
     self.query_data_f = load_jsonl(f'{self.db_dir}/query_data.jsonl')
def load_process_sub_meta(sub_meta_path, clip_length):
    """ which subtitle sentences should be assigned to which clips
    Args:
        sub_meta_path: contains a jsonl file, each line is a dict {"vid_name": str, "sub": list(dicts)},
            each dict under "sub" is, e.g., {'text': " Chase : That's all this is?", 'start': 0.862, 'end': 1.862}.
            The dicts under "sub" are ordered the same as the original .srt files.
        clip_length: float, assign each subtitle sentence to a clip segment
    Returns:
    """
    video2sub = {e["vid_name"]: e for e in load_jsonl(sub_meta_path)}
    for vid_name, sub_info in tqdm(video2sub.items(),
                                   desc="processing subtitles"):
        sub_info["clip2sen"] = process_single_vid_sub(sub_info["sub"],
                                                      clip_length)
        video2sub[vid_name] = sub_info
    return video2sub
def main_compute_upper_bound():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-dset_name", type=str, choices=["tvr"])
    parser.add_argument(
        "-eval_file_path",
        type=str,
        help="path to the file containing data to be evaluated")
    parser.add_argument("-save_path",
                        type=str,
                        help="path to save the results")
    parser.add_argument("-verbose", action="store_true")
    args = parser.parse_args()

    eval_datalist = load_jsonl(args.eval_file_path)
    video_proposals_list = get_proposals_for_videos(eval_datalist,
                                                    args.dset_name)
    recall_metrics = compute_proposal_recall_upper_bound(video_proposals_list,
                                                         iou_thds=(0.5, 0.7))

    video_proposals_list_by_video = {}
    for p in video_proposals_list:
        if p["vid_name"] in video_proposals_list_by_video:
            continue
        else:
            video_proposals_list_by_video[p["vid_name"]] = p
    video_proposals_list_by_video = list(
        video_proposals_list_by_video.values())
    total_n_clips_in_proposals = \
        np.sum([np.sum(e["proposals"][:, 1] - e["proposals"][:, 0]) for e in video_proposals_list_by_video])

    results = dict(avg_num_proposals=float(
        np.mean([len(e["proposals"]) for e in video_proposals_list_by_video])),
                   total_num_proposals=int(
                       np.sum([
                           len(e["proposals"])
                           for e in video_proposals_list_by_video
                       ])),
                   recall_metrics=recall_metrics,
                   dset_name=args.dset_name,
                   filename=args.eval_file_path,
                   proposal_config=ProposalConfigs[args.dset_name])
    results["avg_clip_per_proposal"] = total_n_clips_in_proposals / results[
        "total_num_proposals"]
    save_json(results, args.save_path, save_pretty=True)
    if args.verbose:
        pprint.pprint(results)
def load_preprocess_tvr_query(tvr_file_path):
    return [dict(id=e["desc_id"], text=e["desc"]) for e in load_jsonl(tvr_file_path)]
Beispiel #11
0
    def __init__(self,
                 dset_name,
                 data_path,
                 desc_bert_path,
                 sub_bert_path,
                 max_desc_len,
                 vid_feat_path,
                 clip_length,
                 vid_feat_size,
                 sub_feat_size=0,
                 ctx_mode="video_tef",
                 pos_iou_thd=0.7,
                 neg_iou_thd=0.3,
                 h5driver=None,
                 data_ratio=1.0,
                 normalize_vfeat=True,
                 normalize_tfeat=True,
                 model_type="cal",
                 external_train_vr_res_path=None,
                 video_duration_idx_path=None):
        self.dset_name = dset_name
        self.model_type = model_type
        self.pool_local = model_type == "mcn"  # pool local feature
        self.data_path = data_path
        self.data_ratio = data_ratio

        self.desc_bert_path = desc_bert_path
        self.max_desc_len = max_desc_len
        self.sub_bert_path = sub_bert_path

        self.vid_feat_path = vid_feat_path
        self.clip_length = clip_length
        self.ctx_mode = ctx_mode

        self.pos_iou_thd = pos_iou_thd
        self.neg_iou_thd = neg_iou_thd

        self.vid_feat_output_size = 2 * vid_feat_size * (
            "video" in ctx_mode) + 2 * ("tef" in ctx_mode)
        self.sub_feat_output_size = 2 * sub_feat_size * (
            "sub" in ctx_mode) + 2 * ("tef" in ctx_mode)

        # prepare desc data
        self.data = load_jsonl(data_path)
        if self.data_ratio != 1:
            n_examples = int(len(self.data) * data_ratio)
            self.data = self.data[:n_examples]
            logger.info("Using {}% of the data: {} examples".format(
                data_ratio * 100, n_examples))

        self.proposal_fn = get_proposal_interface(dset_name)
        if self.ctx_mode != "tef":
            self.vid_feat_h5 = h5py.File(self.vid_feat_path,
                                         "r",
                                         driver=h5driver)
        self.desc_bert_h5 = h5py.File(self.desc_bert_path,
                                      "r",
                                      driver=h5driver)
        if "sub" in self.ctx_mode:
            self.sub_bert_h5 = h5py.File(self.sub_bert_path,
                                         "r",
                                         driver=h5driver)
        self.normalize_vfeat = normalize_vfeat
        self.normalize_tfeat = normalize_tfeat
        self.use_video = "video" in self.ctx_mode
        self.use_sub = "sub" in self.ctx_mode
        self.use_tef = "tef" in self.ctx_mode

        if external_train_vr_res_path is not None:
            video_data = load_json(video_duration_idx_path)["train"]
            # {video_idx: [vid_name, vid_duration]}
            video_idx2name_dur_pair = {
                v[1]: [k, v[0]]
                for k, v in video_data.items()
            }
            external_vr_res = load_json(external_train_vr_res_path)
            # {desc_id: [(vid_name, vid_duration), ...]}
            self.desc_id2video_names_dur_pairs = \
                {e["desc_id"]: [video_idx2name_dur_pair[int(sub_e[0])] for sub_e in e["predictions"]]
                 for e in external_vr_res["VR"]}  # ordered
Beispiel #12
0
    def __init__(self,
                 dset_name,
                 data_path,
                 desc_bert_path_or_handler,
                 sub_bert_path_or_handler,
                 vid_feat_path_or_handler,
                 max_desc_len,
                 max_ctx_len,
                 ctx_mode="video",
                 normalize_vfeat=True,
                 normalize_tfeat=True,
                 h5driver=None,
                 data_ratio=1.0):
        self.dset_name = dset_name
        self.data_path = data_path
        self.data_ratio = data_ratio
        self.max_desc_len = max_desc_len
        self.max_ctx_len = max_ctx_len

        self.desc_bert_path_or_handler = desc_bert_path_or_handler
        self.sub_bert_path_or_handler = sub_bert_path_or_handler
        self.vid_feat_path_or_handler = vid_feat_path_or_handler
        self.ctx_mode = ctx_mode

        # prepare desc data
        self.data = load_jsonl(data_path)
        if self.data_ratio != 1:
            n_examples = int(len(self.data) * data_ratio)
            self.data = self.data[:n_examples]
            logger.info("Using {}% of the data: {} examples".format(
                data_ratio * 100, n_examples))

        self.use_video = "video" in self.ctx_mode
        self.use_sub = "sub" in self.ctx_mode
        self.use_tef = "tef" in self.ctx_mode

        if self.use_video:
            if isinstance(vid_feat_path_or_handler, h5py.File):
                self.vid_feat_h5 = vid_feat_path_or_handler
            else:  # str path
                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler,
                                             "r",
                                             driver=h5driver)

        if isinstance(desc_bert_path_or_handler, h5py.File):
            self.desc_bert_h5 = desc_bert_path_or_handler
        else:
            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler,
                                          "r",
                                          driver=h5driver)

        if self.use_sub:
            if isinstance(sub_bert_path_or_handler, h5py.File):
                self.sub_bert_h5 = sub_bert_path_or_handler
            else:  # str path
                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler,
                                             "r",
                                             driver=h5driver)

        self.normalize_vfeat = normalize_vfeat
        self.normalize_tfeat = normalize_tfeat