def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_path", type=str, required=True) parser.add_argument("--dset_name", type=str, default="tvc", choices=["tvc"]) parser.add_argument("--cache", type=str, default="./cache") parser.add_argument("--min_word_count", type=int, default=5) parser.add_argument("--raw_glove_path", type=str, help="downloaded glove vectors path") opt = parser.parse_args() if not os.path.exists(opt.cache): os.makedirs(opt.cache) # load, merge, clean, split data train_datalist = load_jsonl(opt.train_path) all_sentences = flat_list_of_lists( [[sub_e["desc"] for sub_e in e["descs"]] for e in train_datalist]) all_sentences = [ nltk.tokenize.word_tokenize(sen.lower()) for sen in all_sentences ] word2idx = build_vocab_idx(all_sentences, opt.min_word_count) print("[Info] Dumping the processed data to json file", opt.cache) save_json( word2idx, os.path.join(opt.cache, "{}_word2idx.json".format(opt.dset_name))) print("[Info] Finish.") if opt.raw_glove_path: vocab_glove_path = os.path.join( opt.cache, "{}_vocab_glove.pt".format(opt.dset_name)) extract_glove(word2idx, opt.raw_glove_path, vocab_glove_path)
def _load_data(self, data_path): logging.info("Loading data from {}".format(data_path)) raw_lines = load_jsonl(data_path) raw_lines = raw_lines[:int(len(raw_lines) * self.data_ratio)] data = [] for line in raw_lines: if self.is_eval: data.append( dict(vid_name=line["vid_name"], duration=line["duration"], ts=line["ts"], clip_id=line["clip_id"], clip_st_ed=self.convert_ts_to_clip_indices( line["ts"]))) else: for d in line["descs"]: data.append( dict(vid_name=line["vid_name"], duration=line["duration"], ts=line["ts"], clip_id=line["clip_id"], desc=d["desc"], clip_st_ed=self.convert_ts_to_clip_indices( line["ts"]))) logging.info("Loading complete! {} captions".format(len(data))) return data
def load_process_sub_meta(sub_meta_path, vid2nframe, frame_length): """ which subtitles should be assigned to which frames Args: sub_meta_path: contains a jsonl file, each line is a dict {"vid_name": str, "sub": list(dicts)}, each dict under "sub" is, e.g., {'text': " Chase : That's all this is?", 'start': 0.862, 'end': 1.862}. The dicts under "sub" are ordered the same as the original .srt files. frame_length: float, assign each subtitle to a frame segment Returns: """ video2sub = {e["vid_name"]: e for e in load_jsonl(sub_meta_path)} total_overlapped_sub, total_sub = 0, 0 max_sub_length, extra_long_subs = 0, 0 max_gap_time, max_sub_duration = 0, 0 max_matched_frame_len, max_unmatched_group_len = 0, 0 max_overlap_time = 0 for vid_name, sub_info in tqdm(video2sub.items(), desc="processing subtitles"): if isinstance(vid2nframe[vid_name], int): num_of_frames = vid2nframe[vid_name] if num_of_frames == 0: num_of_frames = int( int(sub_info["sub"][-1]["end"]) / frame_length) else: raise ValueError( f"{vid_name} in vid2nframe, but with unexpected format:\n" + f"{vid2nframe[vid_name]}") info, overlapped_sub = process_single_vid_sub(sub_info["sub"], frame_length, num_of_frames) # sub_info.update(info) video2sub[vid_name] = info total_overlapped_sub += overlapped_sub total_sub += len(sub_info["sub"]) max_sub_length = max(max_sub_length, info["max_sub_length"]) max_matched_frame_len = max(max_matched_frame_len, info["max_matched_frame_len"]) max_sub_duration = max(max_sub_duration, info["max_sub_duration"]) max_gap_time = max(max_gap_time, info["max_gap_time"]) max_unmatched_group_len = max(max_unmatched_group_len, info["max_unmatched_group_len"]) max_overlap_time = max(max_overlap_time, info["max_overlap_time"]) extra_long_subs += info["extra_long_subs"] print(f"overlap/total: {total_overlapped_sub}/{total_sub}") print(f"max subtitle length: {max_sub_length}") print(f"max subtitle duration: {max_sub_duration}") print(f"max overlap between two subtitles:{max_overlap_time}") print(f"max gap time between two subtitles: {max_gap_time}") print(f"max number of matched frames: {max_matched_frame_len}") print(f"max len of unmatched frame group: {max_unmatched_group_len}") print(f"extra long subs: {extra_long_subs}") return video2sub
def load_preprocess_tvr_subtitles(tokenizer, sub_data_file, max_length, filter_file_path=None, drop_edge=True, debug=False): """ filter_file_path: if provided, will be used to filter relevant subtitles max_chunks: int, split each subtitle into multiple chunks max_length: int, drop_edge, bool, must set to False when doing feature extraction, optionally set to True to save some time """ sub_datalist = load_jsonl(sub_data_file) sub_datalist = sub_datalist[:100] if debug else sub_datalist if filter_file_path is not None: # filter at finetuning, to use only subtitles in train set. assert len(filter_file_path) == 1, "please supply only one filter file path (--train_data_file)" filter_file_path = filter_file_path[0] keep_ids = list(set([e["vid_name"] for e in load_jsonl(filter_file_path)])) sub_datalist = [e for e in sub_datalist if e["vid_name"] in keep_ids] preprocessed_sub_datalist = flat_list_of_lists( [chunk_single_sub(tokenizer, sub_data, max_length=max_length, drop_edge=drop_edge) for sub_data in tqdm(sub_datalist, desc="Loading subtitles")]) return preprocessed_sub_datalist
def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler, max_desc_len, max_ctx_len, vid_feat_path_or_handler, clip_length, ctx_mode="video", normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0, video_duration_idx_path=None, eval_split_name=None): self.dset_name = dset_name self.data_path = data_path self.data_ratio = data_ratio self.desc_bert_path_or_handler = desc_bert_path_or_handler self.max_desc_len = max_desc_len self.sub_bert_path_or_handler = sub_bert_path_or_handler self.max_ctx_len = max_ctx_len self.vid_feat_path_or_handler = vid_feat_path_or_handler self.clip_length = clip_length self.ctx_mode = ctx_mode # prepare desc data self.data = load_jsonl(data_path) if self.data_ratio != 1: n_examples = int(len(self.data) * data_ratio) self.data = self.data[:n_examples] logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) self.use_video = "video" in self.ctx_mode self.use_sub = "sub" in self.ctx_mode self.use_tef = "tef" in self.ctx_mode if self.use_video: if isinstance(vid_feat_path_or_handler, h5py.File): self.vid_feat_h5 = vid_feat_path_or_handler else: # str path self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) if isinstance(desc_bert_path_or_handler, h5py.File): self.desc_bert_h5 = desc_bert_path_or_handler else: self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) if self.use_sub: if isinstance(sub_bert_path_or_handler, h5py.File): self.sub_bert_h5 = sub_bert_path_or_handler else: # str path self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) self.normalize_vfeat = normalize_vfeat self.normalize_tfeat = normalize_tfeat if video_duration_idx_path is not None: video_data = load_json(video_duration_idx_path)[eval_split_name] self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()] self.video2idx = {k: v[1] for k, v in video_data.items()}
def __init__(self, dset_name, eval_split_name, data_path=None, desc_bert_path_or_handler=None, max_desc_len=None, max_ctx_len=None, sub_bert_path_or_handler=None, vid_feat_path_or_handler=None, video_duration_idx_path=None, clip_length=None, ctx_mode="video", data_mode="context", h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True): self.dset_name = dset_name self.eval_split_name = eval_split_name self.ctx_mode = ctx_mode self.load_gt_video = False self.data_ratio = data_ratio # only affect query data self.normalize_vfeat = normalize_vfeat self.normalize_tfeat = normalize_tfeat self.data_mode = None self.set_data_mode(data_mode) self.max_desc_len = max_desc_len self.max_ctx_len = max_ctx_len self.data_path = data_path self.query_data = load_jsonl(data_path) if data_ratio != 1: n_examples = int(len(self.query_data) * data_ratio) self.query_data = self.query_data[:n_examples] logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) if isinstance(desc_bert_path_or_handler, h5py.File): self.desc_bert_h5 = desc_bert_path_or_handler else: self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) video_data = load_json(video_duration_idx_path)[self.eval_split_name] self.video_data = {k: v[0] for k, v in video_data.items()} self.video2idx = {k: v[1] for k, v in video_data.items()} self.clip_length = clip_length self.use_video = "video" in self.ctx_mode self.use_sub = "sub" in self.ctx_mode self.use_tef = "tef" in self.ctx_mode if self.use_video: if isinstance(vid_feat_path_or_handler, h5py.File): self.vid_feat_h5 = vid_feat_path_or_handler else: # str path self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) if self.use_sub: if isinstance(sub_bert_path_or_handler, h5py.File): self.sub_bert_h5 = sub_bert_path_or_handler else: # str path self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
def __init__(self, db_dir, max_txt_len=-1): super().__init__(db_dir, max_txt_len) if os.path.exists(f'{self.db_dir}/query2video.json'): self.query2video = json.load( open(f'{self.db_dir}/query2video.json')) self.video2query = {} for k, v in self.query2video.items(): if v not in self.video2query: self.video2query[v] = [k] else: self.video2query[v].append(k) else: self.query2video = {} self.video2query = {} self.query_data_f = load_jsonl(f'{self.db_dir}/query_data.jsonl')
def load_process_sub_meta(sub_meta_path, clip_length): """ which subtitle sentences should be assigned to which clips Args: sub_meta_path: contains a jsonl file, each line is a dict {"vid_name": str, "sub": list(dicts)}, each dict under "sub" is, e.g., {'text': " Chase : That's all this is?", 'start': 0.862, 'end': 1.862}. The dicts under "sub" are ordered the same as the original .srt files. clip_length: float, assign each subtitle sentence to a clip segment Returns: """ video2sub = {e["vid_name"]: e for e in load_jsonl(sub_meta_path)} for vid_name, sub_info in tqdm(video2sub.items(), desc="processing subtitles"): sub_info["clip2sen"] = process_single_vid_sub(sub_info["sub"], clip_length) video2sub[vid_name] = sub_info return video2sub
def main_compute_upper_bound(): import argparse parser = argparse.ArgumentParser() parser.add_argument("-dset_name", type=str, choices=["tvr"]) parser.add_argument( "-eval_file_path", type=str, help="path to the file containing data to be evaluated") parser.add_argument("-save_path", type=str, help="path to save the results") parser.add_argument("-verbose", action="store_true") args = parser.parse_args() eval_datalist = load_jsonl(args.eval_file_path) video_proposals_list = get_proposals_for_videos(eval_datalist, args.dset_name) recall_metrics = compute_proposal_recall_upper_bound(video_proposals_list, iou_thds=(0.5, 0.7)) video_proposals_list_by_video = {} for p in video_proposals_list: if p["vid_name"] in video_proposals_list_by_video: continue else: video_proposals_list_by_video[p["vid_name"]] = p video_proposals_list_by_video = list( video_proposals_list_by_video.values()) total_n_clips_in_proposals = \ np.sum([np.sum(e["proposals"][:, 1] - e["proposals"][:, 0]) for e in video_proposals_list_by_video]) results = dict(avg_num_proposals=float( np.mean([len(e["proposals"]) for e in video_proposals_list_by_video])), total_num_proposals=int( np.sum([ len(e["proposals"]) for e in video_proposals_list_by_video ])), recall_metrics=recall_metrics, dset_name=args.dset_name, filename=args.eval_file_path, proposal_config=ProposalConfigs[args.dset_name]) results["avg_clip_per_proposal"] = total_n_clips_in_proposals / results[ "total_num_proposals"] save_json(results, args.save_path, save_pretty=True) if args.verbose: pprint.pprint(results)
def load_preprocess_tvr_query(tvr_file_path): return [dict(id=e["desc_id"], text=e["desc"]) for e in load_jsonl(tvr_file_path)]
def __init__(self, dset_name, data_path, desc_bert_path, sub_bert_path, max_desc_len, vid_feat_path, clip_length, vid_feat_size, sub_feat_size=0, ctx_mode="video_tef", pos_iou_thd=0.7, neg_iou_thd=0.3, h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True, model_type="cal", external_train_vr_res_path=None, video_duration_idx_path=None): self.dset_name = dset_name self.model_type = model_type self.pool_local = model_type == "mcn" # pool local feature self.data_path = data_path self.data_ratio = data_ratio self.desc_bert_path = desc_bert_path self.max_desc_len = max_desc_len self.sub_bert_path = sub_bert_path self.vid_feat_path = vid_feat_path self.clip_length = clip_length self.ctx_mode = ctx_mode self.pos_iou_thd = pos_iou_thd self.neg_iou_thd = neg_iou_thd self.vid_feat_output_size = 2 * vid_feat_size * ( "video" in ctx_mode) + 2 * ("tef" in ctx_mode) self.sub_feat_output_size = 2 * sub_feat_size * ( "sub" in ctx_mode) + 2 * ("tef" in ctx_mode) # prepare desc data self.data = load_jsonl(data_path) if self.data_ratio != 1: n_examples = int(len(self.data) * data_ratio) self.data = self.data[:n_examples] logger.info("Using {}% of the data: {} examples".format( data_ratio * 100, n_examples)) self.proposal_fn = get_proposal_interface(dset_name) if self.ctx_mode != "tef": self.vid_feat_h5 = h5py.File(self.vid_feat_path, "r", driver=h5driver) self.desc_bert_h5 = h5py.File(self.desc_bert_path, "r", driver=h5driver) if "sub" in self.ctx_mode: self.sub_bert_h5 = h5py.File(self.sub_bert_path, "r", driver=h5driver) self.normalize_vfeat = normalize_vfeat self.normalize_tfeat = normalize_tfeat self.use_video = "video" in self.ctx_mode self.use_sub = "sub" in self.ctx_mode self.use_tef = "tef" in self.ctx_mode if external_train_vr_res_path is not None: video_data = load_json(video_duration_idx_path)["train"] # {video_idx: [vid_name, vid_duration]} video_idx2name_dur_pair = { v[1]: [k, v[0]] for k, v in video_data.items() } external_vr_res = load_json(external_train_vr_res_path) # {desc_id: [(vid_name, vid_duration), ...]} self.desc_id2video_names_dur_pairs = \ {e["desc_id"]: [video_idx2name_dur_pair[int(sub_e[0])] for sub_e in e["predictions"]] for e in external_vr_res["VR"]} # ordered
def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler, vid_feat_path_or_handler, max_desc_len, max_ctx_len, ctx_mode="video", normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0): self.dset_name = dset_name self.data_path = data_path self.data_ratio = data_ratio self.max_desc_len = max_desc_len self.max_ctx_len = max_ctx_len self.desc_bert_path_or_handler = desc_bert_path_or_handler self.sub_bert_path_or_handler = sub_bert_path_or_handler self.vid_feat_path_or_handler = vid_feat_path_or_handler self.ctx_mode = ctx_mode # prepare desc data self.data = load_jsonl(data_path) if self.data_ratio != 1: n_examples = int(len(self.data) * data_ratio) self.data = self.data[:n_examples] logger.info("Using {}% of the data: {} examples".format( data_ratio * 100, n_examples)) self.use_video = "video" in self.ctx_mode self.use_sub = "sub" in self.ctx_mode self.use_tef = "tef" in self.ctx_mode if self.use_video: if isinstance(vid_feat_path_or_handler, h5py.File): self.vid_feat_h5 = vid_feat_path_or_handler else: # str path self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) if isinstance(desc_bert_path_or_handler, h5py.File): self.desc_bert_h5 = desc_bert_path_or_handler else: self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) if self.use_sub: if isinstance(sub_bert_path_or_handler, h5py.File): self.sub_bert_h5 = sub_bert_path_or_handler else: # str path self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) self.normalize_vfeat = normalize_vfeat self.normalize_tfeat = normalize_tfeat