def load_features(self): root_feat = Path(self.root_feat) feat_names = {key: self.visual_feat_paths(key) for key in self.paths["feature_names"]} feat_names.update(self.paths["custom_paths"]) features = {} for expert, rel_names in feat_names.items(): if expert not in self.ordered_experts: continue feat_paths = tuple([root_feat / rel_name for rel_name in rel_names]) if len(feat_paths) == 1: features[expert] = memcache(feat_paths[0]) else: # support multiple forms of feature (e.g. max and avg pooling). For # now, we only support direct concatenation msg = f"{expert}: Only direct concatenation of muliple feats is possible" print(f"Concatenating aggregates for {expert}....") assert self.feat_aggregation[expert]["aggregate"] == "concat", msg axis = self.feat_aggregation[expert]["aggregate-axis"] x = concat_features.cache_info() # pylint: disable=no-value-for-parameter print(f"concat cache info: {x}") features_ = concat_features(feat_paths, axis=axis) memory_summary() # Make separate feature copies for each split to allow in-place filtering features[expert] = copy.deepcopy(features_) self.features = features if self.challenge_mode: self.load_challenge_text_features() else: self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat] self.text_features = memcache(text_feat_path)
def validate_embeddings_against_reference( computed_embeddings: Dict[str, List[np.ndarray]], embedding_name: str, dataset: str, ): root_feat, paths = dataset_paths(dataset) reference_dict = {} for path in paths["text_feat_paths"][embedding_name].values(): reference_dict.update(memcache(root_feat / path)) # We handle MSVD as a special case, because video keys != feature keys if dataset == "MSVD": key_map = memcache(root_feat / paths["dict_youtube_mapping_path"]) inverse_map = {val: key for key, val in key_map.items()} reference_dict = { inverse_map[key]: val for key, val in reference_dict.items() } print(f"Validating embeddings against reference....") for key, val in tqdm.tqdm(computed_embeddings.items()): ref_val = reference_dict[key] msg = (f"[{embedding_name}] {key} Different number of " f"embeddings {len(ref_val)} vs {len(val)}") assert len(ref_val) == len(val), msg msg = f"[{embedding_name}] Embedding mismatch for {key}" for vec, ref_vec in zip(val, ref_val): assert np.abs(vec - ref_vec).max() < 1E-5, msg
def load_features(self): root_feat = Path(self.root_feat) feat_names = {key: self.visual_feat_paths(key) for key in self.paths["feature_names"]} feat_names.update(self.paths["custom_paths"]) features = {} for expert, rel_names in feat_names.items(): if expert not in self.ordered_experts: continue feat_paths = tuple([root_feat / rel_name for rel_name in rel_names]) if len(feat_paths) == 1: features[expert] = memcache(feat_paths[0]) else: # support multiple forms of feature (e.g. max and avg pooling). For # now, we only support direct concatenation msg = f"{expert}: Only direct concatenation of muliple feats is possible" print(f"Concatenating aggregates for {expert}....") is_concat = self.feat_aggregation[expert]["aggregate"] == "concat" self.log_assert(is_concat, msg=msg) axis = self.feat_aggregation[expert]["aggregate-axis"] x = concat_features.cache_info() # pylint: disable=no-value-for-parameter print(f"concat cache info: {x}") features_ = concat_features(feat_paths, axis=axis) memory_summary() # Make separate feature copies for each split to allow in-place filtering features[expert] = copy.deepcopy(features_) self.features = features if self.split_name == "jsfusion": self.restrict_test_captions = memcache( root_feat / self.paths["js_test_cap_idx_path"]) self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) self.text_features = memcache(root_feat / self.paths["text_feat_path"]) if self.restrict_train_captions: # hash the video names to avoid O(n) lookups in long lists train_list = set(self.partition_lists["train"]) for key, val in self.text_features.items(): if key not in train_list: continue if not self.split_name == "full-test": # Note that we do not perform this sanity check for the full-test # split, because the text features in the cached dataset will already # have been cropped to the specified `resstrict_train_captions` msg = "expected train text features to be lists with length 19 or 20" has_expected_feats = isinstance(val, list) and len(val) in {19, 20} self.log_assert(has_expected_feats, msg=msg) # restrict to the first N captions (deterministic) self.text_features[key] = val[:self.restrict_train_captions] self.summary_stats()
def load_features(self): root_feat = self.root_feat feat_names = { key: self.visual_feat_paths(key) for key in self.paths["feature_names"] } feat_names.update(self.paths["custom_paths"]) features = {} for expert, rel_names in feat_names.items(): if expert not in self.ordered_experts: continue feat_paths = tuple( [Path(root_feat) / rel_name for rel_name in rel_names]) if len(feat_paths) == 1: features[expert] = memcache(feat_paths[0]) else: # support multiple forms of feature (e.g. max and avg pooling). For # now, we only support direct concatenation msg = f"{expert}: Only direct concatenation of muliple feats is possible" print(f"Concatenating aggregates for {expert}....") assert self.feat_aggregation[expert][ "aggregate"] == "concat", msg axis = self.feat_aggregation[expert]["aggregate-axis"] x = concat_features.cache_info() # pylint: disable=no-value-for-parameter print(f"concat cache info: {x}") features_ = concat_features(feat_paths, axis=axis) memory_summary() # Make separate feature copies for each split to allow in-place filtering features[expert] = copy.deepcopy(features_) self.features = features if self.challenge_mode: self.load_challenge_text_features() else: self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) # keys = list(raw_captions.keys()) # raw_captions_fused = {} # for key in keys: # raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key])) # self.raw_captions = raw_captions_fused text_feat_path = root_feat / self.paths["text_feat_paths"][ self.text_feat] self.text_features = memcache(text_feat_path) # overload video paths, which are structured differently for YouCook2 self.video_path_retrieval = [ f"videos/{x}.mp4" for x in self.partition_lists["val"] ]
def pseudo_annos_to_subset_dict( pseudo_anno_path: Path, pseudo_annos: str, canonical_vocab: set, episode2subset: Dict[str, str], ) -> Dict[str, Dict]: # keep track of some basic stats as a sanity check thresholds = [0.5, 0.7, 0.9] counts = {thr: 0 for thr in thresholds} data = memcache(pseudo_anno_path)[pseudo_annos] subset_data = {key: dict() for key in ("train", "val", "test")} subset2episodes = {key: set() for key in subset_data} for episode, subset in episode2subset.items(): subset2episodes[subset].add(episode) for subset in subset_data: for word, worddict in tqdm.tqdm(data.items()): assert word in canonical_vocab, f"Expected {word} to be in 1064 vocab" keep = np.array( [x in subset2episodes[subset] for x in worddict["names"]]) if keep.sum(): if word not in subset_data[subset]: subset_data[subset][word] = defaultdict(list) for key, val in worddict.items(): kept = np.array(val)[keep].tolist() subset_data[subset][word][key].extend(kept) for thr in counts: counts[thr] += (np.array(worddict["probs"])[keep] > thr).sum() data = subset_data for thr, val in counts.items(): print(f"Found {val} annotations at confidences > {thr}") return data
def get_episode2subset_map(subset2episode: Path) -> Dict[str, str]: """Build a mapping that converts episode keys into their respective subsets """ subset2episode = memcache(subset2episode) episode2subset = {} for subset, episodes in subset2episode.items(): for episode in episodes: episode_key = episode.replace("/", "--") assert episode_key not in episode2subset, f"Duplicate key: {episode}!" episode2subset[episode_key] = subset return episode2subset
def load_features(self): root_feat = self.root_feat if self.distil_params is not None: self.distil_features = {} d_base_path = self.distil_params['base_path'] teachers = list( map(lambda x: root_feat / Path(d_base_path + x), self.distil_params['teachers'])) for i, f_name in enumerate(teachers): self.distil_features[i] = memcache(f_name) feat_names = { key: self.visual_feat_paths(key) for key in self.paths["feature_names"] } feat_names.update(self.paths["custom_paths"]) features = {} for expert, rel_names in feat_names.items(): if expert not in self.ordered_experts: continue feat_paths = tuple( [Path(root_feat) / rel_name for rel_name in rel_names]) if len(feat_paths) == 1: features[expert] = memcache(feat_paths[0]) else: # support multiple forms of feature (e.g. max and avg pooling). For # now, we only support direct concatenation msg = f"{expert}: Only direct concatenation of muliple feats is possible" print(f"Concatenating aggregates for {expert}....") assert self.feat_aggregation[expert][ "aggregate"] == "concat", msg axis = self.feat_aggregation[expert]["aggregate-axis"] x = concat_features.cache_info() # pylint: disable=no-value-for-parameter print(f"concat cache info: {x}") features_ = concat_features(feat_paths, axis=axis) memory_summary() # Make separate feature copies for each split to allow in-place filtering features[expert] = copy.deepcopy(features_) self.features = features if self.challenge_mode: self.load_challenge_text_features() else: text_feat_paths = self.paths["text_feat_paths"][self.text_feat] if isinstance(text_feat_paths, dict): text_features = memcache(root_feat / text_feat_paths["train"]) text_features.update( memcache(root_feat / text_feat_paths[self.split_name])) elif isinstance(text_feat_paths, (Path, str)): text_features = memcache(root_feat / text_feat_paths) else: raise TypeError(f"Unexpected type {type(text_feat_paths)}") self.text_features = text_features self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
def load_british_mouthings(mouthing_pkl_path: Path) -> dict: """Load mouthing predictions from disk and transform the keywords from US to UK English. """ # Note: we leave the practice/practise dilemena for another time and stick with this # list for backwards compatibility us_mouthings = memcache(mouthing_pkl_path) british_mouthings = {} for subset, subdict in us_mouthings.items(): british_mouthings[subset] = { US2UK_MAPPING.get(key, key): val for key, val in subdict.items() } return british_mouthings
def main(): parser = argparse.ArgumentParser() parser.add_argument("--vis", action="store_true") parser.add_argument("--refresh", action="store_true") parser.add_argument("--fig_dir", type=Path, default="misc/BSLCP/figs") parser.add_argument("--config", type=Path, default="misc/BSLCP/data_paths.json") parser.add_argument( "--vocab_name", default="bsl1k_vocab", choices=["bsl1k_vocab", "BSLCP_all_glosses", "signdict_signbank"], ) args = parser.parse_args() config = memcache(args.config) dest_path = Path(config[args.vocab_name]["anno_path"]) vocab_path = config[args.vocab_name]["vocab_path"] if vocab_path: with open(vocab_path, "rb") as f: canonical_vocab = set(pickle.load(f)["words"]) else: # We use an empty vocabulary to denote that no filtering should be performed canonical_vocab = set() fig_dir = args.fig_dir / args.vocab_name parse_annos( anno_dir=Path(config["raw_anno_dir"]), target_tiers=tuple(config["target_tiers"]), train_val_test_ratio=config["train_val_test_ratio"], raw_video_dir=Path(config["raw_video_dir"]), vocab_name=args.vocab_name, fig_dir=fig_dir, dest_path=dest_path, canonical_vocab=canonical_vocab, refresh=args.refresh, vis=args.vis, )
def main( video_dir: Path, trim_format: str, pad_clip: float, limit: int, processes: int, json_anno_path: Path, anno_name: str, force_resize: int, refresh: bool, vis: bool, ): print(f"Processing {anno_name} annotations") data = memcache(json_anno_path) output_filenames = defaultdict(list) kwarg_list = [] outs = set() count = 0 for s in tqdm.tqdm(data.keys()): for word in tqdm.tqdm(data[s].keys()): N = len(data[s][word]["start"]) for i in range(N): start_time = data[s][word]["start"][i] - pad_clip end_time = data[s][word]["end"][i] + pad_clip output_filename = construct_video_filename( output_dir=video_dir, set_name=s, word=word, name=Path(data[s][word]["video"][i]).stem, start_time=time2tuple(start_time), end_time=time2tuple(end_time), trim_format=trim_format, ) output_filenames[output_filename].append( (start_time, end_time)) source_file = Path(data[s][word]["video"][i]) assert source_file.exists( ), f"Expected source file at {source_file}" kwargs = { "refresh": refresh, "start_time": start_time, "end_time": end_time, "output_filename": output_filename, "source_file": source_file, "force_resize": force_resize, } outs.add(output_filename) kwarg_list.append(kwargs) count += 1 if vis: durations = np.array( [x["end_time"] - x["start_time"] for x in kwarg_list]) step = 0.1 bins = np.arange(0, np.ceil(durations.max()), step=step) values, _ = np.histogram(durations, bins=bins) plt.figure(figsize=(20, 10)) x_ticks = bins[:-1] + (step / 2) plt.bar(x_ticks, values, width=step) font = {"family": "serif", "weight": "normal", "size": 26} matplotlib.rc("font", **font) plt.suptitle(f"BSLCP sign durations") plt.savefig("zz-bslcp-durations.png") if limit: kwarg_list = kwarg_list[:limit] func = extract_clip if processes > 1: with mp.Pool(processes=processes) as pool: starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list) else: for kwargs in tqdm.tqdm(kwarg_list): func(**kwargs) print(f"Expected to produce: {len(kwarg_list)} outputs")
if __name__ == "__main__": matplotlib.use("Agg") p = argparse.ArgumentParser() p.add_argument("--limit", type=int, default=0) p.add_argument("--refresh", action="store_true") p.add_argument("--vis", action="store_true") p.add_argument("--config", type=Path, default="misc/BSLCP/data_paths.json") p.add_argument("--processes", type=int, default=1) p.add_argument( "--anno_name", default="BSLCP_all_glosses", choices=["bsl1k_vocab", "BSLCP_all_glosses", "signdict_signbank"]) args = p.parse_args() p_kwargs = vars(args) config = memcache(p_kwargs.pop("config")) p_kwargs.update({ "force_resize": config["force_resize"], "json_anno_path": Path(config[args.anno_name]["anno_path"]), "trim_format": config["trim_format"], "video_dir": Path(config["data_dir"]) / config[args.anno_name]["video_dir"], "pad_clip": config["pad_clip"], }) main(**p_kwargs)
def extract_embeddings( text_embedding_config_path: Path, rel_dest_dir: Path, data_dir: Path, refresh: bool, validate_embeddings: bool, limit: int, processes: int, embedding_name: str, datasets: List[str], ): for dataset in datasets: dest_dir = data_dir / dataset / rel_dest_dir dest_name = embedding_name if limit: dest_name = f"{embedding_name}-limit{limit}" dest_path = dest_dir / f"{dest_name}.pkl" # if dest_path.exists() and not refresh: # print(f"Found existing text embeddings at {dest_path}, skipping....") # return dest_dir.mkdir(exist_ok=True, parents=True) # handle the activity-net exception if dataset == "activity-net": fname = "raw-captions-train-val_1.pkl" elif dataset == "QuerYDSegments": fname = "split_raw_captions_filtered.pkl" elif dataset == "QuerYD": fname = "raw_captions_combined_filtered.pkl" else: fname = "raw-captions.pkl" captions_path = data_dir / dataset / "structured-symlinks" / fname # import ipdb; ipdb.set_trace() video_descriptions = memcache(captions_path) with open(text_embedding_config_path, "r") as f: text_embedding_config = json.load(f) force_cpu = text_embedding_config[embedding_name].pop( "force_cpu", False) dev_name = "cuda:0" if torch.cuda.device_count( ) > 0 and not force_cpu else "cpu" device = torch.device(dev_name) model = prepare_embedding_model(embedding_name, text_embedding_config) model.set_device(device) if limit: keep = set(list(video_descriptions.keys())[:limit]) video_descriptions = { key: val for key, val in video_descriptions.items() if key in keep } computed_embeddings = {} kwarg_list = [] for key, descriptions in tqdm.tqdm(video_descriptions.items()): kwarg_list.append({"key": key, "descriptions": descriptions}) all_failed_tokens = [] func = extract_embeddings_for_video if processes > 1: # Note: An experimental approach with Ray. Unfortunately, it seems that # the overhead is too great to justify this approach (it's slower than # using a single process). TODO(Samuel): revisit. func = ray.remote(extract_embeddings_for_video) ray.init(num_cpus=processes) # Store model in shared memory object store to avoid multiple copies model_id = ray.put(model) def to_iterator(obj_ids): while obj_ids: done, obj_ids = ray.wait(obj_ids) yield ray.get(done[0]) result_ids = [ func.remote(model=model_id, **kwargs) for kwargs in kwarg_list ] zipped = zip(to_iterator(result_ids), kwarg_list) for (embeddings, failed), kwargs in tqdm.tqdm(zipped, total=len(result_ids)): computed_embeddings[kwargs["key"]] = embeddings all_failed_tokens.extend(failed) else: for kwargs in tqdm.tqdm(kwarg_list): embeddings_for_video, failed_tokens = func(**kwargs, model=model) computed_embeddings[kwargs["key"]] = embeddings_for_video all_failed_tokens.extend(failed_tokens) stats = [ len(x) for sublist in computed_embeddings.values() for x in sublist ] print(f"Average num embedding tokens: {np.mean(stats):.1f} tokens") fail_rate = len(all_failed_tokens) / np.sum(stats) stat_str = f"{len(all_failed_tokens)}/{np.sum(stats)} [{100 * fail_rate:.1f}%]" print(f"Failed tokens: {stat_str} tokens") if validate_embeddings: validate_embeddings_against_reference( computed_embeddings=computed_embeddings, embedding_name=embedding_name, dataset=dataset, ) with BlockTimer(f"Writing embeddings to {dest_path}"): with open(dest_path, "wb") as f: pickle.dump(computed_embeddings, f)
def parse_subtitles( subtitle_pkl_path: Path, subtitle_reference_mouthings: Path, canonical_vocab: frozenset, prob_thres: Number, episode2subset: frozendict, pkl_file: Path = None, episode_filter: str = None, save_pkl: bool = True, temporal_tol: int = 4, ) -> Dict: """Extract raw subtitles into a format that mimics the mouthing predictions. Use frozen datastructures to allow LRU caching. """ subs = memcache(subtitle_pkl_path) ref_mouthings = load_british_mouthings(subtitle_reference_mouthings) # Filter to episodes with available subtitles subset2episodes = defaultdict(list) for episode, subset in episode2subset.items(): episode = episode.replace("/", "--") if episode_filter and episode_filter not in episode: continue if episode in subs: subset2episodes[subset].append(episode) print( f"Filtered to {sum(len(x) for x in subset2episodes.values())} episodes" ) data = {} count = 0 for subset, episodes in subset2episodes.items(): data[subset] = {} for episode in tqdm.tqdm(episodes): episode_subs = subs[episode] for sub in tqdm.tqdm(episode_subs): if isinstance(sub["span"], list): text = "".join([x["text"] for x in sub["span"]]) else: text = sub["span"]["text"] subtitle_words = [ clean_subtitle_word(x) for x in text.split(" ") ] for keyword in canonical_vocab: keyword_ref_mouthings = ref_mouthings[subset][keyword] keep = keyword_ref_mouthings["names"] == episode conf_keep = np.array( keyword_ref_mouthings["probs"]) > prob_thres mask = conf_keep * keep if prob_thres and not (keep.sum() and mask.sum()): continue candidate_times = np.array( keyword_ref_mouthings["times"])[mask] if keyword not in data[subset]: data[subset][keyword] = { "names": [], "probs": [], "times": [] } if keyword in subtitle_words: sub_time = sub["start"] + (sub["end"] - sub["start"]) / 2 candidate_times = np.array( keyword_ref_mouthings["times"])[mask] if prob_thres: # we only keep times that are close to a confident mouthing if np.abs(candidate_times - sub_time).min() > temporal_tol: continue data[subset][keyword]["names"].append(episode) data[subset][keyword]["probs"].append(1) data[subset][keyword]["times"].append(sub_time) count += 1 print(f"Proposing {count} subtitle crops") if save_pkl: pkl.dump(data, open(pkl_file, "wb")) return data
def __init__( self, root_path="data/wlasl", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=64, evaluate_video=False, hflip=0.5, stride=0.5, ram_data=True, gpu_collation=False, use_bbox=True, monolithic_pkl_path="data/pickled-videos/wlasl-compressed-quality-90-resized-256x256.pkl", input_type="rgb", pose_keys=["body", "face", "lhnd", "rhnd"], mask_rgb=None, mask_type=None, mask_prob=1.0, ): self.root_path = root_path self.setname = setname # train, val or test self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.gpu_collation = gpu_collation self.stride = stride self.use_bbox = use_bbox self.input_type = input_type self.pose_keys = pose_keys self.mask_rgb = mask_rgb self.mask_type = mask_type self.video_folder = "videos_360h_25fps" if Path(monolithic_pkl_path).exists() and ram_data: print(f"Loading monolithic pickle from {monolithic_pkl_path}") self.video_data_dict = memcache(monolithic_pkl_path) else: self.video_data_dict = None infofile = os.path.join(root_path, "info", "info.pkl") print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) if self.input_type == "pose": pose_pkl = os.path.join(root_path, "info", "pose.pkl") print(f"Loading {pose_pkl}") self.pose_data = pkl.load(open(pose_pkl, "rb")) if self.mask_rgb: assert mask_type if self.mask_rgb == "face": face_pkl = os.path.join(root_path, "info", "face_bbox.pkl") print(f"Loading {face_pkl}") self.face_data = pkl.load(open(face_pkl, "rb")) # Use this to take subset if self.input_type == "pose" or self.mask_rgb: mouth_pkl = os.path.join(root_path, "info", "mouth_bbox.pkl") print(f"Loading {mouth_pkl}") self.mouth_data = pkl.load(open(mouth_pkl, "rb")) self.videos = [s.strip() for s in data["videos"]["name"]] self.videos = np.asarray(self.videos) self.classes = data["videos"]["word_id"] with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f: self.class_names = f.read().splitlines() meta_key = self.video_folder if gpu_collation and not self.video_data_dict: # GPU collation requires all inputs to share the same spatial input size self.video_folder = "videos-resized-256fps-256x256" self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation) bboxes_orig = [s for s in np.asarray(data["videos"]["box"])] self.bboxes = [] for i, bb in enumerate(bboxes_orig): ht = data["videos"]["videos_original"]["H"][i] wt = data["videos"]["videos_original"]["W"][i] xmin, ymin, xmax, ymax = bb bb_norm = [ymin / ht, xmin / wt, ymax / ht, xmax / wt] self.bboxes.append(bb_norm) self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0]) if self.setname == "val": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0]) elif self.setname == "test": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0]) if self.input_type == "pose" or self.mask_rgb: # Valid mouth ix should be equivalent to valid face ix, valid pose ix etc valid_mouth_ix = np.where( np.array([i is not None for i in self.mouth_data]) )[0] if self.setname == "val" or self.setname == "test": print(f"{len(self.train)} train, {len(self.valid)} val samples.") print("Taking subsets according to having pose or not") self.train = list(set(self.train).intersection(set(valid_mouth_ix))) if self.setname == "val" or self.setname == "test": self.valid = list(set(self.valid).intersection(set(valid_mouth_ix))) print(f"{len(self.train)} train, {len(self.valid)} val samples.") if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)
def main( data_dir: Path, anno_pkl_path: Path, video_dir: Path, canonical_1064_words: Path, refresh: bool, prob_thres: float, worker_id: int, num_partitions: int, limit: int, processes: int, mouthing_window_secs: int, progress_markers: int, aggregate: bool, pseudo_annos: str, episode2subset: Dict[str, str], trim_format: str = "%06d", ): path_kwargs = { "limit": limit, "data_dir": data_dir, "pseudo_annos": pseudo_annos, "prob_thres": prob_thres, "mouthing_window_secs": mouthing_window_secs, } with open(canonical_1064_words, "rb") as f: canonical_vocab = set(pkl.load(f)["words"]) if aggregate: dest_path = gen_paths(worker_id=0, num_partitions=1, **path_kwargs)["info"] if dest_path.exists() and not refresh: print(f"Found existing info file at {dest_path}, skipping...") return info = create_info_structure() for ii in range(num_partitions): src_path = gen_paths(worker_id=ii, num_partitions=num_partitions, **path_kwargs)["info"] worker_info = memcache(src_path) msg = "Expected worker info to match the target 1064 vocab" assert set(worker_info["words"]) == canonical_vocab, msg if ii == 0: # we can update the words with the first worker info["words"] = worker_info["words"] info["words_to_id"] = worker_info["words_to_id"] for key in info["videos"]: if key == "videos": for subkey in info["videos"]["videos"]: info["videos"]["videos"][subkey].extend( worker_info["videos"]["videos"][subkey]) else: info["videos"][key].extend(worker_info["videos"][key]) print(f"Writing aggregated info to {dest_path}") with open(dest_path, "wb") as f: pkl.dump(info, f) return paths = gen_paths(worker_id=worker_id, num_partitions=num_partitions, **path_kwargs) if paths["info"].exists() and not refresh: print(f"Found existing info file at {paths['info']}, skipping...") return data = create_info_structure() words = set() sets = ["train", "val", "test"] set_dict = {"train": 0, "val": 1, "test": 2} all_data = load_data( pseudo_annos=pseudo_annos, anno_pkl_path=anno_pkl_path, canonical_vocab=canonical_vocab, episode2subset=episode2subset, ) all_data = filter_words_by_confidence(all_data, prob_thres) print(f"Using a vocabulary of {len(canonical_vocab)} words for BBC") words = list(sorted(canonical_vocab)) # Write to TXT file with open(paths["words"], "w") as dict_file: words_to_id = {} for i, w in enumerate(words): words_to_id[w] = i dict_file.write(f"{i:05d} {w}\n") data["words"] = words data["words_to_id"] = words_to_id t0 = time.time() if num_partitions == 1: worker_words = set(words) else: worker_words = np.array_split(words, num_partitions)[worker_id] count = 0 kwarg_list = [] for s in sets: # all_data.keys(): subset_total = len(all_data[s]) for word_cnt, word in enumerate(all_data[s].keys()): assert word in words_to_id, f"Unkown word: {word}" if limit and count >= limit: continue if word not in worker_words: continue N = len(all_data[s][word]["names"]) delta = time.time() - t0 print( f"{delta:0.2f} sec {s} {word_cnt}/{subset_total} {word} [{N} samples]" ) for i in range(N): if all_data[s][word]["probs"][i] > prob_thres: start_time, end_time = take_interval_from_peak( all_data[s][word]["times"][i]) output_filename = construct_video_filename( output_dir=video_dir, set_name=s, word=word, name=all_data[s][word]["names"][i], start_time=start_time, end_time=end_time, trim_format=trim_format, ) if os.path.exists(output_filename): # Video resolution information name = os.path.join(s, word, os.path.basename(output_filename)) kwargs = { "count": count, "word": word, "name": name, "word_id": words_to_id[word], "split": set_dict[s], "processes": processes, "mouthing_time": all_data[s][word]["times"][i], "mouthing_prob": all_data[s][word]["probs"][i], "output_filename": output_filename, "progress_markers": progress_markers, } kwarg_list.append(kwargs) count += 1 # Enable the worker to print progress. for kwargs in kwarg_list: kwargs["total"] = len(kwarg_list) func = update_meta if processes > 1: with mp.Pool(processes=processes) as pool: meta = starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list) else: meta = [] for kwargs in tqdm.tqdm(kwarg_list): meta.append(func(**kwargs)) # Filter videos that failed to return meta data pre_filter = len(meta) meta = [x for x in meta if x] print( f"{len(meta)}/{pre_filter} were successfully parsed for meta information" ) # check that ordering was preserved by multiprocessing counts = [x["count"] for x in meta] assert list(sorted(counts)) == counts, "Expected meta items to be in order" for x in tqdm.tqdm(meta): data["videos"]["videos"]["T"].append(x["video_res_t"]) data["videos"]["videos"]["W"].append(x["video_res_w"]) # 480 data["videos"]["videos"]["H"].append(x["video_res_h"]) # 480 data["videos"]["videos"]["duration_sec"].append( x["video_duration_sec"]) data["videos"]["videos"]["fps"].append(x["video_fps"]) # 25 data["videos"]["word"].append(x["word"]) data["videos"]["word_id"].append(x["word_id"]) data["videos"]["split"].append(x["split"]) data["videos"]["name"].append(x["name"]) data["videos"]["mouthing_time"].append(x["mouthing_time"]) data["videos"]["mouthing_prob"].append(x["mouthing_prob"]) print(f"Saving info file to {paths['info']}...") pkl.dump(data, open(paths["info"], "wb"))
def load_features(self): root_feat = Path(self.root_feat) feat_names = { key: self.visual_feat_paths(key) for key in self.paths["feature_names"] } feat_names.update(self.paths["custom_paths"]) features = {} for expert, rel_names in feat_names.items(): if expert not in self.ordered_experts: continue feat_paths = tuple( [root_feat / rel_name for rel_name in rel_names]) if len(feat_paths) == 1: features[expert] = memcache(feat_paths[0]) else: # support multiple forms of feature (e.g. max and avg pooling). For # now, we only support direct concatenation msg = f"{expert}: Only direct concat of muliple feats is possible" print(f"Concatenating aggregates for {expert}....") assert self.feat_aggregation[expert][ "aggregate"] == "concat", msg axis = self.feat_aggregation[expert]["aggregate-axis"] x = concat_features.cache_info() # pylint: disable=no-value-for-parameter print(f"concat cache info: {x}") features_ = concat_features(feat_paths, axis=axis) memory_summary() if expert == "speech": features_defaults = defaultdict(lambda: np.zeros((1, 300))) features_defaults.update(features_) features_ = features_defaults # Make separate feature copies for each split to allow in-place filtering features[expert] = copy.deepcopy(features_) self.features = features text_feat_paths = self.paths["text_feat_paths"] text_features = memcache(root_feat / text_feat_paths["train"]) split_names = {"dev": "val", "official": "test"} text_features.update( memcache(root_feat / text_feat_paths[split_names[self.split_name]])) key_map = memcache(root_feat / self.paths["dict_youtube_mapping_path"]) inverse_map = {} for key, value in key_map.items(): inverse_map[value] = key self.text_features = { inverse_map[key]: val for key, val in text_features.items() } self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) if "detection" in self.ordered_experts: # Example processing processed = {} for key, subdict in self.features["detection"].items(): box, conf = subdict["detection_boxes"], subdict[ "detection_scores"] raw = subdict["raw_feats_avg"] processed[key] = np.concatenate( (box, conf.reshape(-1, 1), raw), axis=1) self.features["detection"] = processed if "openpose" in self.ordered_experts: # Example processing processed = {} for key, subdict in self.features["openpose"].items(): raw = np.concatenate(subdict["matrix"], axis=1) processed[key] = raw.transpose(1, 0, 2).reshape(-1, 3 * 18) self.features["openpose"] = processed
def main( data_dir: Path, json_anno_path: Path, video_dir: Path, word_data_pkl: Path, trim_format: str, anno_name: str, refresh: bool, ): print(f"Creating info file for {anno_name} annotations") info_dict_dir = data_dir / "info" / anno_name info_dict_dir.mkdir(exist_ok=True, parents=True) info_file = info_dict_dir / "info.pkl" if info_file.exists() and not refresh: print("Found existing info file") if word_data_pkl.exists() and not refresh: print("Found existing word_data_pkl file") else: info = memcache(info_file) word_data_pkl_data = { key: info[key] for key in ("words", "words_to_id") } with open(word_data_pkl, "wb") as f: pkl.dump(word_data_pkl_data, f) print(f"Wrote word_data_pkl to {word_data_pkl}") return dict_file = open(info_dict_dir / "words.txt", "w") data = {} words = set() data["videos"] = {} data["videos"]["name"] = [ ] # Our naming convention (unique ID for a video) data["videos"]["word"] = [] data["videos"]["word_id"] = [] data["videos"]["split"] = [] # 0: train, 1: val, 2: test # Resolution info data["videos"]["videos"] = {} data["videos"]["videos"]["T"] = [] data["videos"]["videos"]["W"] = [] data["videos"]["videos"]["H"] = [] data["videos"]["videos"]["duration_sec"] = [] data["videos"]["videos"]["fps"] = [] # Extra annot data["videos"]["start"] = [] data["videos"]["end"] = [] sets = ["train", "val", "test"] set_dict = {"train": 0, "val": 1, "test": 2} all_data = memcache(json_anno_path) words = set() for subset, subdict in all_data.items(): words.update(subdict.keys()) # Only use train words from reference print(f"{len(words)} words") mapping = { "airplane": "aeroplane", "center": "centre", "favor": "favour", "gray": "grey", "practice": "practise", "recognize": "recognise", "yogurt": "yoghurt", } # fix spellings to English updated_words = [mapping.get(word, word) for word in words] words = list(sorted(set(updated_words))) # Write to TXT file words_to_id = {} for i, w in enumerate(words): words_to_id[w] = i dict_file.write(f"{i:05d} {w}\n") dict_file.close() data["words"] = words data["words_to_id"] = words_to_id cnt = 0 t0 = time.time() for s in sets: # all_data.keys(): for word_cnt, word in enumerate(all_data[s].keys()): if word in words_to_id: print(f"{time.time() - t0:0.2f} sec {s} {word_cnt} {word}") N = len(all_data[s][word]["start"]) for i in range(N): start_time = all_data[s][word]["start"][i] end_time = all_data[s][word]["end"][i] output_filename = construct_video_filename( word=word, set_name=s, output_dir=video_dir, name=Path(all_data[s][word]["video"][i]).stem, start_time=time2tuple(start_time), end_time=time2tuple(end_time), trim_format=trim_format, ) if os.path.exists(output_filename): # Video resolution information ( video_res_t, video_res_w, video_res_h, video_fps, video_duration_sec, ) = _get_video_info(str(output_filename)) # Indication that the video is readable if video_res_t: # if not (video_fps == row['fps']): # print(s, i, video_fps, row['fps']) data["videos"]["videos"]["T"].append(video_res_t) data["videos"]["videos"]["W"].append( video_res_w) # 480 data["videos"]["videos"]["H"].append( video_res_h) # 480 data["videos"]["videos"]["duration_sec"].append( video_duration_sec) data["videos"]["videos"]["fps"].append( video_fps) # 25 data["videos"]["word"].append(word) data["videos"]["word_id"].append(words_to_id[word]) data["videos"]["split"].append(set_dict[s]) name = os.path.join( s, word, os.path.basename(output_filename)) data["videos"]["name"].append(name) data["videos"]["start"].append( all_data[s][word]["start"][i]) data["videos"]["end"].append( all_data[s][word]["end"][i]) cnt += 1 print(f"Writing results to {info_file}") pkl.dump(data, open(info_file, "wb"))