def download_wavs(txt_file: Path, refresh: bool, api_wav_endpoint: str, data_dir: Path, processes: int, legacy_only: bool): wav_dir = data_dir / "wav-files" wav_dir.mkdir(parents=True, exist_ok=True) triplets = [] with open(txt_file, 'r') as f: video_links = f.read().splitlines() annotation_folder = data_dir / 'describe-api-results' for url in tqdm.tqdm(video_links): video_id = url.split('https://www.youtube.com/watch?v=')[1] src_path = annotation_folder / f"assets-{video_id}.json" with open(src_path, "r") as f: info = json.load(f) for desc in info["result"]["audio_descriptions"]: for clip in desc["audio_clips"]: wav_file = clip["file_name"] if Path(clip["file_path"]).name == "legacy": wav_parent = Path(clip["file_path"]).name else: wav_parent = Path(clip["file_path"]).parent.name if wav_file: triplets.append((video_id, wav_file, wav_parent)) print(f"Parsed {len(triplets)} in total") if legacy_only: prev = len(triplets) triplets = [x for x in triplets if x[2] == "legacy"] print(f"Filtered to legacy only wavs ({prev} -> {len(triplets)}") kwarg_list = [] for url, wav_file, wav_parent in triplets: kwarg_list.append({ "url": url, "wav_dir": wav_dir, "wav_file": wav_file, "wav_parent": wav_parent, "refresh": refresh, "api_wav_endpoint": api_wav_endpoint, }) pool_func = fetch_wav_worker if processes > 1: # The definition of the pool func must precede the creation of the pool # to ensure its pickleable. We force the definition to occur by reassigning # the function. with mp.Pool(processes=processes) as pool: starmap_with_kwargs(pool=pool, func=pool_func, kwargs_iter=kwarg_list) else: for idx, kwarg in enumerate(kwarg_list): print(f"{idx}/{len(kwarg_list)} processing kwargs ") pool_func(**kwarg)
def download_videos(video_dir: Path, txt_file: Path, tries: int, refresh: bool, processes: int, logging: logging.basicConfig): """File goes through list of existent videos in QuerYD dataset and attempts to download them. Videos are saved with the name "video-{video_id}". Inputs: video_dir: location where videos are saved tries: how many times to attempt downloading a video refresh: flag to restart the downloading process logging: logging module for saving information about progress of script """ with open(txt_file, 'r') as f: video_links = f.read().splitlines() os.makedirs(video_dir, exist_ok=True) existent_videos = os.listdir(video_dir) existent_ids = [ video.split('video-')[1].split('.')[0] for video in existent_videos ] total_number_videos = len(video_links) logging.info("Downloading videos") kwarg_list = [] for idx, url in enumerate(video_links): video_id = url.split('https://www.youtube.com/watch?v=')[1] if video_id not in existent_ids or refresh is True: kwarg_list.append({ "tries": tries, "url": url, "video_dir": video_dir, "video_id": video_id, }) else: logging.info(f"Already downloaded video {video_id}") pool_func = download_one_video if processes > 1: # The definition of the pool func must precede the creation of the pool # to ensure its pickleable. We force the definition to occur by reassigning # the function. with mp.Pool(processes=processes) as pool: starmap_with_kwargs(pool=pool, func=pool_func, kwargs_iter=kwarg_list) else: for idx, kwarg in enumerate(kwarg_list): print(f"{idx}/{len(kwarg_list)} processing kwargs ") pool_func(**kwarg)
def main( data_dir: Path, anno_pkl_path: Path, video_dir: Path, canonical_1064_words: Path, refresh: bool, prob_thres: float, worker_id: int, num_partitions: int, limit: int, processes: int, mouthing_window_secs: int, progress_markers: int, aggregate: bool, pseudo_annos: str, episode2subset: Dict[str, str], trim_format: str = "%06d", ): path_kwargs = { "limit": limit, "data_dir": data_dir, "pseudo_annos": pseudo_annos, "prob_thres": prob_thres, "mouthing_window_secs": mouthing_window_secs, } with open(canonical_1064_words, "rb") as f: canonical_vocab = set(pkl.load(f)["words"]) if aggregate: dest_path = gen_paths(worker_id=0, num_partitions=1, **path_kwargs)["info"] if dest_path.exists() and not refresh: print(f"Found existing info file at {dest_path}, skipping...") return info = create_info_structure() for ii in range(num_partitions): src_path = gen_paths(worker_id=ii, num_partitions=num_partitions, **path_kwargs)["info"] worker_info = memcache(src_path) msg = "Expected worker info to match the target 1064 vocab" assert set(worker_info["words"]) == canonical_vocab, msg if ii == 0: # we can update the words with the first worker info["words"] = worker_info["words"] info["words_to_id"] = worker_info["words_to_id"] for key in info["videos"]: if key == "videos": for subkey in info["videos"]["videos"]: info["videos"]["videos"][subkey].extend( worker_info["videos"]["videos"][subkey]) else: info["videos"][key].extend(worker_info["videos"][key]) print(f"Writing aggregated info to {dest_path}") with open(dest_path, "wb") as f: pkl.dump(info, f) return paths = gen_paths(worker_id=worker_id, num_partitions=num_partitions, **path_kwargs) if paths["info"].exists() and not refresh: print(f"Found existing info file at {paths['info']}, skipping...") return data = create_info_structure() words = set() sets = ["train", "val", "test"] set_dict = {"train": 0, "val": 1, "test": 2} all_data = load_data( pseudo_annos=pseudo_annos, anno_pkl_path=anno_pkl_path, canonical_vocab=canonical_vocab, episode2subset=episode2subset, ) all_data = filter_words_by_confidence(all_data, prob_thres) print(f"Using a vocabulary of {len(canonical_vocab)} words for BBC") words = list(sorted(canonical_vocab)) # Write to TXT file with open(paths["words"], "w") as dict_file: words_to_id = {} for i, w in enumerate(words): words_to_id[w] = i dict_file.write(f"{i:05d} {w}\n") data["words"] = words data["words_to_id"] = words_to_id t0 = time.time() if num_partitions == 1: worker_words = set(words) else: worker_words = np.array_split(words, num_partitions)[worker_id] count = 0 kwarg_list = [] for s in sets: # all_data.keys(): subset_total = len(all_data[s]) for word_cnt, word in enumerate(all_data[s].keys()): assert word in words_to_id, f"Unkown word: {word}" if limit and count >= limit: continue if word not in worker_words: continue N = len(all_data[s][word]["names"]) delta = time.time() - t0 print( f"{delta:0.2f} sec {s} {word_cnt}/{subset_total} {word} [{N} samples]" ) for i in range(N): if all_data[s][word]["probs"][i] > prob_thres: start_time, end_time = take_interval_from_peak( all_data[s][word]["times"][i]) output_filename = construct_video_filename( output_dir=video_dir, set_name=s, word=word, name=all_data[s][word]["names"][i], start_time=start_time, end_time=end_time, trim_format=trim_format, ) if os.path.exists(output_filename): # Video resolution information name = os.path.join(s, word, os.path.basename(output_filename)) kwargs = { "count": count, "word": word, "name": name, "word_id": words_to_id[word], "split": set_dict[s], "processes": processes, "mouthing_time": all_data[s][word]["times"][i], "mouthing_prob": all_data[s][word]["probs"][i], "output_filename": output_filename, "progress_markers": progress_markers, } kwarg_list.append(kwargs) count += 1 # Enable the worker to print progress. for kwargs in kwarg_list: kwargs["total"] = len(kwarg_list) func = update_meta if processes > 1: with mp.Pool(processes=processes) as pool: meta = starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list) else: meta = [] for kwargs in tqdm.tqdm(kwarg_list): meta.append(func(**kwargs)) # Filter videos that failed to return meta data pre_filter = len(meta) meta = [x for x in meta if x] print( f"{len(meta)}/{pre_filter} were successfully parsed for meta information" ) # check that ordering was preserved by multiprocessing counts = [x["count"] for x in meta] assert list(sorted(counts)) == counts, "Expected meta items to be in order" for x in tqdm.tqdm(meta): data["videos"]["videos"]["T"].append(x["video_res_t"]) data["videos"]["videos"]["W"].append(x["video_res_w"]) # 480 data["videos"]["videos"]["H"].append(x["video_res_h"]) # 480 data["videos"]["videos"]["duration_sec"].append( x["video_duration_sec"]) data["videos"]["videos"]["fps"].append(x["video_fps"]) # 25 data["videos"]["word"].append(x["word"]) data["videos"]["word_id"].append(x["word_id"]) data["videos"]["split"].append(x["split"]) data["videos"]["name"].append(x["name"]) data["videos"]["mouthing_time"].append(x["mouthing_time"]) data["videos"]["mouthing_prob"].append(x["mouthing_prob"]) print(f"Saving info file to {paths['info']}...") pkl.dump(data, open(paths["info"], "wb"))
def store_as_pkl( video_dir, dest_path, vis, limit, resize_res, store_compressed, processes, num_partitions, worker_id, ): video_paths = list(video_dir.glob("**/*.mp4")) print(f"Found {len(video_paths)} videos in {video_dir}") if num_partitions > 1: video_paths = np.array_split(video_paths, num_partitions)[worker_id] if limit: video_paths = video_paths[:limit] data = {} kwarg_list = [] for ii, video_path in enumerate(video_paths): kwargs = { "video_idx": ii, "vis": vis, "resize_res": resize_res, "video_path": video_path, "total_videos": len(video_paths), "store_compressed": store_compressed, "processes": processes, } kwarg_list.append(kwargs) func = parse_video_content if processes > 1: with mp.Pool(processes=processes) as pool: res = starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list) for store, kwargs in zip(res, kwarg_list): data[str(kwargs["video_path"])] = store else: for kwargs in tqdm.tqdm(kwarg_list): data[str(kwargs["video_path"])] = func(**kwargs) # if store_compressed: num_bytes = [ sum(x.getbuffer().nbytes for x in vid["data"]) for vid in data.values() ] print( ( f"[Video size] >>> avg: {humanize.naturalsize(np.mean(num_bytes))}, " f"max: {humanize.naturalsize(np.max(num_bytes))}, " f"min: {humanize.naturalsize(np.min(num_bytes))}" ) ) tic = time.time() print(f"Writing data to {dest_path}") with open(dest_path, "wb") as f: pickle.dump(data, f) duration = time.strftime("%Hh%Mm%Ss", time.gmtime(time.time() - tic)) pickle_size = humanize.naturalsize(dest_path.stat().st_size, binary=True) print(f"Finished writing pickle [{pickle_size}] to disk in {duration}")
def main( video_dir: Path, trim_format: str, pad_clip: float, limit: int, processes: int, json_anno_path: Path, anno_name: str, force_resize: int, refresh: bool, vis: bool, ): print(f"Processing {anno_name} annotations") data = memcache(json_anno_path) output_filenames = defaultdict(list) kwarg_list = [] outs = set() count = 0 for s in tqdm.tqdm(data.keys()): for word in tqdm.tqdm(data[s].keys()): N = len(data[s][word]["start"]) for i in range(N): start_time = data[s][word]["start"][i] - pad_clip end_time = data[s][word]["end"][i] + pad_clip output_filename = construct_video_filename( output_dir=video_dir, set_name=s, word=word, name=Path(data[s][word]["video"][i]).stem, start_time=time2tuple(start_time), end_time=time2tuple(end_time), trim_format=trim_format, ) output_filenames[output_filename].append( (start_time, end_time)) source_file = Path(data[s][word]["video"][i]) assert source_file.exists( ), f"Expected source file at {source_file}" kwargs = { "refresh": refresh, "start_time": start_time, "end_time": end_time, "output_filename": output_filename, "source_file": source_file, "force_resize": force_resize, } outs.add(output_filename) kwarg_list.append(kwargs) count += 1 if vis: durations = np.array( [x["end_time"] - x["start_time"] for x in kwarg_list]) step = 0.1 bins = np.arange(0, np.ceil(durations.max()), step=step) values, _ = np.histogram(durations, bins=bins) plt.figure(figsize=(20, 10)) x_ticks = bins[:-1] + (step / 2) plt.bar(x_ticks, values, width=step) font = {"family": "serif", "weight": "normal", "size": 26} matplotlib.rc("font", **font) plt.suptitle(f"BSLCP sign durations") plt.savefig("zz-bslcp-durations.png") if limit: kwarg_list = kwarg_list[:limit] func = extract_clip if processes > 1: with mp.Pool(processes=processes) as pool: starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list) else: for kwargs in tqdm.tqdm(kwarg_list): func(**kwargs) print(f"Expected to produce: {len(kwarg_list)} outputs")
def main( output_dir: Path, subtitle_pkl_path: Path, canonical_1064_words: Path, pseudo_anno_path: Path, subtitle_reference_mouthings: Path, use_date: str, trim_format: str, video_src_name: str, refresh: bool, refresh_kwargs_pkl: bool, use_subs: bool, use_sentences: bool, pseudo_annos: str, kwargs_only: bool, limit: int, worker_id: int, processes: int, force_resize: int, num_partitions: int, num_frames_before: int, num_frames_after: int, window_secs: int, prob_thres: float, episode2subset: Dict[str, str], ): paths = gen_paths( limit=limit, use_date=use_date, video_src_name=video_src_name, output_dir=output_dir, use_subs=use_subs, use_sentences=use_sentences, pseudo_annos=pseudo_annos, pseudo_anno_path=pseudo_anno_path, prob_thres=prob_thres, force_resize=force_resize, num_frames_before=num_frames_before, num_frames_after=num_frames_after, window_secs=window_secs, ) with open(canonical_1064_words, "rb") as f: canonical_vocab = set(pkl.load(f)["words"]) if pseudo_annos: data = pseudo_annos_to_subset_dict( pseudo_anno_path=pseudo_anno_path, pseudo_annos=pseudo_annos, episode2subset=episode2subset, canonical_vocab=canonical_vocab, ) elif paths["anno_pkl"].exists(): data = pkl.load(open(paths["anno_pkl"], "rb")) else: print(f"Generating pkl file for {window_secs} sec window...") if use_subs: data = parse_subtitles( pkl_file=paths["anno_pkl"], prob_thres=prob_thres, episode2subset=episode2subset, canonical_vocab=canonical_vocab, subtitle_reference_mouthings=subtitle_reference_mouthings, subtitle_pkl_path=subtitle_pkl_path, ) else: data = gather_all_jsons(paths["anno_pkl"], window_secs=window_secs) if paths["kwargs_pkl"].exists() and not refresh_kwargs_pkl: print(f"Loading kwargs from {paths['kwargs_pkl']} from cache") with open(paths["kwargs_pkl"], "rb") as f: kwarg_list = pkl.load(f) else: if use_sentences: # Parallization doesn't really make sense for sentences, # but we keep it to preserve structure. count = 0 kwarg_constructors = [] for subset in tqdm.tqdm(data.keys()): if limit and count >= limit: continue kwargs = { "refresh": refresh, "subset": subset, "paths": paths, "subtitle_data": data[subset], "trim_format": trim_format, "video_src_name": video_src_name, "force_resize": force_resize, "sentence_pad_sec": window_secs, } kwarg_constructors.append(kwargs) count += 1 func = build_kwarg_list_for_sentence else: # Due to the scale of the preprocessing, the algorithm for creating the # arguments that will be passed to each worker is also parallelised # (i.e. we are using multiprocessing to determine the keyword arguments) count = 0 kwarg_constructors = [] for subset in tqdm.tqdm(data.keys()): for word in tqdm.tqdm(data[subset].keys()): if limit and count >= limit: continue kwargs = { "refresh": refresh, "word": word, "count": count, "subset": subset, "paths": paths, "prob_thres": prob_thres, "word_data": data[subset][word], "trim_format": trim_format, "video_src_name": video_src_name, "num_frames_before": num_frames_before, "num_frames_after": num_frames_after, "force_resize": force_resize, "processes": processes, } kwarg_constructors.append(kwargs) count += 1 func = build_kwarg_list_for_word # Include total counts to allow the function to show progress for kwargs in kwarg_constructors: kwargs["total"] = count with BlockTimer("Building kwarg lists"): if processes > 1: with mp.Pool(processes=processes) as pool: kwarg_list = starmap_with_kwargs( pool=pool, func=func, kwargs_iter=kwarg_constructors, ) else: kwarg_list = [] for kwargs in tqdm.tqdm(kwarg_constructors): kwarg_list.append(func(**kwargs)) # flatten outputs kwarg_list = [x for sublist in kwarg_list for x in sublist] print( f"Caching kwarg_list ({len(kwarg_list)} elements) to {paths['kwargs_pkl']}" ) paths["kwargs_pkl"].parent.mkdir(exist_ok=True, parents=True) with open(paths["kwargs_pkl"], "wb") as f: pkl.dump(kwarg_list, f) if kwargs_only: return kwarg_list = np.array_split(kwarg_list, num_partitions)[worker_id] msg = ( f"Worker {worker_id}/{num_partitions} processing {len(kwarg_list)} items" f" with {processes} processes") print(msg) if limit: kwarg_list = kwarg_list[:limit] func = extract_clip if processes > 1: with mp.Pool(processes=processes) as pool: starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list) else: for kwargs in tqdm.tqdm(kwarg_list): func(**kwargs)
def resize_videos( src_video_dir: Path, dest_video_dir: Path, relevant_ids_path: (Path, NoneType), vis: bool, limit: int, suffix: str, refresh: bool, processes: int, resize_res: int, worker_id: int, progress_markers: int, num_partitions: int, exclude_pattern: (str, NoneType), ): video_paths = list(src_video_dir.glob(f"**/*{suffix}")) print(f"Found {len(video_paths)} videos in {src_video_dir}") if relevant_ids_path: with open(relevant_ids_path, "r") as f: relevant_ids = set(f.read().splitlines()) video_paths = [ x for x in video_paths if video_path2id(x) in relevant_ids ] print(f"Filtered to {len(video_paths)} videos using relevant-id list") if exclude_pattern: pre_exclude = len(video_paths) video_paths = [x for x in video_paths if exclude_pattern not in x.name] print(f"Filtered from {pre_exclude} videos to {len(video_paths)} " f" by excluding the pattern: {exclude_pattern}") video_paths = np.array_split(video_paths, num_partitions)[worker_id] if limit: video_paths = video_paths[:limit] # Some source videos were re-encoded to fix meta data issues. When these are used # we rename their targets to match the other videos remap = {"signhd-dense-fast-audio": "signhd"} kwarg_list = [] for ii, video_path in enumerate(video_paths): dest_path = dest_video_dir / video_path.relative_to(src_video_dir) # We enforce that all videos are re-encoded as mp4, regardless of source format dest_path = dest_path.with_suffix(".mp4") if any([key in str(dest_path) for key in remap]): for src, target in remap.items(): dest_path = Path(str(dest_path).replace(src, target)) if dest_path.exists() and not refresh: print(f"Found existing video at {dest_path}, skipping") continue kwargs = { "vis": vis, "video_idx": ii, "processes": processes, "dest_path": dest_path, "resize_res": resize_res, "video_path": video_path, "total_videos": len(video_paths), "progress_markers": progress_markers, } kwarg_list.append(kwargs) func = resize_video_content if processes > 1: with mp.Pool(processes=processes) as pool: starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list) else: for kwargs in tqdm.tqdm(kwarg_list): func(**kwargs)