def download_wavs(txt_file: Path, refresh: bool, api_wav_endpoint: str,
                  data_dir: Path, processes: int, legacy_only: bool):
    wav_dir = data_dir / "wav-files"
    wav_dir.mkdir(parents=True, exist_ok=True)
    triplets = []
    with open(txt_file, 'r') as f:
        video_links = f.read().splitlines()
    annotation_folder = data_dir / 'describe-api-results'
    for url in tqdm.tqdm(video_links):
        video_id = url.split('https://www.youtube.com/watch?v=')[1]
        src_path = annotation_folder / f"assets-{video_id}.json"
        with open(src_path, "r") as f:
            info = json.load(f)
        for desc in info["result"]["audio_descriptions"]:
            for clip in desc["audio_clips"]:
                wav_file = clip["file_name"]
                if Path(clip["file_path"]).name == "legacy":
                    wav_parent = Path(clip["file_path"]).name
                else:
                    wav_parent = Path(clip["file_path"]).parent.name
                if wav_file:
                    triplets.append((video_id, wav_file, wav_parent))
    print(f"Parsed {len(triplets)} in total")

    if legacy_only:
        prev = len(triplets)
        triplets = [x for x in triplets if x[2] == "legacy"]
        print(f"Filtered to legacy only wavs ({prev} -> {len(triplets)}")

    kwarg_list = []
    for url, wav_file, wav_parent in triplets:
        kwarg_list.append({
            "url": url,
            "wav_dir": wav_dir,
            "wav_file": wav_file,
            "wav_parent": wav_parent,
            "refresh": refresh,
            "api_wav_endpoint": api_wav_endpoint,
        })

    pool_func = fetch_wav_worker
    if processes > 1:
        # The definition of the pool func must precede the creation of the pool
        # to ensure its pickleable.  We force the definition to occur by reassigning
        # the function.
        with mp.Pool(processes=processes) as pool:
            starmap_with_kwargs(pool=pool,
                                func=pool_func,
                                kwargs_iter=kwarg_list)
    else:
        for idx, kwarg in enumerate(kwarg_list):
            print(f"{idx}/{len(kwarg_list)} processing kwargs ")
            pool_func(**kwarg)
def download_videos(video_dir: Path, txt_file: Path, tries: int, refresh: bool,
                    processes: int, logging: logging.basicConfig):
    """File goes through list of existent videos in QuerYD dataset and
    attempts to download them. Videos are saved with the name "video-{video_id}".
    Inputs:
        video_dir: location where videos are saved
        tries: how many times to attempt downloading a video
        refresh: flag to restart the downloading process
        logging: logging module for saving information about progress of script
    """
    with open(txt_file, 'r') as f:
        video_links = f.read().splitlines()
    os.makedirs(video_dir, exist_ok=True)
    existent_videos = os.listdir(video_dir)
    existent_ids = [
        video.split('video-')[1].split('.')[0] for video in existent_videos
    ]
    total_number_videos = len(video_links)
    logging.info("Downloading videos")
    kwarg_list = []
    for idx, url in enumerate(video_links):
        video_id = url.split('https://www.youtube.com/watch?v=')[1]
        if video_id not in existent_ids or refresh is True:
            kwarg_list.append({
                "tries": tries,
                "url": url,
                "video_dir": video_dir,
                "video_id": video_id,
            })
        else:
            logging.info(f"Already downloaded video {video_id}")

    pool_func = download_one_video
    if processes > 1:
        # The definition of the pool func must precede the creation of the pool
        # to ensure its pickleable.  We force the definition to occur by reassigning
        # the function.
        with mp.Pool(processes=processes) as pool:
            starmap_with_kwargs(pool=pool,
                                func=pool_func,
                                kwargs_iter=kwarg_list)
    else:
        for idx, kwarg in enumerate(kwarg_list):
            print(f"{idx}/{len(kwarg_list)} processing kwargs ")
            pool_func(**kwarg)
Exemple #3
0
def main(
    data_dir: Path,
    anno_pkl_path: Path,
    video_dir: Path,
    canonical_1064_words: Path,
    refresh: bool,
    prob_thres: float,
    worker_id: int,
    num_partitions: int,
    limit: int,
    processes: int,
    mouthing_window_secs: int,
    progress_markers: int,
    aggregate: bool,
    pseudo_annos: str,
    episode2subset: Dict[str, str],
    trim_format: str = "%06d",
):
    path_kwargs = {
        "limit": limit,
        "data_dir": data_dir,
        "pseudo_annos": pseudo_annos,
        "prob_thres": prob_thres,
        "mouthing_window_secs": mouthing_window_secs,
    }
    with open(canonical_1064_words, "rb") as f:
        canonical_vocab = set(pkl.load(f)["words"])

    if aggregate:
        dest_path = gen_paths(worker_id=0, num_partitions=1,
                              **path_kwargs)["info"]
        if dest_path.exists() and not refresh:
            print(f"Found existing info file at {dest_path}, skipping...")
            return
        info = create_info_structure()
        for ii in range(num_partitions):
            src_path = gen_paths(worker_id=ii,
                                 num_partitions=num_partitions,
                                 **path_kwargs)["info"]
            worker_info = memcache(src_path)
            msg = "Expected worker info to match the target 1064 vocab"
            assert set(worker_info["words"]) == canonical_vocab, msg
            if ii == 0:
                # we can update the words with the first worker
                info["words"] = worker_info["words"]
                info["words_to_id"] = worker_info["words_to_id"]
            for key in info["videos"]:
                if key == "videos":
                    for subkey in info["videos"]["videos"]:
                        info["videos"]["videos"][subkey].extend(
                            worker_info["videos"]["videos"][subkey])
                else:
                    info["videos"][key].extend(worker_info["videos"][key])
        print(f"Writing aggregated info to {dest_path}")
        with open(dest_path, "wb") as f:
            pkl.dump(info, f)
        return

    paths = gen_paths(worker_id=worker_id,
                      num_partitions=num_partitions,
                      **path_kwargs)
    if paths["info"].exists() and not refresh:
        print(f"Found existing info file at {paths['info']}, skipping...")
        return

    data = create_info_structure()
    words = set()
    sets = ["train", "val", "test"]
    set_dict = {"train": 0, "val": 1, "test": 2}
    all_data = load_data(
        pseudo_annos=pseudo_annos,
        anno_pkl_path=anno_pkl_path,
        canonical_vocab=canonical_vocab,
        episode2subset=episode2subset,
    )
    all_data = filter_words_by_confidence(all_data, prob_thres)
    print(f"Using a vocabulary of {len(canonical_vocab)} words for BBC")
    words = list(sorted(canonical_vocab))

    # Write to TXT file
    with open(paths["words"], "w") as dict_file:
        words_to_id = {}
        for i, w in enumerate(words):
            words_to_id[w] = i
            dict_file.write(f"{i:05d} {w}\n")

    data["words"] = words
    data["words_to_id"] = words_to_id

    t0 = time.time()
    if num_partitions == 1:
        worker_words = set(words)
    else:
        worker_words = np.array_split(words, num_partitions)[worker_id]

    count = 0
    kwarg_list = []
    for s in sets:  # all_data.keys():
        subset_total = len(all_data[s])
        for word_cnt, word in enumerate(all_data[s].keys()):
            assert word in words_to_id, f"Unkown word: {word}"
            if limit and count >= limit:
                continue
            if word not in worker_words:
                continue
            N = len(all_data[s][word]["names"])
            delta = time.time() - t0
            print(
                f"{delta:0.2f} sec {s} {word_cnt}/{subset_total} {word} [{N} samples]"
            )
            for i in range(N):
                if all_data[s][word]["probs"][i] > prob_thres:
                    start_time, end_time = take_interval_from_peak(
                        all_data[s][word]["times"][i])
                    output_filename = construct_video_filename(
                        output_dir=video_dir,
                        set_name=s,
                        word=word,
                        name=all_data[s][word]["names"][i],
                        start_time=start_time,
                        end_time=end_time,
                        trim_format=trim_format,
                    )
                    if os.path.exists(output_filename):
                        # Video resolution information
                        name = os.path.join(s, word,
                                            os.path.basename(output_filename))
                        kwargs = {
                            "count": count,
                            "word": word,
                            "name": name,
                            "word_id": words_to_id[word],
                            "split": set_dict[s],
                            "processes": processes,
                            "mouthing_time": all_data[s][word]["times"][i],
                            "mouthing_prob": all_data[s][word]["probs"][i],
                            "output_filename": output_filename,
                            "progress_markers": progress_markers,
                        }
                        kwarg_list.append(kwargs)
                        count += 1

    # Enable the worker to print progress.
    for kwargs in kwarg_list:
        kwargs["total"] = len(kwarg_list)

    func = update_meta
    if processes > 1:
        with mp.Pool(processes=processes) as pool:
            meta = starmap_with_kwargs(pool=pool,
                                       func=func,
                                       kwargs_iter=kwarg_list)
    else:
        meta = []
        for kwargs in tqdm.tqdm(kwarg_list):
            meta.append(func(**kwargs))

    # Filter videos that failed to return meta data
    pre_filter = len(meta)
    meta = [x for x in meta if x]
    print(
        f"{len(meta)}/{pre_filter} were successfully parsed for meta information"
    )

    # check that ordering was preserved by multiprocessing
    counts = [x["count"] for x in meta]
    assert list(sorted(counts)) == counts, "Expected meta items to be in order"

    for x in tqdm.tqdm(meta):
        data["videos"]["videos"]["T"].append(x["video_res_t"])
        data["videos"]["videos"]["W"].append(x["video_res_w"])  # 480
        data["videos"]["videos"]["H"].append(x["video_res_h"])  # 480
        data["videos"]["videos"]["duration_sec"].append(
            x["video_duration_sec"])
        data["videos"]["videos"]["fps"].append(x["video_fps"])  # 25
        data["videos"]["word"].append(x["word"])
        data["videos"]["word_id"].append(x["word_id"])
        data["videos"]["split"].append(x["split"])
        data["videos"]["name"].append(x["name"])
        data["videos"]["mouthing_time"].append(x["mouthing_time"])
        data["videos"]["mouthing_prob"].append(x["mouthing_prob"])
    print(f"Saving info file to {paths['info']}...")
    pkl.dump(data, open(paths["info"], "wb"))
Exemple #4
0
def store_as_pkl(
    video_dir,
    dest_path,
    vis,
    limit,
    resize_res,
    store_compressed,
    processes,
    num_partitions,
    worker_id,
):
    video_paths = list(video_dir.glob("**/*.mp4"))
    print(f"Found {len(video_paths)} videos in {video_dir}")

    if num_partitions > 1:
        video_paths = np.array_split(video_paths, num_partitions)[worker_id]

    if limit:
        video_paths = video_paths[:limit]

    data = {}
    kwarg_list = []
    for ii, video_path in enumerate(video_paths):
        kwargs = {
            "video_idx": ii,
            "vis": vis,
            "resize_res": resize_res,
            "video_path": video_path,
            "total_videos": len(video_paths),
            "store_compressed": store_compressed,
            "processes": processes,
        }
        kwarg_list.append(kwargs)

    func = parse_video_content
    if processes > 1:
        with mp.Pool(processes=processes) as pool:
            res = starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list)
        for store, kwargs in zip(res, kwarg_list):
            data[str(kwargs["video_path"])] = store
    else:
        for kwargs in tqdm.tqdm(kwarg_list):
            data[str(kwargs["video_path"])] = func(**kwargs)

    # if store_compressed:
    num_bytes = [
        sum(x.getbuffer().nbytes for x in vid["data"]) for vid in data.values()
    ]
    print(
        (
            f"[Video size] >>> avg: {humanize.naturalsize(np.mean(num_bytes))}, "
            f"max: {humanize.naturalsize(np.max(num_bytes))}, "
            f"min: {humanize.naturalsize(np.min(num_bytes))}"
        )
    )
    tic = time.time()
    print(f"Writing data to {dest_path}")
    with open(dest_path, "wb") as f:
        pickle.dump(data, f)
    duration = time.strftime("%Hh%Mm%Ss", time.gmtime(time.time() - tic))
    pickle_size = humanize.naturalsize(dest_path.stat().st_size, binary=True)
    print(f"Finished writing pickle [{pickle_size}] to disk in {duration}")
Exemple #5
0
def main(
    video_dir: Path,
    trim_format: str,
    pad_clip: float,
    limit: int,
    processes: int,
    json_anno_path: Path,
    anno_name: str,
    force_resize: int,
    refresh: bool,
    vis: bool,
):
    print(f"Processing {anno_name} annotations")
    data = memcache(json_anno_path)

    output_filenames = defaultdict(list)
    kwarg_list = []
    outs = set()
    count = 0
    for s in tqdm.tqdm(data.keys()):
        for word in tqdm.tqdm(data[s].keys()):
            N = len(data[s][word]["start"])
            for i in range(N):
                start_time = data[s][word]["start"][i] - pad_clip
                end_time = data[s][word]["end"][i] + pad_clip
                output_filename = construct_video_filename(
                    output_dir=video_dir,
                    set_name=s,
                    word=word,
                    name=Path(data[s][word]["video"][i]).stem,
                    start_time=time2tuple(start_time),
                    end_time=time2tuple(end_time),
                    trim_format=trim_format,
                )
                output_filenames[output_filename].append(
                    (start_time, end_time))
                source_file = Path(data[s][word]["video"][i])
                assert source_file.exists(
                ), f"Expected source file at {source_file}"
                kwargs = {
                    "refresh": refresh,
                    "start_time": start_time,
                    "end_time": end_time,
                    "output_filename": output_filename,
                    "source_file": source_file,
                    "force_resize": force_resize,
                }
                outs.add(output_filename)
                kwarg_list.append(kwargs)
                count += 1

    if vis:
        durations = np.array(
            [x["end_time"] - x["start_time"] for x in kwarg_list])
        step = 0.1
        bins = np.arange(0, np.ceil(durations.max()), step=step)
        values, _ = np.histogram(durations, bins=bins)
        plt.figure(figsize=(20, 10))
        x_ticks = bins[:-1] + (step / 2)
        plt.bar(x_ticks, values, width=step)
        font = {"family": "serif", "weight": "normal", "size": 26}
        matplotlib.rc("font", **font)
        plt.suptitle(f"BSLCP sign durations")
        plt.savefig("zz-bslcp-durations.png")

    if limit:
        kwarg_list = kwarg_list[:limit]
    func = extract_clip
    if processes > 1:
        with mp.Pool(processes=processes) as pool:
            starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list)
    else:
        for kwargs in tqdm.tqdm(kwarg_list):
            func(**kwargs)
    print(f"Expected to produce: {len(kwarg_list)} outputs")
Exemple #6
0
def main(
    output_dir: Path,
    subtitle_pkl_path: Path,
    canonical_1064_words: Path,
    pseudo_anno_path: Path,
    subtitle_reference_mouthings: Path,
    use_date: str,
    trim_format: str,
    video_src_name: str,
    refresh: bool,
    refresh_kwargs_pkl: bool,
    use_subs: bool,
    use_sentences: bool,
    pseudo_annos: str,
    kwargs_only: bool,
    limit: int,
    worker_id: int,
    processes: int,
    force_resize: int,
    num_partitions: int,
    num_frames_before: int,
    num_frames_after: int,
    window_secs: int,
    prob_thres: float,
    episode2subset: Dict[str, str],
):

    paths = gen_paths(
        limit=limit,
        use_date=use_date,
        video_src_name=video_src_name,
        output_dir=output_dir,
        use_subs=use_subs,
        use_sentences=use_sentences,
        pseudo_annos=pseudo_annos,
        pseudo_anno_path=pseudo_anno_path,
        prob_thres=prob_thres,
        force_resize=force_resize,
        num_frames_before=num_frames_before,
        num_frames_after=num_frames_after,
        window_secs=window_secs,
    )
    with open(canonical_1064_words, "rb") as f:
        canonical_vocab = set(pkl.load(f)["words"])
    if pseudo_annos:
        data = pseudo_annos_to_subset_dict(
            pseudo_anno_path=pseudo_anno_path,
            pseudo_annos=pseudo_annos,
            episode2subset=episode2subset,
            canonical_vocab=canonical_vocab,
        )
    elif paths["anno_pkl"].exists():
        data = pkl.load(open(paths["anno_pkl"], "rb"))
    else:
        print(f"Generating pkl file for {window_secs} sec window...")
        if use_subs:
            data = parse_subtitles(
                pkl_file=paths["anno_pkl"],
                prob_thres=prob_thres,
                episode2subset=episode2subset,
                canonical_vocab=canonical_vocab,
                subtitle_reference_mouthings=subtitle_reference_mouthings,
                subtitle_pkl_path=subtitle_pkl_path,
            )
        else:
            data = gather_all_jsons(paths["anno_pkl"], window_secs=window_secs)

    if paths["kwargs_pkl"].exists() and not refresh_kwargs_pkl:
        print(f"Loading kwargs from {paths['kwargs_pkl']} from cache")
        with open(paths["kwargs_pkl"], "rb") as f:
            kwarg_list = pkl.load(f)
    else:
        if use_sentences:
            # Parallization doesn't really make sense for sentences,
            # but we keep it to preserve structure.
            count = 0
            kwarg_constructors = []
            for subset in tqdm.tqdm(data.keys()):
                if limit and count >= limit:
                    continue
                kwargs = {
                    "refresh": refresh,
                    "subset": subset,
                    "paths": paths,
                    "subtitle_data": data[subset],
                    "trim_format": trim_format,
                    "video_src_name": video_src_name,
                    "force_resize": force_resize,
                    "sentence_pad_sec": window_secs,
                }
                kwarg_constructors.append(kwargs)
                count += 1
            func = build_kwarg_list_for_sentence
        else:
            # Due to the scale of the preprocessing, the algorithm for creating the
            # arguments that will be passed to each worker is also parallelised
            # (i.e. we are using multiprocessing to determine the keyword arguments)
            count = 0
            kwarg_constructors = []
            for subset in tqdm.tqdm(data.keys()):
                for word in tqdm.tqdm(data[subset].keys()):
                    if limit and count >= limit:
                        continue
                    kwargs = {
                        "refresh": refresh,
                        "word": word,
                        "count": count,
                        "subset": subset,
                        "paths": paths,
                        "prob_thres": prob_thres,
                        "word_data": data[subset][word],
                        "trim_format": trim_format,
                        "video_src_name": video_src_name,
                        "num_frames_before": num_frames_before,
                        "num_frames_after": num_frames_after,
                        "force_resize": force_resize,
                        "processes": processes,
                    }
                    kwarg_constructors.append(kwargs)
                    count += 1
            func = build_kwarg_list_for_word

        # Include total counts to allow the function to show progress
        for kwargs in kwarg_constructors:
            kwargs["total"] = count
        with BlockTimer("Building kwarg lists"):
            if processes > 1:
                with mp.Pool(processes=processes) as pool:
                    kwarg_list = starmap_with_kwargs(
                        pool=pool,
                        func=func,
                        kwargs_iter=kwarg_constructors,
                    )
            else:
                kwarg_list = []
                for kwargs in tqdm.tqdm(kwarg_constructors):
                    kwarg_list.append(func(**kwargs))

        # flatten outputs
        kwarg_list = [x for sublist in kwarg_list for x in sublist]
        print(
            f"Caching kwarg_list ({len(kwarg_list)} elements) to {paths['kwargs_pkl']}"
        )
        paths["kwargs_pkl"].parent.mkdir(exist_ok=True, parents=True)
        with open(paths["kwargs_pkl"], "wb") as f:
            pkl.dump(kwarg_list, f)

    if kwargs_only:
        return

    kwarg_list = np.array_split(kwarg_list, num_partitions)[worker_id]
    msg = (
        f"Worker {worker_id}/{num_partitions} processing {len(kwarg_list)} items"
        f" with {processes} processes")
    print(msg)
    if limit:
        kwarg_list = kwarg_list[:limit]
    func = extract_clip
    if processes > 1:
        with mp.Pool(processes=processes) as pool:
            starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list)
    else:
        for kwargs in tqdm.tqdm(kwarg_list):
            func(**kwargs)
Exemple #7
0
def resize_videos(
        src_video_dir: Path,
        dest_video_dir: Path,
        relevant_ids_path: (Path, NoneType),
        vis: bool,
        limit: int,
        suffix: str,
        refresh: bool,
        processes: int,
        resize_res: int,
        worker_id: int,
        progress_markers: int,
        num_partitions: int,
        exclude_pattern: (str, NoneType),
):
    video_paths = list(src_video_dir.glob(f"**/*{suffix}"))
    print(f"Found {len(video_paths)} videos in {src_video_dir}")

    if relevant_ids_path:
        with open(relevant_ids_path, "r") as f:
            relevant_ids = set(f.read().splitlines())
        video_paths = [
            x for x in video_paths if video_path2id(x) in relevant_ids
        ]
        print(f"Filtered to {len(video_paths)} videos using relevant-id list")

    if exclude_pattern:
        pre_exclude = len(video_paths)
        video_paths = [x for x in video_paths if exclude_pattern not in x.name]
        print(f"Filtered from {pre_exclude} videos to {len(video_paths)} "
              f" by excluding the pattern: {exclude_pattern}")

    video_paths = np.array_split(video_paths, num_partitions)[worker_id]

    if limit:
        video_paths = video_paths[:limit]

    # Some source videos were re-encoded to fix meta data issues.  When these are used
    # we rename their targets to match the other videos
    remap = {"signhd-dense-fast-audio": "signhd"}

    kwarg_list = []
    for ii, video_path in enumerate(video_paths):
        dest_path = dest_video_dir / video_path.relative_to(src_video_dir)
        # We enforce that all videos are re-encoded as mp4, regardless of source format
        dest_path = dest_path.with_suffix(".mp4")
        if any([key in str(dest_path) for key in remap]):
            for src, target in remap.items():
                dest_path = Path(str(dest_path).replace(src, target))
        if dest_path.exists() and not refresh:
            print(f"Found existing video at {dest_path}, skipping")
            continue

        kwargs = {
            "vis": vis,
            "video_idx": ii,
            "processes": processes,
            "dest_path": dest_path,
            "resize_res": resize_res,
            "video_path": video_path,
            "total_videos": len(video_paths),
            "progress_markers": progress_markers,
        }
        kwarg_list.append(kwargs)

    func = resize_video_content
    if processes > 1:
        with mp.Pool(processes=processes) as pool:
            starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list)
    else:
        for kwargs in tqdm.tqdm(kwarg_list):
            func(**kwargs)