def load_features(self):
        root_feat = Path(self.root_feat)
        feat_names = {key: self.visual_feat_paths(key) for key in
                      self.paths["feature_names"]}
        feat_names.update(self.paths["custom_paths"])
        features = {}
        for expert, rel_names in feat_names.items():
            if expert not in self.ordered_experts:
                continue
            feat_paths = tuple([root_feat / rel_name for rel_name in rel_names])
            if len(feat_paths) == 1:
                features[expert] = memcache(feat_paths[0])
            else:
                # support multiple forms of feature (e.g. max and avg pooling). For
                # now, we only support direct concatenation
                msg = f"{expert}: Only direct concatenation of muliple feats is possible"
                print(f"Concatenating aggregates for {expert}....")
                assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
                axis = self.feat_aggregation[expert]["aggregate-axis"]
                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
                print(f"concat cache info: {x}")
                features_ = concat_features(feat_paths, axis=axis)
                memory_summary()

                # Make separate feature copies for each split to allow in-place filtering
                features[expert] = copy.deepcopy(features_)

        self.features = features
        if self.challenge_mode:
            self.load_challenge_text_features()
        else:
            self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
            text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
            self.text_features = memcache(text_feat_path)
Beispiel #2
0
def validate_embeddings_against_reference(
    computed_embeddings: Dict[str, List[np.ndarray]],
    embedding_name: str,
    dataset: str,
):
    root_feat, paths = dataset_paths(dataset)
    reference_dict = {}
    for path in paths["text_feat_paths"][embedding_name].values():
        reference_dict.update(memcache(root_feat / path))

    # We handle MSVD as a special case, because video keys != feature keys
    if dataset == "MSVD":
        key_map = memcache(root_feat / paths["dict_youtube_mapping_path"])
        inverse_map = {val: key for key, val in key_map.items()}
        reference_dict = {
            inverse_map[key]: val
            for key, val in reference_dict.items()
        }

    print(f"Validating embeddings against reference....")
    for key, val in tqdm.tqdm(computed_embeddings.items()):
        ref_val = reference_dict[key]
        msg = (f"[{embedding_name}] {key} Different number of "
               f"embeddings {len(ref_val)} vs {len(val)}")
        assert len(ref_val) == len(val), msg
        msg = f"[{embedding_name}] Embedding mismatch for {key}"
        for vec, ref_vec in zip(val, ref_val):
            assert np.abs(vec - ref_vec).max() < 1E-5, msg
Beispiel #3
0
    def load_features(self):
        root_feat = Path(self.root_feat)
        feat_names = {key: self.visual_feat_paths(key) for key in
                      self.paths["feature_names"]}
        feat_names.update(self.paths["custom_paths"])
        features = {}
        for expert, rel_names in feat_names.items():
            if expert not in self.ordered_experts:
                continue
            feat_paths = tuple([root_feat / rel_name for rel_name in rel_names])
            if len(feat_paths) == 1:
                features[expert] = memcache(feat_paths[0])
            else:
                # support multiple forms of feature (e.g. max and avg pooling). For
                # now, we only support direct concatenation
                msg = f"{expert}: Only direct concatenation of muliple feats is possible"
                print(f"Concatenating aggregates for {expert}....")
                is_concat = self.feat_aggregation[expert]["aggregate"] == "concat"
                self.log_assert(is_concat, msg=msg)
                axis = self.feat_aggregation[expert]["aggregate-axis"]
                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
                print(f"concat cache info: {x}")
                features_ = concat_features(feat_paths, axis=axis)
                memory_summary()

                # Make separate feature copies for each split to allow in-place filtering
                features[expert] = copy.deepcopy(features_)

        self.features = features
        if self.split_name == "jsfusion":
            self.restrict_test_captions = memcache(
                root_feat / self.paths["js_test_cap_idx_path"])

        self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
        self.text_features = memcache(root_feat / self.paths["text_feat_path"])

        if self.restrict_train_captions:
            # hash the video names to avoid O(n) lookups in long lists
            train_list = set(self.partition_lists["train"])
            for key, val in self.text_features.items():
                if key not in train_list:
                    continue

                if not self.split_name == "full-test":
                    # Note that we do not perform this sanity check for the full-test
                    # split, because the text features in the cached dataset will already
                    # have been cropped to the specified `resstrict_train_captions`
                    msg = "expected train text features to be lists with length 19 or 20"
                    has_expected_feats = isinstance(val, list) and len(val) in {19, 20}
                    self.log_assert(has_expected_feats, msg=msg)

                # restrict to the first N captions (deterministic)
                self.text_features[key] = val[:self.restrict_train_captions]
        self.summary_stats()
Beispiel #4
0
    def load_features(self):
        root_feat = self.root_feat
        feat_names = {
            key: self.visual_feat_paths(key)
            for key in self.paths["feature_names"]
        }
        feat_names.update(self.paths["custom_paths"])
        features = {}
        for expert, rel_names in feat_names.items():
            if expert not in self.ordered_experts:
                continue
            feat_paths = tuple(
                [Path(root_feat) / rel_name for rel_name in rel_names])
            if len(feat_paths) == 1:
                features[expert] = memcache(feat_paths[0])
            else:
                # support multiple forms of feature (e.g. max and avg pooling). For
                # now, we only support direct concatenation
                msg = f"{expert}: Only direct concatenation of muliple feats is possible"
                print(f"Concatenating aggregates for {expert}....")
                assert self.feat_aggregation[expert][
                    "aggregate"] == "concat", msg
                axis = self.feat_aggregation[expert]["aggregate-axis"]
                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
                print(f"concat cache info: {x}")
                features_ = concat_features(feat_paths, axis=axis)
                memory_summary()

                # Make separate feature copies for each split to allow in-place filtering
                features[expert] = copy.deepcopy(features_)

        self.features = features
        if self.challenge_mode:
            self.load_challenge_text_features()
        else:
            self.raw_captions = memcache(root_feat /
                                         self.paths["raw_captions_path"])
            # keys = list(raw_captions.keys())
            # raw_captions_fused = {}
            # for key in keys:
            #     raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key]))
            # self.raw_captions = raw_captions_fused
            text_feat_path = root_feat / self.paths["text_feat_paths"][
                self.text_feat]
            self.text_features = memcache(text_feat_path)

        # overload video paths, which are structured differently for YouCook2
        self.video_path_retrieval = [
            f"videos/{x}.mp4" for x in self.partition_lists["val"]
        ]
Beispiel #5
0
def pseudo_annos_to_subset_dict(
    pseudo_anno_path: Path,
    pseudo_annos: str,
    canonical_vocab: set,
    episode2subset: Dict[str, str],
) -> Dict[str, Dict]:
    # keep track of some basic stats as a sanity check
    thresholds = [0.5, 0.7, 0.9]
    counts = {thr: 0 for thr in thresholds}
    data = memcache(pseudo_anno_path)[pseudo_annos]
    subset_data = {key: dict() for key in ("train", "val", "test")}
    subset2episodes = {key: set() for key in subset_data}
    for episode, subset in episode2subset.items():
        subset2episodes[subset].add(episode)
    for subset in subset_data:
        for word, worddict in tqdm.tqdm(data.items()):
            assert word in canonical_vocab, f"Expected {word} to be in 1064 vocab"
            keep = np.array(
                [x in subset2episodes[subset] for x in worddict["names"]])
            if keep.sum():
                if word not in subset_data[subset]:
                    subset_data[subset][word] = defaultdict(list)
                for key, val in worddict.items():
                    kept = np.array(val)[keep].tolist()
                    subset_data[subset][word][key].extend(kept)
                for thr in counts:
                    counts[thr] += (np.array(worddict["probs"])[keep] >
                                    thr).sum()
    data = subset_data
    for thr, val in counts.items():
        print(f"Found {val} annotations at confidences > {thr}")
    return data
Beispiel #6
0
def get_episode2subset_map(subset2episode: Path) -> Dict[str, str]:
    """Build a mapping that converts episode keys into their respective subsets
    """
    subset2episode = memcache(subset2episode)
    episode2subset = {}
    for subset, episodes in subset2episode.items():
        for episode in episodes:
            episode_key = episode.replace("/", "--")
            assert episode_key not in episode2subset, f"Duplicate key: {episode}!"
            episode2subset[episode_key] = subset
    return episode2subset
    def load_features(self):
        root_feat = self.root_feat
        if self.distil_params is not None:
            self.distil_features = {}
            d_base_path = self.distil_params['base_path']

            teachers = list(
                map(lambda x: root_feat / Path(d_base_path + x),
                    self.distil_params['teachers']))

            for i, f_name in enumerate(teachers):
                self.distil_features[i] = memcache(f_name)

        feat_names = {
            key: self.visual_feat_paths(key)
            for key in self.paths["feature_names"]
        }
        feat_names.update(self.paths["custom_paths"])
        features = {}
        for expert, rel_names in feat_names.items():
            if expert not in self.ordered_experts:
                continue
            feat_paths = tuple(
                [Path(root_feat) / rel_name for rel_name in rel_names])
            if len(feat_paths) == 1:
                features[expert] = memcache(feat_paths[0])
            else:
                # support multiple forms of feature (e.g. max and avg pooling). For
                # now, we only support direct concatenation
                msg = f"{expert}: Only direct concatenation of muliple feats is possible"
                print(f"Concatenating aggregates for {expert}....")
                assert self.feat_aggregation[expert][
                    "aggregate"] == "concat", msg
                axis = self.feat_aggregation[expert]["aggregate-axis"]
                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
                print(f"concat cache info: {x}")
                features_ = concat_features(feat_paths, axis=axis)
                memory_summary()

                # Make separate feature copies for each split to allow in-place filtering
                features[expert] = copy.deepcopy(features_)

        self.features = features
        if self.challenge_mode:
            self.load_challenge_text_features()
        else:
            text_feat_paths = self.paths["text_feat_paths"][self.text_feat]
            if isinstance(text_feat_paths, dict):
                text_features = memcache(root_feat / text_feat_paths["train"])
                text_features.update(
                    memcache(root_feat / text_feat_paths[self.split_name]))
            elif isinstance(text_feat_paths, (Path, str)):
                text_features = memcache(root_feat / text_feat_paths)
            else:
                raise TypeError(f"Unexpected type {type(text_feat_paths)}")
            self.text_features = text_features
            self.raw_captions = memcache(root_feat /
                                         self.paths["raw_captions_path"])
Beispiel #8
0
def load_british_mouthings(mouthing_pkl_path: Path) -> dict:
    """Load mouthing predictions from disk and transform the keywords from US to UK
    English.
    """
    # Note: we leave the practice/practise dilemena for another time and stick with this
    # list for backwards compatibility
    us_mouthings = memcache(mouthing_pkl_path)
    british_mouthings = {}
    for subset, subdict in us_mouthings.items():
        british_mouthings[subset] = {
            US2UK_MAPPING.get(key, key): val
            for key, val in subdict.items()
        }
    return british_mouthings
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--vis", action="store_true")
    parser.add_argument("--refresh", action="store_true")
    parser.add_argument("--fig_dir", type=Path, default="misc/BSLCP/figs")
    parser.add_argument("--config",
                        type=Path,
                        default="misc/BSLCP/data_paths.json")
    parser.add_argument(
        "--vocab_name",
        default="bsl1k_vocab",
        choices=["bsl1k_vocab", "BSLCP_all_glosses", "signdict_signbank"],
    )
    args = parser.parse_args()

    config = memcache(args.config)

    dest_path = Path(config[args.vocab_name]["anno_path"])
    vocab_path = config[args.vocab_name]["vocab_path"]
    if vocab_path:
        with open(vocab_path, "rb") as f:
            canonical_vocab = set(pickle.load(f)["words"])
    else:
        # We use an empty vocabulary to denote that no filtering should be performed
        canonical_vocab = set()
    fig_dir = args.fig_dir / args.vocab_name

    parse_annos(
        anno_dir=Path(config["raw_anno_dir"]),
        target_tiers=tuple(config["target_tiers"]),
        train_val_test_ratio=config["train_val_test_ratio"],
        raw_video_dir=Path(config["raw_video_dir"]),
        vocab_name=args.vocab_name,
        fig_dir=fig_dir,
        dest_path=dest_path,
        canonical_vocab=canonical_vocab,
        refresh=args.refresh,
        vis=args.vis,
    )
Beispiel #10
0
def main(
    video_dir: Path,
    trim_format: str,
    pad_clip: float,
    limit: int,
    processes: int,
    json_anno_path: Path,
    anno_name: str,
    force_resize: int,
    refresh: bool,
    vis: bool,
):
    print(f"Processing {anno_name} annotations")
    data = memcache(json_anno_path)

    output_filenames = defaultdict(list)
    kwarg_list = []
    outs = set()
    count = 0
    for s in tqdm.tqdm(data.keys()):
        for word in tqdm.tqdm(data[s].keys()):
            N = len(data[s][word]["start"])
            for i in range(N):
                start_time = data[s][word]["start"][i] - pad_clip
                end_time = data[s][word]["end"][i] + pad_clip
                output_filename = construct_video_filename(
                    output_dir=video_dir,
                    set_name=s,
                    word=word,
                    name=Path(data[s][word]["video"][i]).stem,
                    start_time=time2tuple(start_time),
                    end_time=time2tuple(end_time),
                    trim_format=trim_format,
                )
                output_filenames[output_filename].append(
                    (start_time, end_time))
                source_file = Path(data[s][word]["video"][i])
                assert source_file.exists(
                ), f"Expected source file at {source_file}"
                kwargs = {
                    "refresh": refresh,
                    "start_time": start_time,
                    "end_time": end_time,
                    "output_filename": output_filename,
                    "source_file": source_file,
                    "force_resize": force_resize,
                }
                outs.add(output_filename)
                kwarg_list.append(kwargs)
                count += 1

    if vis:
        durations = np.array(
            [x["end_time"] - x["start_time"] for x in kwarg_list])
        step = 0.1
        bins = np.arange(0, np.ceil(durations.max()), step=step)
        values, _ = np.histogram(durations, bins=bins)
        plt.figure(figsize=(20, 10))
        x_ticks = bins[:-1] + (step / 2)
        plt.bar(x_ticks, values, width=step)
        font = {"family": "serif", "weight": "normal", "size": 26}
        matplotlib.rc("font", **font)
        plt.suptitle(f"BSLCP sign durations")
        plt.savefig("zz-bslcp-durations.png")

    if limit:
        kwarg_list = kwarg_list[:limit]
    func = extract_clip
    if processes > 1:
        with mp.Pool(processes=processes) as pool:
            starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list)
    else:
        for kwargs in tqdm.tqdm(kwarg_list):
            func(**kwargs)
    print(f"Expected to produce: {len(kwarg_list)} outputs")
Beispiel #11
0

if __name__ == "__main__":
    matplotlib.use("Agg")
    p = argparse.ArgumentParser()
    p.add_argument("--limit", type=int, default=0)
    p.add_argument("--refresh", action="store_true")
    p.add_argument("--vis", action="store_true")
    p.add_argument("--config", type=Path, default="misc/BSLCP/data_paths.json")
    p.add_argument("--processes", type=int, default=1)
    p.add_argument(
        "--anno_name",
        default="BSLCP_all_glosses",
        choices=["bsl1k_vocab", "BSLCP_all_glosses", "signdict_signbank"])
    args = p.parse_args()
    p_kwargs = vars(args)
    config = memcache(p_kwargs.pop("config"))
    p_kwargs.update({
        "force_resize":
        config["force_resize"],
        "json_anno_path":
        Path(config[args.anno_name]["anno_path"]),
        "trim_format":
        config["trim_format"],
        "video_dir":
        Path(config["data_dir"]) / config[args.anno_name]["video_dir"],
        "pad_clip":
        config["pad_clip"],
    })
    main(**p_kwargs)
Beispiel #12
0
def extract_embeddings(
    text_embedding_config_path: Path,
    rel_dest_dir: Path,
    data_dir: Path,
    refresh: bool,
    validate_embeddings: bool,
    limit: int,
    processes: int,
    embedding_name: str,
    datasets: List[str],
):
    for dataset in datasets:
        dest_dir = data_dir / dataset / rel_dest_dir
        dest_name = embedding_name
        if limit:
            dest_name = f"{embedding_name}-limit{limit}"
        dest_path = dest_dir / f"{dest_name}.pkl"

        # if dest_path.exists() and not refresh:
        #     print(f"Found existing text embeddings at {dest_path}, skipping....")
        #     return

        dest_dir.mkdir(exist_ok=True, parents=True)
        # handle the activity-net exception
        if dataset == "activity-net":
            fname = "raw-captions-train-val_1.pkl"
        elif dataset == "QuerYDSegments":
            fname = "split_raw_captions_filtered.pkl"
        elif dataset == "QuerYD":
            fname = "raw_captions_combined_filtered.pkl"
        else:
            fname = "raw-captions.pkl"
        captions_path = data_dir / dataset / "structured-symlinks" / fname
        # import ipdb; ipdb.set_trace()
        video_descriptions = memcache(captions_path)
        with open(text_embedding_config_path, "r") as f:
            text_embedding_config = json.load(f)

        force_cpu = text_embedding_config[embedding_name].pop(
            "force_cpu", False)
        dev_name = "cuda:0" if torch.cuda.device_count(
        ) > 0 and not force_cpu else "cpu"
        device = torch.device(dev_name)

        model = prepare_embedding_model(embedding_name, text_embedding_config)
        model.set_device(device)
        if limit:
            keep = set(list(video_descriptions.keys())[:limit])
            video_descriptions = {
                key: val
                for key, val in video_descriptions.items() if key in keep
            }

        computed_embeddings = {}
        kwarg_list = []
        for key, descriptions in tqdm.tqdm(video_descriptions.items()):
            kwarg_list.append({"key": key, "descriptions": descriptions})

        all_failed_tokens = []
        func = extract_embeddings_for_video
        if processes > 1:
            # Note: An experimental approach with Ray.  Unfortunately, it seems that
            # the overhead is too great to justify this approach (it's slower than
            # using a single process). TODO(Samuel): revisit.
            func = ray.remote(extract_embeddings_for_video)
            ray.init(num_cpus=processes)

            # Store model in shared memory object store to avoid multiple copies
            model_id = ray.put(model)

            def to_iterator(obj_ids):
                while obj_ids:
                    done, obj_ids = ray.wait(obj_ids)
                    yield ray.get(done[0])

            result_ids = [
                func.remote(model=model_id, **kwargs) for kwargs in kwarg_list
            ]
            zipped = zip(to_iterator(result_ids), kwarg_list)
            for (embeddings,
                 failed), kwargs in tqdm.tqdm(zipped, total=len(result_ids)):
                computed_embeddings[kwargs["key"]] = embeddings
                all_failed_tokens.extend(failed)
        else:
            for kwargs in tqdm.tqdm(kwarg_list):
                embeddings_for_video, failed_tokens = func(**kwargs,
                                                           model=model)
                computed_embeddings[kwargs["key"]] = embeddings_for_video
                all_failed_tokens.extend(failed_tokens)

        stats = [
            len(x) for sublist in computed_embeddings.values() for x in sublist
        ]
        print(f"Average num embedding tokens: {np.mean(stats):.1f} tokens")
        fail_rate = len(all_failed_tokens) / np.sum(stats)
        stat_str = f"{len(all_failed_tokens)}/{np.sum(stats)} [{100 * fail_rate:.1f}%]"
        print(f"Failed tokens: {stat_str} tokens")

        if validate_embeddings:
            validate_embeddings_against_reference(
                computed_embeddings=computed_embeddings,
                embedding_name=embedding_name,
                dataset=dataset,
            )
        with BlockTimer(f"Writing embeddings to {dest_path}"):
            with open(dest_path, "wb") as f:
                pickle.dump(computed_embeddings, f)
Beispiel #13
0
def parse_subtitles(
    subtitle_pkl_path: Path,
    subtitle_reference_mouthings: Path,
    canonical_vocab: frozenset,
    prob_thres: Number,
    episode2subset: frozendict,
    pkl_file: Path = None,
    episode_filter: str = None,
    save_pkl: bool = True,
    temporal_tol: int = 4,
) -> Dict:
    """Extract raw subtitles into a format that mimics the mouthing predictions. Use
    frozen datastructures to allow LRU caching.
    """
    subs = memcache(subtitle_pkl_path)
    ref_mouthings = load_british_mouthings(subtitle_reference_mouthings)

    # Filter to episodes with available subtitles
    subset2episodes = defaultdict(list)
    for episode, subset in episode2subset.items():
        episode = episode.replace("/", "--")
        if episode_filter and episode_filter not in episode:
            continue
        if episode in subs:
            subset2episodes[subset].append(episode)
    print(
        f"Filtered to {sum(len(x) for x in subset2episodes.values())} episodes"
    )

    data = {}
    count = 0
    for subset, episodes in subset2episodes.items():
        data[subset] = {}
        for episode in tqdm.tqdm(episodes):
            episode_subs = subs[episode]
            for sub in tqdm.tqdm(episode_subs):
                if isinstance(sub["span"], list):
                    text = "".join([x["text"] for x in sub["span"]])
                else:
                    text = sub["span"]["text"]
                subtitle_words = [
                    clean_subtitle_word(x) for x in text.split(" ")
                ]
                for keyword in canonical_vocab:
                    keyword_ref_mouthings = ref_mouthings[subset][keyword]
                    keep = keyword_ref_mouthings["names"] == episode
                    conf_keep = np.array(
                        keyword_ref_mouthings["probs"]) > prob_thres
                    mask = conf_keep * keep
                    if prob_thres and not (keep.sum() and mask.sum()):
                        continue
                    candidate_times = np.array(
                        keyword_ref_mouthings["times"])[mask]

                    if keyword not in data[subset]:
                        data[subset][keyword] = {
                            "names": [],
                            "probs": [],
                            "times": []
                        }
                    if keyword in subtitle_words:
                        sub_time = sub["start"] + (sub["end"] -
                                                   sub["start"]) / 2
                        candidate_times = np.array(
                            keyword_ref_mouthings["times"])[mask]
                        if prob_thres:
                            # we only keep times that are close to a confident mouthing
                            if np.abs(candidate_times -
                                      sub_time).min() > temporal_tol:
                                continue

                        data[subset][keyword]["names"].append(episode)
                        data[subset][keyword]["probs"].append(1)
                        data[subset][keyword]["times"].append(sub_time)
                        count += 1
    print(f"Proposing {count} subtitle crops")
    if save_pkl:
        pkl.dump(data, open(pkl_file, "wb"))
    return data
Beispiel #14
0
    def __init__(
        self,
        root_path="data/wlasl",
        inp_res=224,
        resize_res=256,
        setname="train",
        scale_factor=0.1,
        num_in_frames=64,
        evaluate_video=False,
        hflip=0.5,
        stride=0.5,
        ram_data=True,
        gpu_collation=False,
        use_bbox=True,
        monolithic_pkl_path="data/pickled-videos/wlasl-compressed-quality-90-resized-256x256.pkl",
        input_type="rgb",
        pose_keys=["body", "face", "lhnd", "rhnd"],
        mask_rgb=None,
        mask_type=None,
        mask_prob=1.0,
    ):
        self.root_path = root_path
        self.setname = setname  # train, val or test
        self.inp_res = inp_res
        self.resize_res = resize_res
        self.scale_factor = scale_factor
        self.num_in_frames = num_in_frames
        self.evaluate_video = evaluate_video
        self.hflip = hflip
        self.gpu_collation = gpu_collation
        self.stride = stride
        self.use_bbox = use_bbox
        self.input_type = input_type
        self.pose_keys = pose_keys
        self.mask_rgb = mask_rgb
        self.mask_type = mask_type

        self.video_folder = "videos_360h_25fps"
        if Path(monolithic_pkl_path).exists() and ram_data:
            print(f"Loading monolithic pickle from {monolithic_pkl_path}")
            self.video_data_dict = memcache(monolithic_pkl_path)
        else:
            self.video_data_dict = None

        infofile = os.path.join(root_path, "info", "info.pkl")
        print(f"Loading {infofile}")
        data = pkl.load(open(infofile, "rb"))

        if self.input_type == "pose":
            pose_pkl = os.path.join(root_path, "info", "pose.pkl")
            print(f"Loading {pose_pkl}")
            self.pose_data = pkl.load(open(pose_pkl, "rb"))
        if self.mask_rgb:
            assert mask_type
        if self.mask_rgb == "face":
            face_pkl = os.path.join(root_path, "info", "face_bbox.pkl")
            print(f"Loading {face_pkl}")
            self.face_data = pkl.load(open(face_pkl, "rb"))

        # Use this to take subset
        if self.input_type == "pose" or self.mask_rgb:
            mouth_pkl = os.path.join(root_path, "info", "mouth_bbox.pkl")
            print(f"Loading {mouth_pkl}")
            self.mouth_data = pkl.load(open(mouth_pkl, "rb"))

        self.videos = [s.strip() for s in data["videos"]["name"]]
        self.videos = np.asarray(self.videos)

        self.classes = data["videos"]["word_id"]
        with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f:
            self.class_names = f.read().splitlines()

        meta_key = self.video_folder
        if gpu_collation and not self.video_data_dict:
            # GPU collation requires all inputs to share the same spatial input size
            self.video_folder = "videos-resized-256fps-256x256"
        self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation)

        bboxes_orig = [s for s in np.asarray(data["videos"]["box"])]
        self.bboxes = []
        for i, bb in enumerate(bboxes_orig):
            ht = data["videos"]["videos_original"]["H"][i]
            wt = data["videos"]["videos_original"]["W"][i]
            xmin, ymin, xmax, ymax = bb
            bb_norm = [ymin / ht, xmin / wt, ymax / ht, xmax / wt]
            self.bboxes.append(bb_norm)

        self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0])
        if self.setname == "val":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0])
        elif self.setname == "test":
            self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0])

        if self.input_type == "pose" or self.mask_rgb:
            # Valid mouth ix should be equivalent to valid face ix, valid pose ix etc
            valid_mouth_ix = np.where(
                np.array([i is not None for i in self.mouth_data])
            )[0]
            if self.setname == "val" or self.setname == "test":
                print(f"{len(self.train)} train, {len(self.valid)} val samples.")
            print("Taking subsets according to having pose or not")
            self.train = list(set(self.train).intersection(set(valid_mouth_ix)))
            if self.setname == "val" or self.setname == "test":
                self.valid = list(set(self.valid).intersection(set(valid_mouth_ix)))
                print(f"{len(self.train)} train, {len(self.valid)} val samples.")

        if evaluate_video:
            self.valid, self.t_beg = self._slide_windows(self.valid)

        VideoDataset.__init__(self)
Beispiel #15
0
def main(
    data_dir: Path,
    anno_pkl_path: Path,
    video_dir: Path,
    canonical_1064_words: Path,
    refresh: bool,
    prob_thres: float,
    worker_id: int,
    num_partitions: int,
    limit: int,
    processes: int,
    mouthing_window_secs: int,
    progress_markers: int,
    aggregate: bool,
    pseudo_annos: str,
    episode2subset: Dict[str, str],
    trim_format: str = "%06d",
):
    path_kwargs = {
        "limit": limit,
        "data_dir": data_dir,
        "pseudo_annos": pseudo_annos,
        "prob_thres": prob_thres,
        "mouthing_window_secs": mouthing_window_secs,
    }
    with open(canonical_1064_words, "rb") as f:
        canonical_vocab = set(pkl.load(f)["words"])

    if aggregate:
        dest_path = gen_paths(worker_id=0, num_partitions=1,
                              **path_kwargs)["info"]
        if dest_path.exists() and not refresh:
            print(f"Found existing info file at {dest_path}, skipping...")
            return
        info = create_info_structure()
        for ii in range(num_partitions):
            src_path = gen_paths(worker_id=ii,
                                 num_partitions=num_partitions,
                                 **path_kwargs)["info"]
            worker_info = memcache(src_path)
            msg = "Expected worker info to match the target 1064 vocab"
            assert set(worker_info["words"]) == canonical_vocab, msg
            if ii == 0:
                # we can update the words with the first worker
                info["words"] = worker_info["words"]
                info["words_to_id"] = worker_info["words_to_id"]
            for key in info["videos"]:
                if key == "videos":
                    for subkey in info["videos"]["videos"]:
                        info["videos"]["videos"][subkey].extend(
                            worker_info["videos"]["videos"][subkey])
                else:
                    info["videos"][key].extend(worker_info["videos"][key])
        print(f"Writing aggregated info to {dest_path}")
        with open(dest_path, "wb") as f:
            pkl.dump(info, f)
        return

    paths = gen_paths(worker_id=worker_id,
                      num_partitions=num_partitions,
                      **path_kwargs)
    if paths["info"].exists() and not refresh:
        print(f"Found existing info file at {paths['info']}, skipping...")
        return

    data = create_info_structure()
    words = set()
    sets = ["train", "val", "test"]
    set_dict = {"train": 0, "val": 1, "test": 2}
    all_data = load_data(
        pseudo_annos=pseudo_annos,
        anno_pkl_path=anno_pkl_path,
        canonical_vocab=canonical_vocab,
        episode2subset=episode2subset,
    )
    all_data = filter_words_by_confidence(all_data, prob_thres)
    print(f"Using a vocabulary of {len(canonical_vocab)} words for BBC")
    words = list(sorted(canonical_vocab))

    # Write to TXT file
    with open(paths["words"], "w") as dict_file:
        words_to_id = {}
        for i, w in enumerate(words):
            words_to_id[w] = i
            dict_file.write(f"{i:05d} {w}\n")

    data["words"] = words
    data["words_to_id"] = words_to_id

    t0 = time.time()
    if num_partitions == 1:
        worker_words = set(words)
    else:
        worker_words = np.array_split(words, num_partitions)[worker_id]

    count = 0
    kwarg_list = []
    for s in sets:  # all_data.keys():
        subset_total = len(all_data[s])
        for word_cnt, word in enumerate(all_data[s].keys()):
            assert word in words_to_id, f"Unkown word: {word}"
            if limit and count >= limit:
                continue
            if word not in worker_words:
                continue
            N = len(all_data[s][word]["names"])
            delta = time.time() - t0
            print(
                f"{delta:0.2f} sec {s} {word_cnt}/{subset_total} {word} [{N} samples]"
            )
            for i in range(N):
                if all_data[s][word]["probs"][i] > prob_thres:
                    start_time, end_time = take_interval_from_peak(
                        all_data[s][word]["times"][i])
                    output_filename = construct_video_filename(
                        output_dir=video_dir,
                        set_name=s,
                        word=word,
                        name=all_data[s][word]["names"][i],
                        start_time=start_time,
                        end_time=end_time,
                        trim_format=trim_format,
                    )
                    if os.path.exists(output_filename):
                        # Video resolution information
                        name = os.path.join(s, word,
                                            os.path.basename(output_filename))
                        kwargs = {
                            "count": count,
                            "word": word,
                            "name": name,
                            "word_id": words_to_id[word],
                            "split": set_dict[s],
                            "processes": processes,
                            "mouthing_time": all_data[s][word]["times"][i],
                            "mouthing_prob": all_data[s][word]["probs"][i],
                            "output_filename": output_filename,
                            "progress_markers": progress_markers,
                        }
                        kwarg_list.append(kwargs)
                        count += 1

    # Enable the worker to print progress.
    for kwargs in kwarg_list:
        kwargs["total"] = len(kwarg_list)

    func = update_meta
    if processes > 1:
        with mp.Pool(processes=processes) as pool:
            meta = starmap_with_kwargs(pool=pool,
                                       func=func,
                                       kwargs_iter=kwarg_list)
    else:
        meta = []
        for kwargs in tqdm.tqdm(kwarg_list):
            meta.append(func(**kwargs))

    # Filter videos that failed to return meta data
    pre_filter = len(meta)
    meta = [x for x in meta if x]
    print(
        f"{len(meta)}/{pre_filter} were successfully parsed for meta information"
    )

    # check that ordering was preserved by multiprocessing
    counts = [x["count"] for x in meta]
    assert list(sorted(counts)) == counts, "Expected meta items to be in order"

    for x in tqdm.tqdm(meta):
        data["videos"]["videos"]["T"].append(x["video_res_t"])
        data["videos"]["videos"]["W"].append(x["video_res_w"])  # 480
        data["videos"]["videos"]["H"].append(x["video_res_h"])  # 480
        data["videos"]["videos"]["duration_sec"].append(
            x["video_duration_sec"])
        data["videos"]["videos"]["fps"].append(x["video_fps"])  # 25
        data["videos"]["word"].append(x["word"])
        data["videos"]["word_id"].append(x["word_id"])
        data["videos"]["split"].append(x["split"])
        data["videos"]["name"].append(x["name"])
        data["videos"]["mouthing_time"].append(x["mouthing_time"])
        data["videos"]["mouthing_prob"].append(x["mouthing_prob"])
    print(f"Saving info file to {paths['info']}...")
    pkl.dump(data, open(paths["info"], "wb"))
    def load_features(self):
        root_feat = Path(self.root_feat)
        feat_names = {
            key: self.visual_feat_paths(key)
            for key in self.paths["feature_names"]
        }
        feat_names.update(self.paths["custom_paths"])
        features = {}
        for expert, rel_names in feat_names.items():
            if expert not in self.ordered_experts:
                continue
            feat_paths = tuple(
                [root_feat / rel_name for rel_name in rel_names])
            if len(feat_paths) == 1:
                features[expert] = memcache(feat_paths[0])
            else:
                # support multiple forms of feature (e.g. max and avg pooling). For
                # now, we only support direct concatenation
                msg = f"{expert}: Only direct concat of muliple feats is possible"
                print(f"Concatenating aggregates for {expert}....")
                assert self.feat_aggregation[expert][
                    "aggregate"] == "concat", msg
                axis = self.feat_aggregation[expert]["aggregate-axis"]
                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
                print(f"concat cache info: {x}")
                features_ = concat_features(feat_paths, axis=axis)
                memory_summary()

                if expert == "speech":
                    features_defaults = defaultdict(lambda: np.zeros((1, 300)))
                    features_defaults.update(features_)
                    features_ = features_defaults

                # Make separate feature copies for each split to allow in-place filtering
                features[expert] = copy.deepcopy(features_)

        self.features = features
        text_feat_paths = self.paths["text_feat_paths"]
        text_features = memcache(root_feat / text_feat_paths["train"])
        split_names = {"dev": "val", "official": "test"}
        text_features.update(
            memcache(root_feat /
                     text_feat_paths[split_names[self.split_name]]))
        key_map = memcache(root_feat / self.paths["dict_youtube_mapping_path"])
        inverse_map = {}
        for key, value in key_map.items():
            inverse_map[value] = key
        self.text_features = {
            inverse_map[key]: val
            for key, val in text_features.items()
        }
        self.raw_captions = memcache(root_feat /
                                     self.paths["raw_captions_path"])

        if "detection" in self.ordered_experts:
            # Example processing
            processed = {}
            for key, subdict in self.features["detection"].items():
                box, conf = subdict["detection_boxes"], subdict[
                    "detection_scores"]
                raw = subdict["raw_feats_avg"]
                processed[key] = np.concatenate(
                    (box, conf.reshape(-1, 1), raw), axis=1)
            self.features["detection"] = processed

        if "openpose" in self.ordered_experts:
            # Example processing
            processed = {}
            for key, subdict in self.features["openpose"].items():
                raw = np.concatenate(subdict["matrix"], axis=1)
                processed[key] = raw.transpose(1, 0, 2).reshape(-1, 3 * 18)
            self.features["openpose"] = processed
Beispiel #17
0
def main(
    data_dir: Path,
    json_anno_path: Path,
    video_dir: Path,
    word_data_pkl: Path,
    trim_format: str,
    anno_name: str,
    refresh: bool,
):
    print(f"Creating info file for {anno_name} annotations")
    info_dict_dir = data_dir / "info" / anno_name
    info_dict_dir.mkdir(exist_ok=True, parents=True)
    info_file = info_dict_dir / "info.pkl"
    if info_file.exists() and not refresh:
        print("Found existing info file")
        if word_data_pkl.exists() and not refresh:
            print("Found existing word_data_pkl file")
        else:
            info = memcache(info_file)
            word_data_pkl_data = {
                key: info[key]
                for key in ("words", "words_to_id")
            }
            with open(word_data_pkl, "wb") as f:
                pkl.dump(word_data_pkl_data, f)
            print(f"Wrote word_data_pkl to {word_data_pkl}")
        return
    dict_file = open(info_dict_dir / "words.txt", "w")

    data = {}
    words = set()
    data["videos"] = {}
    data["videos"]["name"] = [
    ]  # Our naming convention (unique ID for a video)
    data["videos"]["word"] = []
    data["videos"]["word_id"] = []
    data["videos"]["split"] = []  # 0: train, 1: val, 2: test

    # Resolution info
    data["videos"]["videos"] = {}
    data["videos"]["videos"]["T"] = []
    data["videos"]["videos"]["W"] = []
    data["videos"]["videos"]["H"] = []
    data["videos"]["videos"]["duration_sec"] = []
    data["videos"]["videos"]["fps"] = []

    # Extra annot
    data["videos"]["start"] = []
    data["videos"]["end"] = []

    sets = ["train", "val", "test"]
    set_dict = {"train": 0, "val": 1, "test": 2}

    all_data = memcache(json_anno_path)

    words = set()
    for subset, subdict in all_data.items():
        words.update(subdict.keys())

    # Only use train words from reference
    print(f"{len(words)} words")
    mapping = {
        "airplane": "aeroplane",
        "center": "centre",
        "favor": "favour",
        "gray": "grey",
        "practice": "practise",
        "recognize": "recognise",
        "yogurt": "yoghurt",
    }
    # fix spellings to English
    updated_words = [mapping.get(word, word) for word in words]
    words = list(sorted(set(updated_words)))

    # Write to TXT file
    words_to_id = {}
    for i, w in enumerate(words):
        words_to_id[w] = i
        dict_file.write(f"{i:05d} {w}\n")
    dict_file.close()

    data["words"] = words
    data["words_to_id"] = words_to_id

    cnt = 0
    t0 = time.time()

    for s in sets:  # all_data.keys():
        for word_cnt, word in enumerate(all_data[s].keys()):
            if word in words_to_id:
                print(f"{time.time() - t0:0.2f} sec {s} {word_cnt} {word}")
                N = len(all_data[s][word]["start"])
                for i in range(N):
                    start_time = all_data[s][word]["start"][i]
                    end_time = all_data[s][word]["end"][i]
                    output_filename = construct_video_filename(
                        word=word,
                        set_name=s,
                        output_dir=video_dir,
                        name=Path(all_data[s][word]["video"][i]).stem,
                        start_time=time2tuple(start_time),
                        end_time=time2tuple(end_time),
                        trim_format=trim_format,
                    )
                    if os.path.exists(output_filename):
                        # Video resolution information
                        (
                            video_res_t,
                            video_res_w,
                            video_res_h,
                            video_fps,
                            video_duration_sec,
                        ) = _get_video_info(str(output_filename))
                        # Indication that the video is readable
                        if video_res_t:
                            # if not (video_fps == row['fps']):
                            #     print(s, i, video_fps, row['fps'])
                            data["videos"]["videos"]["T"].append(video_res_t)
                            data["videos"]["videos"]["W"].append(
                                video_res_w)  # 480
                            data["videos"]["videos"]["H"].append(
                                video_res_h)  # 480
                            data["videos"]["videos"]["duration_sec"].append(
                                video_duration_sec)
                            data["videos"]["videos"]["fps"].append(
                                video_fps)  # 25

                            data["videos"]["word"].append(word)
                            data["videos"]["word_id"].append(words_to_id[word])

                            data["videos"]["split"].append(set_dict[s])
                            name = os.path.join(
                                s, word, os.path.basename(output_filename))
                            data["videos"]["name"].append(name)

                            data["videos"]["start"].append(
                                all_data[s][word]["start"][i])
                            data["videos"]["end"].append(
                                all_data[s][word]["end"][i])
                            cnt += 1
    print(f"Writing results to {info_file}")
    pkl.dump(data, open(info_file, "wb"))