コード例 #1
0
    def load_features(self):
        root_feat = self.root_feat
        feat_agg = self.feat_aggregation

        feat_names = {
            "flow": f"i3d-i3d-{feat_agg['flow']}.pickle",
            "face": f"VGGFace2-ResNet50-face-{feat_agg['face']}.pickle",
            "rgb": f"{self.rgb_model_name}-imagenet-{feat_agg['rgb']}.pickle",
            "scene": f"densenet161-scene-{feat_agg['scene']}.pickle",
            "ocr": "ocr-feats.pkl",
            "audio": "vggish-audio-raw.pickle",
            "speech": "stt_w2v.pickle",
        }
        assert feat_agg["scene"] == "max", "expected max pooling over scenes"
        feat_paths = {
            key: Path(root_feat) / value
            for key, value in feat_names.items()
        }

        if self.text_feat == "openai":
            text_feat_path = pjoin(root_feat, "openai-feats.pkl")
        else:
            raise ValueError(f"Text features {self.text_feat} not supported ")

        features = {
            expert: memcache(path)
            for expert, path in feat_paths.items()
        }
        text_features = memcache(text_feat_path)
        self.features = features
        self.text_features = text_features
        self.raw_captions = memcache(
            Path(self.data_dir) / "processing/raw-captions.pkl")
コード例 #2
0
    def load_features(self):
        root_feat = Path(self.root_feat)
        feat_names = {
            key: self.visual_feat_paths(key)
            for key in self.paths["feature_names"]
        }
        feat_names.update(self.paths["custom_paths"])
        features = {}
        for expert, rel_names in feat_names.items():
            if expert not in self.ordered_experts:
                continue
            feat_paths = tuple(
                [root_feat / rel_name for rel_name in rel_names])
            if len(feat_paths) == 1:
                features[expert] = memcache(feat_paths[0])
            else:
                # support multiple forms of feature (e.g. max and avg pooling). For
                # now, we only support direct concatenation
                msg = f"{expert}: Only direct concatenation of muliple feats is possible"
                print(f"Concatenating aggregates for {expert}....")
                assert self.feat_aggregation[expert][
                    "aggregate"] == "concat", msg
                axis = self.feat_aggregation[expert]["aggregate-axis"]
                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
                print(f"concat cache info: {x}")
                features_ = concat_features(feat_paths, axis=axis)
                memory_summary()

                # Make separate feature copies for each split to allow in-place filtering
                features[expert] = copy.deepcopy(features_)

        self.features = features
        self.raw_captions = memcache(root_feat /
                                     self.paths["raw_captions_path"])
        self.text_features = memcache(root_feat / self.paths["text_feat_path"])
コード例 #3
0
    def load_features(self):
        root_feat = self.root_feat
        feat_names = {
            "face": "VGGFace2-ResNet50-face-avg.pickle",
            "flow": "i3d-i3d-avg.pickle",
            "rgb": f"{self.rgb_model_name}-imagenet-avg-nocrop.pickle",
            "scene": "densenet161-scene-max.pickle",
            "ocr": "AN_OCR_ALL_unique_video_w2v.pkl",
            "audio": "vggish-audio-raw.pickle",
            "speech": "stt_w2v.pickle",
        }
        feat_paths = {
            key: Path(root_feat) / value
            for key, value in feat_names.items()
        }

        if self.text_feat == "openai":
            text_feat_train_path = pjoin(root_feat, "openai-train.pkl")
            text_feat_val1_path = pjoin(root_feat, "openai-val1.pkl")
            text_feat_val2_path = pjoin(root_feat, "openai-val2.pkl")
        else:
            raise ValueError(f"Text features {self.text_feat} not supported ")

        features = {
            expert: memcache(path)
            for expert, path in feat_paths.items()
        }
        text_features = memcache(text_feat_train_path)
        if self.split_name == "val1":
            text_features.update(memcache(text_feat_val1_path))
        elif self.split_name == "val2":
            text_features.update(memcache(text_feat_val2_path))
        else:
            raise ValueError(
                f"unrecognised activity-net split: {self.split_name}")

        self.features = features
        self.text_features = text_features
        self.raw_captions = memcache(self.raw_captions_path)
コード例 #4
0
    def configure_train_test_splits(self, split_name):
        self.restrict_test_captions = None
        if split_name == "miech":
            # For now, we follow Antoine's approach of using the first text caption
            # for the retreival task when evaluating on his custom split.
            train_list_path = "train_list_miech.txt"
            test_list_path = "test_list_miech.txt"
        elif split_name in "jsfusion":
            train_list_path = "train_list_jsfusion.txt"
            test_list_path = "val_list_jsfusion.txt"
            # NOTE: The JSFusion split (referred to as 1k-A in the paper) uses all
            # videos, but randomly samples a single caption per video from the test
            # set for evaluation. To reproduce this evaluation, we use the indices
            # of the test captions, and restrict to this subset during eval.
            test_cap_idx_path = pjoin(self.root_feat,
                                      "jsfusion_val_caption_idx.pkl")
            self.restrict_test_captions = memcache(test_cap_idx_path)
        elif split_name in {"full-val", "full-test"}:
            train_list_path = "train_list_full.txt"
            if split_name == "full-val":
                test_list_path = "val_list_full.txt"
            else:
                test_list_path = "test_list_full.txt"
        else:
            msg = "unrecognised MSRVTT split: {}"
            raise ValueError(msg.format(split_name))

        train_list_path = pjoin(self.root_feat, train_list_path)
        test_list_path = pjoin(self.root_feat, test_list_path)

        print("loading training/val splits....")
        tic = time.time()
        with open(train_list_path) as f:
            self.train_list = f.readlines()
        self.train_list = [x.strip() for x in self.train_list]
        with open(test_list_path) as f:
            self.test_list = f.readlines()
        self.test_list = [x.strip() for x in self.test_list]
        print("done in {:.3f}s".format(time.time() - tic))
        self.split_name = split_name
コード例 #5
0
    def _load_data(self):
        self.expert_data = {}
        for expert in self.experts_used:
            if expert != 'context':
                data_pth = osj(self.data_dir, 'features', self.experts[expert])
                self.expert_data[expert] = memcache(data_pth)
                memory_summary()

        clips_with_data = []
        for expert in self.expert_data:
            if expert != 'description' and expert != 'label':
                clips_with_data += self.expert_data[expert].keys()

        # debugging (input random tensors)
        random = False
        if random:
            for expert in self.expert_data:
                for videoid in self.expert_data[expert]:
                    self.expert_data[expert][videoid] = np.random.randn(
                        *self.expert_data[expert][videoid].shape)

        # debugging (input zero tensors)
        zeros = False
        if zeros:
            for expert in self.expert_data:
                for videoid in self.expert_data[expert]:
                    self.expert_data[expert][videoid] = np.zeros(
                        self.expert_data[expert][videoid].shape)

        clips_with_data = set(clips_with_data)

        #sanity check
        #pdb.set_trace()
        #if not self.data['clips'].index.isin(clips_with_data).all():
        #    print(self.data['clips'][~self.data['clips'].index.isin(clips_with_data)].index)
        #    raise NotImplementedError
        self.data['clips'] = self.data['clips'][self.data['clips'].index.isin(
            clips_with_data)]
        print(f'{self.split} size: {len(self.data["clips"])} clips')
コード例 #6
0
    def __init__(self,
                 data_dir,
                 raw_input_dims,
                 cut_name,
                 split_name,
                 max_text_words=30,
                 max_expert_tokens=8,
                 clip_duration=float("Inf"),
                 caption_length=float("Inf"),
                 captions_per_video=1,
                 restrict_train_captions=0,
                 training=False,
                 split_size=1.0,
                 load_in_ram=False,
                 remove_stop_words=False,
                 n_pairs=1,
                 tokenizer=None,
                 shuffle_feats_t=False,
                 loaded_data=None,
                 query_shuffling="indiv",
                 cross_seed=0,
                 temporal_encoding_window=1):

        self.sanity_checks = False
        self.train = training
        self.data_dir = data_dir
        self.restrict_train_captions = restrict_train_captions
        self.max_text_words = max_text_words
        self.max_expert_tokens = max_expert_tokens
        self.root_feat = pathlib.Path(data_dir) / "symlinked-feats"
        self.experts = set(raw_input_dims.keys())
        self.rgb_shots = 1
        self.cut_name = cut_name
        self.split_name = split_name
        self.split_size = split_size
        self.load_in_ram = load_in_ram
        self.remove_stop_words = remove_stop_words
        self.n_pairs = n_pairs
        self.clip_duration = clip_duration
        self.caption_length = caption_length
        self.tokenizer = tokenizer
        self.shuffle_feats_t = shuffle_feats_t
        self.query_shuffling = query_shuffling
        self.cross_seed = cross_seed
        self.temporal_encoding_window = temporal_encoding_window

        self.data_aug = False
        self.max_ratio_rem = 0

        if self.cut_name == "c":
            # The challenge features are stored in pkl files
            self.reading_from = "pkl"
        else:
            # The ECCV20 paper features are stored in multiple h5 files
            self.reading_from = "mult_h5"

        self.cache_dir = os.path.join(os.path.dirname(data_dir),
                                      "vid_feat_files", self.reading_from)
        logger.debug("Cache_dir: %s", self.cache_dir)

        # This attribute can be overloaded by different datasets, so it must be set
        # before the `configure_train_test_splits() method call`
        self.restrict_test_captions = None

        # Use a single caption per video when forming training minibatches
        # (different captions from the same video may still be used across
        # different minibatches)
        if self.train:
            self.captions_per_video = 1
        else:
            self.captions_per_video = captions_per_video

        self.ordered_experts = list(raw_input_dims.keys())

        self.configure_train_test_splits(cut_name=cut_name,
                                         split_name=split_name)
        self.expert_timings = expert_timings.expert_timings

        # If split_size is type(int) it represents the number of samples that we
        # keep.
        # If split_size is type(float) it represents the ratio of the original
        # split size that we keep.
        original_size = len(self.vid_list)
        if split_size >= 2 and isinstance(split_size, int):
            nb_samples = split_size
        elif 0 <= split_size <= 1 and isinstance(split_size, float):
            nb_samples = int(split_size * original_size)

        self.vid_list = self.vid_list[:nb_samples]
        self.num_train = len(self.vid_list)

        # Display info about the dataset split size
        main_msg = f"Number of videos in {self.dataset_name}: {original_size}"
        if self.num_train == original_size:
            msg = ""
        else:
            msg = f" but we keep only {self.num_train} (split_size = {split_size})"
        logger.debug(main_msg + msg)

        # Log how many captions per video are kept
        logger.debug("We consider %s captions per video",
                     self.captions_per_video)
        self.raw_input_dims = raw_input_dims

        visualisations = True
        if visualisations:
            logger.debug("Storing paths to enable visualisations ...")

            symlink_to_root = pathlib.Path.cwd() / "project_root"
            # If symlink to root can be accessed, follow that path
            # Otherwise, follow the current working directory
            # (that should be the project root)
            if symlink_to_root.exists():
                video_paths = [
                    os.readlink(str(symlink_to_root)) /
                    pathlib.Path(data_dir) / f"videos/{x}.mp4"
                    for x in self.vid_list
                ]
            else:
                video_paths = [
                    pathlib.Path.cwd() / pathlib.Path(data_dir) /
                    f"videos/{x}.mp4" for x in self.vid_list
                ]

            self.video_paths = video_paths

        self.missing_val = 0

        if not os.path.exists(self.cache_dir) and self.reading_from != "pkl":
            logger.warning("%s does not exist", self.cache_dir)

        self.variable_sz_experts = self.experts
        self.flaky_experts = self.experts

        self.loaded_in_ram = False
        self.loaded_data = loaded_data
        data_source = self.dataset_name.split("_")[0]
        if data_source not in self.loaded_data:
            self.loaded_data[data_source] = {}
        if self.load_in_ram:
            logger.info("Loading dataset {self.dataset_name} in ram ...")
            if self.reading_from == "mult_h5":
                self.data_vid = {}
                for i, vid in enumerate(self.vid_list):
                    if i % 100 == 0:
                        logger.debug(i)
                    self.data[vid] = self.get_sample_data(vid)
            elif self.reading_from == "pkl":
                self.data_exp = self.loaded_data[data_source]
                for expert in self.experts:
                    if expert not in self.data_exp:
                        self.data_exp[expert] = {}
                    if expert in self.expert_paths.keys():
                        for agg, path in self.expert_paths[expert].items():
                            data_path = pathlib.Path(
                                self.data_dir) / pathlib.Path(path)
                            if agg not in self.data_exp[expert]:
                                self.data_exp[expert][agg] = memcache(
                                    data_path)
                    else:
                        logger.warning(
                            "The expert %s is not available for dataset %s",
                            expert, self.dataset_name)

                if self.split_name == "test2":
                    path = self.expert_paths["raw_captions_test2"]
                else:
                    path = self.expert_paths["raw_captions"]
                data_path = pathlib.Path(self.data_dir) / pathlib.Path(path)
                additionnal_captions = memcache(data_path)
                if "raw_captions" not in self.data_exp:
                    self.data_exp["raw_captions"] = {}
                self.data_exp["raw_captions"].update(additionnal_captions)
            self.loaded_in_ram = True
コード例 #7
0
    def load_features(self):
        root_feat = self.root_feat
        feat_names = {
            "face": "VGGFace2-ResNet50-face-raw.pickle",
            "flow": "i3d-i3d-raw.pickle",
            "rgb": f"{self.rgb_model_name}-imagenet-raw-nocrop.pickle",
            "scene": "densenet161-scene-max.pickle",
            "ocr": "MSVD_all_text_w2v.pkl",
        }
        feat_paths = {
            key: Path(root_feat) / value
            for key, value in feat_names.items()
        }

        if self.text_feat == "w2v":
            text_feat_train_path = pjoin(root_feat, "w2v-caption-train.pkl")
            text_feat_val_path = pjoin(root_feat, "w2v-caption-val.pkl")
            text_feat_test_path = pjoin(root_feat, "w2v-caption-test.pkl")
        elif self.text_feat == "openai":
            text_feat_train_path = pjoin(root_feat, "openai-caption-train.pkl")
            text_feat_val_path = pjoin(root_feat, "openai-caption-val.pkl")
            text_feat_test_path = pjoin(root_feat, "openai-caption-test.pkl")
        else:
            raise ValueError(f"Text features {self.text_feat} not supported ")

        features = {
            expert: memcache(path)
            for expert, path in feat_paths.items()
        }
        text_features = memcache(text_feat_train_path)
        if self.split_name == "dev":
            text_features.update(memcache(text_feat_val_path))
        elif self.split_name == "official":
            text_features.update(memcache(text_feat_test_path))
        else:
            raise ValueError(f"unrecognised MSVD split: {self.split_name}")

        # To ensure that the text features are stored with the same keys as other
        # features, we need to convert text feature keys (YouTube hashes) into
        # video names
        key_map = memcache(pjoin(root_feat, "dict_youtube_mapping.pkl"))
        inverse_map = {}
        for key, value in key_map.items():
            inverse_map[value] = key
        text_features = {
            inverse_map[key]: val
            for key, val in text_features.items()
        }

        # we handle ocr separately from the other experts, for backwards compatibility
        # reasons
        canon_feats = {}
        for expert, feats in features.items():
            if expert != "ocr":
                canon_feats[expert] = self.canonical_features(feats)
            else:
                raw_dim = self.raw_input_dims[expert]
                canon_feats[expert] = self.canonical_features(feats,
                                                              raw_dim=raw_dim)
        self.features = canon_feats
        self.text_features = text_features
        self.raw_captions = memcache(pjoin(root_feat, "raw-captions.pkl"))
コード例 #8
0
ファイル: msrvtt_dataset.py プロジェクト: wayne980/MMT-old
    def configure_train_test_splits(self, cut_name, split_name):
        self.restrict_test_captions = None

        if cut_name in ["miech", "jsfusion"]:
            if cut_name in ["miech"]:
                # For now, we follow Antoine's approach of using the first text caption
                # for the retrieval task when evaluating on his custom split.
                train_list_path = "train_list_miech.txt"
                test_list_path = "test_list_miech.txt"
            elif cut_name in ["jsfusion"]:
                train_list_path = "train_list_jsfusion.txt"
                test_list_path = "val_list_jsfusion.txt"
                # NOTE: The JSFusion split (referred to as 1k-A in the paper) uses all
                # videos, but randomly samples a single caption per video from the test
                # set for evaluation. To reproduce this evaluation, we use the indices
                # of the test captions, and restrict to this subset during eval.
                test_cap_idx_path = os.path.join(
                    self.data_dir, "jsfusion_val_caption_idx.pkl")
                self.restrict_test_captions = memcache(test_cap_idx_path)

            test_list_path = os.path.join(self.data_dir, test_list_path)
            with open(test_list_path) as f:
                test_vid_list = f.readlines()
            nb_test_samples = len(test_vid_list)

            if split_name in ["train", "trn", "val", "trainval"]:
                train_list_path = os.path.join(self.data_dir, train_list_path)
                with open(train_list_path) as f:
                    train_vid_list = f.readlines()
                nb_train_samples = len(train_vid_list)

                cross_vid_list = train_vid_list
                cross_vid_list = [x.strip() for x in cross_vid_list]

                # The cross seed is used to split training videos into different
                # cross validation splits.
                rng = np.random.RandomState(self.cross_seed)
                rng.shuffle(cross_vid_list)

                if split_name in ["train", "trn", "trainval"]:
                    if split_name in ["trainval"]:
                        self.vid_list = cross_vid_list
                    elif split_name in ["train", "trn"]:
                        self.vid_list = cross_vid_list[nb_test_samples:]
                    if split_name in ["trn"]:
                        self.vid_list = self.vid_list[:nb_test_samples]

                elif split_name in ["val"]:
                    self.vid_list = cross_vid_list[:nb_test_samples]

            elif split_name == "test":
                self.vid_list = test_vid_list
                self.vid_list = [x.strip() for x in self.vid_list]

        elif cut_name in ["full"]:
            if split_name in ["train", "trn"]:
                list_path = "train_list.txt"
            elif split_name in ["val"]:
                list_path = "val_list.txt"
            elif split_name in ["test"]:
                list_path = "test_list.txt"
            else:
                raise ValueError(f"unrecognised split: {split_name}")
            list_path = os.path.join(self.data_dir, list_path)
            with open(list_path) as f:
                self.vid_list = f.readlines()
            self.vid_list = [x.strip() for x in self.vid_list]

            # We want the trn split to be the same size as the val set
            if split_name in ["trn"]:
                rng = np.random.RandomState(0)
                rng.shuffle(self.vid_list)
                self.vid_list = self.vid_list[:497]

        elif cut_name in ["c"]:
            self.expert_paths = get_expert_paths(self.data_dir)
            if split_name in ["train", "trn", "val", "trainval"]:
                train_list_path = "train_list.txt"
                train_list_path = os.path.join(self.data_dir, train_list_path)
                with open(train_list_path) as f:
                    train_vid_list = f.readlines()
                nb_train_samples = len(train_vid_list)

                val_list_path = "val_list.txt"
                val_list_path = os.path.join(self.data_dir, val_list_path)
                with open(val_list_path) as f:
                    val_vid_list = f.readlines()
                nb_val_samples = len(val_vid_list)

                cross_vid_list = train_vid_list + val_vid_list
                cross_vid_list = [x.strip() for x in cross_vid_list]

                if self.cross_seed != 0:
                    # The cross seed is used to split training videos into different
                    # cross validation splits.
                    rng = np.random.RandomState(self.cross_seed)
                    rng.shuffle(cross_vid_list)

                if split_name in ["train", "trn", "trainval"]:
                    if split_name in ["trainval"]:
                        self.vid_list = cross_vid_list
                    elif split_name in ["train", "trn"]:
                        self.vid_list = cross_vid_list[:nb_train_samples]
                    if split_name in ["trn"]:
                        # In order to monitor performance on the training set, we sample
                        # from it as many samples as there are validation samples.
                        rng = np.random.RandomState(0)
                        rng.shuffle(self.vid_list)
                        self.vid_list = self.vid_list[:nb_val_samples]

                elif split_name in ["val"]:
                    self.vid_list = cross_vid_list[nb_train_samples:]

            else:
                if split_name == "test1":
                    list_path = "public_server_val.txt"
                elif split_name == "test2":
                    list_path = "public_server_test.txt"
                list_path = os.path.join(self.data_dir, list_path)
                with open(list_path) as f:
                    self.vid_list = f.readlines()
                self.vid_list = [x.strip() for x in self.vid_list]

        else:
            msg = "unrecognised cut: {}"
            raise ValueError(msg.format(cut_name))

        self.split_name = split_name
        self.dataset_name = f"MSRVTT_{cut_name}_{split_name}"
コード例 #9
0
    def __init__(self,
                 data_dir,
                 feat_aggregation,
                 raw_input_dims,
                 num_test_captions,
                 split_name,
                 text_dim,
                 text_feat,
                 rgb_model_name,
                 fuse_captions,
                 max_text_words,
                 max_expert_tokens,
                 verbose=False):

        self.ordered_experts = list(raw_input_dims.keys())
        self.max_expert_tokens = max_expert_tokens
        self.max_text_words = max_text_words
        self.raw_input_dims = raw_input_dims
        self.captions_per_video = 1
        self.MISSING_VAL = np.nan

        root_feat = Path(data_dir) / "symlinked-feats"

        print("Reading test data ...")
        train_feat_names = {
            "face": "X_face.npy",
            "flow": "X_flow.npy",
            "rgb": "X_resnet.npy",
            "scene":
            f"densenet161-scene-{feat_aggregation['scene']}-train.npy",
            "ocr": "w2v-ocr-raw-train.npy",
            "audio": "X_audio_train.npy",
        }
        val_feat_names = {
            "face": "face-retrieval.npy.tensor.npy",
            "flow": "flow-retrieval.npy.tensor.npy",
            "rgb": "resnet152-retrieval.npy.tensor.npy",
            "scene": f"densenet161-scene-{feat_aggregation['scene']}-val.npy",
            "ocr": "w2v-ocr-raw-val.npy",
            "audio": "X_audio_retrieval.npy.tensor.npy",
        }
        feat_paths = {"train": train_feat_names, "val": val_feat_names}

        if text_feat == "w2v":
            text_train = "w2v_LSMDC.npy"
            text_val = "w2v_LSMDC_retrieval.npy"
        elif text_feat == "openai":
            text_train = "openai-train.npy"
            text_val = "openai-test.npy"
        else:
            raise ValueError(f"Text features {text_feat} not recognised ")
        text_paths = {"train": text_train, "val": text_val}

        features = {}
        for key, feat_names in feat_paths.items():
            features[key] = {
                expert: memcache(Path(root_feat) / path)
                for expert, path in feat_names.items()
            }
        text_features = {
            key: memcache(Path(root_feat) / val)
            for key, val in text_paths.items()
        }

        # There are five videos without captions in the training set, so we drop them
        expected = 5
        train_masks = np.array([len(x) > 0 for x in text_features["train"]])
        missing_captions = len(train_masks) - sum(train_masks)
        msg = f"Expected {expected} videos without captions, found {missing_captions}"
        assert missing_captions == expected, msg
        features["train"] = {
            key: val[train_masks]
            for key, val in features["train"].items()
        }
        with open(Path(root_feat) / "test_video_paths.txt", "r") as f:
            self.video_path_retrieval = [
                Path(x) for x in f.read().splitlines()
            ]

        # combine variable length inputs into a large single tensor by zero padding. We
        # store the original sizes to allow reduced padding in minibatches
        self.expert_feat_sizes = {}
        for expert in {"audio", "ocr"}:
            feats = features["train"][expert]
            tensor, cropped_sizes = self.zero_pad_to_tensor(
                feats, self.max_expert_tokens)
            features["train"][expert] = tensor
            self.expert_feat_sizes[expert] = cropped_sizes

        text_features["train"] = text_features["train"][train_masks]
        self.text_feature_sizes = {}
        for key, val in text_features.items():
            tensor, cropped_sizes = self.zero_pad_to_tensor(
                val, self.max_text_words)
            self.text_feature_sizes[key], text_features[
                key] = cropped_sizes, tensor

        # store the indices of missing face and ocr features, marking the other experts
        # as available
        self.flaky = {"face", "ocr"}
        ind_paths = {
            x: Path(root_feat) / f"no_{x}_ind_retrieval.npy"
            for x in self.flaky
        }
        test_ind = {
            expert: 1 - memcache(path)
            for expert, path in ind_paths.items()
        }
        test_ind.update({
            expert: np.ones_like(test_ind["ocr"])
            for expert in self.ordered_experts if expert not in self.flaky
        })
        self.test_ind = {
            key: th.from_numpy(val)
            for key, val in test_ind.items()
        }

        for key in {"train", "val"}:
            missing = np.sum(features[key]["face"], axis=1) == 0
            features[key]["face"][missing, :] = np.nan
            missing = np.sum(np.sum(features[key]["ocr"], axis=1), axis=1) == 0
            features[key]["ocr"][missing, :] = np.nan

        self.features = features
        self.text_retrieval = th.from_numpy(text_features["val"]).float()
        self.raw_captions_retrieval = None
        self.text_features = text_features
コード例 #10
0
    def load_features(self):
        root_feat = Path(self.root_feat)
        feat_names = {key: self.visual_feat_paths(key) for key in
                      self.paths["feature_names"]}
        feat_names.update(self.paths["custom_paths"])
        # modern, custom = MSVD.supported_features(split_name=self.split_name)
        # feat_names = {key: self.visual_feat_paths(key) for key in modern}
        # feat_names.update(custom)
        # restrict to required experts
        features = {}
        for expert, rel_names in feat_names.items():
            if expert not in self.ordered_experts:
                continue
            feat_paths = tuple([root_feat / rel_name for rel_name in rel_names])
            if len(feat_paths) == 1:
                features[expert] = memcache(feat_paths[0])
            else:
                # support multiple forms of feature (e.g. max and avg pooling). For
                # now, we only support direct concatenation
                msg = f"{expert}: Only direct concat of muliple feats is possible"
                print(f"Concatenating aggregates for {expert}....")
                assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
                axis = self.feat_aggregation[expert]["aggregate-axis"]
                x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
                print(f"concat cache info: {x}")
                features_ = concat_features(feat_paths, axis=axis)
                memory_summary()

                if expert == "speech":
                    features_defaults = defaultdict(lambda: np.zeros((1, 300)))
                    features_defaults.update(features_)
                    features_ = features_defaults

                # Make separate feature copies for each split to allow in-place filtering
                features[expert] = copy.deepcopy(features_)

        self.features = features
        text_feat_paths = self.paths["text_feat_paths"]
        text_features = memcache(root_feat / text_feat_paths["train"])
        split_names = {"dev": "val", "official": "test"}
        text_features.update(memcache(
            root_feat / text_feat_paths[split_names[self.split_name]]))
        key_map = memcache(pjoin(root_feat, self.paths["dict_youtube_mapping_path"]))
        inverse_map = {}
        for key, value in key_map.items():
            inverse_map[value] = key
        self.text_features = {inverse_map[key]: val for key, val in text_features.items()}
        self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])

        if "detection" in self.ordered_experts:
            # Example processing
            processed = {}
            for key, subdict in self.features["detection"].items():
                box, conf = subdict["detection_boxes"], subdict["detection_scores"]
                raw = subdict["raw_feats_avg"]
                processed[key] = np.concatenate((box, conf.reshape(-1, 1), raw), axis=1)
            self.features["detection"] = processed

        if "openpose" in self.ordered_experts:
            # Example processing
            processed = {}
            for key, subdict in self.features["openpose"].items():
                raw = np.concatenate(subdict["matrix"], axis=1)
                processed[key] = raw.transpose(1, 0, 2).reshape(-1, 3 * 18)
            self.features["openpose"] = processed
コード例 #11
0
    def load_features(self):
        root_feat = self.root_feat
        feat_paths = {}

        if self.split_name == "miech":
            if self.rgb_model_name == "resnet":
                rgb_feat_name = "resnet_features.pickle"
            elif self.rgb_model_name == "senet154":
                rgb_feat_name = "senet154-imagenet-raw-nocrop.pickle"
            else:
                raise ValueError(
                    f"unrecognised rgb_model_name: {self.rgb_model_name}")
            feat_paths["audio"] = pjoin(root_feat, "audio_features.pickle")
            feat_paths["face"] = pjoin(root_feat, "face_features.pickle")
            feat_paths["flow"] = pjoin(root_feat, "flow_features.pickle")
        elif self.split_name in {"full-test", "full-val", "jsfusion"}:
            feat_paths["audio"] = pjoin(root_feat, "Audio_MSRVTT_new.pickle")
            feat_paths["face"] = pjoin(root_feat, "Face_MSRVTT_new.pickle")
            feat_paths["flow"] = pjoin(root_feat, "I3D_MSRVTT_new.pickle")
            rgb_feat_name = f"{self.rgb_model_name}-imagenet-raw-nocrop.pickle"

        feat_paths["rgb"] = pjoin(root_feat, rgb_feat_name)
        feat_paths["scene"] = pjoin(root_feat, "scene-raw.npy")

        # Note: Antoine's text features cover the full 10,000 videos, so can be
        # used for either split, similarly for the speech embeddings
        text_feat = self.text_feat
        if text_feat == "w2v":
            text_feat_path = pjoin(root_feat, "w2v_MSRVTT.pickle")
        elif text_feat == "openai":
            text_feat_path = pjoin(root_feat, "w2v_MSRVTT_openAIGPT.pickle")
        elif text_feat == "bertxl":
            text_feat_path = pjoin(root_feat, "w2v_MSRVTT_transformer.pickle")
        else:
            raise ValueError(
                "Text features {} not recognised ".format(text_feat))
        feat_paths["speech"] = pjoin(root_feat, "stt_w2v.pickle")
        feat_paths["ocr"] = pjoin(root_feat, "MSR_VTT_all_text_w2v.pkl")
        # drop features which have not been requested
        feat_paths = {
            key: val
            for key, val in feat_paths.items() if key in self.ordered_experts
        }
        features = {
            expert: memcache(path)
            for expert, path in feat_paths.items()
        }

        # we handle ocr separately from the other experts, for backwards compatibility
        canon_feats = {}
        for expert, feats in features.items():
            if expert != "ocr":
                canon_feats[expert] = self.canonical_features(feats)
            else:
                raw_dim = self.raw_input_dims[expert]
                canon_feats[expert] = self.canonical_features(feats,
                                                              raw_dim=raw_dim)
        self.features = canon_feats
        self.raw_captions = memcache(
            Path(self.data_dir) / "processing/raw-captions.pkl")
        self.text_features = memcache(text_feat_path)
        if self.restrict_train_captions:
            # hash the video names to avoid O(n) lookups in long lists
            train_list = set(self.train_list)
            for key, val in self.text_features.items():
                if key not in train_list:
                    continue
                msg = "expected text features to be lists with length 19 or 20"
                assert isinstance(val, list) and len(val) in {19, 20}, msg
                # restrict to the first N captions (deterministic)
                self.text_features[key] = val[:self.restrict_train_captions]