def load_features(self): root_feat = self.root_feat feat_agg = self.feat_aggregation feat_names = { "flow": f"i3d-i3d-{feat_agg['flow']}.pickle", "face": f"VGGFace2-ResNet50-face-{feat_agg['face']}.pickle", "rgb": f"{self.rgb_model_name}-imagenet-{feat_agg['rgb']}.pickle", "scene": f"densenet161-scene-{feat_agg['scene']}.pickle", "ocr": "ocr-feats.pkl", "audio": "vggish-audio-raw.pickle", "speech": "stt_w2v.pickle", } assert feat_agg["scene"] == "max", "expected max pooling over scenes" feat_paths = { key: Path(root_feat) / value for key, value in feat_names.items() } if self.text_feat == "openai": text_feat_path = pjoin(root_feat, "openai-feats.pkl") else: raise ValueError(f"Text features {self.text_feat} not supported ") features = { expert: memcache(path) for expert, path in feat_paths.items() } text_features = memcache(text_feat_path) self.features = features self.text_features = text_features self.raw_captions = memcache( Path(self.data_dir) / "processing/raw-captions.pkl")
def load_features(self): root_feat = Path(self.root_feat) feat_names = { key: self.visual_feat_paths(key) for key in self.paths["feature_names"] } feat_names.update(self.paths["custom_paths"]) features = {} for expert, rel_names in feat_names.items(): if expert not in self.ordered_experts: continue feat_paths = tuple( [root_feat / rel_name for rel_name in rel_names]) if len(feat_paths) == 1: features[expert] = memcache(feat_paths[0]) else: # support multiple forms of feature (e.g. max and avg pooling). For # now, we only support direct concatenation msg = f"{expert}: Only direct concatenation of muliple feats is possible" print(f"Concatenating aggregates for {expert}....") assert self.feat_aggregation[expert][ "aggregate"] == "concat", msg axis = self.feat_aggregation[expert]["aggregate-axis"] x = concat_features.cache_info() # pylint: disable=no-value-for-parameter print(f"concat cache info: {x}") features_ = concat_features(feat_paths, axis=axis) memory_summary() # Make separate feature copies for each split to allow in-place filtering features[expert] = copy.deepcopy(features_) self.features = features self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) self.text_features = memcache(root_feat / self.paths["text_feat_path"])
def load_features(self): root_feat = self.root_feat feat_names = { "face": "VGGFace2-ResNet50-face-avg.pickle", "flow": "i3d-i3d-avg.pickle", "rgb": f"{self.rgb_model_name}-imagenet-avg-nocrop.pickle", "scene": "densenet161-scene-max.pickle", "ocr": "AN_OCR_ALL_unique_video_w2v.pkl", "audio": "vggish-audio-raw.pickle", "speech": "stt_w2v.pickle", } feat_paths = { key: Path(root_feat) / value for key, value in feat_names.items() } if self.text_feat == "openai": text_feat_train_path = pjoin(root_feat, "openai-train.pkl") text_feat_val1_path = pjoin(root_feat, "openai-val1.pkl") text_feat_val2_path = pjoin(root_feat, "openai-val2.pkl") else: raise ValueError(f"Text features {self.text_feat} not supported ") features = { expert: memcache(path) for expert, path in feat_paths.items() } text_features = memcache(text_feat_train_path) if self.split_name == "val1": text_features.update(memcache(text_feat_val1_path)) elif self.split_name == "val2": text_features.update(memcache(text_feat_val2_path)) else: raise ValueError( f"unrecognised activity-net split: {self.split_name}") self.features = features self.text_features = text_features self.raw_captions = memcache(self.raw_captions_path)
def configure_train_test_splits(self, split_name): self.restrict_test_captions = None if split_name == "miech": # For now, we follow Antoine's approach of using the first text caption # for the retreival task when evaluating on his custom split. train_list_path = "train_list_miech.txt" test_list_path = "test_list_miech.txt" elif split_name in "jsfusion": train_list_path = "train_list_jsfusion.txt" test_list_path = "val_list_jsfusion.txt" # NOTE: The JSFusion split (referred to as 1k-A in the paper) uses all # videos, but randomly samples a single caption per video from the test # set for evaluation. To reproduce this evaluation, we use the indices # of the test captions, and restrict to this subset during eval. test_cap_idx_path = pjoin(self.root_feat, "jsfusion_val_caption_idx.pkl") self.restrict_test_captions = memcache(test_cap_idx_path) elif split_name in {"full-val", "full-test"}: train_list_path = "train_list_full.txt" if split_name == "full-val": test_list_path = "val_list_full.txt" else: test_list_path = "test_list_full.txt" else: msg = "unrecognised MSRVTT split: {}" raise ValueError(msg.format(split_name)) train_list_path = pjoin(self.root_feat, train_list_path) test_list_path = pjoin(self.root_feat, test_list_path) print("loading training/val splits....") tic = time.time() with open(train_list_path) as f: self.train_list = f.readlines() self.train_list = [x.strip() for x in self.train_list] with open(test_list_path) as f: self.test_list = f.readlines() self.test_list = [x.strip() for x in self.test_list] print("done in {:.3f}s".format(time.time() - tic)) self.split_name = split_name
def _load_data(self): self.expert_data = {} for expert in self.experts_used: if expert != 'context': data_pth = osj(self.data_dir, 'features', self.experts[expert]) self.expert_data[expert] = memcache(data_pth) memory_summary() clips_with_data = [] for expert in self.expert_data: if expert != 'description' and expert != 'label': clips_with_data += self.expert_data[expert].keys() # debugging (input random tensors) random = False if random: for expert in self.expert_data: for videoid in self.expert_data[expert]: self.expert_data[expert][videoid] = np.random.randn( *self.expert_data[expert][videoid].shape) # debugging (input zero tensors) zeros = False if zeros: for expert in self.expert_data: for videoid in self.expert_data[expert]: self.expert_data[expert][videoid] = np.zeros( self.expert_data[expert][videoid].shape) clips_with_data = set(clips_with_data) #sanity check #pdb.set_trace() #if not self.data['clips'].index.isin(clips_with_data).all(): # print(self.data['clips'][~self.data['clips'].index.isin(clips_with_data)].index) # raise NotImplementedError self.data['clips'] = self.data['clips'][self.data['clips'].index.isin( clips_with_data)] print(f'{self.split} size: {len(self.data["clips"])} clips')
def __init__(self, data_dir, raw_input_dims, cut_name, split_name, max_text_words=30, max_expert_tokens=8, clip_duration=float("Inf"), caption_length=float("Inf"), captions_per_video=1, restrict_train_captions=0, training=False, split_size=1.0, load_in_ram=False, remove_stop_words=False, n_pairs=1, tokenizer=None, shuffle_feats_t=False, loaded_data=None, query_shuffling="indiv", cross_seed=0, temporal_encoding_window=1): self.sanity_checks = False self.train = training self.data_dir = data_dir self.restrict_train_captions = restrict_train_captions self.max_text_words = max_text_words self.max_expert_tokens = max_expert_tokens self.root_feat = pathlib.Path(data_dir) / "symlinked-feats" self.experts = set(raw_input_dims.keys()) self.rgb_shots = 1 self.cut_name = cut_name self.split_name = split_name self.split_size = split_size self.load_in_ram = load_in_ram self.remove_stop_words = remove_stop_words self.n_pairs = n_pairs self.clip_duration = clip_duration self.caption_length = caption_length self.tokenizer = tokenizer self.shuffle_feats_t = shuffle_feats_t self.query_shuffling = query_shuffling self.cross_seed = cross_seed self.temporal_encoding_window = temporal_encoding_window self.data_aug = False self.max_ratio_rem = 0 if self.cut_name == "c": # The challenge features are stored in pkl files self.reading_from = "pkl" else: # The ECCV20 paper features are stored in multiple h5 files self.reading_from = "mult_h5" self.cache_dir = os.path.join(os.path.dirname(data_dir), "vid_feat_files", self.reading_from) logger.debug("Cache_dir: %s", self.cache_dir) # This attribute can be overloaded by different datasets, so it must be set # before the `configure_train_test_splits() method call` self.restrict_test_captions = None # Use a single caption per video when forming training minibatches # (different captions from the same video may still be used across # different minibatches) if self.train: self.captions_per_video = 1 else: self.captions_per_video = captions_per_video self.ordered_experts = list(raw_input_dims.keys()) self.configure_train_test_splits(cut_name=cut_name, split_name=split_name) self.expert_timings = expert_timings.expert_timings # If split_size is type(int) it represents the number of samples that we # keep. # If split_size is type(float) it represents the ratio of the original # split size that we keep. original_size = len(self.vid_list) if split_size >= 2 and isinstance(split_size, int): nb_samples = split_size elif 0 <= split_size <= 1 and isinstance(split_size, float): nb_samples = int(split_size * original_size) self.vid_list = self.vid_list[:nb_samples] self.num_train = len(self.vid_list) # Display info about the dataset split size main_msg = f"Number of videos in {self.dataset_name}: {original_size}" if self.num_train == original_size: msg = "" else: msg = f" but we keep only {self.num_train} (split_size = {split_size})" logger.debug(main_msg + msg) # Log how many captions per video are kept logger.debug("We consider %s captions per video", self.captions_per_video) self.raw_input_dims = raw_input_dims visualisations = True if visualisations: logger.debug("Storing paths to enable visualisations ...") symlink_to_root = pathlib.Path.cwd() / "project_root" # If symlink to root can be accessed, follow that path # Otherwise, follow the current working directory # (that should be the project root) if symlink_to_root.exists(): video_paths = [ os.readlink(str(symlink_to_root)) / pathlib.Path(data_dir) / f"videos/{x}.mp4" for x in self.vid_list ] else: video_paths = [ pathlib.Path.cwd() / pathlib.Path(data_dir) / f"videos/{x}.mp4" for x in self.vid_list ] self.video_paths = video_paths self.missing_val = 0 if not os.path.exists(self.cache_dir) and self.reading_from != "pkl": logger.warning("%s does not exist", self.cache_dir) self.variable_sz_experts = self.experts self.flaky_experts = self.experts self.loaded_in_ram = False self.loaded_data = loaded_data data_source = self.dataset_name.split("_")[0] if data_source not in self.loaded_data: self.loaded_data[data_source] = {} if self.load_in_ram: logger.info("Loading dataset {self.dataset_name} in ram ...") if self.reading_from == "mult_h5": self.data_vid = {} for i, vid in enumerate(self.vid_list): if i % 100 == 0: logger.debug(i) self.data[vid] = self.get_sample_data(vid) elif self.reading_from == "pkl": self.data_exp = self.loaded_data[data_source] for expert in self.experts: if expert not in self.data_exp: self.data_exp[expert] = {} if expert in self.expert_paths.keys(): for agg, path in self.expert_paths[expert].items(): data_path = pathlib.Path( self.data_dir) / pathlib.Path(path) if agg not in self.data_exp[expert]: self.data_exp[expert][agg] = memcache( data_path) else: logger.warning( "The expert %s is not available for dataset %s", expert, self.dataset_name) if self.split_name == "test2": path = self.expert_paths["raw_captions_test2"] else: path = self.expert_paths["raw_captions"] data_path = pathlib.Path(self.data_dir) / pathlib.Path(path) additionnal_captions = memcache(data_path) if "raw_captions" not in self.data_exp: self.data_exp["raw_captions"] = {} self.data_exp["raw_captions"].update(additionnal_captions) self.loaded_in_ram = True
def load_features(self): root_feat = self.root_feat feat_names = { "face": "VGGFace2-ResNet50-face-raw.pickle", "flow": "i3d-i3d-raw.pickle", "rgb": f"{self.rgb_model_name}-imagenet-raw-nocrop.pickle", "scene": "densenet161-scene-max.pickle", "ocr": "MSVD_all_text_w2v.pkl", } feat_paths = { key: Path(root_feat) / value for key, value in feat_names.items() } if self.text_feat == "w2v": text_feat_train_path = pjoin(root_feat, "w2v-caption-train.pkl") text_feat_val_path = pjoin(root_feat, "w2v-caption-val.pkl") text_feat_test_path = pjoin(root_feat, "w2v-caption-test.pkl") elif self.text_feat == "openai": text_feat_train_path = pjoin(root_feat, "openai-caption-train.pkl") text_feat_val_path = pjoin(root_feat, "openai-caption-val.pkl") text_feat_test_path = pjoin(root_feat, "openai-caption-test.pkl") else: raise ValueError(f"Text features {self.text_feat} not supported ") features = { expert: memcache(path) for expert, path in feat_paths.items() } text_features = memcache(text_feat_train_path) if self.split_name == "dev": text_features.update(memcache(text_feat_val_path)) elif self.split_name == "official": text_features.update(memcache(text_feat_test_path)) else: raise ValueError(f"unrecognised MSVD split: {self.split_name}") # To ensure that the text features are stored with the same keys as other # features, we need to convert text feature keys (YouTube hashes) into # video names key_map = memcache(pjoin(root_feat, "dict_youtube_mapping.pkl")) inverse_map = {} for key, value in key_map.items(): inverse_map[value] = key text_features = { inverse_map[key]: val for key, val in text_features.items() } # we handle ocr separately from the other experts, for backwards compatibility # reasons canon_feats = {} for expert, feats in features.items(): if expert != "ocr": canon_feats[expert] = self.canonical_features(feats) else: raw_dim = self.raw_input_dims[expert] canon_feats[expert] = self.canonical_features(feats, raw_dim=raw_dim) self.features = canon_feats self.text_features = text_features self.raw_captions = memcache(pjoin(root_feat, "raw-captions.pkl"))
def configure_train_test_splits(self, cut_name, split_name): self.restrict_test_captions = None if cut_name in ["miech", "jsfusion"]: if cut_name in ["miech"]: # For now, we follow Antoine's approach of using the first text caption # for the retrieval task when evaluating on his custom split. train_list_path = "train_list_miech.txt" test_list_path = "test_list_miech.txt" elif cut_name in ["jsfusion"]: train_list_path = "train_list_jsfusion.txt" test_list_path = "val_list_jsfusion.txt" # NOTE: The JSFusion split (referred to as 1k-A in the paper) uses all # videos, but randomly samples a single caption per video from the test # set for evaluation. To reproduce this evaluation, we use the indices # of the test captions, and restrict to this subset during eval. test_cap_idx_path = os.path.join( self.data_dir, "jsfusion_val_caption_idx.pkl") self.restrict_test_captions = memcache(test_cap_idx_path) test_list_path = os.path.join(self.data_dir, test_list_path) with open(test_list_path) as f: test_vid_list = f.readlines() nb_test_samples = len(test_vid_list) if split_name in ["train", "trn", "val", "trainval"]: train_list_path = os.path.join(self.data_dir, train_list_path) with open(train_list_path) as f: train_vid_list = f.readlines() nb_train_samples = len(train_vid_list) cross_vid_list = train_vid_list cross_vid_list = [x.strip() for x in cross_vid_list] # The cross seed is used to split training videos into different # cross validation splits. rng = np.random.RandomState(self.cross_seed) rng.shuffle(cross_vid_list) if split_name in ["train", "trn", "trainval"]: if split_name in ["trainval"]: self.vid_list = cross_vid_list elif split_name in ["train", "trn"]: self.vid_list = cross_vid_list[nb_test_samples:] if split_name in ["trn"]: self.vid_list = self.vid_list[:nb_test_samples] elif split_name in ["val"]: self.vid_list = cross_vid_list[:nb_test_samples] elif split_name == "test": self.vid_list = test_vid_list self.vid_list = [x.strip() for x in self.vid_list] elif cut_name in ["full"]: if split_name in ["train", "trn"]: list_path = "train_list.txt" elif split_name in ["val"]: list_path = "val_list.txt" elif split_name in ["test"]: list_path = "test_list.txt" else: raise ValueError(f"unrecognised split: {split_name}") list_path = os.path.join(self.data_dir, list_path) with open(list_path) as f: self.vid_list = f.readlines() self.vid_list = [x.strip() for x in self.vid_list] # We want the trn split to be the same size as the val set if split_name in ["trn"]: rng = np.random.RandomState(0) rng.shuffle(self.vid_list) self.vid_list = self.vid_list[:497] elif cut_name in ["c"]: self.expert_paths = get_expert_paths(self.data_dir) if split_name in ["train", "trn", "val", "trainval"]: train_list_path = "train_list.txt" train_list_path = os.path.join(self.data_dir, train_list_path) with open(train_list_path) as f: train_vid_list = f.readlines() nb_train_samples = len(train_vid_list) val_list_path = "val_list.txt" val_list_path = os.path.join(self.data_dir, val_list_path) with open(val_list_path) as f: val_vid_list = f.readlines() nb_val_samples = len(val_vid_list) cross_vid_list = train_vid_list + val_vid_list cross_vid_list = [x.strip() for x in cross_vid_list] if self.cross_seed != 0: # The cross seed is used to split training videos into different # cross validation splits. rng = np.random.RandomState(self.cross_seed) rng.shuffle(cross_vid_list) if split_name in ["train", "trn", "trainval"]: if split_name in ["trainval"]: self.vid_list = cross_vid_list elif split_name in ["train", "trn"]: self.vid_list = cross_vid_list[:nb_train_samples] if split_name in ["trn"]: # In order to monitor performance on the training set, we sample # from it as many samples as there are validation samples. rng = np.random.RandomState(0) rng.shuffle(self.vid_list) self.vid_list = self.vid_list[:nb_val_samples] elif split_name in ["val"]: self.vid_list = cross_vid_list[nb_train_samples:] else: if split_name == "test1": list_path = "public_server_val.txt" elif split_name == "test2": list_path = "public_server_test.txt" list_path = os.path.join(self.data_dir, list_path) with open(list_path) as f: self.vid_list = f.readlines() self.vid_list = [x.strip() for x in self.vid_list] else: msg = "unrecognised cut: {}" raise ValueError(msg.format(cut_name)) self.split_name = split_name self.dataset_name = f"MSRVTT_{cut_name}_{split_name}"
def __init__(self, data_dir, feat_aggregation, raw_input_dims, num_test_captions, split_name, text_dim, text_feat, rgb_model_name, fuse_captions, max_text_words, max_expert_tokens, verbose=False): self.ordered_experts = list(raw_input_dims.keys()) self.max_expert_tokens = max_expert_tokens self.max_text_words = max_text_words self.raw_input_dims = raw_input_dims self.captions_per_video = 1 self.MISSING_VAL = np.nan root_feat = Path(data_dir) / "symlinked-feats" print("Reading test data ...") train_feat_names = { "face": "X_face.npy", "flow": "X_flow.npy", "rgb": "X_resnet.npy", "scene": f"densenet161-scene-{feat_aggregation['scene']}-train.npy", "ocr": "w2v-ocr-raw-train.npy", "audio": "X_audio_train.npy", } val_feat_names = { "face": "face-retrieval.npy.tensor.npy", "flow": "flow-retrieval.npy.tensor.npy", "rgb": "resnet152-retrieval.npy.tensor.npy", "scene": f"densenet161-scene-{feat_aggregation['scene']}-val.npy", "ocr": "w2v-ocr-raw-val.npy", "audio": "X_audio_retrieval.npy.tensor.npy", } feat_paths = {"train": train_feat_names, "val": val_feat_names} if text_feat == "w2v": text_train = "w2v_LSMDC.npy" text_val = "w2v_LSMDC_retrieval.npy" elif text_feat == "openai": text_train = "openai-train.npy" text_val = "openai-test.npy" else: raise ValueError(f"Text features {text_feat} not recognised ") text_paths = {"train": text_train, "val": text_val} features = {} for key, feat_names in feat_paths.items(): features[key] = { expert: memcache(Path(root_feat) / path) for expert, path in feat_names.items() } text_features = { key: memcache(Path(root_feat) / val) for key, val in text_paths.items() } # There are five videos without captions in the training set, so we drop them expected = 5 train_masks = np.array([len(x) > 0 for x in text_features["train"]]) missing_captions = len(train_masks) - sum(train_masks) msg = f"Expected {expected} videos without captions, found {missing_captions}" assert missing_captions == expected, msg features["train"] = { key: val[train_masks] for key, val in features["train"].items() } with open(Path(root_feat) / "test_video_paths.txt", "r") as f: self.video_path_retrieval = [ Path(x) for x in f.read().splitlines() ] # combine variable length inputs into a large single tensor by zero padding. We # store the original sizes to allow reduced padding in minibatches self.expert_feat_sizes = {} for expert in {"audio", "ocr"}: feats = features["train"][expert] tensor, cropped_sizes = self.zero_pad_to_tensor( feats, self.max_expert_tokens) features["train"][expert] = tensor self.expert_feat_sizes[expert] = cropped_sizes text_features["train"] = text_features["train"][train_masks] self.text_feature_sizes = {} for key, val in text_features.items(): tensor, cropped_sizes = self.zero_pad_to_tensor( val, self.max_text_words) self.text_feature_sizes[key], text_features[ key] = cropped_sizes, tensor # store the indices of missing face and ocr features, marking the other experts # as available self.flaky = {"face", "ocr"} ind_paths = { x: Path(root_feat) / f"no_{x}_ind_retrieval.npy" for x in self.flaky } test_ind = { expert: 1 - memcache(path) for expert, path in ind_paths.items() } test_ind.update({ expert: np.ones_like(test_ind["ocr"]) for expert in self.ordered_experts if expert not in self.flaky }) self.test_ind = { key: th.from_numpy(val) for key, val in test_ind.items() } for key in {"train", "val"}: missing = np.sum(features[key]["face"], axis=1) == 0 features[key]["face"][missing, :] = np.nan missing = np.sum(np.sum(features[key]["ocr"], axis=1), axis=1) == 0 features[key]["ocr"][missing, :] = np.nan self.features = features self.text_retrieval = th.from_numpy(text_features["val"]).float() self.raw_captions_retrieval = None self.text_features = text_features
def load_features(self): root_feat = Path(self.root_feat) feat_names = {key: self.visual_feat_paths(key) for key in self.paths["feature_names"]} feat_names.update(self.paths["custom_paths"]) # modern, custom = MSVD.supported_features(split_name=self.split_name) # feat_names = {key: self.visual_feat_paths(key) for key in modern} # feat_names.update(custom) # restrict to required experts features = {} for expert, rel_names in feat_names.items(): if expert not in self.ordered_experts: continue feat_paths = tuple([root_feat / rel_name for rel_name in rel_names]) if len(feat_paths) == 1: features[expert] = memcache(feat_paths[0]) else: # support multiple forms of feature (e.g. max and avg pooling). For # now, we only support direct concatenation msg = f"{expert}: Only direct concat of muliple feats is possible" print(f"Concatenating aggregates for {expert}....") assert self.feat_aggregation[expert]["aggregate"] == "concat", msg axis = self.feat_aggregation[expert]["aggregate-axis"] x = concat_features.cache_info() # pylint: disable=no-value-for-parameter print(f"concat cache info: {x}") features_ = concat_features(feat_paths, axis=axis) memory_summary() if expert == "speech": features_defaults = defaultdict(lambda: np.zeros((1, 300))) features_defaults.update(features_) features_ = features_defaults # Make separate feature copies for each split to allow in-place filtering features[expert] = copy.deepcopy(features_) self.features = features text_feat_paths = self.paths["text_feat_paths"] text_features = memcache(root_feat / text_feat_paths["train"]) split_names = {"dev": "val", "official": "test"} text_features.update(memcache( root_feat / text_feat_paths[split_names[self.split_name]])) key_map = memcache(pjoin(root_feat, self.paths["dict_youtube_mapping_path"])) inverse_map = {} for key, value in key_map.items(): inverse_map[value] = key self.text_features = {inverse_map[key]: val for key, val in text_features.items()} self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) if "detection" in self.ordered_experts: # Example processing processed = {} for key, subdict in self.features["detection"].items(): box, conf = subdict["detection_boxes"], subdict["detection_scores"] raw = subdict["raw_feats_avg"] processed[key] = np.concatenate((box, conf.reshape(-1, 1), raw), axis=1) self.features["detection"] = processed if "openpose" in self.ordered_experts: # Example processing processed = {} for key, subdict in self.features["openpose"].items(): raw = np.concatenate(subdict["matrix"], axis=1) processed[key] = raw.transpose(1, 0, 2).reshape(-1, 3 * 18) self.features["openpose"] = processed
def load_features(self): root_feat = self.root_feat feat_paths = {} if self.split_name == "miech": if self.rgb_model_name == "resnet": rgb_feat_name = "resnet_features.pickle" elif self.rgb_model_name == "senet154": rgb_feat_name = "senet154-imagenet-raw-nocrop.pickle" else: raise ValueError( f"unrecognised rgb_model_name: {self.rgb_model_name}") feat_paths["audio"] = pjoin(root_feat, "audio_features.pickle") feat_paths["face"] = pjoin(root_feat, "face_features.pickle") feat_paths["flow"] = pjoin(root_feat, "flow_features.pickle") elif self.split_name in {"full-test", "full-val", "jsfusion"}: feat_paths["audio"] = pjoin(root_feat, "Audio_MSRVTT_new.pickle") feat_paths["face"] = pjoin(root_feat, "Face_MSRVTT_new.pickle") feat_paths["flow"] = pjoin(root_feat, "I3D_MSRVTT_new.pickle") rgb_feat_name = f"{self.rgb_model_name}-imagenet-raw-nocrop.pickle" feat_paths["rgb"] = pjoin(root_feat, rgb_feat_name) feat_paths["scene"] = pjoin(root_feat, "scene-raw.npy") # Note: Antoine's text features cover the full 10,000 videos, so can be # used for either split, similarly for the speech embeddings text_feat = self.text_feat if text_feat == "w2v": text_feat_path = pjoin(root_feat, "w2v_MSRVTT.pickle") elif text_feat == "openai": text_feat_path = pjoin(root_feat, "w2v_MSRVTT_openAIGPT.pickle") elif text_feat == "bertxl": text_feat_path = pjoin(root_feat, "w2v_MSRVTT_transformer.pickle") else: raise ValueError( "Text features {} not recognised ".format(text_feat)) feat_paths["speech"] = pjoin(root_feat, "stt_w2v.pickle") feat_paths["ocr"] = pjoin(root_feat, "MSR_VTT_all_text_w2v.pkl") # drop features which have not been requested feat_paths = { key: val for key, val in feat_paths.items() if key in self.ordered_experts } features = { expert: memcache(path) for expert, path in feat_paths.items() } # we handle ocr separately from the other experts, for backwards compatibility canon_feats = {} for expert, feats in features.items(): if expert != "ocr": canon_feats[expert] = self.canonical_features(feats) else: raw_dim = self.raw_input_dims[expert] canon_feats[expert] = self.canonical_features(feats, raw_dim=raw_dim) self.features = canon_feats self.raw_captions = memcache( Path(self.data_dir) / "processing/raw-captions.pkl") self.text_features = memcache(text_feat_path) if self.restrict_train_captions: # hash the video names to avoid O(n) lookups in long lists train_list = set(self.train_list) for key, val in self.text_features.items(): if key not in train_list: continue msg = "expected text features to be lists with length 19 or 20" assert isinstance(val, list) and len(val) in {19, 20}, msg # restrict to the first N captions (deterministic) self.text_features[key] = val[:self.restrict_train_captions]