def collate_data(self, data): ind = { expert: [x[f"{expert}_ind"] for x in data] for expert in self.ordered_experts } ind = {key: ensure_tensor(np.array(val)) for key, val in ind.items()} experts = [] for expert in self.ordered_experts: if expert in {"audio", "ocr"}: # zero pad the variable length inputs to the shortest possible length pad_to = max([x[f"{expert}_sz"] for x in data]) val = th.from_numpy( np.stack([x[expert][:pad_to] for x in data], axis=0)) else: val = th.from_numpy(np.vstack([x[expert] for x in data])) experts.append((expert, val.float())) experts = OrderedDict(experts) # Similarly, we zero pad the text to the shortest possible length pad_to = max([x[f"text_sz"] for x in data]) text = th.from_numpy(np.array([x["text"][:pad_to] for x in data])).float() text = self.unsqueeze_text(text) return {"text": text, "experts": experts, "ind": ind}
def get_retrieval_data(self): experts = OrderedDict( (expert, th.from_numpy(self.retrieval[expert]).float()) for expert in self.ordered_experts) retrieval_data = { "text": ensure_tensor(self.text_retrieval).float(), "experts": experts, "ind": self.test_ind, } meta = { "query_masks": self.query_masks, "raw_captions": self.raw_captions_retrieval, "paths": self.video_path_retrieval, } return retrieval_data, meta
def get_retrieval_data(self): experts = OrderedDict( (expert, th.from_numpy(self.retrieval[expert]).float()) for expert in self.ordered_experts ) retrieval_data = { "text": ensure_tensor(self.text_retrieval).float(), "experts": experts, "ind": self.test_ind, } meta = { "query_masks": self.query_masks, "raw_captions": self.raw_captions_retrieval, "paths": self.video_path_retrieval, } if False: # safety checks import pickle ret1 = retrieval_data with open("/tmp/retrieval_data.pkl", "rb") as f: ret2 = pickle.load(f) # ind - OK for key, val in ret1["ind"].items(): print(f"ind diff: {key}", (val - ret2["ind"][key].float()).sum()) # set nans to comparable number first NAN_VAL = 2780343 experts1 = dict(ret1["experts"]) experts2 = dict(ret2["experts"]) for key in experts1.keys(): fix = th.isnan(experts1[key]) experts1[key][fix] = NAN_VAL fix = th.isnan(experts2[key]) experts2[key][fix] = NAN_VAL for key in experts1: print(key, (experts1[key] - experts2[key]).sum()) # text - OK print("text diff", (ret2["text"] - ret1["text"]).sum()) import ipdb; ipdb.set_trace() return retrieval_data, meta
def collate_data(self, data): batch_size = len(data) tensors = {} for expert in self.tensor_storage["fixed"]: if expert in self.trn_config.keys(): tensors[expert] = np.zeros( (batch_size, self.trn_config[expert], self.raw_input_dims[expert])) else: tensors[expert] = np.zeros( (batch_size, self.raw_input_dims[expert])) # Track which indices of each modality are available in the present batch ind = {expert: np.zeros(batch_size) for expert in self.experts} tensors.update({ expert: np.zeros((batch_size, self.max_tokens[expert], self.raw_input_dims[expert])) for expert in self.tensor_storage["variable"] }) if "retrieval" in self.task: text_tensor = np.zeros((batch_size, self.captions_per_video, self.max_tokens["text"], self.text_dim)) text_token_mask = np.zeros((batch_size, self.captions_per_video)) elif "classification" in self.task and self.class_type == "single_label": label_tensor = np.zeros(batch_size) vid_name = [] elif "classification" in self.task and self.class_type == "multi_label": label_tensor = np.zeros((batch_size, self.num_classes)) vid_name = [] for ii, _ in enumerate(data): datum = data[ii] for expert in self.experts: ind[expert][ii] = datum[f"{expert}_ind"] for expert in self.tensor_storage["fixed"]: tensors[expert][ii] = datum[expert] for expert in self.tensor_storage["variable"]: if ind[expert][ii]: keep = min(len(datum[expert]), self.max_tokens[expert]) if keep: tensors[expert][ii, :keep, :] = datum[expert][:keep] else: tensors[expert][ii, :, :] = self.MISSING_VAL if "retrieval" in self.task: text = datum["text"] for jj in range(self.captions_per_video): keep = min(len(text[jj]), self.max_tokens["text"]) text_tensor[ii, jj, :keep, :] = text[jj][:keep] text_token_mask[ii, jj] = keep elif self.task == "classification": if self.cls_partition != 'test': label_tensor[ii] = datum["labels"] vid_name.append(datum["vid"]) ind = {key: ensure_tensor(val) for key, val in ind.items()} experts = OrderedDict((expert, th.from_numpy(tensors[expert]).float()) for expert in self.ordered_experts) for expert in self.experts: if self.feat_aggregation[expert].get("binarise", False): replace = np.logical_not(th.isnan(experts[expert][:, 0, 0])) experts[expert][replace] = th.ones_like( experts[expert][replace]) minibatch = {"experts": experts, "ind": ind} if "retrieval" in self.task: minibatch["text"] = th.from_numpy(text_tensor).float() minibatch["text_token_mask"] = th.from_numpy(text_token_mask) elif self.task == "classification": if self.cls_partition != 'test': minibatch["labels"] = th.from_numpy(label_tensor).float() if self.cls_partition != "train": # we only pass the video names for visualisation and making predictions # on the val/test set minibatch["vid_name"] = vid_name return minibatch
def collate_data(self, data): batch_size = len(data) # Track which indices of each modality are available in the present batch ind = {expert: np.zeros(batch_size) for expert in self.experts} # as above, we handle rgb separately from other fixed_sz experts tensors = {expert: np.zeros((batch_size, self.raw_input_dims[expert])) for expert in self.fixed_sz_experts if expert != "rgb"} tensors["rgb"] = np.zeros((batch_size, self.rgb_shots, self.raw_input_dims["rgb"])) tensors.update({expert: np.zeros( (batch_size, self.max_expert_tokens, self.raw_input_dims[expert]) ) for expert in self.variable_sz_experts}) text_tensor = np.zeros((batch_size, self.captions_per_video, self.max_text_words, self.text_dim)) for ii, _ in enumerate(data): datum = data[ii] for expert in self.experts: ind[expert][ii] = datum[f"{expert}_ind"] # It is preferable to explicitly pass NaNs into the network as missing # values, over simply zeros, to avoid silent failures for expert in self.fixed_sz_experts: tensors[expert][ii] = datum[expert] for expert in self.variable_sz_experts: keep = min(len(datum[expert]), self.max_expert_tokens) if keep: tensors[expert][ii, :keep, :] = datum[expert][:keep] text = datum["text"] for jj in range(self.captions_per_video): keep = min(len(text[jj]), self.max_text_words) text_tensor[ii, jj, :keep, :] = text[jj][:keep] ind = {key: ensure_tensor(val) for key, val in ind.items()} text = th.from_numpy(text_tensor).float() experts = OrderedDict( (expert, th.from_numpy(tensors[expert]).float()) for expert in self.ordered_experts) minibatch = {"text": text, "experts": experts, "ind": ind} # ---------------------------------------------------------------- # sanity checking # ---------------------------------------------------------------- if False: import pickle with open("/tmp/minibatch.pkl", "rb") as f: minibatch2 = pickle.load(f) # text - OK print("text diff", (minibatch2["text"] - minibatch["text"]).sum()) # ind - OK for key, val in minibatch["ind"].items(): print(f"ind diff: {key}", (val - minibatch2["ind"][key]).sum()) # set nans to comparable number first NAN_VAL = 2780343 experts1 = dict(minibatch["experts"]) experts2 = dict(minibatch2["experts"]) for key in experts1.keys(): fix = th.isnan(experts1[key]) experts1[key][fix] = NAN_VAL fix = th.isnan(experts2[key]) experts2[key][fix] = NAN_VAL for key in experts1: print(key, (experts1[key] - experts2[key]).sum()) import ipdb; ipdb.set_trace() return minibatch