def __init__(self, config: Dict[str, Union[int, str]], dialogs_jsonpath: str, dense_annotations_jsonpath: Optional[str] = None, overfit: bool = False, in_memory: bool = False): super().__init__() self.config = config self.dialogs_reader = DialogsReader(dialogs_jsonpath) if "val" in self.split and dense_annotations_jsonpath is not None: self.annotations_reader = DenseAnnotationsReader(dense_annotations_jsonpath) else: self.annotations_reader = None self.vocabulary = Vocabulary( config["word_counts_json"], min_count=config["vocab_min_count"] ) # initialize image features reader according to split image_features_hdfpath = config["image_features_train_h5"] if "val" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) # keep a list of image_ids as primary keys to access data self.image_ids = list(self.dialogs_reader.dialogs.keys()) if overfit: self.image_ids = self.image_ids[:5]
def __init__(self, visdial_jsonpath: str, config: Dict[str, Union[int, str]], overfit: bool = False, in_memory: bool = False): super().__init__() self.config = config self.json_reader = VisDialJsonReader(visdial_jsonpath) self.vocabulary = Vocabulary( config["word_counts_json"], min_count=config["vocab_min_count"] ) # initialize image features reader according to split image_features_hdfpath = config["image_features_train_h5"] if "val" in self.json_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.json_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) # keep a list of image_ids as primary keys to access data self.image_ids = list(self.json_reader.dialogs.keys()) if overfit: self.image_ids = self.image_ids[:5]
def __init__( self, config: Dict[str, Any], dialogs_jsonpath: str, dense_annotations_jsonpath: Optional[str] = None, overfit: bool = False, in_memory: bool = False, return_options: bool = True, add_boundary_toks: bool = False, sample_flag: bool = False, ): super().__init__() self.config = config self.return_options = return_options self.add_boundary_toks = add_boundary_toks self.dialogs_reader = DialogsReader(dialogs_jsonpath) if "val" in self.split and dense_annotations_jsonpath is not None: self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath) else: self.annotations_reader = None self.vocabulary = Vocabulary(config["word_counts_json"], min_count=config["vocab_min_count"]) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if "val" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) if sample_flag == False: self.image_ids = list(self.dialogs_reader.dialogs.keys()) file = open('data/qt_count.json', 'r') ##load qt information self.qt_file = json.loads(file.read()) self.qt_list = list(self.qt_file.keys()) file.close() for i in range(len(self.qt_list)): self.qt_list[i] = word_tokenize(self.qt_list[i]) # Keep a list of image_ids as primary keys to access data. if sample_flag == True: samplefile = open('data/visdial_1.0_train_dense_sample.json', 'r') sample = json.loads(samplefile.read()) samplefile.close() ndcg_id_list = [] for idx in range(len(sample)): ndcg_id_list.append(sample[idx]['image_id']) self.image_ids = ndcg_id_list if overfit: self.image_ids = self.image_ids[:5]
def __init__( self, config: Dict[str, Any], dialogs_jsonpath: str, dense_annotations_jsonpath: Optional[str] = None, return_adjusted_gt_relevance: bool = False, overfit: bool = False, in_memory: bool = False, return_options: bool = True, add_boundary_toks: bool = False, ): super().__init__() self.config = config self.return_options = return_options self.return_adjusted_gt_relevance = return_adjusted_gt_relevance self.add_boundary_toks = add_boundary_toks self.dialogs_reader = DialogsReader(dialogs_jsonpath) if (("val" in self.split or "dense" in self.split) and dense_annotations_jsonpath is not None): self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath) else: self.annotations_reader = None if config['word_embedding_type'] == 'glove': self.vocabulary = GloveVocabulary( word_counts_path=config['word_counts_json'], min_count=config['vocab_min_count'], glove_weight_path=config['glove_weight_txt'], vec_size=config['glove_emb_dim'], glove_vec_num=config['glove_vec_num']) else: self.vocabulary = Vocabulary( word_counts_path=config["word_counts_json"], min_count=config["vocab_min_count"]) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if ("val" in self.dialogs_reader.split and "fake" not in self.dialogs_reader.split): image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) # Keep a list of image_ids as primary keys to access data. self.image_ids = list(self.dialogs_reader.dialogs.keys()) if overfit: self.image_ids = self.image_ids[:5]
def __init__( self, config: Dict[str, Any], dialogs_jsonpath: str, dense_annotations_jsonpath: Optional[str] = None, overfit: bool = False, in_memory: bool = False, num_workers: int = 1, return_options: bool = True, add_boundary_toks: bool = False, ): super().__init__() self.config = config self.return_options = return_options self.add_boundary_toks = add_boundary_toks self.dialogs_reader = DialogsReader( dialogs_jsonpath, num_examples=(5 if overfit else None), num_workers=num_workers) if "val" in self.split and dense_annotations_jsonpath is not None: self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath) else: self.annotations_reader = None self.vocabulary = Vocabulary(config["word_counts_json"], min_count=config["vocab_min_count"]) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if "val" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) # Keep a list of image_ids as primary keys to access data. self.image_ids = list(self.dialogs_reader.dialogs.keys()) if overfit: self.image_ids = self.image_ids[:5]
def test_encoder(): # a = torch.randn(2,5) # b = torch.randn(2,5) # c = a+b # print(a) # print(b) # print(c) # scores = scores.masked_fill(mask, -1e9) img_feat = torch.randn(4, 36, 2048) seq_size = 20 ques = torch.randperm(seq_size).view(1, seq_size) # (batch,seq_len) ques = ques.unsqueeze(1).repeat(4, 10, 1) # ques = ques.repeat(4,1) ques_len = torch.LongTensor([6, 5, 4, 3]).unsqueeze(1).repeat(1, 10) # print(ques_len.size()) # # print(ques.size()) # (4,10,20) # print(img_feat.size()) config = { "use_hist": False, "use_bert": False, "img_feature_size": 2048, "word_embedding_size": 300, "bert_embedding_size": 768, "lstm_hidden_size": 512, "lstm_num_layers": 2, "dropout": 0.5, "word_counts_json": '../data/visdial_1.0_word_counts_train.json', "concat_history": False, "vocab_min_count": 5 } vocabulary = Vocabulary(config["word_counts_json"], min_count=config["vocab_min_count"]) net = MCANImgOnlyEncoder(config, vocabulary) opts = {'img_feat': img_feat, 'ques': ques, 'ques_len': ques_len} fused_embedding = net(opts) print(fused_embedding.size())
# ================================================================================================ # INPUT ARGUMENTS AND CONFIG # ================================================================================================ args = parser.parse_args(args=[]) # keys: {"dataset", "model", "solver"} config = yaml.load(open(args.config_yml)) # ================================================================================================ # SETUP DATASET # ================================================================================================ vocabulary = Vocabulary( config["dataset"]["word_counts_json"], min_count=config["dataset"]["vocab_min_count"] ) def loadGloveModel(gloveFile): print("Loading pretrained word vectors...") with open(gloveFile,'r') as f: model = {} for line in f: splitLine = line.split() word = splitLine[0] embedding = np.array([float(val) for val in splitLine[1:]]) model[word] = embedding print("Done.",len(model)," words loaded!") return model glove = loadGloveModel(args.pretrained_txt)
class VisDialDataset(Dataset): def __init__(self, visdial_jsonpath: str, config: Dict[str, Union[int, str]], overfit: bool = False, in_memory: bool = False): super().__init__() self.config = config self.json_reader = VisDialJsonReader(visdial_jsonpath) self.vocabulary = Vocabulary( config["word_counts_json"], min_count=config["vocab_min_count"] ) # initialize image features reader according to split image_features_hdfpath = config["image_features_train_h5"] if "val" in self.json_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.json_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) # keep a list of image_ids as primary keys to access data self.image_ids = list(self.json_reader.dialogs.keys()) if overfit: self.image_ids = self.image_ids[:5] @property def split(self): return self.json_reader.split def __len__(self): return len(self.image_ids) def __getitem__(self, index): # get image_id, which serves as a primary key for current instance image_id = self.image_ids[index] # get image features for this image_id using hdf reader image_features = self.hdf_reader[image_id] image_features = torch.tensor(image_features) # normalize image features at zero-th dimension (since there's no batch dimension) if self.config["img_norm"]: image_features = normalize(image_features, dim=0, p=2) # retrieve instance for this image_id using json reader visdial_instance = self.json_reader[image_id] caption = visdial_instance["caption"] dialog = visdial_instance["dialog"] # convert word tokens of caption, question, answer and answer options to integers caption = self.vocabulary.to_indices(caption) for i in range(len(dialog)): dialog[i]["question"] = self.vocabulary.to_indices(dialog[i]["question"]) dialog[i]["answer"] = self.vocabulary.to_indices(dialog[i]["answer"]) for j in range(len(dialog[i]["answer_options"])): dialog[i]["answer_options"][j] = self.vocabulary.to_indices( dialog[i]["answer_options"][j] ) questions, question_lengths = self._pad_sequences( [dialog_round["question"] for dialog_round in dialog] ) history, history_lengths = self._get_history( caption, [dialog_round["question"] for dialog_round in dialog], [dialog_round["answer"] for dialog_round in dialog] ) answer_options = [] answer_option_lengths = [] for dialog_round in dialog: options, option_lengths = self._pad_sequences(dialog_round["answer_options"]) answer_options.append(options) answer_option_lengths.append(option_lengths) answer_options = torch.stack(answer_options, 0) if "test" not in self.split: answer_indices = [dialog_round["gt_index"] for dialog_round in dialog] # collect everything as tensors for ``collate_fn`` of dataloader to work seemlessly # questions, history, etc. are converted to LongTensors, for nn.Embedding input item = {} item["img_ids"] = torch.tensor(image_id).long() item["img_feat"] = image_features item["ques"] = questions.long() item["hist"] = history.long() item["opt"] = answer_options.long() item["ques_len"] = torch.tensor(question_lengths).long() item["hist_len"] = torch.tensor(history_lengths).long() item["opt_len"] = torch.tensor(answer_option_lengths).long() item["num_rounds"] = torch.tensor(visdial_instance["num_rounds"]).long() if "test" not in self.split: item["ans_ind"] = torch.tensor(answer_indices).long() return item def _pad_sequences(self, sequences: List[List[int]]): """Given tokenized sequences (either questions, answers or answer options, tokenized in ``__getitem__``), padding them to maximum specified sequence length. Return as a tensor of size ``(*, max_sequence_length)``. This method is only called in ``__getitem__``, chunked out separately for readability. Parameters ---------- sequences : List[List[int]] List of tokenized sequences, each sequence is typically a List[int]. Returns ------- torch.Tensor, torch.Tensor Tensor of sequences padded to max length, and length of sequences before padding. """ for i in range(len(sequences)): sequences[i] = sequences[i][: self.config["max_sequence_length"] - 1] sequence_lengths = [len(sequence) for sequence in sequences] # pad all sequences to max_sequence_length maxpadded_sequences = torch.full( (len(sequences), self.config["max_sequence_length"]), fill_value=self.vocabulary.PAD_INDEX, ) padded_sequences = pad_sequence( [torch.tensor(sequence) for sequence in sequences], batch_first=True, padding_value=self.vocabulary.PAD_INDEX ) maxpadded_sequences[:, :padded_sequences.size(1)] = padded_sequences return maxpadded_sequences, sequence_lengths def _get_history(self, caption: List[int], questions: List[List[int]], answers: List[List[int]]): # allow double length of caption, equivalent to a concatenated QA pair caption = caption[: self.config["max_sequence_length"] * 2 - 1] for i in range(len(questions)): questions[i] = questions[i][: self.config["max_sequence_length"] - 1] for i in range(len(answers)): answers[i] = answers[i][: self.config["max_sequence_length"] - 1] # history for first round is caption, else concatenated QA pair of previous round history = [] history.append(caption) for question, answer in zip(questions, answers): history.append(question + answer + [self.vocabulary.EOS_INDEX]) # drop last entry from history (there's no eleventh question) history = history[:-1] max_history_length = self.config["max_sequence_length"] * 2 if self.config["concat_history"]: # concatenated_history has similar structure as history, except it contains # concatenated QA pairs from previous rounds concatenated_history = [] concatenated_history.append(caption) for i in range(1, len(history)): concatenated_history.append([]) for j in range(i + 1): concatenated_history[i].extend(history[j]) max_history_length = self.config["max_sequence_length"] * 2 * len(history) history = concatenated_history history_lengths = [len(round_history) for round_history in history] maxpadded_history = torch.full( (len(history), max_history_length), fill_value=self.vocabulary.PAD_INDEX, ) padded_history = pad_sequence( [torch.tensor(round_history) for round_history in history], batch_first=True, padding_value=self.vocabulary.PAD_INDEX ) maxpadded_history[:, :padded_history.size(1)] = padded_history return maxpadded_history, history_lengths
class VisDialDataset(Dataset): """ A full representation of VisDial v1.0 (train/val/test) dataset. According to the appropriate split, it returns dictionary of question, image, history, ground truth answer, answer options, dense annotations etc. """ def __init__( self, config: Dict[str, Any], dialogs_jsonpath: str, dense_annotations_jsonpath: Optional[str] = None, overfit: bool = False, in_memory: bool = False, return_options: bool = True, add_boundary_toks: bool = False, sample_flag: bool = False, ): super().__init__() self.config = config self.return_options = return_options self.add_boundary_toks = add_boundary_toks self.dialogs_reader = DialogsReader(dialogs_jsonpath) if "val" in self.split and dense_annotations_jsonpath is not None: self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath) else: self.annotations_reader = None self.vocabulary = Vocabulary(config["word_counts_json"], min_count=config["vocab_min_count"]) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if "val" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) if sample_flag == False: self.image_ids = list(self.dialogs_reader.dialogs.keys()) file = open('data/qt_count.json', 'r') ##load qt information self.qt_file = json.loads(file.read()) self.qt_list = list(self.qt_file.keys()) file.close() for i in range(len(self.qt_list)): self.qt_list[i] = word_tokenize(self.qt_list[i]) # Keep a list of image_ids as primary keys to access data. if sample_flag == True: samplefile = open('data/visdial_1.0_train_dense_sample.json', 'r') sample = json.loads(samplefile.read()) samplefile.close() ndcg_id_list = [] for idx in range(len(sample)): ndcg_id_list.append(sample[idx]['image_id']) self.image_ids = ndcg_id_list if overfit: self.image_ids = self.image_ids[:5] @property def split(self): return self.dialogs_reader.split def __len__(self): return len(self.image_ids) def __getitem__(self, index): # Get image_id, which serves as a primary key for current instance. image_id = self.image_ids[index] # Get image features for this image_id using hdf reader. image_features = self.hdf_reader[image_id] image_features = torch.tensor(image_features) # Normalize image features at zero-th dimension (since there's no batch # dimension). if self.config["img_norm"]: image_features = normalize(image_features, dim=0, p=2) # Retrieve instance for this image_id using json reader. # print(image_id) visdial_instance = self.dialogs_reader[image_id] caption = visdial_instance["caption"] dialog = visdial_instance["dialog"] opt_idx = visdial_instance['opt_list'] # Convert word tokens of caption, question, answer and answer options # to integers. qt = torch.full([10, 1], 54) #totally 55 question types qt_len = torch.full([10, 1], 2) #maxmize about 6 caption = self.vocabulary.to_indices(caption) for i in range(len(dialog)): question_tmp = dialog[i]["question"] for k in range(len(self.qt_list)): if self.qt_list[k] == question_tmp[0:len(self.qt_list[k])]: qt[i] = k qt_len[i] = len(self.qt_list[k]) dialog[i]["question"] = self.vocabulary.to_indices( dialog[i]["question"]) if self.add_boundary_toks: dialog[i]["answer"] = self.vocabulary.to_indices( [self.vocabulary.SOS_TOKEN] + dialog[i]["answer"] + [self.vocabulary.EOS_TOKEN]) else: dialog[i]["answer"] = self.vocabulary.to_indices( dialog[i]["answer"]) if self.return_options: for j in range(len(dialog[i]["answer_options"])): if self.add_boundary_toks: dialog[i]["answer_options"][ j] = self.vocabulary.to_indices( [self.vocabulary.SOS_TOKEN] + dialog[i]["answer_options"][j] + [self.vocabulary.EOS_TOKEN]) else: dialog[i]["answer_options"][ j] = self.vocabulary.to_indices( dialog[i]["answer_options"][j]) questions, question_lengths = self._pad_sequences( [dialog_round["question"] for dialog_round in dialog]) history, history_lengths = self._get_history( caption, [dialog_round["question"] for dialog_round in dialog], [dialog_round["answer"] for dialog_round in dialog], ) answers_in, answer_lengths = self._pad_sequences( [dialog_round["answer"][:-1] for dialog_round in dialog] # [dialog_round["answer"][:] for dialog_round in dialog] ) answers_out, _ = self._pad_sequences( # [dialog_round["answer"][:] for dialog_round in dialog] [dialog_round["answer"][1:] for dialog_round in dialog]) answers, _ = self._pad_sequences( # [dialog_round["answer"][:] for dialog_round in dialog] [dialog_round["answer"] for dialog_round in dialog]) # Collect everything as tensors for ``collate_fn`` of dataloader to # work seamlessly questions, history, etc. are converted to # LongTensors, for nn.Embedding input. item = {} item['opt_idx'] = torch.tensor(opt_idx).long() item["img_ids"] = torch.tensor(image_id).long() item["img_feat"] = image_features item["ques"] = questions.long() item["qt"] = qt.long() item["qt_len"] = qt_len.long() item["ans"] = answers.long() item["hist"] = history.long() item["ans_in"] = answers_in.long() item["ans_out"] = answers_out.long() item["ques_len"] = torch.tensor(question_lengths).long() item["hist_len"] = torch.tensor(history_lengths).long() item["ans_len"] = torch.tensor(answer_lengths).long() item["num_rounds"] = torch.tensor( visdial_instance["num_rounds"]).long() if self.return_options: if self.add_boundary_toks: answer_options_in, answer_options_out, answer_options = [], [], [] answer_option_lengths = [] for dialog_round in dialog: options, option_lengths = self._pad_sequences([ option[:-1] for option in dialog_round["answer_options"] ]) answer_options_in.append(options) options, _ = self._pad_sequences([ option[1:] for option in dialog_round["answer_options"] ]) answer_options_out.append(options) options, _ = self._pad_sequences([ option[:] for option in dialog_round["answer_options"] ]) answer_options.append(options) answer_option_lengths.append(option_lengths) answer_options_in = torch.stack(answer_options_in, 0) answer_options_out = torch.stack(answer_options_out, 0) answer_options = torch.stack(answer_options, 0) item["opt"] = answer_options.long() item["opt_in"] = answer_options_in.long() item["opt_out"] = answer_options_out.long() item["opt_len"] = torch.tensor(answer_option_lengths).long() else: answer_options = [] answer_option_lengths = [] for dialog_round in dialog: options, option_lengths = self._pad_sequences( dialog_round["answer_options"]) answer_options.append(options) answer_option_lengths.append(option_lengths) answer_options = torch.stack(answer_options, 0) item["opt"] = answer_options.long() item["opt_len"] = torch.tensor(answer_option_lengths).long() if "test" not in self.split: answer_indices = [ dialog_round["gt_index"] for dialog_round in dialog ] item["ans_ind"] = torch.tensor(answer_indices).long() # Gather dense annotations. if "val" in self.split: dense_annotations = self.annotations_reader[image_id] item["relevance"] = torch.tensor( dense_annotations["gt_relevance"]).float() item["round_id"] = torch.tensor( dense_annotations["round_id"]).long() return item def _pad_sequences(self, sequences: List[List[int]]): """Given tokenized sequences (either questions, answers or answer options, tokenized in ``__getitem__``), padding them to maximum specified sequence length. Return as a tensor of size ``(*, max_sequence_length)``. This method is only called in ``__getitem__``, chunked out separately for readability. Parameters ---------- sequences : List[List[int]] List of tokenized sequences, each sequence is typically a List[int]. Returns ------- torch.Tensor, torch.Tensor Tensor of sequences padded to max length, and length of sequences before padding. """ for i in range(len(sequences)): sequences[i] = sequences[i][:self.config["max_sequence_length"] - 1] sequence_lengths = [len(sequence) for sequence in sequences] # Pad all sequences to max_sequence_length. maxpadded_sequences = torch.full( (len(sequences), self.config["max_sequence_length"]), fill_value=self.vocabulary.PAD_INDEX, ) padded_sequences = pad_sequence( [torch.tensor(sequence) for sequence in sequences], batch_first=True, padding_value=self.vocabulary.PAD_INDEX, ) maxpadded_sequences[:, :padded_sequences.size(1)] = padded_sequences return maxpadded_sequences, sequence_lengths def _get_history( self, caption: List[int], questions: List[List[int]], answers: List[List[int]], ): # Allow double length of caption, equivalent to a concatenated QA pair. caption = caption[:self.config["max_sequence_length"] * 2 - 1] for i in range(len(questions)): questions[i] = questions[i][:self.config["max_sequence_length"] - 1] for i in range(len(answers)): answers[i] = answers[i][:self.config["max_sequence_length"] - 1] # History for first round is caption, else concatenated QA pair of # previous round. history = [] history.append(caption) for question, answer in zip(questions, answers): history.append(question + answer + [self.vocabulary.EOS_INDEX]) # Drop last entry from history (there's no eleventh question). history = history[:-1] #切掉最后一个 max_history_length = self.config["max_sequence_length"] * 2 if self.config.get("concat_history", False): # Concatenated_history has similar structure as history, except it # contains concatenated QA pairs from previous rounds. concatenated_history = [] concatenated_history.append(caption) for i in range(1, len(history)): concatenated_history.append([]) for j in range(i + 1): concatenated_history[i].extend(history[j]) max_history_length = (self.config["max_sequence_length"] * 2 * len(history)) history = concatenated_history history_lengths = [len(round_history) for round_history in history] maxpadded_history = torch.full( (len(history), max_history_length), fill_value=self.vocabulary.PAD_INDEX, ) padded_history = pad_sequence( [torch.tensor(round_history) for round_history in history], batch_first=True, padding_value=self.vocabulary.PAD_INDEX, ) maxpadded_history[:, :padded_history.size(1)] = padded_history return maxpadded_history, history_lengths
def __init__( self, config: Dict[str, Any], dialogs_jsonpath: str, dense_annotations_jsonpath: Optional[str] = None, augment_dense_annotations_jsonpath: Optional[str] = None, use_pretrained_emb: bool = False, qa_emb_file_path: Optional[str] = None, # SA: todo remove this hist_emb_file_path: Optional[str] = None, # SA: todo remove this use_caption: bool = True, num_hist_turns: int = 10, finetune: bool = False, overfit: bool = False, in_memory: bool = False, num_workers: int = 1, return_options: bool = True, add_boundary_toks: bool = False): super().__init__() self.config = config # SA: embedding reader self.use_pretrained_emb = use_pretrained_emb self.return_options = return_options self.add_boundary_toks = add_boundary_toks self.dialogs_reader = DialogsReader( dialogs_jsonpath, num_examples=(5 if overfit else None), num_workers=num_workers, use_pretrained_emb=self.use_pretrained_emb) self.finetune = finetune self.use_caption = use_caption # SA: embedding reader if self.use_pretrained_emb: assert qa_emb_file_path, "Did you forget to set emb file path?" # @todo: for now coming through argparse self.qa_emb_file_path = qa_emb_file_path self.hist_emb_file_path = hist_emb_file_path # hist_emb_file_path = config["hist_emb_file_path"] # TransformerEmbeddingsHdfReader(embedding_path, in_memory) # self.embedding_reader = TransformerEmbeddingsHdfReader(hist_emb_file_path, # in_memory) self.question_reader = QuesEmbeddingsHdfReader( qa_emb_file_path, in_memory) self.ans_reader = AnswerEmbeddingsHdfReader( qa_emb_file_path, in_memory) self.caption_reader = CaptionEmbeddingsHdfReader( qa_emb_file_path, in_memory) # SA: we dont pass in_memory here because history is too big # SA: todo this key would change self.hist_reader = HistEmbeddingsHdfReader(hist_emb_file_path, hdfs_key="hist") # SA: if finetuning for train/val otherwise just validation set if self.finetune or ("val" in self.split and dense_annotations_jsonpath is not None): self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath) else: self.annotations_reader = None if augment_dense_annotations_jsonpath is not None: self.augmented_annotations_reader = AugmentedDenseAnnotationsReader( augment_dense_annotations_jsonpath) self.use_augment_dense = True else: self.use_augment_dense = False self.vocabulary = Vocabulary(config["word_counts_json"], min_count=config["vocab_min_count"]) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if "val" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) # Keep a list of image_ids as primary keys to access data. # For finetune we use only those image id where we have dense annotations if self.finetune: self.image_ids = list(self.annotations_reader.keys) else: self.image_ids = list(self.dialogs_reader.dialogs.keys()) if overfit: self.image_ids = self.image_ids[:5]
class VisDialDataset(Dataset): """ A full representation of VisDial v1.0 (train/val/test) dataset. According to the appropriate split, it returns dictionary of question, image, history, ground truth answer, answer options, dense annotations etc. """ def __init__( self, config: Dict[str, Any], dialogs_jsonpath: str, dense_annotations_jsonpath: Optional[str] = None, augment_dense_annotations_jsonpath: Optional[str] = None, use_pretrained_emb: bool = False, qa_emb_file_path: Optional[str] = None, # SA: todo remove this hist_emb_file_path: Optional[str] = None, # SA: todo remove this use_caption: bool = True, num_hist_turns: int = 10, finetune: bool = False, overfit: bool = False, in_memory: bool = False, num_workers: int = 1, return_options: bool = True, add_boundary_toks: bool = False): super().__init__() self.config = config # SA: embedding reader self.use_pretrained_emb = use_pretrained_emb self.return_options = return_options self.add_boundary_toks = add_boundary_toks self.dialogs_reader = DialogsReader( dialogs_jsonpath, num_examples=(5 if overfit else None), num_workers=num_workers, use_pretrained_emb=self.use_pretrained_emb) self.finetune = finetune self.use_caption = use_caption # SA: embedding reader if self.use_pretrained_emb: assert qa_emb_file_path, "Did you forget to set emb file path?" # @todo: for now coming through argparse self.qa_emb_file_path = qa_emb_file_path self.hist_emb_file_path = hist_emb_file_path # hist_emb_file_path = config["hist_emb_file_path"] # TransformerEmbeddingsHdfReader(embedding_path, in_memory) # self.embedding_reader = TransformerEmbeddingsHdfReader(hist_emb_file_path, # in_memory) self.question_reader = QuesEmbeddingsHdfReader( qa_emb_file_path, in_memory) self.ans_reader = AnswerEmbeddingsHdfReader( qa_emb_file_path, in_memory) self.caption_reader = CaptionEmbeddingsHdfReader( qa_emb_file_path, in_memory) # SA: we dont pass in_memory here because history is too big # SA: todo this key would change self.hist_reader = HistEmbeddingsHdfReader(hist_emb_file_path, hdfs_key="hist") # SA: if finetuning for train/val otherwise just validation set if self.finetune or ("val" in self.split and dense_annotations_jsonpath is not None): self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath) else: self.annotations_reader = None if augment_dense_annotations_jsonpath is not None: self.augmented_annotations_reader = AugmentedDenseAnnotationsReader( augment_dense_annotations_jsonpath) self.use_augment_dense = True else: self.use_augment_dense = False self.vocabulary = Vocabulary(config["word_counts_json"], min_count=config["vocab_min_count"]) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if "val" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) # Keep a list of image_ids as primary keys to access data. # For finetune we use only those image id where we have dense annotations if self.finetune: self.image_ids = list(self.annotations_reader.keys) else: self.image_ids = list(self.dialogs_reader.dialogs.keys()) if overfit: self.image_ids = self.image_ids[:5] @property def split(self): return self.dialogs_reader.split def __len__(self): return len(self.image_ids) def __getitem__(self, index): # start = time.time() # Get image_id, which serves as a primary key for current instance. image_id = self.image_ids[index] # Get image features for this image_id using hdf reader. image_features = self.hdf_reader[image_id] image_features = torch.tensor(image_features) # Normalize image features at zero-th dimension (since there's no batch # dimension). if self.config["img_norm"]: image_features = normalize(image_features, dim=0, p=2) # Retrieve instance for this image_id using json reader. visdial_instance = self.dialogs_reader[image_id] caption = visdial_instance["caption"] dialog = visdial_instance["dialog"] # SA: reading embeddings here if self.use_pretrained_emb: # We need indexes to actually call the readers here now. dialog_with_index = visdial_instance["dialog_with_index"] original_index = visdial_instance["original_index"] assert len(dialog) == len( dialog_with_index ), "These should be equal => just saving the index instead of string" # ideally should be in if-else clause ques_embeddings = [] ans_embeddings = [] opts_embeddings = [] # Convert word tokens of caption, question, answer and answer options # to integers. caption = self.vocabulary.to_indices(caption) for i in range(len(dialog)): # SA: using embeddings here in the same loop if self.use_pretrained_emb: # SA: todo We dont need caption embeddings when we already have history??? # caption_embedding = self.caption_reader[original_index] ques_embeddings.append( self.question_reader[dialog_with_index[i]["question"]]) ans_embeddings.append( self.ans_reader[dialog_with_index[i]["answer"]]) # SA: original code dialog[i]["question"] = self.vocabulary.to_indices( dialog[i]["question"]) if self.add_boundary_toks: dialog[i]["answer"] = self.vocabulary.to_indices( [self.vocabulary.SOS_TOKEN] + dialog[i]["answer"] + [self.vocabulary.EOS_TOKEN]) else: dialog[i]["answer"] = self.vocabulary.to_indices( dialog[i]["answer"]) # for disc decoder if self.return_options: # Ideally should be in if-else clause opts_round_embeddings = [] for j in range(len(dialog[i]["answer_options"])): # SA: trying option encodings here now if self.use_pretrained_emb: opts_round_embeddings.append(self.ans_reader[ dialog_with_index[i]["answer_options"][j]]) if self.add_boundary_toks: dialog[i]["answer_options"][ j] = self.vocabulary.to_indices( [self.vocabulary.SOS_TOKEN] + dialog[i]["answer_options"][j] + [self.vocabulary.EOS_TOKEN]) else: dialog[i]["answer_options"][ j] = self.vocabulary.to_indices( dialog[i]["answer_options"][j]) # Ideally should be in if-else clause opts_embeddings.append(opts_round_embeddings) questions, question_lengths = self._pad_sequences( [dialog_round["question"] for dialog_round in dialog]) history, history_lengths = self._get_history( caption, [dialog_round["question"] for dialog_round in dialog], [dialog_round["answer"] for dialog_round in dialog], ) answers_in, answer_lengths = self._pad_sequences( [dialog_round["answer"][:-1] for dialog_round in dialog]) answers_out, _ = self._pad_sequences( [dialog_round["answer"][1:] for dialog_round in dialog]) # Collect everything as tensors for ``collate_fn`` of dataloader to # work seamlessly questions, history, etc. are converted to # LongTensors, for nn.Embedding input. item = {} item["img_ids"] = torch.tensor(image_id).long() item["img_feat"] = image_features item["ques"] = questions.long() item["hist"] = history.long() item["ans_in"] = answers_in.long( ) # SA: probably useful for training gen item["ans_out"] = answers_out.long( ) # SA: probably useful for training gen item["ques_len"] = torch.tensor(question_lengths).long() item["hist_len"] = torch.tensor(history_lengths).long() item["ans_len"] = torch.tensor(answer_lengths).long() item["num_rounds"] = torch.tensor( visdial_instance["num_rounds"]).long() ## SA: pretrained embedding here if self.use_pretrained_emb: # See https://github.com/pytorch/pytorch/issues/13918 item["ques_embeddings"] = torch.tensor( np.array(ques_embeddings)).float() # now (10, 20, 768) ==> will be (bs, 10, 20, 768) (bert embeddings) item["opts_embeddings"] = torch.tensor( np.array(opts_embeddings)).float() # ans_embeddings = torch.tensor(np.array(ans_embeddings)).float() # caption_embedding = torch.tensor(np.array(caption_embedding)).float() # SA: todo proxy hist embeddings # hist_embeddings = self._get_history_embedding(caption_embedding, item["ques_embeddings"], # ans_embeddings) item["hist_embeddings"] = self.hist_reader[image_id] # (10, 100, 20, 768) ==> will be (bs, 10, 100, 20, 768) (bert embeddings) if self.return_options: if self.add_boundary_toks: answer_options_in, answer_options_out = [], [] answer_option_lengths = [] for dialog_round in dialog: options, option_lengths = self._pad_sequences([ option[:-1] for option in dialog_round["answer_options"] ]) answer_options_in.append(options) options, _ = self._pad_sequences([ option[1:] for option in dialog_round["answer_options"] ]) answer_options_out.append(options) answer_option_lengths.append(option_lengths) answer_options_in = torch.stack(answer_options_in, 0) answer_options_out = torch.stack(answer_options_out, 0) item["opt_in"] = answer_options_in.long() item["opt_out"] = answer_options_out.long() item["opt_len"] = torch.tensor(answer_option_lengths).long() else: answer_options = [] answer_option_lengths = [] for dialog_round in dialog: options, option_lengths = self._pad_sequences( dialog_round["answer_options"]) answer_options.append(options) answer_option_lengths.append(option_lengths) answer_options = torch.stack(answer_options, 0) # used by disc model ## options_length SA: used by model to select non-zero options item["opt"] = answer_options.long() item["opt_len"] = torch.tensor(answer_option_lengths).long() if "test" not in self.split: answer_indices = [ dialog_round["gt_index"] for dialog_round in dialog ] item["ans_ind"] = torch.tensor( answer_indices).long() # Used by evaluate for ndcg # Gather dense annotations. if self.finetune or ("val" in self.split): dense_annotations = self.annotations_reader[image_id] # SA: have to do this because of changed dic key in train if "val" in self.split: item["gt_relevance"] = torch.tensor( dense_annotations["gt_relevance"]).float() elif "train" in self.split: item["gt_relevance"] = torch.tensor( dense_annotations["relevance"]).float() item["round_id"] = torch.tensor( dense_annotations["round_id"]).long() # end = time.time() # time_taken = end - start # print('Time for loading item: ',time_taken) if self.use_augment_dense: augmented_dense_annotations = self.augmented_annotations_reader[ image_id] item["augmented_gt_relevance"] = torch.tensor( augmented_dense_annotations["augmented_gt_relevance"]).float() return item def _pad_sequences(self, sequences: List[List[int]]): """Given tokenized sequences (either questions, answers or answer options, tokenized in ``__getitem__``), padding them to maximum specified sequence length. Return as a tensor of size ``(*, max_sequence_length)``. This method is only called in ``__getitem__``, chunked out separately for readability. Parameters ---------- sequences : List[List[int]] List of tokenized sequences, each sequence is typically a List[int]. Returns ------- torch.Tensor, torch.Tensor Tensor of sequences padded to max length, and length of sequences before padding. """ for i in range(len(sequences)): sequences[i] = sequences[i][:self.config["max_sequence_length"] - 1] sequence_lengths = [len(sequence) for sequence in sequences] # Pad all sequences to max_sequence_length. maxpadded_sequences = torch.full( (len(sequences), self.config["max_sequence_length"]), fill_value=self.vocabulary.PAD_INDEX, ) padded_sequences = pad_sequence( [torch.tensor(sequence) for sequence in sequences], batch_first=True, padding_value=self.vocabulary.PAD_INDEX, ) maxpadded_sequences[:, :padded_sequences.size(1)] = padded_sequences return maxpadded_sequences, sequence_lengths def _get_history( self, caption: List[int], questions: List[List[int]], answers: List[List[int]], ): # Allow double length of caption, equivalent to a concatenated QA pair. caption = caption[:self.config["max_sequence_length"] * 2 - 1] for i in range(len(questions)): questions[i] = questions[i][:self.config["max_sequence_length"] - 1] for i in range(len(answers)): answers[i] = answers[i][:self.config["max_sequence_length"] - 1] # History for first round is caption, else concatenated QA pair of # previous round. history = [] ## SA: appending EOS after caption caption = caption + [self.vocabulary.EOS_INDEX] if self.use_caption: history.append(caption) else: history.append([self.vocabulary.EOS_INDEX]) # print("Not using caption in history.") for question, answer in zip(questions, answers): history.append(question + answer + [self.vocabulary.EOS_INDEX]) # Drop last entry from history (there's no eleventh question). history = history[:-1] max_history_length = self.config["max_sequence_length"] * 2 if self.config.get("concat_history", False): # Concatenated_history has similar structure as history, except it # contains concatenated QA pairs from previous rounds. concatenated_history = [] concatenated_history.append(caption) for i in range(1, len(history)): concatenated_history.append([]) for j in range(i + 1): concatenated_history[i].extend(history[j]) max_history_length = (self.config["max_sequence_length"] * 2 * len(history)) history = concatenated_history history_lengths = [len(round_history) for round_history in history] maxpadded_history = torch.full( (len(history), max_history_length), fill_value=self.vocabulary.PAD_INDEX, ) padded_history = pad_sequence( [torch.tensor(round_history) for round_history in history], batch_first=True, padding_value=self.vocabulary.PAD_INDEX, ) maxpadded_history[:, :padded_history.size(1)] = padded_history return maxpadded_history, history_lengths def _get_history_embedding(self, caption, questions, answers): """ only for one dialogue here num_rounds = 10 :param caption: (40, 768) ==> cross check :param questions: (10, 20, 768) :param answers: (10, 20, 768) :return: """ concatenated_qa_history = torch.cat([questions, answers], 1) # print(concatenated_qa_history.size()) # Drop last concatenated_qa_history = concatenated_qa_history[:-1] caption = caption.unsqueeze(0) # Concatenate along batch now concatenated_qa_history = torch.cat([caption, concatenated_qa_history], 0) # shape (10, 40, 768) if self.config.get("concat_history", False): max_history_length = (self.config["max_sequence_length"] * 2 * len(concatenated_qa_history)) # 400 history_list = [] num_rounds, _, rep_size = concatenated_qa_history.size( ) # (10, 40, 768) # hist_tensor = concatenated_qa_history.view(-1, rep_size) # (10*40, 768) # hist_tensor = hist_tensor.unsqueeze(0).repeat(num_rounds,1,1) # (10, 400, 768) # zero_array = for i in range(1, num_rounds + 1): pad_array = torch.zeros( max_history_length - self.config["max_sequence_length"] * 2 * (i), rep_size) hist_array = concatenated_qa_history[:i].view(-1, rep_size) hist_round = torch.cat([hist_array, pad_array], 0) history_list.append(hist_round) history = torch.stack(history_list, 0) else: history = concatenated_qa_history return history def _get_combined_ques_caption_or_hist(self, caption: List[int], questions: List[List[int]], answers: List[List[int]]): # Allow double length of caption, equivalent to a concatenated QA pair. caption = caption[:self.config["max_sequence_length"] * 2 - 1] for i in range(len(questions)): questions[i] = questions[i][:self.config["max_sequence_length"] - 1] for i in range(len(answers)): answers[i] = answers[i][:self.config["max_sequence_length"] - 1]
args.val_dense_json, overfit=args.overfit, in_memory=args.in_memory) else: val_dataset = VisDialDataset(config["dataset"], args.test_json, caption_jsonpath=args.captions_test_json, overfit=args.overfit, in_memory=args.in_memory) val_dataloader = DataLoader(val_dataset, batch_size=config["solver"]["batch_size"], num_workers=args.cpu_workers) with open(config["dataset"]["glovepath"], "r") as glove_file: glove = json.load(glove_file) glovevocabulary = Vocabulary(config["dataset"]["word_counts_json"], min_count=config["dataset"]["vocab_min_count"]) KAT = [] for key in glove.keys(): keylist = [key] token = glovevocabulary.to_indices(keylist) key_and_token = keylist + token KAT.append(key_and_token) glove_token = {} for item in KAT: glove_token[item[1]] = glove[item[0]] glove_list = [] for i in range(len(glovevocabulary)): if i in glove_token.keys(): glove_list.append(glove_token[i]) else:
def test_encoder(): img_feat = torch.randn(4, 36, 2048) seq_size = 20 ques = torch.randperm(seq_size).view(1, seq_size) # (batch,seq_len) ques = ques.unsqueeze(1).repeat(4, 10, 1) # ques = ques.repeat(4,1) ques_len = torch.LongTensor([6, 5, 4, 3]).unsqueeze(1).repeat(1, 10) # print(ques_len.size()) # # print(ques.size()) # (4,10,20) # print(img_feat.size()) config = { "use_hist": False, "use_bert": False, "img_feature_size": 2048, "word_embedding_size": 300, "bert_embedding_size": 768, "lstm_hidden_size": 512, "lstm_num_layers": 2, "dropout": 0.5, "word_counts_json": '/scratch/shubham/visdial2019/data/visdial_1.0_word_counts_train.json', "concat_history": False, "vocab_min_count": 5 } vocabulary = Vocabulary(config["word_counts_json"], min_count=config["vocab_min_count"]) # net = MCANConcatHistBeforeImgEncoder(config, vocabulary) # opts = { # 'img_feat': img_feat, # 'ques': ques, # 'ques_len': ques_len # } # # fused_embedding = net(opts) # print(fused_embedding.size()) # With history, not concatenated print("With history concat false") config["use_hist"] = True net = MCANImgMCANVQAHistAttnEncoder(config, vocabulary) seq_size = 400 hist = torch.randperm(seq_size).view(1, seq_size) # (batch,seq_len) hist = hist.unsqueeze(1).repeat(4, 10, 1) hist_len = torch.LongTensor([10, 15, 15, 19]).unsqueeze(1).repeat(1, 10) # hist_len = torch.LongTensor([20,54,43,32]).unsqueeze(1).repeat(1,10) opts = { 'img_feat': img_feat, 'ques': ques, 'ques_len': ques_len, 'hist': hist, 'hist_len': hist_len } fused_embedding = net(opts) print(fused_embedding.size())
class VisDialDataset(Dataset): """ A full representation of VisDial v1.0 (train/val/test) dataset. According to the appropriate split, it returns dictionary of question, image, history, ground truth answer, answer options, dense annotations etc. """ def __init__(self, config: Dict[str, Any], dialogs_jsonpath: str, caption_jsonpath: str, dense_annotations_jsonpath: Optional[str] = None, overfit: bool = False, in_memory: bool = False): super().__init__() self.config = config self.dialogs_reader = DialogsReader(dialogs_jsonpath) if "val" in self.split and dense_annotations_jsonpath is not None: self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath) else: self.annotations_reader = None self.vocabulary = Vocabulary(config["word_counts_json"], min_count=config["vocab_min_count"]) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if "val" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader(image_features_hdfpath, in_memory) # Keep a list of image_ids as primary keys to access data. self.image_ids = list(self.dialogs_reader.dialogs.keys()) if overfit: self.image_ids = self.image_ids[:5] self.captions_reader = CaptionReader(caption_jsonpath) @property def split(self): return self.dialogs_reader.split def __len__(self): return len(self.image_ids) def __getitem__(self, index): # Get image_id, which serves as a primary key for current instance. image_id = self.image_ids[index] # Get image features for this image_id using hdf reader. image_features, image_relation = self.hdf_reader[image_id] image_features = torch.tensor(image_features) image_relation = torch.tensor(image_relation) #relation # Normalize image features at zero-th dimension (since there's no batch dimension). if self.config["img_norm"]: image_features = normalize(image_features, dim=0, p=2) # Retrieve instance for this image_id using json reader. visdial_instance = self.dialogs_reader[image_id] caption = visdial_instance["caption"] dialog = visdial_instance["dialog"] # Convert word tokens of caption, question, answer and answer options to integers. caption = self.vocabulary.to_indices(caption) for i in range(len(dialog)): dialog[i]["question"] = self.vocabulary.to_indices( dialog[i]["question"]) dialog[i]["answer"] = self.vocabulary.to_indices( dialog[i]["answer"]) for j in range(len(dialog[i]["answer_options"])): dialog[i]["answer_options"][j] = self.vocabulary.to_indices( dialog[i]["answer_options"][j]) questions, question_lengths = self._pad_sequences( [dialog_round["question"] for dialog_round in dialog]) history, history_lengths = self._get_history( caption, [dialog_round["question"] for dialog_round in dialog], [dialog_round["answer"] for dialog_round in dialog]) answer_options = [] answer_option_lengths = [] for dialog_round in dialog: options, option_lengths = self._pad_sequences( dialog_round["answer_options"]) answer_options.append(options) answer_option_lengths.append(option_lengths) answer_options = torch.stack(answer_options, 0) if "test" not in self.split: answer_indices = [ dialog_round["gt_index"] for dialog_round in dialog ] captions_dic = self.captions_reader[image_id] captions_mul = captions_dic["captions"] captions_new = [] for i in range(len(captions_mul)): captions_each = self.vocabulary.to_indices(captions_mul[i]) captions_new.append(captions_each) captions_new, captions_len = self._pad_captions(captions_new) # Collect everything as tensors for ``collate_fn`` of dataloader to work seemlessly # questions, history, etc. are converted to LongTensors, for nn.Embedding input. item = {} item["img_ids"] = torch.tensor(image_id).long() item["img_feat"] = image_features item["relations"] = image_relation item["ques"] = questions.long() item["hist"] = history.long() item["opt"] = answer_options.long() item["ques_len"] = torch.tensor(question_lengths).long() item["hist_len"] = torch.tensor(history_lengths).long() item["opt_len"] = torch.tensor(answer_option_lengths).long() item["num_rounds"] = torch.tensor( visdial_instance["num_rounds"]).long() if "test" not in self.split: item["ans_ind"] = torch.tensor(answer_indices).long() # Gather dense annotations. if "val" in self.split: dense_annotations = self.annotations_reader[image_id] item["gt_relevance"] = torch.tensor( dense_annotations["gt_relevance"]).float() item["round_id"] = torch.tensor( dense_annotations["round_id"]).long() #关于caption的相关参数 item["captions_len"] = torch.tensor(captions_len).long() item["captions"] = captions_new.long() return item def _pad_sequences(self, sequences: List[List[int]]): """Given tokenized sequences (either questions, answers or answer options, tokenized in ``__getitem__``), padding them to maximum specified sequence length. Return as a tensor of size ``(*, max_sequence_length)``. This method is only called in ``__getitem__``, chunked out separately for readability. Parameters ---------- sequences : List[List[int]] List of tokenized sequences, each sequence is typically a List[int]. Returns ------- torch.Tensor, torch.Tensor Tensor of sequences padded to max length, and length of sequences before padding. """ for i in range(len(sequences)): sequences[i] = sequences[i][:self.config["max_sequence_length"] - 1] sequence_lengths = [len(sequence) for sequence in sequences] # Pad all sequences to max_sequence_length. maxpadded_sequences = torch.full( (len(sequences), self.config["max_sequence_length"]), fill_value=self.vocabulary.PAD_INDEX, ) padded_sequences = pad_sequence( [torch.tensor(sequence) for sequence in sequences], batch_first=True, padding_value=self.vocabulary.PAD_INDEX) maxpadded_sequences[:, :padded_sequences.size(1)] = padded_sequences return maxpadded_sequences, sequence_lengths def _get_history(self, caption: List[int], questions: List[List[int]], answers: List[List[int]]): # Allow double length of caption, equivalent to a concatenated QA pair. caption = caption[:self.config["max_sequence_length"] * 2 - 1] for i in range(len(questions)): questions[i] = questions[i][:self.config["max_sequence_length"] - 1] for i in range(len(answers)): answers[i] = answers[i][:self.config["max_sequence_length"] - 1] # History for first round is caption, else concatenated QA pair of previous round. history = [] history.append(caption) for question, answer in zip(questions, answers): history.append(question + answer + [self.vocabulary.EOS_INDEX]) # Drop last entry from history (there's no eleventh question). history = history[:-1] max_history_length = self.config["max_sequence_length"] * 2 if self.config.get("concat_history", False): # Concatenated_history has similar structure as history, except it contains # concatenated QA pairs from previous rounds. concatenated_history = [] concatenated_history.append(caption) for i in range(1, len(history)): concatenated_history.append([]) for j in range(i + 1): concatenated_history[i].extend(history[j]) max_history_length = self.config["max_sequence_length"] * 2 * len( history) history = concatenated_history history_lengths = [len(round_history) for round_history in history] maxpadded_history = torch.full( (len(history), max_history_length), fill_value=self.vocabulary.PAD_INDEX, ) padded_history = pad_sequence( [torch.tensor(round_history) for round_history in history], batch_first=True, padding_value=self.vocabulary.PAD_INDEX) maxpadded_history[:, :padded_history.size(1)] = padded_history return maxpadded_history, history_lengths def _pad_captions(self, sequences: List[List[int]]): LEN_S = len(sequences) if LEN_S > self.config["caption_round_num"]: for i in range(LEN_S - self.config["caption_round_num"]): sequences.pop(-1) #caption_len.pop(-1) caption_len = [] for i in range(len(sequences)): LEN = len(sequences[i]) if LEN < self.config["caption_maxlen_each"]: caption_len.append(len(sequences[i])) for j in range(self.config["caption_maxlen_each"] - LEN): sequences[i].append(0) elif LEN > self.config["caption_maxlen_each"]: for j in range(LEN - self.config["caption_maxlen_each"]): sequences[i].pop(-1) caption_len.append(len(sequences[i])) else: caption_len.append(len(sequences[i])) LEN_S = len(sequences) if LEN_S < self.config["caption_round_num"]: j = 0 #LENS = len(sequences) for i in range(self.config["caption_round_num"] - LEN_S): if j >= LEN_S - 1: j = 0 else: j += 1 sequences.append(sequences[j]) length_new = caption_len[j] caption_len.append(length_new) sequences = torch.tensor(sequences).view( self.config["caption_round_num"], self.config["caption_maxlen_each"]) return sequences, caption_len
def __init__( self, config: Dict[str, Any], dialogs_jsonpath: str, dense_annotations_jsonpath: str, fold_split: Optional[str] = 'train', fold: int = -1, return_adjusted_gt_relevance: bool = False, overfit: bool = False, in_memory: bool = False, return_options: bool = True, add_boundary_toks: bool = False, ): super().__init__() self.config = config self.return_options = return_options self.return_adjusted_gt_relevance = return_adjusted_gt_relevance self.add_boundary_toks = add_boundary_toks self.dialogs_reader = DialogsReader(dialogs_jsonpath) self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath ) self.fold_split = fold_split self.fold = fold self.n_folds = config['n_folds'] if config['word_embedding_type'] == 'glove': self.vocabulary = GloveVocabulary( word_counts_path = config['word_counts_json'], min_count=config['vocab_min_count'], glove_weight_path=config['glove_weight_txt'], vec_size=config['glove_emb_dim'], glove_vec_num=config['glove_vec_num'] ) else: self.vocabulary = Vocabulary( word_counts_path=config["word_counts_json"], min_count=config["vocab_min_count"] ) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if ("val" in self.dialogs_reader.split and "fake" not in self.dialogs_reader.split): image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader( image_features_hdfpath, in_memory ) # Keep a list of image_ids as primary keys to access data. all_image_ids = np.array(list(self.dialogs_reader.dialogs.keys())) if fold < 0 or fold_split == 'test': self.image_ids = all_image_ids.tolist() else: kf = KFold(n_splits=self.n_folds, shuffle=False, random_state=606) train_index, val_index = list(kf.split(all_image_ids))[fold] if fold_split == 'train': self.image_ids = all_image_ids[train_index].tolist() elif fold_split == 'val': self.image_ids = all_image_ids[val_index].tolist() else: raise NotImplementedError() if overfit: self.image_ids = self.image_ids[:5]
class KFoldVisDialDataset(Dataset): """ A full representation of VisDial v1.0 (train/val/test) dataset. According to the appropriate split, it returns dictionary of question, image, history, ground truth answer, answer options, dense annotations etc. """ def __init__( self, config: Dict[str, Any], dialogs_jsonpath: str, dense_annotations_jsonpath: str, fold_split: Optional[str] = 'train', fold: int = -1, return_adjusted_gt_relevance: bool = False, overfit: bool = False, in_memory: bool = False, return_options: bool = True, add_boundary_toks: bool = False, ): super().__init__() self.config = config self.return_options = return_options self.return_adjusted_gt_relevance = return_adjusted_gt_relevance self.add_boundary_toks = add_boundary_toks self.dialogs_reader = DialogsReader(dialogs_jsonpath) self.annotations_reader = DenseAnnotationsReader( dense_annotations_jsonpath ) self.fold_split = fold_split self.fold = fold self.n_folds = config['n_folds'] if config['word_embedding_type'] == 'glove': self.vocabulary = GloveVocabulary( word_counts_path = config['word_counts_json'], min_count=config['vocab_min_count'], glove_weight_path=config['glove_weight_txt'], vec_size=config['glove_emb_dim'], glove_vec_num=config['glove_vec_num'] ) else: self.vocabulary = Vocabulary( word_counts_path=config["word_counts_json"], min_count=config["vocab_min_count"] ) # Initialize image features reader according to split. image_features_hdfpath = config["image_features_train_h5"] if ("val" in self.dialogs_reader.split and "fake" not in self.dialogs_reader.split): image_features_hdfpath = config["image_features_val_h5"] elif "test" in self.dialogs_reader.split: image_features_hdfpath = config["image_features_test_h5"] self.hdf_reader = ImageFeaturesHdfReader( image_features_hdfpath, in_memory ) # Keep a list of image_ids as primary keys to access data. all_image_ids = np.array(list(self.dialogs_reader.dialogs.keys())) if fold < 0 or fold_split == 'test': self.image_ids = all_image_ids.tolist() else: kf = KFold(n_splits=self.n_folds, shuffle=False, random_state=606) train_index, val_index = list(kf.split(all_image_ids))[fold] if fold_split == 'train': self.image_ids = all_image_ids[train_index].tolist() elif fold_split == 'val': self.image_ids = all_image_ids[val_index].tolist() else: raise NotImplementedError() if overfit: self.image_ids = self.image_ids[:5] @property def split(self): return self.dialogs_reader.split def __len__(self): return len(self.image_ids) def __getitem__(self, index): # Get image_id, which serves as a primary key for current instance. image_id = self.image_ids[index] # Get image features for this image_id using hdf reader. image_features = self.hdf_reader[image_id] image_features = torch.tensor(image_features) # Normalize image features at zero-th dimension (since there's no batch # dimension). if self.config["img_norm"]: image_features = normalize(image_features, dim=0, p=2) # Retrieve instance for this image_id using json reader. visdial_instance = self.dialogs_reader[image_id] caption = visdial_instance["caption"] dialog = visdial_instance["dialog"] # Convert word tokens of caption, question, answer and answer options # to integers. caption = self.vocabulary.to_indices(caption) for i in range(len(dialog)): dialog[i]["question"] = self.vocabulary.to_indices( dialog[i]["question"] ) if self.add_boundary_toks: dialog[i]["answer"] = self.vocabulary.to_indices( [self.vocabulary.SOS_TOKEN] + dialog[i]["answer"] + [self.vocabulary.EOS_TOKEN] ) else: dialog[i]["answer"] = self.vocabulary.to_indices( dialog[i]["answer"] ) if self.return_options: for j in range(len(dialog[i]["answer_options"])): if self.add_boundary_toks: dialog[i]["answer_options"][ j ] = self.vocabulary.to_indices( [self.vocabulary.SOS_TOKEN] + dialog[i]["answer_options"][j] + [self.vocabulary.EOS_TOKEN] ) else: dialog[i]["answer_options"][ j ] = self.vocabulary.to_indices( dialog[i]["answer_options"][j] ) questions, question_lengths = self._pad_sequences( [dialog_round["question"] for dialog_round in dialog] ) history, history_lengths = self._get_history( caption, [dialog_round["question"] for dialog_round in dialog], [dialog_round["answer"] for dialog_round in dialog], ) answers_in, answer_lengths = self._pad_sequences( [dialog_round["answer"][:-1] for dialog_round in dialog] ) answers_out, _ = self._pad_sequences( [dialog_round["answer"][1:] for dialog_round in dialog] ) # Collect everything as tensors for ``collate_fn`` of dataloader to # work seamlessly questions, history, etc. are converted to # LongTensors, for nn.Embedding input. item = {} item["img_ids"] = torch.tensor(image_id).long() item["img_feat"] = image_features item["ques"] = questions.long() item["hist"] = history.long() item["ans_in"] = answers_in.long() item["ans_out"] = answers_out.long() item["ques_len"] = torch.tensor(question_lengths).long() item["hist_len"] = torch.tensor(history_lengths).long() item["ans_len"] = torch.tensor(answer_lengths).long() item["num_rounds"] = torch.tensor( visdial_instance["num_rounds"] ).long() if self.return_options: if self.add_boundary_toks: answer_options_in, answer_options_out = [], [] answer_option_lengths = [] for dialog_round in dialog: options, option_lengths = self._pad_sequences( [ option[:-1] for option in dialog_round["answer_options"] ] ) answer_options_in.append(options) options, _ = self._pad_sequences( [ option[1:] for option in dialog_round["answer_options"] ] ) answer_options_out.append(options) answer_option_lengths.append(option_lengths) answer_options_in = torch.stack(answer_options_in, 0) answer_options_out = torch.stack(answer_options_out, 0) item["opt_in"] = answer_options_in.long() item["opt_out"] = answer_options_out.long() item["opt_len"] = torch.tensor(answer_option_lengths).long() else: answer_options = [] answer_option_lengths = [] for dialog_round in dialog: options, option_lengths = self._pad_sequences( dialog_round["answer_options"] ) answer_options.append(options) answer_option_lengths.append(option_lengths) answer_options = torch.stack(answer_options, 0) item["opt"] = answer_options.long() item["opt_len"] = torch.tensor(answer_option_lengths).long() if "test" not in self.split: answer_indices = [ dialog_round["gt_index"] for dialog_round in dialog ] item["ans_ind"] = torch.tensor(answer_indices).long() # Gather dense annotations. if "val" in self.split or "dense" in self.split: dense_annotations = self.annotations_reader[image_id] item["gt_relevance"] = torch.tensor( dense_annotations["gt_relevance"] ).float() item["round_id"] = torch.tensor( dense_annotations["round_id"] ).long() if self.return_adjusted_gt_relevance: item['adjusted_gt_relevance'] = torch.tensor( dense_annotations['adjusted_gt_relevance'] ).float() return item def _pad_sequences(self, sequences: List[List[int]]): """Given tokenized sequences (either questions, answers or answer options, tokenized in ``__getitem__``), padding them to maximum specified sequence length. Return as a tensor of size ``(*, max_sequence_length)``. This method is only called in ``__getitem__``, chunked out separately for readability. Parameters ---------- sequences : List[List[int]] List of tokenized sequences, each sequence is typically a List[int]. Returns ------- torch.Tensor, torch.Tensor Tensor of sequences padded to max length, and length of sequences before padding. """ for i in range(len(sequences)): sequences[i] = sequences[i][ : self.config["max_sequence_length"] - 1 ] sequence_lengths = [len(sequence) for sequence in sequences] # Pad all sequences to max_sequence_length. maxpadded_sequences = torch.full( (len(sequences), self.config["max_sequence_length"]), fill_value=self.vocabulary.PAD_INDEX, ) padded_sequences = pad_sequence( [torch.tensor(sequence) for sequence in sequences], batch_first=True, padding_value=self.vocabulary.PAD_INDEX, ) maxpadded_sequences[:, : padded_sequences.size(1)] = padded_sequences return maxpadded_sequences, sequence_lengths def _get_history( self, caption: List[int], questions: List[List[int]], answers: List[List[int]], ): # Allow double length of caption, equivalent to a concatenated QA pair. caption = caption[: self.config["max_sequence_length"] * 2 - 1] for i in range(len(questions)): questions[i] = questions[i][ : self.config["max_sequence_length"] - 1 ] for i in range(len(answers)): answers[i] = answers[i][: self.config["max_sequence_length"] - 1] # History for first round is caption, else concatenated QA pair of # previous round. history = [] history.append(caption) for question, answer in zip(questions, answers): history.append(question + answer + [self.vocabulary.EOS_INDEX]) # Drop last entry from history (there's no eleventh question). history = history[:-1] max_history_length = self.config["max_sequence_length"] * 2 if self.config.get("concat_history", False): # Concatenated_history has similar structure as history, except it # contains concatenated QA pairs from previous rounds. concatenated_history = [] concatenated_history.append(caption) for i in range(1, len(history)): concatenated_history.append([]) for j in range(i + 1): concatenated_history[i].extend(history[j]) max_history_length = ( self.config["max_sequence_length"] * 2 * len(history) ) history = concatenated_history history_lengths = [len(round_history) for round_history in history] maxpadded_history = torch.full( (len(history), max_history_length), fill_value=self.vocabulary.PAD_INDEX, ) padded_history = pad_sequence( [torch.tensor(round_history) for round_history in history], batch_first=True, padding_value=self.vocabulary.PAD_INDEX, ) maxpadded_history[:, : padded_history.size(1)] = padded_history return maxpadded_history, history_lengths