def get_datasets(args, device): lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased') images_feature_reader = ImageFeaturesH5Reader(args.features_path, False) question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer) if args.min_data: vocab_path = os.path.join(args.data_path, 'cache/vocab_min.json') train_split = "mintrain" val_split = "mintrain" if device.type == "cpu" else "minval" else: vocab_path = os.path.join(args.data_path, 'cache/vocab.json') train_split = "mintrain" if device.type == "cpu" else "train" val_split = "mintrain" if device.type == "cpu" else "val" train_dataset = VQADataset(split=train_split, dataroot=args.data_path, question_tokenizer=question_tokenizer, image_features_reader=images_feature_reader, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=23, num_images=args.max_samples, vocab_path=vocab_path, filter_entries=True, rl=False) val_dataset = VQADataset(split=val_split, dataroot=args.data_path, question_tokenizer=question_tokenizer, image_features_reader=images_feature_reader, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=23, num_images=args.max_samples, vocab_path=vocab_path, filter_entries=True, rl=False) test_dataset = val_dataset return train_dataset, val_dataset, test_dataset
def __init__(self, data_path, features_h5path, max_len=10, reward_type="levenshtein", debug=None, reward_path=None, mode="train", diff_reward=False, condition_answer=True, reward_vocab=None, mask_answers=False, max_seq_length=23, num_answers=1, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), min_data=0, reduced_answers=False, answer_sampl="uniform", params=None, filter_numbers=False): super(VQAEnv, self).__init__(data_path, max_len, reward_type=reward_type, reward_path=reward_path, debug=debug, mode=mode, diff_reward=diff_reward, condition_answer=condition_answer, reward_vocab=reward_vocab, mask_answers=mask_answers, device=device, reduced_answers=reduced_answers, params=params, filter_numbers=filter_numbers) # Loading VQA Dataset. num_images = int(self.debug[1]) if self.debug is not None else self.debug if self.mode == "test_images": num_images = None lm_tokenizer = GPT2Tokenizer.from_pretrained("cache/gpt-2") question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer) reward_tokenizer = BertTokenizer.from_pretrained("cache/bert") images_feature_reader = ImageFeaturesH5Reader(features_h5path, False) modes = self.get_modes(device=device, min_data=min_data) self.min_data = min_data vocab_path = os.path.join(data_path, "cache", "vocab.json") if not min_data else os.path.join(data_path, "cache", "vocab_min.json") num_answers_ = None if answer_sampl == "img_sampling" else num_answers self.dataset = VQADataset(split=modes[self.mode], dataroot=data_path, image_features_reader=images_feature_reader, question_tokenizer=question_tokenizer, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=max_seq_length, num_answers=num_answers_, num_images=num_images, filter_entries=True, vocab_path=vocab_path,filter_numbers=filter_numbers) duplicate_entries = True if answer_sampl == "img_sampling" else False self.dataset.split_entries(duplicate_entries) self.set_special_tokens() self.set_reward_function(reward_type=reward_type, reward_path=reward_path, reward_vocab=reward_vocab, diff_reward=diff_reward) self.answer_sampling = answer_sampl self.inv_freq_answers = self.dataset.get_answers_frequency()
class VQAEnv(GenericEnv): """VQA Env""" metadata = {'render.modes': ['human']} def __init__(self, data_path, features_h5path, max_len=10, reward_type="levenshtein", debug=None, reward_path=None, mode="train", diff_reward=False, condition_answer=True, reward_vocab=None, mask_answers=False, max_seq_length=23, num_answers=1, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), min_data=0, reduced_answers=False, answer_sampl="uniform", params=None, filter_numbers=False): super(VQAEnv, self).__init__(data_path, max_len, reward_type=reward_type, reward_path=reward_path, debug=debug, mode=mode, diff_reward=diff_reward, condition_answer=condition_answer, reward_vocab=reward_vocab, mask_answers=mask_answers, device=device, reduced_answers=reduced_answers, params=params, filter_numbers=filter_numbers) # Loading VQA Dataset. num_images = int(self.debug[1]) if self.debug is not None else self.debug if self.mode == "test_images": num_images = None lm_tokenizer = GPT2Tokenizer.from_pretrained("cache/gpt-2") question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer) reward_tokenizer = BertTokenizer.from_pretrained("cache/bert") images_feature_reader = ImageFeaturesH5Reader(features_h5path, False) modes = self.get_modes(device=device, min_data=min_data) self.min_data = min_data vocab_path = os.path.join(data_path, "cache", "vocab.json") if not min_data else os.path.join(data_path, "cache", "vocab_min.json") num_answers_ = None if answer_sampl == "img_sampling" else num_answers self.dataset = VQADataset(split=modes[self.mode], dataroot=data_path, image_features_reader=images_feature_reader, question_tokenizer=question_tokenizer, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=max_seq_length, num_answers=num_answers_, num_images=num_images, filter_entries=True, vocab_path=vocab_path,filter_numbers=filter_numbers) duplicate_entries = True if answer_sampl == "img_sampling" else False self.dataset.split_entries(duplicate_entries) self.set_special_tokens() self.set_reward_function(reward_type=reward_type, reward_path=reward_path, reward_vocab=reward_vocab, diff_reward=diff_reward) self.answer_sampling = answer_sampl self.inv_freq_answers = self.dataset.get_answers_frequency() def update_mode(self, mode, answer_sampl): self.mode = mode self.answer_sampling = answer_sampl def get_modes(self, device, min_data): if min_data or device.type == "cpu": modes = {"train": "mintrain", "test_images": "minval", "test_text": "mintrain"} else: modes = {"train": "train", "test_images": "val", "test_text": "train"} return modes def get_env_idx(self, i_episode, entries): if i_episode is not None and i_episode < len(entries): env_idx = i_episode else: env_idx = np.random.randint(0, len(entries)) return env_idx def decode_inv_frequency(self): inv_freq_answers_decoded = {self.dataset.answer_tokenizer.decode([k]): round(v, 4) for k, v in self.inv_freq_answers.items()} return inv_freq_answers_decoded def sample_answer_from_inv_freq_distrib(self): tensor_distrib = torch.tensor(list(self.inv_freq_answers.values())) ind_sampled = torch.multinomial(tensor_distrib, num_samples=1) prob_sampled = tensor_distrib[ind_sampled].item() possible_answers = [k for k, v in self.inv_freq_answers.items() if round(v, 4) == round(prob_sampled, 4)] return random.choice(possible_answers) def sample_entry_from_answer(self, answer): entries_answer = {i: entry for i, entry in enumerate(self.dataset.filtered_entries) if entry["answer"]["labels"] == answer} env_idx = random.choice(list(entries_answer.keys())) entry = entries_answer[env_idx] return entry, env_idx def sample_entry(self, entries, i_episode=None): if self.answer_sampling == "random": env_idx = self.get_env_idx(i_episode, entries) entry = entries[env_idx] elif self.answer_sampling == "uniform": answer = random.choice(self.dataset.reduced_answers.cpu().numpy()) entry, env_idx = self.sample_entry_from_answer(answer) elif self.answer_sampling == "inv_frequency": answer = self.sample_answer_from_inv_freq_distrib() entry, env_idx = self.sample_entry_from_answer(answer) elif self.answer_sampling == "img_sampling": img_idx = random.choice(self.dataset.images_idx) answer = random.choice(self.dataset.answer_img_map[img_idx]) entries = [(idx, ent) for idx, ent in enumerate(self.dataset.filtered_entries) if ent["image_id"] == img_idx and answer in ent["answer"]["labels"]] env_idx, entry = random.choice(entries) return entry, env_idx def reset(self, seed=None, i_episode=None): if seed is not None: np.random.seed(seed) entries = self.dataset.test_entries if self.mode == "test_text" else self.dataset.filtered_entries self.entry, self.env_idx = self.sample_entry(entries, i_episode) (features, image_mask, spatials) = self.dataset.get_img_data(self.entry) labels, _ = self.dataset.get_answer_data(self.entry) self.ref_question_idx = self.entry["question_id"] self.ref_question = self.entry["q_token"][:self.max_len] self.ref_questions = self.ref_question.view(1, -1) self.ref_question_decoded = self.dataset.question_tokenizer.decode(self.entry["q_token"][:self.max_len].numpy()) self.ref_questions_decoded = [self.ref_question_decoded] self.ref_answer = labels self.img_idx = self.entry["image_id"] self.img_feats = features self.img = (features, image_mask, spatials) # initializing the state. state_question = [self.special_tokens.SOS_idx] self.state = self.State(torch.LongTensor(state_question).view(1, len(state_question)), self.img_feats.unsqueeze(0), self.ref_answer) self.step_idx = 0 self.dialog = None return self.state
help='Number of answers to keep') parser.add_argument( "-ans_preprocess", type=lambda x: bool(strtobool(x)), default="False", help='preprocess answers (higher accuracy but slow start)') parser.add_argument("-merge_val", type=lambda x: bool(strtobool(x)), default="False", help='Fuse train/val dataset') args = parser.parse_args() print("Loading dataset...") train_dataset = VQADataset(args.data_dir, args.year, "train", preprocess_answers=args.ans_preprocess) answer_counters = train_dataset.answer_counter.most_common() games = train_dataset.games if args.merge_val: valid_dataset = VQADataset(args.data_dir, args.year, "val", preprocess_answers=args.ans_preprocess) answer_counters += valid_dataset.answer_counter.most_common() games += valid_dataset.games word2i = {'<unk>': 0, '<start>': 1, '<stop>': 2, '<padding>': 3} answer2i = {'<unk>': 0}
default='../../data/vqa-v2/coco_trainval.lmdb') args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased') images_feature_reader = ImageFeaturesH5Reader(args.features_path, False) question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer) train_dataset = VQADataset(split="mintrain", dataroot=args.data_path, question_tokenizer=question_tokenizer, image_features_reader=images_feature_reader, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=23, num_images=None, vocab_path=os.path.join(args.data_path, 'cache/vocab.json'), filter_entries=True, rl=False) sf_ids = [0, 1, 2, 3, 4, 5, 7] # 0 corresponds to function smoothing. print( "------------------------------------------------------------------------------------------------" ) print("simple test") for sf_id in sf_ids: bleu_fn = Bleu(sf_id=sf_id) print("<-------------------------------------------------->") print("BLEU scores for smoothing function: {}".format(sf_id)) simple_test(bleu_fn)
State = namedtuple('State', ('text', 'img', "answer")) features_h5path = "../../../data/vqa-v2/coco_trainval.lmdb" data_path = "../../../data/vqa-v2" lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer) reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True) images_feature_reader = ImageFeaturesH5Reader(features_h5path, False) vocab_path = os.path.join(data_path, "cache", "vocab_min.json") dataset = VQADataset(split="mintrain", dataroot=data_path, image_features_reader=images_feature_reader, question_tokenizer=question_tokenizer, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=23, min_len_questions=0, num_answers=1, num_images=None, filter_entries=True, vocab_path=vocab_path) language_score = LanguageScore(dataset=dataset) def get_state_actions(test_sentence): state_encoded = dataset.question_tokenizer.encode(test_sentence) state_encoded = [1] + state_encoded state_encoded = state_encoded[:-1] sequence_actions = state_encoded[1:] return state_encoded, sequence_actions
parser.add_argument("-test", type=int, default=1) args = parser.parse_args() lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased') features_h5path = args.features_path images_feature_reader = ImageFeaturesH5Reader(features_h5path, False) question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer) if args.vocab_path == "none": split = "trainval" if not args.min_split else "mintrainval" vqa_dataset = VQADataset(split=split, dataroot=args.data_path, vocab_path=args.vocab_path, question_tokenizer=question_tokenizer, image_features_reader=images_feature_reader, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=23, num_images=None) else: split = args.split if not args.min_split else "min" + args.split print("Building {} dataset from {}...".format(split, args.vocab_path)) vqa_dataset = VQADataset(split=split, dataroot=args.data_path, question_tokenizer=question_tokenizer, image_features_reader=images_feature_reader, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=23,
def get_datasets(args, device): if args.dataset == "clevr": if args.dataset_ext == 0: train_questions_path = os.path.join(args.data_path, "train_questions.h5") val_questions_path = os.path.join(args.data_path, "val_questions.h5") test_questions_path = os.path.join(args.data_path, "test_questions.h5") train_feats_path = os.path.join(args.data_path, 'train_features.h5') val_feats_path = os.path.join(args.data_path, 'val_features.h5') vocab_path = os.path.join(args.data_path, "vocab.json") if args.task == "lm": train_dataset = QuestionsDataset( h5_questions_path=train_questions_path, vocab_path=vocab_path, range_samples=args.range_samples) val_dataset = QuestionsDataset( h5_questions_path=val_questions_path, vocab_path=vocab_path) test_dataset = QuestionsDataset( h5_questions_path=test_questions_path, vocab_path=vocab_path) elif args.task == "policy": train_dataset = CLEVR_Dataset( h5_questions_path=train_questions_path, h5_feats_path=train_feats_path, vocab_path=vocab_path, max_samples=args.max_samples) val_dataset = CLEVR_Dataset( h5_questions_path=val_questions_path, h5_feats_path=val_feats_path, vocab_path=vocab_path, max_samples=args.max_samples) test_dataset = val_dataset else: vocab_path = os.path.join(args.data_path, "vocab.json") data_path = os.path.join(args.data_path, "clevr_ext") full_dataset = QuestionsDataset( h5_questions_path=data_path, vocab_path=vocab_path, range_samples=args.range_samples, dataset_ext=1) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) train_dataset = copy_attributes(train_dataset, train_dataset.dataset) test_dataset = copy_attributes(test_dataset, test_dataset.dataset) val_dataset = copy.deepcopy(test_dataset) elif args.dataset == "vqa": lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") reward_tokenizer = BertTokenizer.from_pretrained('bert-base-cased') images_feature_reader = ImageFeaturesH5Reader( args.features_path, False) question_tokenizer = VQATokenizer(lm_tokenizer=lm_tokenizer) if args.min_data: vocab_path = os.path.join(args.data_path, 'cache/vocab_min.json') train_split = "mintrain" val_split = "mintrain" if device.type == "cpu" else "minval" else: vocab_path = os.path.join(args.data_path, 'cache/vocab.json') train_split = "mintrain" if device.type == "cpu" else "train" val_split = "mintrain" if device.type == "cpu" else "val" train_dataset = VQADataset( split=train_split, dataroot=args.data_path, question_tokenizer=question_tokenizer, image_features_reader=images_feature_reader, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=23, num_images=None, vocab_path=vocab_path, filter_entries=True, rl=False) val_dataset = VQADataset( split=val_split, dataroot=args.data_path, question_tokenizer=question_tokenizer, image_features_reader=images_feature_reader, reward_tokenizer=reward_tokenizer, clean_datasets=True, max_seq_length=23, num_images=None, vocab_path=vocab_path, filter_entries=True, rl=False, filter_numbers=args.filter_numbers) test_dataset = val_dataset return train_dataset, val_dataset, test_dataset
type=str, default="glove.42B.300d.zip", help="Name of the stanford glove file") parser.add_argument("-glove_out", type=str, default="glove_dict.pkl", help="Name of the output glove file") parser.add_argument("-year", type=int, default=2014, help="VQA dataset year (2014/2017)") args = parser.parse_args() print("Loading dataset...") trainset = VQADataset(args.data_dir, year=args.year, which_set="train") validset = VQADataset(args.data_dir, year=args.year, which_set="val") testdevset = VQADataset(args.data_dir, year=args.year, which_set="test-dev") testset = VQADataset(args.data_dir, year=args.year, which_set="test") tokenizer = TweetTokenizer(preserve_case=False) print("Loading glove...") with io.open(args.glove_in, 'r', encoding="utf-8") as f: vectors = {} for line in f: vals = line.rstrip().split(' ') vectors[vals[0]] = [float(x) for x in vals[1:]]