def pre_process_dataset(image_dir, qjson, ajson, img_prefix): print('Preprocessing datatset. \n') vqa = VQA(ajson, qjson) img_names = [f for f in os.listdir(image_dir) if '.jpg' in f] img_ids = [] for fname in img_names: img_id = fname.split('.')[0].rpartition(img_prefix)[-1] img_ids.append(int(img_id)) ques_ids = vqa.getQuesIds(img_ids) q2i = defaultdict(lambda: len(q2i)) pad = q2i["<pad>"] start = q2i["<sos>"] end = q2i["<eos>"] UNK = q2i["<unk>"] a2i_count = {} for ques_id in ques_ids: qa = vqa.loadQA(ques_id)[0] qqa = vqa.loadQQA(ques_id)[0] ques = qqa['question'][:-1] [q2i[x] for x in ques.lower().strip().split(" ")] answers = qa['answers'] for ans in answers: if not ans['answer_confidence'] == 'yes': continue ans = ans['answer'].lower() if ans not in a2i_count: a2i_count[ans] = 1 else: a2i_count[ans] = a2i_count[ans] + 1 a_sort = sorted(a2i_count.items(), key=operator.itemgetter(1), reverse=True) i2a = {} count = 0 a2i = defaultdict(lambda: len(a2i)) for word, _ in a_sort: a2i[word] i2a[a2i[word]] = word count = count + 1 if count == 1000: break return q2i, a2i, i2a, a2i_count
from external.vqa.vqa import VQA image_dir = "/projectnb/statnlp/shawnlin/dataset/mscoco_vqa_2014/train2014" img_prefix = "COCO_train2014_" qjson = "/projectnb/statnlp/shawnlin/dataset/mscoco_vqa_2014/Questions_Train_mscoco/OpenEnded_mscoco_train2014_questions.json" ajson = "/projectnb/statnlp/shawnlin/dataset/mscoco_vqa_2014/Annotations_Train_mscoco/mscoco_train2014_annotations.json" vqa = VQA(ajson, qjson) img_names = [f for f in os.listdir(image_dir) if '.jpg' in f] img_ids = [] for fname in img_names: img_id = fname.split('.')[0].rpartition(img_prefix)[-1] img_ids.append(int(img_id)) ques_ids = vqa.getQuesIds(img_ids) q2i = defaultdict(lambda: len(q2i)) pad = q2i["<pad>"] start = q2i["<sos>"] end = q2i["<eos>"] UNK = q2i["<unk>"] a2i_count = {} for ques_id in ques_ids: qa = vqa.loadQA(ques_id)[0] qqa = vqa.loadQQA(ques_id)[0] ques = qqa['question'][:-1] [q2i[x] for x in ques.lower().strip().split(" ")]
class VqaDataset(Dataset): """ Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples. """ def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, is_training=True, transform=None): """ Args: image_dir (string): Path to the directory with COCO images question_json_file_path (string): Path to the json file containing the question data annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and answers together image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg") """ self.vqa = VQA(annotation_json_file_path, question_json_file_path) self.ques_idx_list = self.vqa.getQuesIds() self.image_dir = image_dir self.image_filename_pattern = image_filename_pattern if os.path.exists('ques_dictionary.pkl'): with open('ques_dictionary.pkl', 'rb') as f: self.dictionary = pickle.load(f) else: if is_training: self.dictionary = _build_question_dictionary(self.vqa) with open('ques_dictionary.pkl', 'wb') as f: pickle.dump(self.dictionary, f) else: raise "No dictionary built from training dataset!" if os.path.exists('ans_dictionary.pkl'): with open('ans_dictionary.pkl', 'rb') as f: self.answers = pickle.load(f) else: if is_training: self.answers = _build_answer_dictionary(self.vqa) with open('ans_dictionary.pkl', 'wb') as f: pickle.dump(self.answers, f) else: raise "No answer list built from training dataset!" # print(self.dictionary) # print(self.answers) self.image_transform = transform def __len__(self): return len(self.ques_idx_list) def __getitem__(self, idx): ques_idx = self.ques_idx_list[idx] ann = self.vqa.loadQA(ques_idx)[0] image_id = ann['image_id'] image_name = self.image_filename_pattern.format(str(image_id).zfill(12)) image_path = os.path.join(self.image_dir, image_name) if os.path.splitext(image_path)[1] == '.npy': image = np.load(image_path).T else: image = Image.open(image_path).convert('RGB') image = self.image_transform(image) question = self.vqa.qqa[ques_idx]['question'] answers = ann['answers'] best_answer = _get_majority_ans(answers) return { 'image': image, 'image_path': image_name, 'question': question, 'answer': best_answer, 'question_encoding': _encode_question(question, self.dictionary), 'answer_encoding': _encode_answer(best_answer, self.answers), }
class VqaDataset(Dataset): """ Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples. """ def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, existing_format=None, ques_thres=12, ans_thres=6, seq_len=50, prepro=False, prepro_path=None): """ Args: image_dir (string): Path to the directory with COCO images question_json_file_path (string): Path to the json file containing the question data annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and answers together image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg") """ self.image_dir = image_dir self.question_json_file_path = question_json_file_path self.annotation_json_file_path = annotation_json_file_path self.image_filename_pattern = image_filename_pattern self.prepro = prepro print("Allow preprocessing: ", self.prepro) # self.existing_format = existing_format self.vqa = VQA(annotation_json_file_path, question_json_file_path) self.queIds = self.vqa.getQuesIds(); self.quesWords = self.getSplitQues() self.ansWords = self.getSplitAns() self.ques_thres = ques_thres self.ans_thres = ans_thres self.id_images = {} if existing_format is None: self.quesWordToIdx, self.quesVecSize = self.BuildBoW(self.quesWords,self.ques_thres) self.ansWordToIdx, self.ansVecSize = self.BuildBoW(self.ansWords,self.ans_thres) # self.quesVecSize = len(self.quesWordToIdx) self.seq_len = seq_len else: self.quesWordToIdx = existing_format.quesWordToIdx self.ansWordToIdx = existing_format.ansWordToIdx self.quesVecSize = existing_format.quesVecSize self.ansVecSize = existing_format.ansVecSize self.seq_len = existing_format.seq_len if self.prepro: self.prepro_path = prepro_path # features_extracted = h5py.File(prepro_path, 'r') # self.features_h5 = features_extracted["features"][:] # self.ids_h5 = features_extracted["ids"][:] print("The length of question vector: ", self.quesVecSize) def getImgSize(self): return len(self.vqa.getImgIds()) def imgIdToPath(self, idx): str_bracket = "{}" start_idx = self.image_filename_pattern.find(str_bracket) path = self.image_filename_pattern[0:start_idx] + str(idx).zfill(12) + self.image_filename_pattern[start_idx+2:] path = self.image_dir + "/" + path return path def getSplitQues(self): ques_words = [] for i in range(0,len(self.queIds)): question = self.vqa.qqa[self.queIds[i]]['question'] question = question[0:-1] question = question.lower() ques_words += question.split() return ques_words def getSplitAns(self): ans_words = [] for i in range(0,len(self.queIds)): anss = self.vqa.qa[self.queIds[i]]['answers'] for ans in anss: ans_str = ans["answer"] ans_words.append(ans_str) return ans_words def BoWPool(self, data): vocab_set = {} vocab_set['NA'] = 0 for i in range(0,len(data)): for vocab in data: if vocab not in vocab_set: idx = len(vocab_set) vocab_set[vocab] = idx return vocab_set def BuildBoW(self, data, thres): vocab_set = {} for vocab in data: if vocab not in vocab_set: vocab_set[vocab] = 1 else: vocab_set[vocab] += 1 vocab_map = {} vocab_map['NA'] = 0 idx = 1 for vocab in data: if vocab not in vocab_map: if vocab_set[vocab] > thres: vocab_map[vocab] = idx idx += 1 else: vocab_map[vocab] = vocab_map['NA'] # vocab_map['END'] = -1 return vocab_map, idx def BoWVoting(self, sentences, table): count_set = {} for word in sentences: if word['answer'] not in count_set: count_set[word['answer']] = 1 else: count_set[word['answer']] += 1 sorted_dict = sorted(count_set.items(), key=lambda kv: kv[1]) res_word = sorted_dict[-1][0] best_ind = 0 if res_word in table: best_ind = table[res_word] return np.array(best_ind) def BoWVector(self, sentence, table): bow_vec = np.zeros(self.quesVecSize) for i in range(self.seq_len): if i < len(sentence): if sentence[i] in table: bow_vec[table[sentence[i]]] = 1 else: bow_vec[table['NA']] = 1 return bow_vec def BoWVectorGeneric(self, sentence, table): bow_vec = np.zeros([self.seq_len,self.quesVecSize]) for i in range(self.seq_len): if i < len(sentence): if sentence[i] in table: bow_vec[i,table[sentence[i]]] = 1 else: bow_vec[i,table['NA']] = 1 else: break return bow_vec def saveFeatures(self, feat, id): self.id_images[id] = feat def __len__(self): return len(self.queIds) def __getitem__(self, idx): if idx >= len(self.vqa.qa): print("Error: access overflow") return None idx_qa = self.queIds[idx] qa = self.vqa.loadQA(idx_qa)[0] data = {} tmp_question = self.vqa.qqa[idx_qa]['question'] tmp_question = tmp_question.lower()[:-1] data['questions'] = torch.from_numpy(self.BoWVectorGeneric(tmp_question.split(),self.quesWordToIdx)) tmp_answers = qa['answers'] data['gt_answer'] = torch.from_numpy(self.BoWVoting(tmp_answers,self.ansWordToIdx)) data['images_id'] = qa['image_id'] if self.prepro: # h5_idxs = self.ids_h5 # query_idx = np.where(h5_idxs==qa['image_id']) # data['images_id'] = qa['image_id'] # tmp_features = self.features_h5[query_idx[0][0]] # tmp_features = tmp_features.astype(np.float32) # data['images'] = torch.from_numpy(tmp_features) # Above for all features in one h5 file # Below for several different feature files img_idx = qa['image_id'] str_bracket = "{}" start_idx = self.prepro_path.find(str_bracket) path = self.prepro_path[0:start_idx] + str(img_idx) + self.prepro_path[start_idx+2:] features_extracted = h5py.File(path, 'r') feature = features_extracted["features"][:] data['images'] = feature else: tmp_img = Image.open(self.imgIdToPath(qa['image_id'])) tmp_img = tmp_img.convert('RGB') normalize = transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225]) trans = transforms.Compose([ transforms.Resize((224,224)), transforms.ToTensor(), normalize, ]) data['images'] = trans(tmp_img) return data
class VqaDataset(Dataset): """ Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples. """ def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, collate=False, q2i=None, a2i=None, i2a=None, a2i_count=None, img_names=None, img_ids=None, ques_ids=None, method='simple', dataset_type='train', enc_dir=''): """ Args: image_dir (string): Path to the directory with COCO images question_json_file_path (string): Path to the json file containing the question data annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and answers together image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg") """ print(method) self.image_dir = image_dir self.qjson = question_json_file_path self.ajson = annotation_json_file_path img_prefix = image_filename_pattern.split('{}')[0] self.collate = collate self.q2i = q2i self.a2i = a2i self.i2a = i2a self.a2i_count = a2i_count self.img_ids = img_ids self.ques_ids = ques_ids self.img_names = img_names self.method = method self.vqa = VQA(self.ajson, self.qjson) if self.method == 'simple': self.transform = transforms.Compose( [transforms.Resize((224, 224)), transforms.ToTensor()]) else: self.transform = transforms.Compose( [transforms.Resize((448, 448)), transforms.ToTensor()]) if not collate: self.img_names = [ f for f in os.listdir(self.image_dir) if '.jpg' in f ] self.img_ids = [] for fname in self.img_names: img_id = fname.split('.')[0].rpartition(img_prefix)[-1] self.img_ids.append(int(img_id)) self.ques_ids = self.vqa.getQuesIds(self.img_ids) self.q2i, self.a2i, self.i2a, self.a2i_count = pre_process_dataset( image_dir, self.qjson, self.ajson, img_prefix) self.q2i_len = len(self.q2i) self.a2i_len = len(self.a2i.keys()) self.q2i_keys = self.q2i.keys() self.enc_dir = enc_dir if collate and dataset_type == 'train': with open('/home/ubuntu/hw3_release/data/train_enc_idx.npy', 'rb') as f: self.enc_idx = pickle.load(f) elif collate and dataset_type == 'val': with open('/home/ubuntu/hw3_release/data/val_enc_idx.npy', 'rb') as f: self.enc_idx = pickle.load(f) def __len__(self): return len(self.ques_ids) def __getitem__(self, idx): ques_id = self.ques_ids[idx] img_id = self.vqa.getImgIds([ques_id])[0] qa = self.vqa.loadQA(ques_id)[0] qqa = self.vqa.loadQQA(ques_id)[0] img_name = self.img_names[self.img_ids.index(img_id)] if self.method == 'simple': img = default_loader(self.image_dir + '/' + img_name) #imgT = self.transform(img).permute(1, 2, 0) imgT = self.transform(img).float() else: #file_idx = self.enc_idx[img_id] // 50 #arr_idx = self.enc_idx[img_id] % 50 #path = self.enc_dir + '/' + str(file_idx) + '.npz' #img = np.load(path)['out'][arr_idx, :, :] # 512 x 196 #imgT = torch.from_numpy(img).float() img = default_loader(self.image_dir + '/' + img_name) imgT = self.transform(img).float() ques = qqa['question'][:-1] quesI = [self.q2i["<sos>"]] + [ self.q2i[x.lower()] for x in ques.split(" ") if x.lower() in self.q2i_keys ] + [self.q2i["<eos>"]] if not self.collate: quesI = quesI + [self.q2i["<pad>"]] * (8 - len(quesI)) if self.method == 'simple': quesT = torch.zeros(self.q2i_len).float() for idx in quesI: quesT[idx] = 1 else: quesT = torch.from_numpy(np.array(quesI)).long() answers = qa['answers'] max_count = 0 answer = "" for ans in answers: #if not ans['answer_confidence'] == 'yes': # continue ans = ans['answer'].lower() if ans in self.a2i.keys() and self.a2i_count[ans] > max_count: max_count = self.a2i_count[ans] answer = ans if answer == "": # only for validation gT = torch.from_numpy(np.array([self.a2i_len])).long() else: gT = torch.from_numpy(np.array([self.a2i[answer]])).long() if not self.collate: return {'img': imgT, 'ques': quesT, 'gt': gT} return imgT, quesT, gT
class VqaDataset(Dataset): """ Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples. """ def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, transform=None, question_word_to_id_map=None, answer_to_id_map=None, question_word_list_length=5746, answer_list_length=5216, pre_encoder=None, cache_location=None): """ Args: image_dir (string): Path to the directory with COCO images question_json_file_path (string): Path to the json file containing the question data annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and answers together image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg") """ self._vqa = VQA(annotation_file=annotation_json_file_path, question_file=question_json_file_path) self._image_dir = image_dir self._image_filename_pattern = image_filename_pattern self._transform = transform self._max_question_length = 26 self.ques_ids = self._vqa.getQuesIds() # Publicly accessible dataset parameters self.question_word_list_length = question_word_list_length + 1 self.unknown_question_word_index = question_word_list_length self.answer_list_length = answer_list_length + 1 self.unknown_answer_index = answer_list_length self._pre_encoder = pre_encoder self._cache_location = cache_location if self._cache_location is not None: try: os.makedirs(self._cache_location) except OSError: pass # Create the question map if necessary if question_word_to_id_map is None: questions_list = [] questions = self._vqa.questions['questions'] for question in questions: questions_list.append(question['question']) word_list = self._create_word_list(questions_list) self.question_word_to_id_map = self._create_id_map( word_list, self.question_word_list_length) else: self.question_word_to_id_map = question_word_to_id_map # Create the answer map if necessary if answer_to_id_map is None: answer_list = [] answers = self._vqa.dataset['annotations'] for answer in answers: all_answers = answer['answers'] for each_answer in all_answers: answer_list.append(each_answer['answer']) self.answer_to_id_map = self._create_id_map( answer_list, self.answer_list_length) else: self.answer_to_id_map = answer_to_id_map def _create_word_list(self, sentences): """ Turn a list of sentences into a list of processed words (no punctuation, lowercase, etc) Args: sentences: a list of str, sentences to be splitted into words Return: A list of str, words from the split, order remained. """ word_list = [] # Source: https://www.geeksforgeeks.org/removing-punctuations-given-string/ for sentence in sentences: sentence = sentence.lower() punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' for x in sentence: if x in punctuations: sentence = sentence.replace(x, "") word_list.extend(sentence.split(" ")) return word_list def _create_id_map(self, word_list, max_list_length): """ Find the most common str in a list, then create a map from str to id (its rank in the frequency) Args: word_list: a list of str, where the most frequent elements are picked out max_list_length: the number of strs picked Return: A map (dict) from str to id (rank) """ freq_words = {} for word in word_list: if word in freq_words: freq_words[word] += 1 else: freq_words[word] = 1 # Sort dictionary by frequency of words freq_words = dict( sorted(freq_words.items(), key=operator.itemgetter(1), reverse=True)) # Update dictionary for the max list length freq_words = take(max_list_length, freq_words.items()) freq_words = [(val[0], idx) for idx, val in enumerate(freq_words)] freq_words = dict(freq_words) return freq_words def __len__(self): return len(self.ques_ids) def __getitem__(self, idx): """ Load an item of the dataset Args: idx: index of the data item Return: A dict containing multiple torch tensors for image, question and answers. """ ques_id = self.ques_ids[idx] question = self._vqa.loadQA(ques_id) img_id = question[0]['image_id'] img_id = str(img_id) img_id = img_id.zfill(12) if self._cache_location is not None and self._pre_encoder is not None: ############ 3.2 TODO # implement your caching and loading logic here ############ raise NotImplementedError() else: # load the image from disk, apply self._transform (if not None) fpath = os.path.join(self._image_dir, self._image_filename_pattern.format(img_id)) img = Image.open(fpath) img = img.convert('RGB') if self._transform: img = self._transform(img) else: #TODO: Check if this is right img = transforms.functional.to_tensor(img) question = self._vqa.questions['questions'][idx]['question'] question_to_return = question question_split = self._create_word_list([question]) total_question_words = np.array( list(self.question_word_to_id_map.keys())) question_one_hot = np.zeros( [self._max_question_length, self.question_word_list_length]) for idx, word in enumerate(question_split): if idx == self._max_question_length: break contains_word = ((word in total_question_words) == True) if contains_word: hot_idx = np.where(word == total_question_words)[0][0] question_one_hot[idx, hot_idx] = 1 else: question_one_hot[idx, -1] = 1 question_one_hot = torch.from_numpy(question_one_hot) question_one_hot = torch.clamp(torch.sum(question_one_hot, dim=0), max=1) answers = self._vqa.dataset['annotations'][idx]['answers'] answers_one_hot = np.zeros([10, self.answer_list_length]) total_answer_words = np.array(list(self.answer_to_id_map.keys())) all_answer = [] for idx, answer_dict in enumerate(answers): answer = answer_dict['answer'] all_answer.append(answer) contains_word = ((answer in total_answer_words) == True) if contains_word: hot_idx = np.where(answer == total_answer_words)[0][0] answers_one_hot[idx, hot_idx] = 1 else: answers_one_hot[idx, -1] = 1 occurence_count = Counter(all_answer) answer_to_return = occurence_count.most_common(1)[0][0] answers_one_hot = torch.from_numpy(answers_one_hot) #img = img.cuda() #question_one_hot = question_one_hot.cuda() #answers_one_hot_list = answers_one_hot_list.cuda() #datapoint = {'image':img, 'question_tensor':question_one_hot, 'answers_tensor':answers_one_hot} #return datapoint return img, question_one_hot, answers_one_hot, question_to_return, answer_to_return, list( self.answer_to_id_map.keys())
class VqaDataset(Dataset): """ Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples. """ def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, transform=None, question_word_to_id_map=None, answer_to_id_map=None, question_word_list_length=5746, answer_list_length=5216, pre_encoder=None, cache_location=None, max_list_length=None): """ Args: image_dir (string): Path to the directory with COCO images question_json_file_path (string): Path to the json file containing the question data annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and answers together image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg") """ self._vqa = VQA(annotation_file=annotation_json_file_path, question_file=question_json_file_path) self._image_dir = image_dir self._image_filename_pattern = image_filename_pattern if transform is not None: self._transform = transform else: self._transform = transforms.Compose([ transforms.ToTensor(), ]) self._max_question_length = 26 # Publicly accessible dataset parameters self.question_word_list_length = question_word_list_length + 1 self.unknown_question_word_index = question_word_list_length self.answer_list_length = answer_list_length + 1 self.unknown_answer_index = answer_list_length self.question_ids = self._vqa.getQuesIds() self.fixed_str = '000000000000' self._pre_encoder = pre_encoder self._cache_location = cache_location if self._cache_location is not None: try: os.makedirs(self._cache_location) except OSError: pass # import ipdb; ipdb.set_trace() # Create the question map if necessary # self.question_word_to_id_map = {} if question_word_to_id_map is None: ############ 1.6 TODO question_sentences = [] # question_ids = self._vqa.getQuesIds() for question_id in self.question_ids: question_sentences.append( self._vqa.qqa[question_id]['question']) word_list = self._create_word_list(question_sentences) self.question_word_to_id_map = self._create_id_map( word_list, question_word_list_length) print("Created new question_to_id_map") ############ # raise NotImplementedError() else: self.question_word_to_id_map = question_word_to_id_map print("Reused question_to_id_map") # Create the answer map if necessary if answer_to_id_map is None: ############ 1.7 TODO answer_sentence_list = [] for question_id in self.question_ids: answer_list = self._vqa.qa[question_id]['answers'] for item in answer_list: answer_sentence_list.append(item['answer']) self.answer_to_id_map = self._create_id_map( answer_sentence_list, answer_list_length) print("Created new answer_to_id_map") ############ # raise NotImplementedError() else: self.answer_to_id_map = answer_to_id_map print("Reused answer_to_id_map") # import pdb; pdb.set_trace() def _create_word_list(self, sentences): """ Turn a list of sentences into a list of processed words (no punctuation, lowercase, etc) Args: sentences: a list of str, sentences to be splitted into words Return: A list of str, words from the split, order remained. """ ############ 1.4 TODO """ https://machinelearningmastery.com/clean-text-machine-learning-python/ """ import string table = str.maketrans('', '', string.punctuation) # import ipdb; ipdb.set_trace() word_list = [] if type(sentences) == list: for sentence in sentences: words = sentence.split(" ") word_list += [word.translate(table).lower() for word in words] else: words = sentences.split(" ") word_list += [word.translate(table).lower() for word in words] ############ # raise NotImplementedError() return word_list def _create_id_map(self, word_list, max_list_length): """ Find the most common str in a list, then create a map from str to id (its rank in the frequency) Args: word_list: a list of str, where the most frequent elements are picked out max_list_length: the number of strs picked Return: A map (dict) from str to id (rank) """ ############ 1.5 TODO from collections import Counter # import pdb; pdb.set_trace() word_rank_list = Counter(word_list).most_common(max_list_length) id_map = {} for idx, (word, _) in enumerate(word_rank_list): id_map[word] = idx ############ # raise NotImplementedError() return id_map def __len__(self): ############ 1.8 TODO # return len(self._vqa.imgToQA) return len(self._vqa.questions['questions']) # return len(self._vqa.getQuesIds()) ############ # raise NotImplementedError() def __getitem__(self, idx): """ Load an item of the dataset Args: idx: index of the data item Return: A dict containing multiple torch tensors for image, question and answers. """ ############ 1.9 TODO # figure out the idx-th item of dataset from the VQA API image_id_from_idx = self._vqa.questions['questions'][idx]['image_id'] question_id_from_idx = self._vqa.questions['questions'][idx][ 'question_id'] question_sentence = self._vqa.questions['questions'][idx]['question'] answer_sentences = [ ans['answer'] for ans in self._vqa.qa[question_id_from_idx]['answers'] ] # import pdb; pdb.set_trace() ############ # if self._cache_location is not None and self._pre_encoder is not None: # ############ 3.2 TODO # # implement your caching and loading logic here # ############ # raise NotImplementedError() # else: ############ 1.9 TODO # load the image from disk, apply self._transform (if not None) # import ipdb; ipdb.set_trace() image_id_from_idx_string = self.fixed_str + str(image_id_from_idx) truncated_image_id_from_idx = image_id_from_idx_string[-12:] img_file_path = self._image_dir + '/' + self._image_filename_pattern.format( truncated_image_id_from_idx) image = Image.open(img_file_path) image = image.convert("RGB") image = self._transform(image) ############ # raise NotImplementedError() ############ 1.9 TODO # load and encode the question and answers, convert to torch tensors question_encoding = torch.zeros(self._max_question_length, self.question_word_list_length) answer_encoding = torch.zeros(10, self.answer_list_length) question_word_list = self._create_word_list(question_sentence) # print(idx,question_word_list) # print(len(self.question_word_to_id_map.keys())) # quit() for i, word in enumerate(question_word_list): # import ipdb; ipdb.set_trace() # print(i,word) # quit() if i >= self._max_question_length: break if word not in self.question_word_to_id_map.keys(): map_idx = self.unknown_question_word_index else: map_idx = self.question_word_to_id_map[word] question_encoding[i][map_idx] = 1 # answer_sentence_list = self._create_word_list(answer_sentences) for i, answer in enumerate(answer_sentences): if answer not in self.answer_to_id_map.keys(): map_idx = self.unknown_answer_index else: map_idx = self.answer_to_id_map[answer] answer_encoding[i][map_idx] = 1 data = { 'image': image, 'question': question_encoding, 'answer': answer_encoding } return data
class ImgDataset(Dataset): """ Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples. """ def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, existing_format=None): """ Args: image_dir (string): Path to the directory with COCO images question_json_file_path (string): Path to the json file containing the question data annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and answers together image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg") """ self.image_dir = image_dir self.question_json_file_path = question_json_file_path self.annotation_json_file_path = annotation_json_file_path self.image_filename_pattern = image_filename_pattern # self.existing_format = existing_format self.vqa = VQA(annotation_json_file_path, question_json_file_path) self.queIds = self.vqa.getQuesIds(); self.img_list = self.getUniqueImg() def imgIdToPath(self, idx): str_bracket = "{}" start_idx = self.image_filename_pattern.find(str_bracket) path = self.image_filename_pattern[0:start_idx] + str(idx).zfill(12) + self.image_filename_pattern[start_idx+2:] path = self.image_dir + "/" + path return path def getUniqueImg(self): count_img = [] for i in tqdm(range(len(self.queIds))): qa_id = self.queIds[i] qa = self.vqa.loadQA(qa_id)[0] image_id = qa['image_id'] if image_id not in count_img: count_img.append(image_id) print("Unique images size: ", len(count_img)) return count_img def __len__(self): return len(self.img_list) def __getitem__(self, idx): if idx >= len(self.vqa.qa): print("Error: access overflow") return None img_idx = self.img_list[idx] data = {} data['images_id'] = img_idx tmp_img = Image.open(self.imgIdToPath(img_idx)) tmp_img = tmp_img.convert('RGB') normalize = transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225]) trans = transforms.Compose([ transforms.Resize((224,224)), transforms.ToTensor(), normalize, ]) data['images'] = trans(tmp_img) return data