Esempio n. 1
0
def pre_process_dataset(image_dir, qjson, ajson, img_prefix):
    print('Preprocessing datatset. \n')
    vqa = VQA(ajson, qjson)

    img_names = [f for f in os.listdir(image_dir) if '.jpg' in f]
    img_ids = []
    for fname in img_names:
        img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
        img_ids.append(int(img_id))

    ques_ids = vqa.getQuesIds(img_ids)

    q2i = defaultdict(lambda: len(q2i))
    pad = q2i["<pad>"]
    start = q2i["<sos>"]
    end = q2i["<eos>"]
    UNK = q2i["<unk>"]

    a2i_count = {}
    for ques_id in ques_ids:
        qa = vqa.loadQA(ques_id)[0]
        qqa = vqa.loadQQA(ques_id)[0]

        ques = qqa['question'][:-1]
        [q2i[x] for x in ques.lower().strip().split(" ")]

        answers = qa['answers']
        for ans in answers:
            if not ans['answer_confidence'] == 'yes':
                continue
            ans = ans['answer'].lower()
            if ans not in a2i_count:
                a2i_count[ans] = 1
            else:
                a2i_count[ans] = a2i_count[ans] + 1

    a_sort = sorted(a2i_count.items(),
                    key=operator.itemgetter(1),
                    reverse=True)

    i2a = {}
    count = 0
    a2i = defaultdict(lambda: len(a2i))
    for word, _ in a_sort:
        a2i[word]
        i2a[a2i[word]] = word
        count = count + 1
        if count == 1000:
            break

    return q2i, a2i, i2a, a2i_count
Esempio n. 2
0
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """
    def __init__(self,
                 image_dir,
                 question_json_file_path,
                 annotation_json_file_path,
                 image_filename_pattern,
                 collate=False,
                 q2i=None,
                 a2i=None,
                 i2a=None,
                 img_names=None,
                 img_ids=None,
                 ques_ids=None,
                 method='simple',
                 dataset_type='train',
                 enc_dir=''):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        print(method)
        self.image_dir = image_dir
        self.qjson = question_json_file_path
        self.ajson = annotation_json_file_path
        img_prefix = image_filename_pattern.split('{}')[0]
        self.collate = collate
        self.q2i = q2i
        self.a2i = a2i
        self.i2a = i2a
        #self.a2i_count = a2i_count
        self.img_ids = img_ids
        self.ques_ids = ques_ids
        self.img_names = img_names
        self.method = method
        self.vqa = VQA(self.ajson, self.qjson)

        if self.method == 'simple':
            self.transform = transforms.Compose(
                [transforms.Resize((224, 224)),
                 transforms.ToTensor()])
        else:
            self.transform = transforms.Compose(
                [transforms.Resize((448, 448)),
                 transforms.ToTensor()])

        #if not collate:
        #    self.img_names = [f for f in os.listdir(self.image_dir) if '.jpg' in f]
        #    self.img_ids = []
        #    for fname in self.img_names:
        #        img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
        #        self.img_ids.append(int(img_id))

        #    self.ques_ids = self.vqa.getQuesIds(self.img_ids)

        #    self.q2i, self.a2i, self.i2a, self.a2i_count = pre_process_dataset(image_dir, self.qjson,
        #                                                                       self.ajson, img_prefix)

        self.q2i_len = len(self.q2i)
        self.a2i_len = len(self.a2i.keys())
        self.q2i_keys = self.q2i.keys()
        self.enc_dir = enc_dir

        #if collate and dataset_type == 'train':
        #    with open('/home/ubuntu/hw3_release/data/train_enc_idx.npy', 'rb') as f:
        #        self.enc_idx = pickle.load(f)
        #elif collate and dataset_type == 'val':
        #    with open('/home/ubuntu/hw3_release/data/val_enc_idx.npy', 'rb') as f:
        #        self.enc_idx = pickle.load(f)

    def __len__(self):
        return len(self.ques_ids)

    def __getitem__(self, idx):
        ques_id = self.ques_ids[idx]
        img_id = self.vqa.getImgIds([ques_id])[0]

        qa = self.vqa.loadQA(ques_id)[0]
        qqa = self.vqa.loadQQA(ques_id)[0]
        img_name = self.img_names[self.img_ids.index(img_id)]

        if self.method == 'simple':
            img = default_loader(self.image_dir + '/' + img_name)
            #imgT = self.transform(img).permute(1, 2, 0)
            imgT = self.transform(img).float()
        else:
            #file_idx = self.enc_idx[img_id] // 50
            #arr_idx = self.enc_idx[img_id] % 50
            #path = self.enc_dir + '/' + str(file_idx) + '.npz'
            #img = np.load(path)['out'][arr_idx, :, :]               # 512 x 196
            #imgT = torch.from_numpy(img).float()

            img = default_loader(self.image_dir + '/' + img_name)
            imgT = self.transform(img).float()

        ques = qqa['question'][:-1]
        quesI = [self.q2i["<sos>"]] + [
            self.q2i[x.lower()]
            for x in ques.split(" ") if x.lower() in self.q2i_keys
        ] + [self.q2i["<eos>"]]
        if not self.collate:
            quesI = quesI + [self.q2i["<pad>"]] * (8 - len(quesI))
        if self.method == 'simple':
            quesT = torch.zeros(self.q2i_len).float()
            for idx in quesI:
                quesT[idx] = 1
        else:
            quesT = torch.from_numpy(np.array(quesI)).long()

        answers = qa['answers']
        max_count = 0
        answer = ""
        for ans in answers:
            #if not ans['answer_confidence'] == 'yes':
            #    continue
            ans = ans['answer'].lower()
            if ans in self.a2i.keys():  # and self.a2i_count[ans] > max_count:
                #max_count = self.a2i_count[ans]
                answer = ans

        if answer == "":  # only for validation
            gT = torch.from_numpy(np.array([self.a2i_len])).long()
        else:
            gT = torch.from_numpy(np.array([self.a2i[answer]])).long()

        #print("ds: quesT", quesT.shape, quesT)
        #print("ds: gT", gT.shape)
        #print("ds: imgT", imgT.shape)

        if not self.collate:
            return {'img': imgT, 'ques': quesT, 'gt': gT}

        return imgT, quesT, gT
Esempio n. 3
0
img_ids = []
for fname in img_names:
    img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
    img_ids.append(int(img_id))

ques_ids = vqa.getQuesIds(img_ids)

q2i = defaultdict(lambda: len(q2i))
pad = q2i["<pad>"]
start = q2i["<sos>"]
end = q2i["<eos>"]
UNK = q2i["<unk>"]

a2i_count = {}
for ques_id in ques_ids:
    qa = vqa.loadQA(ques_id)[0]
    qqa = vqa.loadQQA(ques_id)[0]

    ques = qqa['question'][:-1]
    [q2i[x] for x in ques.lower().strip().split(" ")]

    answers = qa['answers']
    for ans in answers:
        if not ans['answer_confidence'] == 'yes':
            continue
        ans = ans['answer'].lower()
        if ans not in a2i_count:
            a2i_count[ans] = 1
        else:
            a2i_count[ans] = a2i_count[ans] + 1
Esempio n. 4
0
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """

    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern,
                 is_training=True, transform=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.ques_idx_list = self.vqa.getQuesIds()
        self.image_dir = image_dir
        self.image_filename_pattern = image_filename_pattern

        if os.path.exists('ques_dictionary.pkl'):
            with open('ques_dictionary.pkl', 'rb') as f:
                self.dictionary = pickle.load(f) 
        else:
            if is_training:
                self.dictionary = _build_question_dictionary(self.vqa)
                with open('ques_dictionary.pkl', 'wb') as f:
                    pickle.dump(self.dictionary, f)
            else:
                raise "No dictionary built from training dataset!"

        if os.path.exists('ans_dictionary.pkl'): 
            with open('ans_dictionary.pkl', 'rb') as f:
                self.answers = pickle.load(f) 
        else:
            if is_training:
                self.answers = _build_answer_dictionary(self.vqa)
                with open('ans_dictionary.pkl', 'wb') as f:
                    pickle.dump(self.answers, f)
            else:
                raise "No answer list built from training dataset!"

        # print(self.dictionary)
        # print(self.answers)
        self.image_transform = transform

    def __len__(self):
        return len(self.ques_idx_list)

    def __getitem__(self, idx):
        ques_idx = self.ques_idx_list[idx]
        ann = self.vqa.loadQA(ques_idx)[0]

        image_id = ann['image_id']
        image_name = self.image_filename_pattern.format(str(image_id).zfill(12))
        image_path = os.path.join(self.image_dir, image_name)
        if os.path.splitext(image_path)[1] == '.npy':
            image = np.load(image_path).T
        else:
            image = Image.open(image_path).convert('RGB')
            image = self.image_transform(image)

        question = self.vqa.qqa[ques_idx]['question']
        answers = ann['answers']
        best_answer = _get_majority_ans(answers)
        return {
                'image': image,
                'image_path': image_name,
                'question': question,
                'answer': best_answer,
                'question_encoding': _encode_question(question, self.dictionary),
                'answer_encoding': _encode_answer(best_answer, self.answers),
                }
Esempio n. 5
0
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """

    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, existing_format=None, ques_thres=12, ans_thres=6, seq_len=50, prepro=False, prepro_path=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self.image_dir = image_dir
        self.question_json_file_path = question_json_file_path
        self.annotation_json_file_path = annotation_json_file_path
        self.image_filename_pattern = image_filename_pattern
        self.prepro = prepro
        print("Allow preprocessing: ", self.prepro)
        # self.existing_format = existing_format

        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.queIds = self.vqa.getQuesIds();
        self.quesWords = self.getSplitQues()
        self.ansWords = self.getSplitAns()

        self.ques_thres = ques_thres
        self.ans_thres = ans_thres

        self.id_images = {}

        if existing_format is None:
            self.quesWordToIdx, self.quesVecSize = self.BuildBoW(self.quesWords,self.ques_thres)
            self.ansWordToIdx, self.ansVecSize = self.BuildBoW(self.ansWords,self.ans_thres)
            # self.quesVecSize = len(self.quesWordToIdx)
            self.seq_len = seq_len

        else:
            self.quesWordToIdx = existing_format.quesWordToIdx
            self.ansWordToIdx = existing_format.ansWordToIdx
            self.quesVecSize = existing_format.quesVecSize
            self.ansVecSize = existing_format.ansVecSize
            self.seq_len = existing_format.seq_len

        if self.prepro:
        	self.prepro_path = prepro_path
        	# features_extracted = h5py.File(prepro_path, 'r')
        	# self.features_h5 = features_extracted["features"][:]
        	# self.ids_h5 = features_extracted["ids"][:]

        print("The length of question vector: ", self.quesVecSize)
    
    def getImgSize(self):
    	return len(self.vqa.getImgIds())

    def imgIdToPath(self, idx):
        str_bracket = "{}"
        start_idx = self.image_filename_pattern.find(str_bracket)
        path = self.image_filename_pattern[0:start_idx] + str(idx).zfill(12) + self.image_filename_pattern[start_idx+2:]
        path = self.image_dir + "/" + path
        return path

    def getSplitQues(self):
        ques_words = []
        for i in range(0,len(self.queIds)):
            question = self.vqa.qqa[self.queIds[i]]['question']
            question = question[0:-1]
            question = question.lower()
            ques_words += question.split()
        return ques_words

    def getSplitAns(self):
        ans_words = []
        for i in range(0,len(self.queIds)):
            anss = self.vqa.qa[self.queIds[i]]['answers']
            for ans in anss:
                ans_str = ans["answer"]
                ans_words.append(ans_str)
        return ans_words

    def BoWPool(self, data):
        vocab_set = {}
        vocab_set['NA'] = 0
        for i in range(0,len(data)):
            for vocab in data:
                if vocab not in vocab_set:
                    idx = len(vocab_set)
                    vocab_set[vocab] = idx

        return vocab_set

    def BuildBoW(self, data, thres):
        vocab_set = {}
        for vocab in data:
            if vocab not in vocab_set:
                vocab_set[vocab] = 1
            else:
                vocab_set[vocab] += 1

        vocab_map = {}
        vocab_map['NA'] = 0
        idx = 1
        for vocab in data:
            if vocab not in vocab_map:
                if vocab_set[vocab] > thres:
                    vocab_map[vocab] = idx
                    idx += 1
                else:
                    vocab_map[vocab] = vocab_map['NA']

        # vocab_map['END'] = -1

        return vocab_map, idx

    def BoWVoting(self, sentences, table):
    	count_set = {}
    	for word in sentences:
    		if word['answer'] not in count_set:
    			count_set[word['answer']] = 1
    		else:
    			count_set[word['answer']] += 1
    	sorted_dict = sorted(count_set.items(), key=lambda kv: kv[1])
    	res_word = sorted_dict[-1][0]
    	best_ind = 0
    	if res_word in table:
    		best_ind = table[res_word]
    	return np.array(best_ind)

    def BoWVector(self, sentence, table):
    	bow_vec = np.zeros(self.quesVecSize)

    	for i in range(self.seq_len):
    		if i < len(sentence):
    			if sentence[i] in table:
    				bow_vec[table[sentence[i]]] = 1
    			else:
    				bow_vec[table['NA']] = 1
    	return bow_vec

    def BoWVectorGeneric(self, sentence, table):
    	bow_vec = np.zeros([self.seq_len,self.quesVecSize])

    	for i in range(self.seq_len):
    		if i < len(sentence):
    			if sentence[i] in table:
    				bow_vec[i,table[sentence[i]]] = 1
    			else:
    				bow_vec[i,table['NA']] = 1
    		else:
    			break

    	return bow_vec

    def saveFeatures(self, feat, id):
    	self.id_images[id] = feat

    def __len__(self):
        return len(self.queIds)

    def __getitem__(self, idx):
        
        if idx >= len(self.vqa.qa):
            print("Error: access overflow")
            return None

        idx_qa = self.queIds[idx]
        qa = self.vqa.loadQA(idx_qa)[0]
        data = {}
        tmp_question = self.vqa.qqa[idx_qa]['question']
        tmp_question = tmp_question.lower()[:-1]
        data['questions'] = torch.from_numpy(self.BoWVectorGeneric(tmp_question.split(),self.quesWordToIdx))
        tmp_answers = qa['answers']

        data['gt_answer'] = torch.from_numpy(self.BoWVoting(tmp_answers,self.ansWordToIdx))
        data['images_id'] = qa['image_id']
        if self.prepro:
        	# h5_idxs = self.ids_h5
        	# query_idx = np.where(h5_idxs==qa['image_id'])
        	# data['images_id'] = qa['image_id']
        	# tmp_features = self.features_h5[query_idx[0][0]]
        	# tmp_features = tmp_features.astype(np.float32)
        	# data['images'] = torch.from_numpy(tmp_features)

        	# Above for all features in one h5 file
        	# Below for several different feature files

        	img_idx = qa['image_id']
        	str_bracket = "{}"
        	start_idx = self.prepro_path.find(str_bracket)
        	path = self.prepro_path[0:start_idx] + str(img_idx) + self.prepro_path[start_idx+2:]
        	features_extracted = h5py.File(path, 'r')
        	feature = features_extracted["features"][:]
        	data['images'] = feature

        else:
        	tmp_img = Image.open(self.imgIdToPath(qa['image_id']))
        	tmp_img = tmp_img.convert('RGB')
        	normalize = transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])
        
        	trans = transforms.Compose([
            	transforms.Resize((224,224)),
            	transforms.ToTensor(),
            	normalize,
        	])
        	data['images'] = trans(tmp_img)

    
        
        return data
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """
    def __init__(self,
                 image_dir,
                 question_json_file_path,
                 annotation_json_file_path,
                 image_filename_pattern,
                 transform=None,
                 question_word_to_id_map=None,
                 answer_to_id_map=None,
                 question_word_list_length=5746,
                 answer_list_length=5216,
                 pre_encoder=None,
                 cache_location=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self._vqa = VQA(annotation_file=annotation_json_file_path,
                        question_file=question_json_file_path)
        self._image_dir = image_dir
        self._image_filename_pattern = image_filename_pattern
        self._transform = transform
        self._max_question_length = 26
        self.ques_ids = self._vqa.getQuesIds()

        # Publicly accessible dataset parameters
        self.question_word_list_length = question_word_list_length + 1
        self.unknown_question_word_index = question_word_list_length
        self.answer_list_length = answer_list_length + 1
        self.unknown_answer_index = answer_list_length
        self._pre_encoder = pre_encoder
        self._cache_location = cache_location
        if self._cache_location is not None:
            try:
                os.makedirs(self._cache_location)
            except OSError:
                pass

        # Create the question map if necessary
        if question_word_to_id_map is None:
            questions_list = []
            questions = self._vqa.questions['questions']

            for question in questions:
                questions_list.append(question['question'])

            word_list = self._create_word_list(questions_list)
            self.question_word_to_id_map = self._create_id_map(
                word_list, self.question_word_list_length)
        else:
            self.question_word_to_id_map = question_word_to_id_map

        # Create the answer map if necessary
        if answer_to_id_map is None:
            answer_list = []
            answers = self._vqa.dataset['annotations']
            for answer in answers:
                all_answers = answer['answers']
                for each_answer in all_answers:
                    answer_list.append(each_answer['answer'])

            self.answer_to_id_map = self._create_id_map(
                answer_list, self.answer_list_length)

        else:
            self.answer_to_id_map = answer_to_id_map

    def _create_word_list(self, sentences):
        """
        Turn a list of sentences into a list of processed words (no punctuation, lowercase, etc)
        Args:
            sentences: a list of str, sentences to be splitted into words
        Return:
            A list of str, words from the split, order remained.
        """

        word_list = []
        # Source: https://www.geeksforgeeks.org/removing-punctuations-given-string/
        for sentence in sentences:
            sentence = sentence.lower()
            punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
            for x in sentence:
                if x in punctuations:
                    sentence = sentence.replace(x, "")
            word_list.extend(sentence.split(" "))

        return word_list

    def _create_id_map(self, word_list, max_list_length):
        """
        Find the most common str in a list, then create a map from str to id (its rank in the frequency)
        Args:
            word_list: a list of str, where the most frequent elements are picked out
            max_list_length: the number of strs picked
        Return:
            A map (dict) from str to id (rank)
        """

        freq_words = {}

        for word in word_list:
            if word in freq_words:
                freq_words[word] += 1
            else:
                freq_words[word] = 1

        # Sort dictionary by frequency of words
        freq_words = dict(
            sorted(freq_words.items(),
                   key=operator.itemgetter(1),
                   reverse=True))

        # Update dictionary for the max list length
        freq_words = take(max_list_length, freq_words.items())

        freq_words = [(val[0], idx) for idx, val in enumerate(freq_words)]
        freq_words = dict(freq_words)

        return freq_words

    def __len__(self):

        return len(self.ques_ids)

    def __getitem__(self, idx):
        """
        Load an item of the dataset
        Args:
            idx: index of the data item
        Return:
            A dict containing multiple torch tensors for image, question and answers.
        """

        ques_id = self.ques_ids[idx]
        question = self._vqa.loadQA(ques_id)
        img_id = question[0]['image_id']
        img_id = str(img_id)
        img_id = img_id.zfill(12)

        if self._cache_location is not None and self._pre_encoder is not None:
            ############ 3.2 TODO
            # implement your caching and loading logic here

            ############
            raise NotImplementedError()
        else:
            # load the image from disk, apply self._transform (if not None)
            fpath = os.path.join(self._image_dir,
                                 self._image_filename_pattern.format(img_id))
            img = Image.open(fpath)
            img = img.convert('RGB')
            if self._transform:
                img = self._transform(img)
            else:
                #TODO: Check if this is right
                img = transforms.functional.to_tensor(img)

        question = self._vqa.questions['questions'][idx]['question']
        question_to_return = question
        question_split = self._create_word_list([question])

        total_question_words = np.array(
            list(self.question_word_to_id_map.keys()))

        question_one_hot = np.zeros(
            [self._max_question_length, self.question_word_list_length])

        for idx, word in enumerate(question_split):
            if idx == self._max_question_length:
                break
            contains_word = ((word in total_question_words) == True)
            if contains_word:
                hot_idx = np.where(word == total_question_words)[0][0]
                question_one_hot[idx, hot_idx] = 1
            else:
                question_one_hot[idx, -1] = 1
        question_one_hot = torch.from_numpy(question_one_hot)

        question_one_hot = torch.clamp(torch.sum(question_one_hot, dim=0),
                                       max=1)

        answers = self._vqa.dataset['annotations'][idx]['answers']
        answers_one_hot = np.zeros([10, self.answer_list_length])
        total_answer_words = np.array(list(self.answer_to_id_map.keys()))

        all_answer = []

        for idx, answer_dict in enumerate(answers):
            answer = answer_dict['answer']
            all_answer.append(answer)
            contains_word = ((answer in total_answer_words) == True)
            if contains_word:
                hot_idx = np.where(answer == total_answer_words)[0][0]
                answers_one_hot[idx, hot_idx] = 1
            else:
                answers_one_hot[idx, -1] = 1

        occurence_count = Counter(all_answer)
        answer_to_return = occurence_count.most_common(1)[0][0]

        answers_one_hot = torch.from_numpy(answers_one_hot)
        #img = img.cuda()
        #question_one_hot = question_one_hot.cuda()
        #answers_one_hot_list = answers_one_hot_list.cuda()

        #datapoint = {'image':img, 'question_tensor':question_one_hot, 'answers_tensor':answers_one_hot}

        #return datapoint
        return img, question_one_hot, answers_one_hot, question_to_return, answer_to_return, list(
            self.answer_to_id_map.keys())
Esempio n. 7
0
class ImgDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """

    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, existing_format=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
            """
        self.image_dir = image_dir
        self.question_json_file_path = question_json_file_path
        self.annotation_json_file_path = annotation_json_file_path
        self.image_filename_pattern = image_filename_pattern

        # self.existing_format = existing_format

        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.queIds = self.vqa.getQuesIds();
        self.img_list = self.getUniqueImg()

    def imgIdToPath(self, idx):
        str_bracket = "{}"
        start_idx = self.image_filename_pattern.find(str_bracket)
        path = self.image_filename_pattern[0:start_idx] + str(idx).zfill(12) + self.image_filename_pattern[start_idx+2:]
        path = self.image_dir + "/" + path
        return path

    def getUniqueImg(self):
        count_img = []
        for i in tqdm(range(len(self.queIds))):
            qa_id = self.queIds[i]
            qa = self.vqa.loadQA(qa_id)[0]
            image_id = qa['image_id']
            if image_id not in count_img:
                    count_img.append(image_id)
        print("Unique images size: ", len(count_img))
        return count_img


    def __len__(self):
        return len(self.img_list)

    def __getitem__(self, idx):

        if idx >= len(self.vqa.qa):
            print("Error: access overflow")
            return None

        img_idx = self.img_list[idx]
        data = {}

        data['images_id'] = img_idx
        tmp_img = Image.open(self.imgIdToPath(img_idx))
        tmp_img = tmp_img.convert('RGB')
        normalize = transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])

        trans = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
            normalize,
            ])
        data['images'] = trans(tmp_img)

        return data