Beispiel #1
0
 def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern):
     self.image_dir = image_dir
     self.image_filename_pattern = image_filename_pattern
     self.vqa_loader = VQA(annotation_file=train_annotation_path,
                           question_file=train_question_path)
     self.img_id = self.vqa_loader.getImgIds()
     self.img_id = list(dict.fromkeys(self.img_id))
Beispiel #2
0
def pre_process_dataset(image_dir, qjson, ajson, img_prefix):
    print('Preprocessing datatset. \n')
    vqa = VQA(ajson, qjson)

    img_names = [f for f in os.listdir(image_dir) if '.jpg' in f]
    img_ids = []
    for fname in img_names:
        img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
        img_ids.append(int(img_id))

    ques_ids = vqa.getQuesIds(img_ids)

    q2i = defaultdict(lambda: len(q2i))
    pad = q2i["<pad>"]
    start = q2i["<sos>"]
    end = q2i["<eos>"]
    UNK = q2i["<unk>"]

    a2i_count = {}
    for ques_id in ques_ids:
        qa = vqa.loadQA(ques_id)[0]
        qqa = vqa.loadQQA(ques_id)[0]

        ques = qqa['question'][:-1]
        [q2i[x] for x in ques.lower().strip().split(" ")]

        answers = qa['answers']
        for ans in answers:
            if not ans['answer_confidence'] == 'yes':
                continue
            ans = ans['answer'].lower()
            if ans not in a2i_count:
                a2i_count[ans] = 1
            else:
                a2i_count[ans] = a2i_count[ans] + 1

    a_sort = sorted(a2i_count.items(),
                    key=operator.itemgetter(1),
                    reverse=True)

    i2a = {}
    count = 0
    a2i = defaultdict(lambda: len(a2i))
    for word, _ in a_sort:
        a2i[word]
        i2a[a2i[word]] = word
        count = count + 1
        if count == 1000:
            break

    return q2i, a2i, i2a, a2i_count
Beispiel #3
0
class imgDataset(Dataset):
    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern):
        self.image_dir = image_dir
        self.image_filename_pattern = image_filename_pattern
        self.vqa_loader = VQA(annotation_file=train_annotation_path,
                              question_file=train_question_path)
        self.img_id = self.vqa_loader.getImgIds()
        self.img_id = list(dict.fromkeys(self.img_id))

    def __len__(self):
        return len(self.img_id)

    def __getitem__(self, idx):
        idx = self.img_id[idx]
        i = '{0:012d}'.format(idx)
        item = {}
        path = os.path.join(
            self.image_dir, self.image_filename_pattern.format(i))
        feature = cv2.imread(path)
        feature = cv2.resize(feature, (448, 448))
        feature = np.array(feature.astype(np.float32) / 255)
        feature = np.transpose(feature, (2, 0, 1))
        feature = torch.tensor(
            feature, dtype=torch.float32, requires_grad=False)
        item.update({'idx': idx})
        item.update({'feature': feature})
        return item
Beispiel #4
0
    def __init__(self, image_dir, question_json_file_path,
                 annotation_json_file_path, image_filename_pattern,
                 bag_word_question, bag_word_answer):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self.image_dir = image_dir
        self.image_filename_pattern = image_filename_pattern
        self.vqa_loader = VQA(annotation_file=annotation_json_file_path,
                              question_file=question_json_file_path)

        self.entries = self.vqa_loader.qqa
        self.qa = self.vqa_loader.qa

        self.bag_word_question = bag_word_question
        self.bag_word_answer = bag_word_answer

        self.bag_size_question = len(self.bag_word_question)
        self.bag_size_answer = len(self.bag_word_answer)

        self.milky_vector_question = self.get_milky_vector_question()
        self.milky_vector_answer = self.get_milky_vector_answer()

        self.gt_dict = self.get_gt()
        self.question_index = self.get_index()
Beispiel #5
0
    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, existing_format=None, ques_thres=12, ans_thres=6, seq_len=50, prepro=False, prepro_path=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self.image_dir = image_dir
        self.question_json_file_path = question_json_file_path
        self.annotation_json_file_path = annotation_json_file_path
        self.image_filename_pattern = image_filename_pattern
        self.prepro = prepro
        print("Allow preprocessing: ", self.prepro)
        # self.existing_format = existing_format

        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.queIds = self.vqa.getQuesIds();
        self.quesWords = self.getSplitQues()
        self.ansWords = self.getSplitAns()

        self.ques_thres = ques_thres
        self.ans_thres = ans_thres

        self.id_images = {}

        if existing_format is None:
            self.quesWordToIdx, self.quesVecSize = self.BuildBoW(self.quesWords,self.ques_thres)
            self.ansWordToIdx, self.ansVecSize = self.BuildBoW(self.ansWords,self.ans_thres)
            # self.quesVecSize = len(self.quesWordToIdx)
            self.seq_len = seq_len

        else:
            self.quesWordToIdx = existing_format.quesWordToIdx
            self.ansWordToIdx = existing_format.ansWordToIdx
            self.quesVecSize = existing_format.quesVecSize
            self.ansVecSize = existing_format.ansVecSize
            self.seq_len = existing_format.seq_len

        if self.prepro:
        	self.prepro_path = prepro_path
        	# features_extracted = h5py.File(prepro_path, 'r')
        	# self.features_h5 = features_extracted["features"][:]
        	# self.ids_h5 = features_extracted["ids"][:]

        print("The length of question vector: ", self.quesVecSize)
Beispiel #6
0
    def __init__(self, train_image_dir, train_question_path,
                 train_annotation_path, test_image_dir, test_question_path,
                 test_annotation_path, batch_size, num_epochs,
                 num_data_loader_workers):

        self.vqa_loader = VQA(annotation_file=train_annotation_path,
                              question_file=train_question_path)

        self.entries = self.vqa_loader.qqa
        self.qa = self.vqa_loader.qa

        bag_word_question = self.get_bag_of_word_question()
        bag_word_answer = self.get_bag_of_word_answer()

        pdb.set_trace()

        train_dataset = VqaDataset(
            image_dir=train_image_dir,
            question_json_file_path=train_question_path,
            annotation_json_file_path=train_annotation_path,
            image_filename_pattern="COCO_train2014_{}.jpg",
            bag_word_question=bag_word_question,
            bag_word_answer=bag_word_answer)

        val_dataset = VqaDataset(
            image_dir=test_image_dir,
            question_json_file_path=test_question_path,
            annotation_json_file_path=test_annotation_path,
            image_filename_pattern="COCO_val2014_{}.jpg",
            bag_word_question=bag_word_question,
            bag_word_answer=bag_word_answer)

        num_question = train_dataset.bag_size_question
        num_answer = train_dataset.bag_size_answer
        model = SimpleBaselineNet(num_question, num_answer)

        # could be added outside
        # lr = 0.01
        # momentum = 0.9

        # pdb.set_trace()

        # self.optimizer = torch.optim.SGD(
        #     model.parameters(), lr=lr, momentum=momentum)
        self.optimizer = torch.optim.SGD(
            [{
                'params': model.fc.parameters()
            }, {
                'params': model.feature.parameters()
            }, {
                'params': model.embedding.parameters(),
                'lr': 0.8
            }],
            lr=0.01,
            momentum=0.9)
        self.criterion = nn.CrossEntropyLoss().cuda()

        super().__init__(train_dataset, val_dataset, model, batch_size,
                         num_epochs, num_data_loader_workers)
Beispiel #7
0
    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, existing_format=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
            """
        self.image_dir = image_dir
        self.question_json_file_path = question_json_file_path
        self.annotation_json_file_path = annotation_json_file_path
        self.image_filename_pattern = image_filename_pattern

        # self.existing_format = existing_format

        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.queIds = self.vqa.getQuesIds();
        self.img_list = self.getUniqueImg()
Beispiel #8
0
    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern,
                 is_training=True, transform=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.ques_idx_list = self.vqa.getQuesIds()
        self.image_dir = image_dir
        self.image_filename_pattern = image_filename_pattern

        if os.path.exists('ques_dictionary.pkl'):
            with open('ques_dictionary.pkl', 'rb') as f:
                self.dictionary = pickle.load(f) 
        else:
            if is_training:
                self.dictionary = _build_question_dictionary(self.vqa)
                with open('ques_dictionary.pkl', 'wb') as f:
                    pickle.dump(self.dictionary, f)
            else:
                raise "No dictionary built from training dataset!"

        if os.path.exists('ans_dictionary.pkl'): 
            with open('ans_dictionary.pkl', 'rb') as f:
                self.answers = pickle.load(f) 
        else:
            if is_training:
                self.answers = _build_answer_dictionary(self.vqa)
                with open('ans_dictionary.pkl', 'wb') as f:
                    pickle.dump(self.answers, f)
            else:
                raise "No answer list built from training dataset!"

        # print(self.dictionary)
        # print(self.answers)
        self.image_transform = transform
Beispiel #9
0
  def __init__(self, train_image_dir, train_question_path, train_annotation_path,
               test_image_dir, test_question_path, test_annotation_path, batch_size, num_epochs,
               num_data_loader_workers):

    self.vqa_loader = VQA(annotation_file=train_annotation_path,
                          question_file=train_question_path)

    self.entries = self.vqa_loader.qqa
    self.qa = self.vqa_loader.qa

    bag_word_question = self.get_bag_of_word_question()
    bag_word_answer = self.get_bag_of_word_answer()

    train_dataset = VqaDataset(image_dir=train_image_dir,
                               question_json_file_path=train_question_path,
                               annotation_json_file_path=train_annotation_path,
                               image_filename_pattern="COCO_train2014_{}.jpg",
                               bag_word_question=bag_word_question,
                               bag_word_answer=bag_word_answer,
                               img_dir="./data_val/train.hdf5")
    val_dataset = VqaDataset(image_dir=test_image_dir,
                             question_json_file_path=test_question_path,
                             annotation_json_file_path=test_annotation_path,
                             image_filename_pattern="COCO_val2014_{}.jpg",
                             bag_word_question=bag_word_question,
                             bag_word_answer=bag_word_answer,
                             img_dir="./data_val/val.hdf5")

    num_question = train_dataset.bag_size_question
    num_answer = train_dataset.bag_size_answer

    # pdb.set_trace()

    max_len_train = train_dataset.max_len
    max_len_val = val_dataset.max_len

    print('max_len for train:{}, max_len for val:{}'.format(
        max_len_train, max_len_val))

    self._model = CoattentionNet(num_question, num_answer, 26)

    # self.optimizer = torch.optim.SGD(
    #     self._model.parameters(), lr=0.001, momentum=0.9)
    # pdb.set_trace()
    self.optimizer = torch.optim.Adam(
        self._model.parameters(), lr=4e-4, eps=1e-8)
    self.criterion = nn.CrossEntropyLoss().cuda()

    super().__init__(train_dataset, val_dataset, self._model, batch_size, num_epochs,
                     num_data_loader_workers=num_data_loader_workers)
Beispiel #10
0
    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern,
                 transform=None, question_word_to_id_map=None, answer_to_id_map=None, question_word_list_length=5746, answer_list_length=5216,
                 pre_encoder=None, cache_location=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self._vqa = VQA(annotation_file=annotation_json_file_path, question_file=question_json_file_path)
        self._image_dir = image_dir
        self._image_filename_pattern = image_filename_pattern
        self._transform = transform
        self._max_question_length = 26

        # Publicly accessible dataset parameters
        self.question_word_list_length = question_word_list_length + 1
        self.unknown_question_word_index = question_word_list_length
        self.answer_list_length = answer_list_length + 1
        self.unknown_answer_index = answer_list_length
        self._pre_encoder = pre_encoder
        self._cache_location = cache_location
        if self._cache_location is not None:
            try:
                os.makedirs(self._cache_location)
            except OSError:
                pass

        # Create the question map if necessary
        if question_word_to_id_map is None:
            ############ 1.6 TODO


            ############
            raise NotImplementedError()
        else:
            self.question_word_to_id_map = question_word_to_id_map

        # Create the answer map if necessary
        if answer_to_id_map is None:
            ############ 1.7 TODO


            ############
            raise NotImplementedError()
        else:
            self.answer_to_id_map = answer_to_id_map
Beispiel #11
0
    for p in punct:
        if (str(p + ' ') in inText or str(' ' + p)
                in inText) or (re.search(commaStrip, inText) != None):
            outText = outText.replace(p, '')
        else:
            outText = outText.replace(p, ' ')
    outText = periodStrip.sub("", outText, re.UNICODE)
    return outText


if __name__ == "__main__":

    question_file = sys.argv[1]
    annotation_file = sys.argv[2]
    out_filename = sys.argv[3]
    vqa_db = VQA(annotation_file, question_file)

    ques_list = []
    ans_list = []
    for q_id, annotation in vqa_db.qa.items():
        question = vqa_db.qqa[q_id]['question']
        question = question.lower()[:-1]
        question = question.replace('?', '')  #Just in case
        words = question.split(' ')
        ques_list += words

        answer_objs = annotation['answers']

        possible_answers = [a['answer'] for a in answer_objs]

        for answer in possible_answers:
    def __init__(self,
                 image_dir,
                 question_json_file_path,
                 annotation_json_file_path,
                 image_filename_pattern,
                 collate=False,
                 q2i=None,
                 a2i=None,
                 i2a=None,
                 img_names=None,
                 img_ids=None,
                 ques_ids=None,
                 method='simple',
                 dataset_type='train',
                 enc_dir=''):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        print(method)
        self.image_dir = image_dir
        self.qjson = question_json_file_path
        self.ajson = annotation_json_file_path
        img_prefix = image_filename_pattern.split('{}')[0]
        self.collate = collate
        self.q2i = q2i
        self.a2i = a2i
        self.i2a = i2a
        #self.a2i_count = a2i_count
        self.img_ids = img_ids
        self.ques_ids = ques_ids
        self.img_names = img_names
        self.method = method
        self.vqa = VQA(self.ajson, self.qjson)

        if self.method == 'simple':
            self.transform = transforms.Compose(
                [transforms.Resize((224, 224)),
                 transforms.ToTensor()])
        else:
            self.transform = transforms.Compose(
                [transforms.Resize((448, 448)),
                 transforms.ToTensor()])

        #if not collate:
        #    self.img_names = [f for f in os.listdir(self.image_dir) if '.jpg' in f]
        #    self.img_ids = []
        #    for fname in self.img_names:
        #        img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
        #        self.img_ids.append(int(img_id))

        #    self.ques_ids = self.vqa.getQuesIds(self.img_ids)

        #    self.q2i, self.a2i, self.i2a, self.a2i_count = pre_process_dataset(image_dir, self.qjson,
        #                                                                       self.ajson, img_prefix)

        self.q2i_len = len(self.q2i)
        self.a2i_len = len(self.a2i.keys())
        self.q2i_keys = self.q2i.keys()
        self.enc_dir = enc_dir
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """
    def __init__(self,
                 image_dir,
                 question_json_file_path,
                 annotation_json_file_path,
                 image_filename_pattern,
                 collate=False,
                 q2i=None,
                 a2i=None,
                 i2a=None,
                 img_names=None,
                 img_ids=None,
                 ques_ids=None,
                 method='simple',
                 dataset_type='train',
                 enc_dir=''):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        print(method)
        self.image_dir = image_dir
        self.qjson = question_json_file_path
        self.ajson = annotation_json_file_path
        img_prefix = image_filename_pattern.split('{}')[0]
        self.collate = collate
        self.q2i = q2i
        self.a2i = a2i
        self.i2a = i2a
        #self.a2i_count = a2i_count
        self.img_ids = img_ids
        self.ques_ids = ques_ids
        self.img_names = img_names
        self.method = method
        self.vqa = VQA(self.ajson, self.qjson)

        if self.method == 'simple':
            self.transform = transforms.Compose(
                [transforms.Resize((224, 224)),
                 transforms.ToTensor()])
        else:
            self.transform = transforms.Compose(
                [transforms.Resize((448, 448)),
                 transforms.ToTensor()])

        #if not collate:
        #    self.img_names = [f for f in os.listdir(self.image_dir) if '.jpg' in f]
        #    self.img_ids = []
        #    for fname in self.img_names:
        #        img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
        #        self.img_ids.append(int(img_id))

        #    self.ques_ids = self.vqa.getQuesIds(self.img_ids)

        #    self.q2i, self.a2i, self.i2a, self.a2i_count = pre_process_dataset(image_dir, self.qjson,
        #                                                                       self.ajson, img_prefix)

        self.q2i_len = len(self.q2i)
        self.a2i_len = len(self.a2i.keys())
        self.q2i_keys = self.q2i.keys()
        self.enc_dir = enc_dir

        #if collate and dataset_type == 'train':
        #    with open('/home/ubuntu/hw3_release/data/train_enc_idx.npy', 'rb') as f:
        #        self.enc_idx = pickle.load(f)
        #elif collate and dataset_type == 'val':
        #    with open('/home/ubuntu/hw3_release/data/val_enc_idx.npy', 'rb') as f:
        #        self.enc_idx = pickle.load(f)

    def __len__(self):
        return len(self.ques_ids)

    def __getitem__(self, idx):
        ques_id = self.ques_ids[idx]
        img_id = self.vqa.getImgIds([ques_id])[0]

        qa = self.vqa.loadQA(ques_id)[0]
        qqa = self.vqa.loadQQA(ques_id)[0]
        img_name = self.img_names[self.img_ids.index(img_id)]

        if self.method == 'simple':
            img = default_loader(self.image_dir + '/' + img_name)
            #imgT = self.transform(img).permute(1, 2, 0)
            imgT = self.transform(img).float()
        else:
            #file_idx = self.enc_idx[img_id] // 50
            #arr_idx = self.enc_idx[img_id] % 50
            #path = self.enc_dir + '/' + str(file_idx) + '.npz'
            #img = np.load(path)['out'][arr_idx, :, :]               # 512 x 196
            #imgT = torch.from_numpy(img).float()

            img = default_loader(self.image_dir + '/' + img_name)
            imgT = self.transform(img).float()

        ques = qqa['question'][:-1]
        quesI = [self.q2i["<sos>"]] + [
            self.q2i[x.lower()]
            for x in ques.split(" ") if x.lower() in self.q2i_keys
        ] + [self.q2i["<eos>"]]
        if not self.collate:
            quesI = quesI + [self.q2i["<pad>"]] * (8 - len(quesI))
        if self.method == 'simple':
            quesT = torch.zeros(self.q2i_len).float()
            for idx in quesI:
                quesT[idx] = 1
        else:
            quesT = torch.from_numpy(np.array(quesI)).long()

        answers = qa['answers']
        max_count = 0
        answer = ""
        for ans in answers:
            #if not ans['answer_confidence'] == 'yes':
            #    continue
            ans = ans['answer'].lower()
            if ans in self.a2i.keys():  # and self.a2i_count[ans] > max_count:
                #max_count = self.a2i_count[ans]
                answer = ans

        if answer == "":  # only for validation
            gT = torch.from_numpy(np.array([self.a2i_len])).long()
        else:
            gT = torch.from_numpy(np.array([self.a2i[answer]])).long()

        #print("ds: quesT", quesT.shape, quesT)
        #print("ds: gT", gT.shape)
        #print("ds: imgT", imgT.shape)

        if not self.collate:
            return {'img': imgT, 'ques': quesT, 'gt': gT}

        return imgT, quesT, gT
Beispiel #14
0
import os
import operator
import numpy as np

from six.moves import cPickle as pickle
from collections import defaultdict
from external.vqa.vqa import VQA

image_dir = "/projectnb/statnlp/shawnlin/dataset/mscoco_vqa_2014/train2014"
img_prefix = "COCO_train2014_"
qjson = "/projectnb/statnlp/shawnlin/dataset/mscoco_vqa_2014/Questions_Train_mscoco/OpenEnded_mscoco_train2014_questions.json"
ajson = "/projectnb/statnlp/shawnlin/dataset/mscoco_vqa_2014/Annotations_Train_mscoco/mscoco_train2014_annotations.json"

vqa = VQA(ajson, qjson)

img_names = [f for f in os.listdir(image_dir) if '.jpg' in f]
img_ids = []
for fname in img_names:
    img_id = fname.split('.')[0].rpartition(img_prefix)[-1]
    img_ids.append(int(img_id))

ques_ids = vqa.getQuesIds(img_ids)

q2i = defaultdict(lambda: len(q2i))
pad = q2i["<pad>"]
start = q2i["<sos>"]
end = q2i["<eos>"]
UNK = q2i["<unk>"]

a2i_count = {}
for ques_id in ques_ids:
Beispiel #15
0
    def __init__(self,
                image_dir,
                question_json_file_path,
                annotation_json_file_path,
                image_filename_pattern,
                img_features_dir,
                vocab_json_filename,
                cache_ds_json=False,
                ds_json_filename='temp.json'):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
            img_features_dir (string): Path to the directory with image features
            ds_json_filename (string): Path to the existing dataset json or where to save
            cache_ds_json (string): Save or not

        """
        if os.path.isfile(ds_json_filename):
            f = open(ds_json_filename, "r")
            self.dataset = json.load(f)
            f.close()
        else:
            vqa_db = VQA(annotation_file=annotation_json_file_path, question_file=question_json_file_path)

            self.max_words_in_ques = -1
            
            self.dataset = []
            self.weight = {'yes':2, 'maybe':1, 'no':0}

            print("Populating data structures...")
            i = 0
            for q_id, annotation in vqa_db.qa.items():
                # if i == 500:
                #     break
                # i += 1
                entry = {}
                question = vqa_db.qqa[q_id]['question']
                question = question.lower()[:-1]
                question = question.replace('?', '') #Just in case
                words = question.split(' ')
                if len(words) > self.max_words_in_ques:
                    self.max_words_in_ques = len(words)
                entry['ques'] = words
                answer_objs = annotation['answers']

                possible_answers = [a['answer'] for a in answer_objs]

                entry['possible_answers'] = []
                for answer in possible_answers:
                    mod_ans = self._handle_punctuation(answer)
                    entry['possible_answers'].append(mod_ans)

                
                img_full_idx = "%012d" % annotation['image_id']
                img_name = image_filename_pattern.replace('{}', img_full_idx)
                img_loc = os.path.join(image_dir, img_name)
                entry['img_loc'] = img_loc

                img_feature_loc = os.path.join(img_features_dir, img_name.replace('.jpg', '.npy'))
                entry['img_feat_loc'] = img_feature_loc
                self.dataset.append(entry)

            if cache_ds_json:
                f = open(ds_json_filename, "w")
                json.dump(self.dataset, f)
                f.close()
        
        f = open(vocab_json_filename, "r")
        vocab = json.load(f)
        f.close()

        self.q_word_vocab = vocab['q']
        self.q_vocab_size = len(self.q_word_vocab.keys())
        
        self.a_vocab = vocab['a']
        self.a_vocab_size = len(self.a_vocab.keys())
    def __init__(self,
                 image_dir,
                 question_json_file_path,
                 annotation_json_file_path,
                 image_filename_pattern,
                 transform=None,
                 question_word_to_id_map=None,
                 answer_to_id_map=None,
                 question_word_list_length=5746,
                 answer_list_length=5216,
                 pre_encoder=None,
                 cache_location=None,
                 max_list_length=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self._vqa = VQA(annotation_file=annotation_json_file_path,
                        question_file=question_json_file_path)
        self._image_dir = image_dir
        self._image_filename_pattern = image_filename_pattern
        if transform is not None:
            self._transform = transform
        else:
            self._transform = transforms.Compose([
                transforms.ToTensor(),
            ])
        self._max_question_length = 26

        # Publicly accessible dataset parameters
        self.question_word_list_length = question_word_list_length + 1
        self.unknown_question_word_index = question_word_list_length
        self.answer_list_length = answer_list_length + 1
        self.unknown_answer_index = answer_list_length

        self.question_ids = self._vqa.getQuesIds()
        self.fixed_str = '000000000000'

        self._pre_encoder = pre_encoder
        self._cache_location = cache_location

        if self._cache_location is not None:
            try:
                os.makedirs(self._cache_location)
            except OSError:
                pass

        # import ipdb; ipdb.set_trace()

        # Create the question map if necessary
        # self.question_word_to_id_map = {}
        if question_word_to_id_map is None:
            ############ 1.6 TODO
            question_sentences = []
            # question_ids = self._vqa.getQuesIds()

            for question_id in self.question_ids:
                question_sentences.append(
                    self._vqa.qqa[question_id]['question'])

            word_list = self._create_word_list(question_sentences)
            self.question_word_to_id_map = self._create_id_map(
                word_list, question_word_list_length)
            print("Created new question_to_id_map")
            ############
            # raise NotImplementedError()
        else:
            self.question_word_to_id_map = question_word_to_id_map
            print("Reused question_to_id_map")

        # Create the answer map if necessary
        if answer_to_id_map is None:
            ############ 1.7 TODO
            answer_sentence_list = []
            for question_id in self.question_ids:
                answer_list = self._vqa.qa[question_id]['answers']
                for item in answer_list:
                    answer_sentence_list.append(item['answer'])

            self.answer_to_id_map = self._create_id_map(
                answer_sentence_list, answer_list_length)
            print("Created new answer_to_id_map")
            ############
            # raise NotImplementedError()
        else:
            self.answer_to_id_map = answer_to_id_map
            print("Reused answer_to_id_map")
Beispiel #17
0
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """

    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern,
                 is_training=True, transform=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.ques_idx_list = self.vqa.getQuesIds()
        self.image_dir = image_dir
        self.image_filename_pattern = image_filename_pattern

        if os.path.exists('ques_dictionary.pkl'):
            with open('ques_dictionary.pkl', 'rb') as f:
                self.dictionary = pickle.load(f) 
        else:
            if is_training:
                self.dictionary = _build_question_dictionary(self.vqa)
                with open('ques_dictionary.pkl', 'wb') as f:
                    pickle.dump(self.dictionary, f)
            else:
                raise "No dictionary built from training dataset!"

        if os.path.exists('ans_dictionary.pkl'): 
            with open('ans_dictionary.pkl', 'rb') as f:
                self.answers = pickle.load(f) 
        else:
            if is_training:
                self.answers = _build_answer_dictionary(self.vqa)
                with open('ans_dictionary.pkl', 'wb') as f:
                    pickle.dump(self.answers, f)
            else:
                raise "No answer list built from training dataset!"

        # print(self.dictionary)
        # print(self.answers)
        self.image_transform = transform

    def __len__(self):
        return len(self.ques_idx_list)

    def __getitem__(self, idx):
        ques_idx = self.ques_idx_list[idx]
        ann = self.vqa.loadQA(ques_idx)[0]

        image_id = ann['image_id']
        image_name = self.image_filename_pattern.format(str(image_id).zfill(12))
        image_path = os.path.join(self.image_dir, image_name)
        if os.path.splitext(image_path)[1] == '.npy':
            image = np.load(image_path).T
        else:
            image = Image.open(image_path).convert('RGB')
            image = self.image_transform(image)

        question = self.vqa.qqa[ques_idx]['question']
        answers = ann['answers']
        best_answer = _get_majority_ans(answers)
        return {
                'image': image,
                'image_path': image_name,
                'question': question,
                'answer': best_answer,
                'question_encoding': _encode_question(question, self.dictionary),
                'answer_encoding': _encode_answer(best_answer, self.answers),
                }
Beispiel #18
0
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """

    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, existing_format=None, ques_thres=12, ans_thres=6, seq_len=50, prepro=False, prepro_path=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self.image_dir = image_dir
        self.question_json_file_path = question_json_file_path
        self.annotation_json_file_path = annotation_json_file_path
        self.image_filename_pattern = image_filename_pattern
        self.prepro = prepro
        print("Allow preprocessing: ", self.prepro)
        # self.existing_format = existing_format

        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.queIds = self.vqa.getQuesIds();
        self.quesWords = self.getSplitQues()
        self.ansWords = self.getSplitAns()

        self.ques_thres = ques_thres
        self.ans_thres = ans_thres

        self.id_images = {}

        if existing_format is None:
            self.quesWordToIdx, self.quesVecSize = self.BuildBoW(self.quesWords,self.ques_thres)
            self.ansWordToIdx, self.ansVecSize = self.BuildBoW(self.ansWords,self.ans_thres)
            # self.quesVecSize = len(self.quesWordToIdx)
            self.seq_len = seq_len

        else:
            self.quesWordToIdx = existing_format.quesWordToIdx
            self.ansWordToIdx = existing_format.ansWordToIdx
            self.quesVecSize = existing_format.quesVecSize
            self.ansVecSize = existing_format.ansVecSize
            self.seq_len = existing_format.seq_len

        if self.prepro:
        	self.prepro_path = prepro_path
        	# features_extracted = h5py.File(prepro_path, 'r')
        	# self.features_h5 = features_extracted["features"][:]
        	# self.ids_h5 = features_extracted["ids"][:]

        print("The length of question vector: ", self.quesVecSize)
    
    def getImgSize(self):
    	return len(self.vqa.getImgIds())

    def imgIdToPath(self, idx):
        str_bracket = "{}"
        start_idx = self.image_filename_pattern.find(str_bracket)
        path = self.image_filename_pattern[0:start_idx] + str(idx).zfill(12) + self.image_filename_pattern[start_idx+2:]
        path = self.image_dir + "/" + path
        return path

    def getSplitQues(self):
        ques_words = []
        for i in range(0,len(self.queIds)):
            question = self.vqa.qqa[self.queIds[i]]['question']
            question = question[0:-1]
            question = question.lower()
            ques_words += question.split()
        return ques_words

    def getSplitAns(self):
        ans_words = []
        for i in range(0,len(self.queIds)):
            anss = self.vqa.qa[self.queIds[i]]['answers']
            for ans in anss:
                ans_str = ans["answer"]
                ans_words.append(ans_str)
        return ans_words

    def BoWPool(self, data):
        vocab_set = {}
        vocab_set['NA'] = 0
        for i in range(0,len(data)):
            for vocab in data:
                if vocab not in vocab_set:
                    idx = len(vocab_set)
                    vocab_set[vocab] = idx

        return vocab_set

    def BuildBoW(self, data, thres):
        vocab_set = {}
        for vocab in data:
            if vocab not in vocab_set:
                vocab_set[vocab] = 1
            else:
                vocab_set[vocab] += 1

        vocab_map = {}
        vocab_map['NA'] = 0
        idx = 1
        for vocab in data:
            if vocab not in vocab_map:
                if vocab_set[vocab] > thres:
                    vocab_map[vocab] = idx
                    idx += 1
                else:
                    vocab_map[vocab] = vocab_map['NA']

        # vocab_map['END'] = -1

        return vocab_map, idx

    def BoWVoting(self, sentences, table):
    	count_set = {}
    	for word in sentences:
    		if word['answer'] not in count_set:
    			count_set[word['answer']] = 1
    		else:
    			count_set[word['answer']] += 1
    	sorted_dict = sorted(count_set.items(), key=lambda kv: kv[1])
    	res_word = sorted_dict[-1][0]
    	best_ind = 0
    	if res_word in table:
    		best_ind = table[res_word]
    	return np.array(best_ind)

    def BoWVector(self, sentence, table):
    	bow_vec = np.zeros(self.quesVecSize)

    	for i in range(self.seq_len):
    		if i < len(sentence):
    			if sentence[i] in table:
    				bow_vec[table[sentence[i]]] = 1
    			else:
    				bow_vec[table['NA']] = 1
    	return bow_vec

    def BoWVectorGeneric(self, sentence, table):
    	bow_vec = np.zeros([self.seq_len,self.quesVecSize])

    	for i in range(self.seq_len):
    		if i < len(sentence):
    			if sentence[i] in table:
    				bow_vec[i,table[sentence[i]]] = 1
    			else:
    				bow_vec[i,table['NA']] = 1
    		else:
    			break

    	return bow_vec

    def saveFeatures(self, feat, id):
    	self.id_images[id] = feat

    def __len__(self):
        return len(self.queIds)

    def __getitem__(self, idx):
        
        if idx >= len(self.vqa.qa):
            print("Error: access overflow")
            return None

        idx_qa = self.queIds[idx]
        qa = self.vqa.loadQA(idx_qa)[0]
        data = {}
        tmp_question = self.vqa.qqa[idx_qa]['question']
        tmp_question = tmp_question.lower()[:-1]
        data['questions'] = torch.from_numpy(self.BoWVectorGeneric(tmp_question.split(),self.quesWordToIdx))
        tmp_answers = qa['answers']

        data['gt_answer'] = torch.from_numpy(self.BoWVoting(tmp_answers,self.ansWordToIdx))
        data['images_id'] = qa['image_id']
        if self.prepro:
        	# h5_idxs = self.ids_h5
        	# query_idx = np.where(h5_idxs==qa['image_id'])
        	# data['images_id'] = qa['image_id']
        	# tmp_features = self.features_h5[query_idx[0][0]]
        	# tmp_features = tmp_features.astype(np.float32)
        	# data['images'] = torch.from_numpy(tmp_features)

        	# Above for all features in one h5 file
        	# Below for several different feature files

        	img_idx = qa['image_id']
        	str_bracket = "{}"
        	start_idx = self.prepro_path.find(str_bracket)
        	path = self.prepro_path[0:start_idx] + str(img_idx) + self.prepro_path[start_idx+2:]
        	features_extracted = h5py.File(path, 'r')
        	feature = features_extracted["features"][:]
        	data['images'] = feature

        else:
        	tmp_img = Image.open(self.imgIdToPath(qa['image_id']))
        	tmp_img = tmp_img.convert('RGB')
        	normalize = transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])
        
        	trans = transforms.Compose([
            	transforms.Resize((224,224)),
            	transforms.ToTensor(),
            	normalize,
        	])
        	data['images'] = trans(tmp_img)

    
        
        return data
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """
    def __init__(self,
                 image_dir,
                 question_json_file_path,
                 annotation_json_file_path,
                 image_filename_pattern,
                 transform=None,
                 question_word_to_id_map=None,
                 answer_to_id_map=None,
                 question_word_list_length=5746,
                 answer_list_length=5216,
                 pre_encoder=None,
                 cache_location=None,
                 max_list_length=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self._vqa = VQA(annotation_file=annotation_json_file_path,
                        question_file=question_json_file_path)
        self._image_dir = image_dir
        self._image_filename_pattern = image_filename_pattern
        if transform is not None:
            self._transform = transform
        else:
            self._transform = transforms.Compose([
                transforms.ToTensor(),
            ])
        self._max_question_length = 26

        # Publicly accessible dataset parameters
        self.question_word_list_length = question_word_list_length + 1
        self.unknown_question_word_index = question_word_list_length
        self.answer_list_length = answer_list_length + 1
        self.unknown_answer_index = answer_list_length

        self.question_ids = self._vqa.getQuesIds()
        self.fixed_str = '000000000000'

        self._pre_encoder = pre_encoder
        self._cache_location = cache_location

        if self._cache_location is not None:
            try:
                os.makedirs(self._cache_location)
            except OSError:
                pass

        # import ipdb; ipdb.set_trace()

        # Create the question map if necessary
        # self.question_word_to_id_map = {}
        if question_word_to_id_map is None:
            ############ 1.6 TODO
            question_sentences = []
            # question_ids = self._vqa.getQuesIds()

            for question_id in self.question_ids:
                question_sentences.append(
                    self._vqa.qqa[question_id]['question'])

            word_list = self._create_word_list(question_sentences)
            self.question_word_to_id_map = self._create_id_map(
                word_list, question_word_list_length)
            print("Created new question_to_id_map")
            ############
            # raise NotImplementedError()
        else:
            self.question_word_to_id_map = question_word_to_id_map
            print("Reused question_to_id_map")

        # Create the answer map if necessary
        if answer_to_id_map is None:
            ############ 1.7 TODO
            answer_sentence_list = []
            for question_id in self.question_ids:
                answer_list = self._vqa.qa[question_id]['answers']
                for item in answer_list:
                    answer_sentence_list.append(item['answer'])

            self.answer_to_id_map = self._create_id_map(
                answer_sentence_list, answer_list_length)
            print("Created new answer_to_id_map")
            ############
            # raise NotImplementedError()
        else:
            self.answer_to_id_map = answer_to_id_map
            print("Reused answer_to_id_map")

        # import pdb; pdb.set_trace()

    def _create_word_list(self, sentences):
        """
        Turn a list of sentences into a list of processed words (no punctuation, lowercase, etc)
        Args:
            sentences: a list of str, sentences to be splitted into words
        Return:
            A list of str, words from the split, order remained.
        """

        ############ 1.4 TODO
        """
        https://machinelearningmastery.com/clean-text-machine-learning-python/
        """
        import string
        table = str.maketrans('', '', string.punctuation)
        # import ipdb; ipdb.set_trace()
        word_list = []
        if type(sentences) == list:
            for sentence in sentences:
                words = sentence.split(" ")
                word_list += [word.translate(table).lower() for word in words]
        else:
            words = sentences.split(" ")
            word_list += [word.translate(table).lower() for word in words]
        ############
        # raise NotImplementedError()
        return word_list

    def _create_id_map(self, word_list, max_list_length):
        """
        Find the most common str in a list, then create a map from str to id (its rank in the frequency)
        Args:
            word_list: a list of str, where the most frequent elements are picked out
            max_list_length: the number of strs picked
        Return:
            A map (dict) from str to id (rank)
        """

        ############ 1.5 TODO
        from collections import Counter

        # import pdb; pdb.set_trace()
        word_rank_list = Counter(word_list).most_common(max_list_length)

        id_map = {}
        for idx, (word, _) in enumerate(word_rank_list):
            id_map[word] = idx

        ############
        # raise NotImplementedError()
        return id_map

    def __len__(self):
        ############ 1.8 TODO
        # return len(self._vqa.imgToQA)
        return len(self._vqa.questions['questions'])
        # return len(self._vqa.getQuesIds())
        ############
        # raise NotImplementedError()

    def __getitem__(self, idx):
        """
        Load an item of the dataset
        Args:
            idx: index of the data item
        Return:
            A dict containing multiple torch tensors for image, question and answers.
        """

        ############ 1.9 TODO
        # figure out the idx-th item of dataset from the VQA API
        image_id_from_idx = self._vqa.questions['questions'][idx]['image_id']
        question_id_from_idx = self._vqa.questions['questions'][idx][
            'question_id']
        question_sentence = self._vqa.questions['questions'][idx]['question']
        answer_sentences = [
            ans['answer']
            for ans in self._vqa.qa[question_id_from_idx]['answers']
        ]

        # import pdb; pdb.set_trace()

        ############

        # if self._cache_location is not None and self._pre_encoder is not None:
        #     ############ 3.2 TODO
        #     # implement your caching and loading logic here

        #     ############
        #     raise NotImplementedError()
        # else:
        ############ 1.9 TODO
        # load the image from disk, apply self._transform (if not None)
        # import ipdb; ipdb.set_trace()
        image_id_from_idx_string = self.fixed_str + str(image_id_from_idx)
        truncated_image_id_from_idx = image_id_from_idx_string[-12:]
        img_file_path = self._image_dir + '/' + self._image_filename_pattern.format(
            truncated_image_id_from_idx)
        image = Image.open(img_file_path)
        image = image.convert("RGB")
        image = self._transform(image)
        ############
        # raise NotImplementedError()

        ############ 1.9 TODO
        # load and encode the question and answers, convert to torch tensors
        question_encoding = torch.zeros(self._max_question_length,
                                        self.question_word_list_length)
        answer_encoding = torch.zeros(10, self.answer_list_length)

        question_word_list = self._create_word_list(question_sentence)
        # print(idx,question_word_list)
        # print(len(self.question_word_to_id_map.keys()))
        # quit()
        for i, word in enumerate(question_word_list):
            # import ipdb; ipdb.set_trace()
            # print(i,word)
            # quit()
            if i >= self._max_question_length:
                break
            if word not in self.question_word_to_id_map.keys():
                map_idx = self.unknown_question_word_index
            else:
                map_idx = self.question_word_to_id_map[word]
            question_encoding[i][map_idx] = 1

        # answer_sentence_list = self._create_word_list(answer_sentences)
        for i, answer in enumerate(answer_sentences):
            if answer not in self.answer_to_id_map.keys():
                map_idx = self.unknown_answer_index
            else:
                map_idx = self.answer_to_id_map[answer]
            answer_encoding[i][map_idx] = 1

        data = {
            'image': image,
            'question': question_encoding,
            'answer': answer_encoding
        }
        return data
Beispiel #20
0
class ImgDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """

    def __init__(self, image_dir, question_json_file_path, annotation_json_file_path, image_filename_pattern, existing_format=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
            """
        self.image_dir = image_dir
        self.question_json_file_path = question_json_file_path
        self.annotation_json_file_path = annotation_json_file_path
        self.image_filename_pattern = image_filename_pattern

        # self.existing_format = existing_format

        self.vqa = VQA(annotation_json_file_path, question_json_file_path)
        self.queIds = self.vqa.getQuesIds();
        self.img_list = self.getUniqueImg()

    def imgIdToPath(self, idx):
        str_bracket = "{}"
        start_idx = self.image_filename_pattern.find(str_bracket)
        path = self.image_filename_pattern[0:start_idx] + str(idx).zfill(12) + self.image_filename_pattern[start_idx+2:]
        path = self.image_dir + "/" + path
        return path

    def getUniqueImg(self):
        count_img = []
        for i in tqdm(range(len(self.queIds))):
            qa_id = self.queIds[i]
            qa = self.vqa.loadQA(qa_id)[0]
            image_id = qa['image_id']
            if image_id not in count_img:
                    count_img.append(image_id)
        print("Unique images size: ", len(count_img))
        return count_img


    def __len__(self):
        return len(self.img_list)

    def __getitem__(self, idx):

        if idx >= len(self.vqa.qa):
            print("Error: access overflow")
            return None

        img_idx = self.img_list[idx]
        data = {}

        data['images_id'] = img_idx
        tmp_img = Image.open(self.imgIdToPath(img_idx))
        tmp_img = tmp_img.convert('RGB')
        normalize = transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])

        trans = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
            normalize,
            ])
        data['images'] = trans(tmp_img)

        return data
class VqaDataset(Dataset):
    """
    Load the VQA dataset using the VQA python API. We provide the necessary subset in the External folder, but you may
    want to reference the full repo (https://github.com/GT-Vision-Lab/VQA) for usage examples.
    """
    def __init__(self,
                 image_dir,
                 question_json_file_path,
                 annotation_json_file_path,
                 image_filename_pattern,
                 transform=None,
                 question_word_to_id_map=None,
                 answer_to_id_map=None,
                 question_word_list_length=5746,
                 answer_list_length=5216,
                 pre_encoder=None,
                 cache_location=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self._vqa = VQA(annotation_file=annotation_json_file_path,
                        question_file=question_json_file_path)
        self._image_dir = image_dir
        self._image_filename_pattern = image_filename_pattern
        self._transform = transform
        self._max_question_length = 26
        self.ques_ids = self._vqa.getQuesIds()

        # Publicly accessible dataset parameters
        self.question_word_list_length = question_word_list_length + 1
        self.unknown_question_word_index = question_word_list_length
        self.answer_list_length = answer_list_length + 1
        self.unknown_answer_index = answer_list_length
        self._pre_encoder = pre_encoder
        self._cache_location = cache_location
        if self._cache_location is not None:
            try:
                os.makedirs(self._cache_location)
            except OSError:
                pass

        # Create the question map if necessary
        if question_word_to_id_map is None:
            questions_list = []
            questions = self._vqa.questions['questions']

            for question in questions:
                questions_list.append(question['question'])

            word_list = self._create_word_list(questions_list)
            self.question_word_to_id_map = self._create_id_map(
                word_list, self.question_word_list_length)
        else:
            self.question_word_to_id_map = question_word_to_id_map

        # Create the answer map if necessary
        if answer_to_id_map is None:
            answer_list = []
            answers = self._vqa.dataset['annotations']
            for answer in answers:
                all_answers = answer['answers']
                for each_answer in all_answers:
                    answer_list.append(each_answer['answer'])

            self.answer_to_id_map = self._create_id_map(
                answer_list, self.answer_list_length)

        else:
            self.answer_to_id_map = answer_to_id_map

    def _create_word_list(self, sentences):
        """
        Turn a list of sentences into a list of processed words (no punctuation, lowercase, etc)
        Args:
            sentences: a list of str, sentences to be splitted into words
        Return:
            A list of str, words from the split, order remained.
        """

        word_list = []
        # Source: https://www.geeksforgeeks.org/removing-punctuations-given-string/
        for sentence in sentences:
            sentence = sentence.lower()
            punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
            for x in sentence:
                if x in punctuations:
                    sentence = sentence.replace(x, "")
            word_list.extend(sentence.split(" "))

        return word_list

    def _create_id_map(self, word_list, max_list_length):
        """
        Find the most common str in a list, then create a map from str to id (its rank in the frequency)
        Args:
            word_list: a list of str, where the most frequent elements are picked out
            max_list_length: the number of strs picked
        Return:
            A map (dict) from str to id (rank)
        """

        freq_words = {}

        for word in word_list:
            if word in freq_words:
                freq_words[word] += 1
            else:
                freq_words[word] = 1

        # Sort dictionary by frequency of words
        freq_words = dict(
            sorted(freq_words.items(),
                   key=operator.itemgetter(1),
                   reverse=True))

        # Update dictionary for the max list length
        freq_words = take(max_list_length, freq_words.items())

        freq_words = [(val[0], idx) for idx, val in enumerate(freq_words)]
        freq_words = dict(freq_words)

        return freq_words

    def __len__(self):

        return len(self.ques_ids)

    def __getitem__(self, idx):
        """
        Load an item of the dataset
        Args:
            idx: index of the data item
        Return:
            A dict containing multiple torch tensors for image, question and answers.
        """

        ques_id = self.ques_ids[idx]
        question = self._vqa.loadQA(ques_id)
        img_id = question[0]['image_id']
        img_id = str(img_id)
        img_id = img_id.zfill(12)

        if self._cache_location is not None and self._pre_encoder is not None:
            ############ 3.2 TODO
            # implement your caching and loading logic here

            ############
            raise NotImplementedError()
        else:
            # load the image from disk, apply self._transform (if not None)
            fpath = os.path.join(self._image_dir,
                                 self._image_filename_pattern.format(img_id))
            img = Image.open(fpath)
            img = img.convert('RGB')
            if self._transform:
                img = self._transform(img)
            else:
                #TODO: Check if this is right
                img = transforms.functional.to_tensor(img)

        question = self._vqa.questions['questions'][idx]['question']
        question_to_return = question
        question_split = self._create_word_list([question])

        total_question_words = np.array(
            list(self.question_word_to_id_map.keys()))

        question_one_hot = np.zeros(
            [self._max_question_length, self.question_word_list_length])

        for idx, word in enumerate(question_split):
            if idx == self._max_question_length:
                break
            contains_word = ((word in total_question_words) == True)
            if contains_word:
                hot_idx = np.where(word == total_question_words)[0][0]
                question_one_hot[idx, hot_idx] = 1
            else:
                question_one_hot[idx, -1] = 1
        question_one_hot = torch.from_numpy(question_one_hot)

        question_one_hot = torch.clamp(torch.sum(question_one_hot, dim=0),
                                       max=1)

        answers = self._vqa.dataset['annotations'][idx]['answers']
        answers_one_hot = np.zeros([10, self.answer_list_length])
        total_answer_words = np.array(list(self.answer_to_id_map.keys()))

        all_answer = []

        for idx, answer_dict in enumerate(answers):
            answer = answer_dict['answer']
            all_answer.append(answer)
            contains_word = ((answer in total_answer_words) == True)
            if contains_word:
                hot_idx = np.where(answer == total_answer_words)[0][0]
                answers_one_hot[idx, hot_idx] = 1
            else:
                answers_one_hot[idx, -1] = 1

        occurence_count = Counter(all_answer)
        answer_to_return = occurence_count.most_common(1)[0][0]

        answers_one_hot = torch.from_numpy(answers_one_hot)
        #img = img.cuda()
        #question_one_hot = question_one_hot.cuda()
        #answers_one_hot_list = answers_one_hot_list.cuda()

        #datapoint = {'image':img, 'question_tensor':question_one_hot, 'answers_tensor':answers_one_hot}

        #return datapoint
        return img, question_one_hot, answers_one_hot, question_to_return, answer_to_return, list(
            self.answer_to_id_map.keys())
    def __init__(self,
                 image_dir,
                 question_json_file_path,
                 annotation_json_file_path,
                 image_filename_pattern,
                 transform=None,
                 question_word_to_id_map=None,
                 answer_to_id_map=None,
                 question_word_list_length=5746,
                 answer_list_length=5216,
                 pre_encoder=None,
                 cache_location=None):
        """
        Args:
            image_dir (string): Path to the directory with COCO images
            question_json_file_path (string): Path to the json file containing the question data
            annotation_json_file_path (string): Path to the json file containing the annotations mapping images, questions, and
                answers together
            image_filename_pattern (string): The pattern the filenames of images in this dataset use (eg "COCO_train2014_{}.jpg")
        """
        self._vqa = VQA(annotation_file=annotation_json_file_path,
                        question_file=question_json_file_path)
        self._image_dir = image_dir
        self._image_filename_pattern = image_filename_pattern
        self._transform = transform
        self._max_question_length = 26
        self.ques_ids = self._vqa.getQuesIds()

        # Publicly accessible dataset parameters
        self.question_word_list_length = question_word_list_length + 1
        self.unknown_question_word_index = question_word_list_length
        self.answer_list_length = answer_list_length + 1
        self.unknown_answer_index = answer_list_length
        self._pre_encoder = pre_encoder
        self._cache_location = cache_location
        if self._cache_location is not None:
            try:
                os.makedirs(self._cache_location)
            except OSError:
                pass

        # Create the question map if necessary
        if question_word_to_id_map is None:
            questions_list = []
            questions = self._vqa.questions['questions']

            for question in questions:
                questions_list.append(question['question'])

            word_list = self._create_word_list(questions_list)
            self.question_word_to_id_map = self._create_id_map(
                word_list, self.question_word_list_length)
        else:
            self.question_word_to_id_map = question_word_to_id_map

        # Create the answer map if necessary
        if answer_to_id_map is None:
            answer_list = []
            answers = self._vqa.dataset['annotations']
            for answer in answers:
                all_answers = answer['answers']
                for each_answer in all_answers:
                    answer_list.append(each_answer['answer'])

            self.answer_to_id_map = self._create_id_map(
                answer_list, self.answer_list_length)

        else:
            self.answer_to_id_map = answer_to_id_map