def process_vqa_dataset(questions, annotations, split, args, maps=None): """ Process the questions and annotations into a consolidated dataset. This is done only for the training set. :param questions: :param annotations: :param split: :param args: :param maps: Dict containing various mappings such as word_to_wid, wid_to_word, ans_to_aid and aid_to_ans :return: The processed dataset ready to be used """ dataset = [] for idx, q in enumerate(questions): d = {} d["question_id"] = q["question_id"] d["question"] = q["question"] d["image_id"] = q["image_id"] d["image_name"] = coco_name_format(q["image_id"], "train") d["answer"] = annotations[idx]["multiple_choice_answer"] answers = [] for ans in annotations[idx]['answers']: answers.append(ans['answer']) d['answers_occurence'] = Counter(answers).most_common() dataset.append(d) # Get the top 1000 answers so we can filter the dataset to only questions with these answers top_answers = text.get_top_answers(dataset, args.top_answer_limit) dataset = text.filter_dataset(dataset, top_answers) # Process the questions dataset = text.preprocess_questions(dataset) vocab = text.get_vocabulary(dataset) dataset = text.remove_tail_words(dataset, vocab) if split == "train": word_to_wid = {w: i for i, w in enumerate(vocab)} wid_to_word = [w for w in vocab] ans_to_aid = {a: i for i, a in enumerate(top_answers)} aid_to_ans = [a for a in top_answers] else: # split == "val": word_to_wid = maps["word_to_wid"] wid_to_word = maps["wid_to_word"] ans_to_aid = maps["ans_to_aid"] aid_to_ans = maps["aid_to_ans"] dataset = text.encode_questions(dataset, word_to_wid, args.max_length) dataset = text.encode_answers(dataset, ans_to_aid) return dataset, vocab, word_to_wid, wid_to_word, ans_to_aid, aid_to_ans
def process_vqa_dataset(questions_file, annotations_file, split, maps=None, top_answer_limit=1000, max_length=26, year=2014): """ Process the questions and annotations into a consolidated dataset. This is done only for the training set. :param questions_file: :param annotations_file: :param split: The dataset split. :param maps: Dict containing various mappings such as word_to_wid, wid_to_word, ans_to_aid and aid_to_ans. :param top_answer_limit: :param max_length: The maximum quetsion length. Taken from the VQA sample code. :param year: COCO Dataset release year. :return: The processed dataset ready to be used """ cache_file = "vqa_{0}_dataset_cache.pickle".format(split) # Check if preprocessed cache exists. If yes, load it up, else preprocess the data if os.path.exists(cache_file): print("Found {0} set cache! Loading...".format(split)) dataset, vocab, word_to_wid, wid_to_word, ans_to_aid, aid_to_ans = pickle.load( open(cache_file, 'rb')) else: # load the annotations and questions files print("Loading {0} annotations".format(split)) with open(annotations_file) as ann: j = json.load(ann) annotations = j["annotations"] print("Loading {0} questions".format(split)) with open(questions_file) as q: j = json.load(q) questions = j["questions"] # load up the dataset dataset = [] for idx, q in enumerate(questions): d = dict() d["question_id"] = q["question_id"] d["question"] = q["question"] d["image_id"] = q["image_id"] d["image_name"] = coco_name_format(q["image_id"], split, year) d["answer"] = annotations[idx]["multiple_choice_answer"] answers = [] for ans in annotations[idx]['answers']: answers.append(ans['answer']) d['answers_occurence'] = Counter(answers).most_common() d["question_type"] = annotations[idx]["question_type"] d["answer_type"] = annotations[idx]["answer_type"] dataset.append(d) # Get the top N answers so we can filter the dataset to only questions with these answers top_answers = text.get_top_answers(dataset, top_answer_limit) dataset = text.filter_dataset(dataset, top_answers) # Process the questions dataset = text.preprocess_questions(dataset) if split == "train": vocab = text.get_vocabulary(dataset) word_to_wid = {w: i + 1 for i, w in enumerate(vocab) } # 0 is used for padding wid_to_word = {i + 1: w for i, w in enumerate(vocab)} ans_to_aid = {a: i for i, a in enumerate(top_answers)} aid_to_ans = {i: a for i, a in enumerate(top_answers)} else: # split == "val": vocab = maps["vocab"] word_to_wid = maps["word_to_wid"] wid_to_word = maps["wid_to_word"] ans_to_aid = maps["ans_to_aid"] aid_to_ans = maps["aid_to_ans"] dataset = text.remove_tail_words(dataset, vocab) dataset = text.encode_questions(dataset, word_to_wid, max_length) dataset = text.encode_answers(dataset, ans_to_aid) print("Caching the processed data") pickle.dump( [dataset, vocab, word_to_wid, wid_to_word, ans_to_aid, aid_to_ans], open(cache_file, 'wb+')) return dataset, vocab, word_to_wid, wid_to_word, ans_to_aid, aid_to_ans