def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_refexps_json, 'r') as f: refexps = json.load(f)['refexps'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in refexps[0]: answer_token_to_idx = build_vocab( (str(q['answer']) for q in refexps)) else: answer_token_to_idx = None refexp_token_to_idx = build_vocab((q['refexp'] for q in refexps), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in refexps: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs, delim=';') vocab = { 'refexp_token_to_idx': refexp_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['refexp_token_to_idx']: if word not in vocab['refexp_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['refexp_token_to_idx']) vocab['refexp_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) import clevr_ref_util clevr_ref_util = clevr_ref_util.clevr_ref_util(args.input_scenes_json, args.input_refexps_json) clevr_ref_util.load_scene_refexp() # Encode all refexps and programs print('Encoding data') refexps_encoded = [] programs_encoded = [] refexp_families = [] orig_idxs = [] image_idxs = [] answers = [] if args.num_examples != -1: refexps = refexps[:args.num_examples] for orig_idx, q in enumerate(refexps): if orig_idx % 500 == 0: print('process refexp program', orig_idx) refexp = q['refexp'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'refexp_family_index' in q: refexp_families.append(q['refexp_family_index']) refexp_tokens = tokenize(refexp, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) refexp_encoded = encode(refexp_tokens, vocab['refexp_token_to_idx'], allow_unk=args.encode_unk == 1) refexps_encoded.append(refexp_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str, delim=';') program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) # Pad encoded refexps and programs max_refexp_length = max(len(x) for x in refexps_encoded) for qe in refexps_encoded: while len(qe) < max_refexp_length: qe.append(vocab['refexp_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') refexps_encoded = np.asarray(refexps_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(refexps_encoded.shape) print(programs_encoded.shape) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('refexps', data=refexps_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) f.create_dataset('programs', data=programs_encoded) f.create_dataset('refexp_families', data=np.asarray(refexp_families)) #adding the mask tmp_ans = [] should_create = True for orig_idx, q in enumerate(refexps): if orig_idx % 500 == 0: print('process mask gt', orig_idx) cur_mask = clevr_ref_util.get_mask_from_refexp( q, args.height, args.width) cur_mask.astype(float) tmp_ans.append(cur_mask) if len(tmp_ans) >= 100: tmp_ans = np.asarray(tmp_ans) if should_create: f.create_dataset('answers', data=tmp_ans, maxshape=(None, args.width, args.height)) should_create = False else: f["answers"].resize( (f["answers"].shape[0] + tmp_ans.shape[0]), axis=0) f["answers"][-tmp_ans.shape[0]:] = tmp_ans tmp_ans = [] if len(tmp_ans) != 0: tmp_ans = np.asarray(tmp_ans) if should_create: assert 1 == 0 f.create_dataset('answers', data=tmp_ans, maxshape=(None, args.width, args.height)) should_create = False else: tmp_ans = np.asarray(tmp_ans) f["answers"].resize((f["answers"].shape[0] + tmp_ans.shape[0]), axis=0) f["answers"][-tmp_ans.shape[0]:] = tmp_ans tmp_ans = []
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return if "train" in args.output_h5_file and args.multi_dir: subdirs = [x for x in range(25)] elif "val" in args.output_h5_file and args.multi_dir: subdirs = [25, 26] elif args.multi_dir: subdirs = [27, 28, 29] else: subdirs = [] questions = [] scenes = [] for subdir in subdirs: question_path = os.path.join(args.input_questions_json, str(subdir), "questions.json") scene_path = os.path.join(args.input_scenes_json, str(subdir), "scenes.json") ss = json.load(open(scene_path, "r"))['scenes'] for s in ss: s['cc']['subdir'] = subdir scenes.extend(ss) qs = json.load(open(question_path, "r"))['questions'] for q in qs: q['subdir'] = subdir questions.extend(qs) if not questions: questions = json.load(open(args.input_questions_json, "r"))['questions'] if not scenes: scenes = json.load(open(args.input_scenes_json, "r"))['scenes'] if args.binary_qs_only: filtered_questions = [] for q in tqdm(questions): if q['answer'] in [True, False] and q['question'] != "?": filtered_questions.append(q) questions = filtered_questions # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab( (str(q['answer']) for q in questions), answers_only=True) question_token_to_idx = build_vocab( (q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.'] ) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) all_scene_text = [] for scene in scenes: for view_name, view_struct in scene.items(): for object in view_struct['objects']: all_scene_text.append(object['text']['body']) ocr_to_idx = build_vocab(all_scene_text) vocab = { 'ocr_to_idx': ocr_to_idx, 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) vocab_out_path = args.output_vocab_json.split(".")[0] + ".txt" if vocab_out_path is not ".txt": with open(vocab_out_path, "w") as out_file: for word in vocab['ocr_to_idx'].keys(): out_file.write(word + "\n") if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] baseline = questions[0]['image_index'] for orig_idx, q in enumerate(questions): question = q['question'] # We need to ask the same question about each view of the scene, and there are 20 views of each scene if q.get("subdir"): offset = q['image_index'] - baseline # num_images_per_subdir = len(os.listdir(os.path.join(args.input_scenes_json, str(subdir), "images"))) # image_name = questions[0]['image'] # count = 0 # for i in range(200): # image_name_2 = questions[i]['image'] # if image_name != image_name_2: # break # count += 1 # num_questions_per_image = count # import pdb; pdb.set_trace() # offset = num_images_per_subdir * q['subdir'] + q['image_index'] * num_questions_per_image else: offset = q['image_index'] for view in range(args.num_views): orig_idxs.append(orig_idx) image_idxs.append(offset + view) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: try: answers.append(vocab['answer_token_to_idx'][str(q['answer'])]) except Exception as e: print(e) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') print(len(questions)) if 'answer' in questions[0]: answer_token_to_idx = build_vocab( (trans_answer(q['answer']) for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][trans_answer( q['answer'])]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: # questions keys: answer, question, program, index, image_index (transformed starting from zero) data = json.load(f) # print(len(data['question'])) # return imgs_idxs = set() questions = [] filter_questions = [] if args.input_filter_questions_json != []: with open(args.input_filter_questions_json, 'r') as fq: filter_questions = fq.read().splitlines() for idx, question in data.items(): if len(filter_questions) > 0: if idx not in filter_questions: img_idx = question['imageId'] imgs_idxs.add(img_idx) q = { 'question': question['question'], 'answer': question['answer'], #'program': data['program'][index], 'index': int(idx), 'image_index': img_idx, #'question_family_index': data['question_family_index'][index] } questions.append(q) else: img_idx = question['imageId'] imgs_idxs.add(img_idx) q = { 'question': question['question'], 'answer': question['answer'], #'program': data['program'][index], 'index': int(idx), 'image_index': img_idx, #'question_family_index': data['question_family_index'][index] } questions.append(q) imgs_idxs = sorted(imgs_idxs) mapper = {x: i for i, x in enumerate(imgs_idxs)} for q in questions: q['image_index'] = mapper[q['image_index']] # # DEBUG # print('min img index: {}'.format(min(questions, key=lambda x: x['image_index'])['image_index'])) # print('max img index: {}'.format(max(questions, key=lambda x: x['image_index'])['image_index'])) # return # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: # Added empty delim to keep all the answer as a token. answer_token_to_idx = build_vocab((q['answer'] for q in questions), delim='') question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 num_new_answers = 0 # Apparently, train and val in miniGQA have different # answers. for word in new_vocab['answer_token_to_idx']: if word not in vocab['answer_token_to_idx']: print('Found new answer %s' % word) idx = len(vocab['answer_token_to_idx']) vocab['answer_token_to_idx'][word] = idx num_new_answers += 1 print('Found %d new words' % num_new_words) print('Found %d new answers' % num_new_answers) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))