def run_single_example(args, model): dtype = torch.FloatTensor if args.use_gpu == 1: dtype = torch.cuda.FloatTensor # Build the CNN to use for feature extraction print('Loading CNN for feature extraction') cnn = build_cnn(args, dtype) # Load and preprocess the image img_size = (args.image_height, args.image_width) img = imread(args.image, mode='RGB') img = imresize(img, img_size, interp='bicubic') img = img.transpose(2, 0, 1)[None] mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1) std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1) img = (img.astype(np.float32) / 255.0 - mean) / std # Use CNN to extract features for the image img_var = Variable(torch.FloatTensor(img).type(dtype), volatile=True) feats_var = cnn(img_var) # Tokenize the refexp vocab = load_vocab(args) refexp_tokens = tokenize(args.refexp, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) refexp_encoded = encode(refexp_tokens, vocab['refexp_token_to_idx'], allow_unk=True) refexp_encoded = torch.LongTensor(refexp_encoded).view(1, -1) refexp_encoded = refexp_encoded.type(dtype).long() refexp_var = Variable(refexp_encoded, volatile=True) # Run the model print('Running the model\n') scores = None predicted_program = None if type(model) is tuple: program_generator, execution_engine = model program_generator.type(dtype) execution_engine.type(dtype) predicted_program = program_generator.reinforce_sample( refexp_var, temperature=args.temperature, argmax=(args.sample_argmax == 1)) scores = execution_engine(feats_var, predicted_program) else: model.type(dtype) scores = model(refexp_var, feats_var) # Print results _, predicted_answer_idx = scores.data.cpu()[0].max(dim=0) predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx[0]] print('Question: "%s"' % args.refexp) print('Predicted answer: ', predicted_answer) if predicted_program is not None: print() print('Predicted program:') program = predicted_program.data.cpu()[0] num_inputs = 1 for fn_idx in program: fn_str = vocab['program_idx_to_token'][fn_idx] num_inputs += iep.programs.get_num_inputs(fn_str) - 1 print(fn_str) if num_inputs == 0: break
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') print(len(questions)) if 'answer' in questions[0]: answer_token_to_idx = build_vocab( (trans_answer(q['answer']) for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][trans_answer( q['answer'])]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_refexps_json, 'r') as f: refexps = json.load(f)['refexps'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in refexps[0]: answer_token_to_idx = build_vocab( (str(q['answer']) for q in refexps)) else: answer_token_to_idx = None refexp_token_to_idx = build_vocab((q['refexp'] for q in refexps), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in refexps: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs, delim=';') vocab = { 'refexp_token_to_idx': refexp_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['refexp_token_to_idx']: if word not in vocab['refexp_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['refexp_token_to_idx']) vocab['refexp_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) import clevr_ref_util clevr_ref_util = clevr_ref_util.clevr_ref_util(args.input_scenes_json, args.input_refexps_json) clevr_ref_util.load_scene_refexp() # Encode all refexps and programs print('Encoding data') refexps_encoded = [] programs_encoded = [] refexp_families = [] orig_idxs = [] image_idxs = [] answers = [] if args.num_examples != -1: refexps = refexps[:args.num_examples] for orig_idx, q in enumerate(refexps): if orig_idx % 500 == 0: print('process refexp program', orig_idx) refexp = q['refexp'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'refexp_family_index' in q: refexp_families.append(q['refexp_family_index']) refexp_tokens = tokenize(refexp, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) refexp_encoded = encode(refexp_tokens, vocab['refexp_token_to_idx'], allow_unk=args.encode_unk == 1) refexps_encoded.append(refexp_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str, delim=';') program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) # Pad encoded refexps and programs max_refexp_length = max(len(x) for x in refexps_encoded) for qe in refexps_encoded: while len(qe) < max_refexp_length: qe.append(vocab['refexp_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') refexps_encoded = np.asarray(refexps_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(refexps_encoded.shape) print(programs_encoded.shape) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('refexps', data=refexps_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) f.create_dataset('programs', data=programs_encoded) f.create_dataset('refexp_families', data=np.asarray(refexp_families)) #adding the mask tmp_ans = [] should_create = True for orig_idx, q in enumerate(refexps): if orig_idx % 500 == 0: print('process mask gt', orig_idx) cur_mask = clevr_ref_util.get_mask_from_refexp( q, args.height, args.width) cur_mask.astype(float) tmp_ans.append(cur_mask) if len(tmp_ans) >= 100: tmp_ans = np.asarray(tmp_ans) if should_create: f.create_dataset('answers', data=tmp_ans, maxshape=(None, args.width, args.height)) should_create = False else: f["answers"].resize( (f["answers"].shape[0] + tmp_ans.shape[0]), axis=0) f["answers"][-tmp_ans.shape[0]:] = tmp_ans tmp_ans = [] if len(tmp_ans) != 0: tmp_ans = np.asarray(tmp_ans) if should_create: assert 1 == 0 f.create_dataset('answers', data=tmp_ans, maxshape=(None, args.width, args.height)) should_create = False else: tmp_ans = np.asarray(tmp_ans) f["answers"].resize((f["answers"].shape[0] + tmp_ans.shape[0]), axis=0) f["answers"][-tmp_ans.shape[0]:] = tmp_ans tmp_ans = []
def run_single_example(args, model, cnn_in=None): dtype = torch.FloatTensor if args.use_gpu == 1: dtype = torch.cuda.FloatTensor # Build the CNN to use for feature extraction if cnn_in is None: print('Loading CNN for feature extraction') cnn = build_cnn(args, dtype) else: cnn = cnn_in # Load and preprocess the image img_size = (args.image_height, args.image_width) # print(img_size) img = imread(args.image, mode='RGB') img = imresize(img, img_size, interp='bicubic') imsave("resized.png", img) img_hm = img img = img.transpose(2, 0, 1)[None] mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1) std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1) img = (img.astype(np.float32) / 255.0 - mean) / std # Use CNN to extract features for the image img_var = Variable(torch.FloatTensor(img).type(dtype), volatile=False, requires_grad=True) feats_var = cnn(img_var) # print(feats_var) # Tokenize the question vocab = load_vocab(args) question_tokens = tokenize(args.question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) question_encoded = torch.LongTensor(question_encoded).view(1, -1) question_encoded = question_encoded.type(dtype).long() question_var = Variable(question_encoded, volatile=False) # Run the model print('Running the model\n') scores = None predicted_program = None GMAP_W = 14 GMAP_H = 14 IMG_W = 320 IMG_H = 240 # gm_ffm = [[0 for j in range(GMAP_W)] for i in range(GMAP_H)] # gm_cnn = [[0 for j in range(GMAP_W)] for i in range(GMAP_H)] gm_ffm = np.zeros([GMAP_H, GMAP_W]) def hook(gmap, layers, grad): # print(grad) data = grad.data.cpu() maxvalue = 0 for i in range(GMAP_H): for j in range(GMAP_W): for k in range(layers): gmap[i][j] = gmap[i][j] + data[0][k][i][j] if abs(gmap[i][j]) > maxvalue: maxvalue = abs(gmap[i][j]) # print("maxvalue=", maxvalue) for i in range(GMAP_H): for j in range(GMAP_W): gmap[i][j] = abs(gmap[i][j] / maxvalue) if type(model) is tuple: program_generator, execution_engine = model program_generator.type(dtype) execution_engine.type(dtype) predicted_program = program_generator.reinforce_sample( question_var, temperature=args.temperature, argmax=(args.sample_argmax == 1)) scores, ffm = execution_engine(feats_var, predicted_program) ffm.register_hook(lambda grad: hook(gm_ffm, 128, grad)) else: model.type(dtype) scores = model(question_var, feats_var) feats_var.register_hook(lambda grad: hook(gm_ffm, 1024, grad)) print("SCORES=", scores[0][0]) # fv = feats_var.transpose(1, 3) # print(fv) # feats_var.register_hook(lambda grad: hook(gm_cnn, 1024, grad)) # fv[0][0][0].sum().backward() sum = scores.sum() sum.backward() x = np.zeros([GMAP_H, GMAP_W]) y = np.zeros([GMAP_H, GMAP_W]) # # x = [(i + 0.5) / GMAP_H * IMG_H for i in range(GMAP_H)] # # y = [(i + 0.5) / GMAP_H * IMG_W for i in range(GMAP_W)] z = np.zeros([GMAP_H, GMAP_W]) for i in range(GMAP_H): for j in range(GMAP_W): x[i][j] = (i) #/ (GMAP_H-1) y[i][j] = (j) #/ (GMAP_W-1) z[i][j] = gm_ffm[i][j] #print(x.max()) #print(y.max()) x.reshape([-1]) y.reshape([-1]) z.reshape([-1]) # # x_new = np.zeros([IMG_H * IMG_W]) # # y_new = np.zeros([IMG_H * IMG_W]) x_new = [i + 0.5 for i in range(IMG_H)] y_new = [i + 0.5 for i in range(IMG_W)] # # for i in range(IMG_H): # # for j in range(IMG_W): # # x_new[i * IMG_W + j] = i + 0.5 # # y_new[i * IMG_W + j] = j + 0.5 # # print(x_new.size) #x,y=np.mgrid(0:IMG_W:14j,0:IMG_H:14j) f = ip.RectBivariateSpline([(i+0.5) / (GMAP_H) for i in range(GMAP_H)], [(i+0.5) / (GMAP_W) for i in range(GMAP_W)], z) # f = ip.interp2d(x, y, z, kind='linear', fill_value=0, bounds_error=True) # z_new = f(x_new, y_new) z_new = np.zeros([IMG_H, IMG_W]) for i in range(IMG_H): for j in range(IMG_W): z_new[i][j] = f((i + 0.5) / IMG_H, (j+0.5) / IMG_W)[0] if args.focus_data is not None: with open(args.focus_data, 'w') as f: for row in z_new: for d in row: f.write(str(d)) f.write(' ') f.write('\n') fimg = np.zeros([IMG_H, IMG_W, 3]) for i in range(IMG_H): for j in range(IMG_W): val = z_new[i][j] * 255 fimg[i][j] = [val, val, val] if args.focus_img is not None: imsave(args.focus_img, fimg) # tck = ip.bisplrep(x, y, z, s=0) # z_new = ip.bisplev(x_new, y_new, tck) #z_new.reshape([IMG_H, IMG_W]) #print(z.max()) #print(z_new.max()) # print(z_new) # plt.pcolor(z) # plt.show() # plt.pcolor(z_new) # plt.show() # print(gm_ffm) # himg = np.array(gmap, dtype=np.float32) # himg = imresize(himg, img_size, interp='bicubic') # img_ffm = np.zeros([args.image_height, args.image_width, 3]) # img_cnn = np.zeros([args.image_height, args.image_width, 3]) # scale_h = args.image_height / GMAP_H # scale_w = args.image_width / GMAP_W # for i in range(args.image_height): # for j in range(args.image_width): # for k in range(3): # img_ffm[i][j][k] = img_hm[i][j][k] * 0.5 + gm_ffm[int(i / scale_h)][int(j / scale_w)] * 255 * 0.5 # img_cnn[i][j][k] = img_hm[i][j][k] * 0.5 + gm_cnn[int(i / scale_h)][int(j / scale_w)] * 255 * 0.5 # imsave('heatmap-ffm.png', img_ffm) # imsave('heatmap-cnn.png', img_cnn) # print(himg) # seaborn.heatmap(gm_ffm) # plt.show() # print("GRADIENT=", ffm.grad) # print(scores.backward(ffm)) # Print results _, predicted_answer_idx = scores.data.cpu()[0].max(dim=0) predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx[0]] print('Question: "%s"' % args.question) print('Predicted answer: ', predicted_answer) if predicted_program is not None: print() print('Predicted program:') program = predicted_program.data.cpu()[0] num_inputs = 1 for fn_idx in program: fn_str = vocab['program_idx_to_token'][fn_idx] num_inputs += iep.programs.get_num_inputs(fn_str) - 1 print(fn_str) if num_inputs == 0: break
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return if "train" in args.output_h5_file and args.multi_dir: subdirs = [x for x in range(25)] elif "val" in args.output_h5_file and args.multi_dir: subdirs = [25, 26] elif args.multi_dir: subdirs = [27, 28, 29] else: subdirs = [] questions = [] scenes = [] for subdir in subdirs: question_path = os.path.join(args.input_questions_json, str(subdir), "questions.json") scene_path = os.path.join(args.input_scenes_json, str(subdir), "scenes.json") ss = json.load(open(scene_path, "r"))['scenes'] for s in ss: s['cc']['subdir'] = subdir scenes.extend(ss) qs = json.load(open(question_path, "r"))['questions'] for q in qs: q['subdir'] = subdir questions.extend(qs) if not questions: questions = json.load(open(args.input_questions_json, "r"))['questions'] if not scenes: scenes = json.load(open(args.input_scenes_json, "r"))['scenes'] if args.binary_qs_only: filtered_questions = [] for q in tqdm(questions): if q['answer'] in [True, False] and q['question'] != "?": filtered_questions.append(q) questions = filtered_questions # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab( (str(q['answer']) for q in questions), answers_only=True) question_token_to_idx = build_vocab( (q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.'] ) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) all_scene_text = [] for scene in scenes: for view_name, view_struct in scene.items(): for object in view_struct['objects']: all_scene_text.append(object['text']['body']) ocr_to_idx = build_vocab(all_scene_text) vocab = { 'ocr_to_idx': ocr_to_idx, 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) vocab_out_path = args.output_vocab_json.split(".")[0] + ".txt" if vocab_out_path is not ".txt": with open(vocab_out_path, "w") as out_file: for word in vocab['ocr_to_idx'].keys(): out_file.write(word + "\n") if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] baseline = questions[0]['image_index'] for orig_idx, q in enumerate(questions): question = q['question'] # We need to ask the same question about each view of the scene, and there are 20 views of each scene if q.get("subdir"): offset = q['image_index'] - baseline # num_images_per_subdir = len(os.listdir(os.path.join(args.input_scenes_json, str(subdir), "images"))) # image_name = questions[0]['image'] # count = 0 # for i in range(200): # image_name_2 = questions[i]['image'] # if image_name != image_name_2: # break # count += 1 # num_questions_per_image = count # import pdb; pdb.set_trace() # offset = num_images_per_subdir * q['subdir'] + q['image_index'] * num_questions_per_image else: offset = q['image_index'] for view in range(args.num_views): orig_idxs.append(orig_idx) image_idxs.append(offset + view) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: try: answers.append(vocab['answer_token_to_idx'][str(q['answer'])]) except Exception as e: print(e) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: # questions keys: answer, question, program, index, image_index (transformed starting from zero) data = json.load(f) # print(len(data['question'])) # return imgs_idxs = set() questions = [] filter_questions = [] if args.input_filter_questions_json != []: with open(args.input_filter_questions_json, 'r') as fq: filter_questions = fq.read().splitlines() for idx, question in data.items(): if len(filter_questions) > 0: if idx not in filter_questions: img_idx = question['imageId'] imgs_idxs.add(img_idx) q = { 'question': question['question'], 'answer': question['answer'], #'program': data['program'][index], 'index': int(idx), 'image_index': img_idx, #'question_family_index': data['question_family_index'][index] } questions.append(q) else: img_idx = question['imageId'] imgs_idxs.add(img_idx) q = { 'question': question['question'], 'answer': question['answer'], #'program': data['program'][index], 'index': int(idx), 'image_index': img_idx, #'question_family_index': data['question_family_index'][index] } questions.append(q) imgs_idxs = sorted(imgs_idxs) mapper = {x: i for i, x in enumerate(imgs_idxs)} for q in questions: q['image_index'] = mapper[q['image_index']] # # DEBUG # print('min img index: {}'.format(min(questions, key=lambda x: x['image_index'])['image_index'])) # print('max img index: {}'.format(max(questions, key=lambda x: x['image_index'])['image_index'])) # return # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: # Added empty delim to keep all the answer as a token. answer_token_to_idx = build_vocab((q['answer'] for q in questions), delim='') question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 num_new_answers = 0 # Apparently, train and val in miniGQA have different # answers. for word in new_vocab['answer_token_to_idx']: if word not in vocab['answer_token_to_idx']: print('Found new answer %s' % word) idx = len(vocab['answer_token_to_idx']) vocab['answer_token_to_idx'][word] = idx num_new_answers += 1 print('Found %d new words' % num_new_words) print('Found %d new answers' % num_new_answers) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))
def run_single_example(args, model, image_filepath = None, counting = True): """ We modify this function to return the prediction confidences (not just the prediction) """ global CONSTRAINED_INDECES dtype = torch.FloatTensor if args.use_gpu == 1: dtype = torch.cuda.FloatTensor # Build the CNN to use for feature extraction print('Loading CNN for feature extraction') cnn = build_cnn(args, dtype) # Load and preprocess the image img_size = (args.image_height, args.image_width) if image_filepath == None: # img = imread(args.image, pilmode='RGB') img = Image.open(args.image).convert('RGB') else: print ("Found image") # img = imread(image_filepath, pilmode='RGB') img = Image.open(image_filepath).convert('RGB') img = img.resize(img_size, resample=Image.BICUBIC) img = np.array(img) img = img.transpose(2, 0, 1)[None] mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1) std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1) img = (img.astype(np.float32) / 255.0 - mean) / std # Use CNN to extract features for the image with torch.no_grad(): img_var = Variable(torch.FloatTensor(img).type(dtype)) #, volatile=True) feats_var = cnn(img_var) # Tokenize the question vocab = load_vocab(args) question_tokens = tokenize(args.question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) question_encoded = torch.LongTensor(question_encoded).view(1, -1) question_encoded = question_encoded.type(dtype).long() with torch.no_grad(): question_var = Variable(question_encoded)#, volatile=True) if CONSTRAINED_INDECES == None: set_constrained_indeces(vocab) # Run the model print('Running the model\n') scores = None predicted_program = None if type(model) is tuple: program_generator, execution_engine = model program_generator.type(dtype) execution_engine.type(dtype) predicted_program = program_generator.reinforce_sample( question_var, temperature=args.temperature, argmax=(args.sample_argmax == 1)) scores = execution_engine(feats_var, predicted_program) else: model.type(dtype) scores = model(question_var, feats_var) probability = F.softmax(scores).data.cpu()[0] # Print results _, predicted_answer_idx = scores.data.cpu()[0].max(dim=0) print (predicted_answer_idx) predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx.item()] print('Question: "%s"' % args.question) print('Predicted answer: ', predicted_answer) print('Predicted answer confidence: ', probability.max(dim=0)) if predicted_program is not None: print() print('Predicted program:') program = predicted_program.data.cpu()[0] num_inputs = 1 for fn_idx in program: fn_str = vocab['program_idx_to_token'][fn_idx.item()] num_inputs += iep.programs.get_num_inputs(fn_str) - 1 print(fn_str) if num_inputs == 0: break # return the probabilities for all choices when counting if counting: return [probability[i] for i in CONSTRAINED_INDECES] else: return probability.max(dim=0)
def compute_saliency_map(args, model, image_filepath = None, counting = True, smoothgrad = True, output_dir = "../output/"): """ Compute the SmoothGrad saliency map. Note the saliency map is computed with respect to the classification. If the query is "how many spheres?", and the scene contains 1 sphere, the saliency map should highlight the object which most contributes to this classification of 1 sphere. The saliency map would be different for a different query (e.g., "how many cubes?") """ global CONSTRAINED_INDECES # check if the directory exists; if not, make it if not os.path.isdir(output_dir): os.mkdir(output_dir) dtype = torch.FloatTensor if args.use_gpu == 1: dtype = torch.cuda.FloatTensor # Tokenize the question vocab = load_vocab(args) question_tokens = tokenize(args.question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) question_encoded = torch.LongTensor(question_encoded).view(1, -1) question_encoded = question_encoded.type(dtype).long() with torch.no_grad(): question_var = Variable(question_encoded)#, volatile=True) if CONSTRAINED_INDECES == None: set_constrained_indeces(vocab) # Build the CNN to use for feature extraction print('Loading CNN for feature extraction') cnn = build_cnn(args, dtype) cnn.eval() # Load and preprocess the image img_size = (args.image_height, args.image_width) if image_filepath == None: img = Image.open(args.image).convert('RGB') else: print ("Found image") img = Image.open(args.image).convert('RGB') img_var = preprocess(img) img_var = img_var.type(dtype) saliency = get_smoothed_mask(model, cnn, question_var, img_var, dtype, 'cuda')[0].cpu().detach().numpy() img = deprocess_img(img_var) fig, (ax1, ax2, ax3) = plt.subplots(1, 3) ax1.imshow(np.asarray(img)) ax1.axis('off') ax2.imshow(saliency, cmap=plt.cm.gist_heat) ax2.axis('off') ax3.imshow(np.asarray(img), alpha = 0.5) ax3.imshow(saliency, cmap=plt.cm.gist_heat, alpha = 0.7) ax3.axis('off') plt.savefig(output_dir + 'out.png', bbox_inches='tight', pad_inches=0) plt.show() scores, predicted_program = run_model(model, cnn, question_var, img_var, dtype) probability = F.softmax(scores).data.cpu()[0] # Print results _, predicted_answer_idx = scores.data.cpu()[0].max(dim=0) print (predicted_answer_idx) predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx.item()] print ("Predicted answer list: " + str(vocab['answer_idx_to_token'])) print('Question: "%s"' % args.question) print('Predicted answer: ', predicted_answer) print('Confidence - 0: ', probability[4]) print('Confidence - 1: ', probability[5]) print('Predicted answer confidence: ', probability.max(dim=0)) if predicted_program is not None: print() print('Predicted program:') program = predicted_program.data.cpu()[0] num_inputs = 1 for fn_idx in program: fn_str = vocab['program_idx_to_token'][fn_idx.item()] num_inputs += iep.programs.get_num_inputs(fn_str) - 1 print(fn_str) if num_inputs == 0: break # return the probabilities for all choices when counting if counting: return [probability[i] for i in CONSTRAINED_INDECES] else: return probability.max(dim=0)