def preprocess(model): if args.sw_name.startswith("clevr"): program_prefix = vr.programs.list_to_prefix( model["program"]) else: program_prefix = clevr_util.parse_program(mode=0, model=model) program_str = vr.programs.list_to_str(program_prefix) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, program_token_to_idx) program_encoded += [ program_token_to_idx["<NULL>"] for _ in range(27 - len(program_encoded)) ] return np.asarray(program_encoded, dtype=np.int64)
def run_single_example(args, model, dtype, question_raw, feats_var=None): interactive = feats_var is not None if not interactive: feats_var = extract_image_features(args, dtype) # Tokenize the question vocab = load_vocab(args) question_tokens = tokenize(question_raw, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) if args.enforce_clevr_vocab == 1: for word in question_tokens: if word not in vocab['question_token_to_idx']: print( colored( 'No one taught me what "%s" means :( Try me again!' % (word), 'magenta')) return question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) question_encoded = torch.LongTensor(question_encoded).view(1, -1) question_encoded = question_encoded.type(dtype).long() question_var = Variable(question_encoded, volatile=False) # Run the model scores = None predicted_program = None if type(model) is tuple: pg, ee = model pg.type(dtype) pg.eval() ee.type(dtype) ee.eval() if args.model_type == 'FiLM': predicted_program = pg(question_var) else: predicted_program = pg.reinforce_sample( question_var, temperature=args.temperature, argmax=(args.sample_argmax == 1)) programs[question_raw] = predicted_program if args.debug_every <= -1: pdb.set_trace() scores = ee(feats_var, predicted_program, save_activations=True) else: model.type(dtype) scores = model(question_var, feats_var) # Print results predicted_probs = scores.data.cpu() _, predicted_answer_idx = predicted_probs[0].max(dim=0) predicted_probs = F.softmax(Variable(predicted_probs[0])).data predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx[0]] answers_to_probs = {} for i in range(len(vocab['answer_idx_to_token'])): answers_to_probs[vocab['answer_idx_to_token'][i]] = predicted_probs[i] answers_to_probs_sorted = sorted(answers_to_probs.items(), key=lambda x: x[1]) answers_to_probs_sorted.reverse() for i in range(len(answers_to_probs_sorted)): if (answers_to_probs_sorted[i][1] >= 1e-3 and args.debug_every < float('inf')): print("%s: %.1f%%" % (answers_to_probs_sorted[i][0].capitalize(), 100 * answers_to_probs_sorted[i][1])) if not interactive: print(colored('Question: "%s"' % question_raw, 'cyan')) print(colored(str(predicted_answer).capitalize(), 'magenta')) if interactive: return # Visualize Gradients w.r.t. output cf_conv = ee.classifier[0](ee.cf_input) cf_bn = ee.classifier[1](cf_conv) pre_pool = ee.classifier[2](cf_bn) pooled = ee.classifier[3](pre_pool) pre_pool_max_per_c = pre_pool.max(2)[0].max(3)[0].expand_as(pre_pool) pre_pool_masked = (pre_pool_max_per_c == pre_pool).float() * pre_pool pool_feat_locs = (pre_pool_masked > 0).float().sum(1) if args.debug_every <= 1: pdb.set_trace() if args.output_viz_dir != 'NA': viz_dir = args.output_viz_dir + question_raw + ' ' + predicted_answer if not os.path.isdir(viz_dir): os.mkdir(viz_dir) args.viz_dir = viz_dir print('Saving visualizations to ' + args.viz_dir) # Backprop w.r.t. sum of output scores - What affected prediction most? ee.feats.register_hook(save_grad('stem')) for i in range(ee.num_modules): ee.module_outputs[i].register_hook(save_grad('m' + str(i))) scores_sum = scores.sum() scores_sum.backward() # Visualizations! visualize(feats_var, args, 'resnet101') visualize(ee.feats, args, 'conv-stem') visualize(grads['stem'], args, 'grad-conv-stem') for i in range(ee.num_modules): visualize(ee.module_outputs[i], args, 'resblock' + str(i)) visualize(grads['m' + str(i)], args, 'grad-resblock' + str(i)) visualize(pre_pool, args, 'pre-pool') visualize(pool_feat_locs, args, 'pool-feature-locations') if (predicted_program is not None) and (args.model_type != 'FiLM'): print() print('Predicted program:') program = predicted_program.data.cpu()[0] num_inputs = 1 for fn_idx in program: fn_str = vocab['program_idx_to_token'][fn_idx] num_inputs += vr.programs.get_num_inputs(fn_str) - 1 print(fn_str) if num_inputs == 0: break
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab((q['answer'] for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Encode all questions and programs print('Encoding data') questions_encoded = [] questions_encoded_bert = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] types = [] for orig_idx, q in enumerate(questions): question = q['question'] if 'program' in q: types += [q['program'][-1]['function']] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) questions_encoded_bert.append(bert_tokenizer.encode(question.lower())) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) max_question_length_bert = max(len(x) for x in questions_encoded_bert) pad_token_bert = 0 for qe in questions_encoded_bert: while len(qe) < max_question_length_bert: qe.append(pad_token_bert) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) questions_encoded_bert = np.asarray(questions_encoded_bert, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(questions_encoded_bert.shape) print(programs_encoded.shape) mapping = {} for i, t in enumerate(set(types)): mapping[t] = i print(mapping) types_coded = [] for t in types: types_coded += [mapping[t]] with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('questions_bert', data=questions_encoded_bert) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers)) if len(types) > 0: f.create_dataset('types', data=np.asarray(types_coded))
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data from', args.input_questions_json) if args.q_family_shift and len(args.q_family_shift): if len(args.q_family_shift) != len(args.input_questions_json): raise ValueError("shift must be provided for each question file") q_family_shifts = args.q_family_shift else: q_family_shifts = [0] * len(args.input_questions_json) questions = [] for q_file, shift in zip(args.input_questions_json, q_family_shifts): print(q_file) with open(q_file, 'r') as f: more_questions = json.load(f)['questions'] for q in more_questions: q['question_family_index'] += shift questions.extend(more_questions) # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab((q['answer'] for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } def arity(name): if name == 'scene': return 0 if 'equal' in name or name in [ 'union', 'intersect', 'less_than', 'greater_than' ]: return 2 return 1 vocab['program_token_arity'] = { name: arity(name) for name in program_token_to_idx } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] types = [] for orig_idx, q in enumerate(questions): question = q['question'] if 'program' in q: types += [q['program'][-1]['function']] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) mapping = {} for i, t in enumerate(set(types)): mapping[t] = i print(mapping) types_coded = [] for t in types: types_coded += [mapping[t]] with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers)) if len(types) > 0: f.create_dataset('types', data=np.asarray(types_coded))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode', default='prefix', choices=['chain', 'prefix', 'postfix']) parser.add_argument('--shapes_data', type=str, help="Path to the SHAPES dataset") parser.add_argument('--size', type=str, help="Which version of the training set to use") args = parser.parse_args() parts = ['train', 'val', 'test'] part_prefixes = ['train.' + args.size, 'val', 'test'] part_prefixes = [ os.path.join(args.shapes_data, prefix) for prefix in part_prefixes ] for part, prefix in zip(parts, part_prefixes): image_path = prefix + '.input.npy' images = numpy.load(image_path) questions_path = prefix + '.query_str.txt' questions_encoded = [] with open(questions_path) as src: questions = [str_ for str_ in src] if part == 'train': question_vocab = build_vocab(questions, delim=None) for qe in questions: tkn = tokenize(qe, delim=None) questions_encoded.append( encode(tkn, question_vocab, allow_unk=True)) max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(question_vocab['<NULL>']) answers_path = prefix + '.output' with open(answers_path) as src: answers = [1 if w.strip() == 'true' else 0 for w in src] programs_path = prefix + '.query' all_program_strs = [] with open(programs_path) as src: for line in src: line = line.strip() program = layout_tree(layout_from_parsing(parse_tree(line))) program_str = program_to_str(program, args.mode) if program_str is not None: all_program_strs.append(program_str) if part == 'train': program_vocab = build_vocab(all_program_strs) programs_encoded = [] programs_arities = [] programs_depths = [] with open(programs_path) as src: for line in src: line = line.strip() program = layout_tree(layout_from_parsing(parse_tree(line))) program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str, delim=None) program_encoded = encode(program_tokens, program_vocab, allow_unk=True) programs_encoded.append(program_encoded) programs_arities.append(program_to_arity(program, args.mode)) programs_depths.append(program_to_depth(program, args.mode)) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(program_vocab['<NULL>']) max_program_arity_length = max(len(x) for x in programs_arities) for ar in programs_arities: while len(ar) < max_program_arity_length: ar.append(-1) max_program_depth_length = max(len(x) for x in programs_depths) for de in programs_depths: while len(de) < max_program_depth_length: de.append(-1) assert (max_program_length == max_program_arity_length) and ( max_program_length == max_program_depth_length) # Create h5 file print('Writing output') questions_encoded = numpy.asarray(questions_encoded, dtype=numpy.int32) programs_encoded = numpy.asarray(programs_encoded, dtype=numpy.int32) programs_arities = numpy.asarray(programs_arities, dtype=numpy.int32) programs_depths = numpy.asarray(programs_depths, dtype=numpy.int32) print(questions_encoded.shape) print(programs_encoded.shape) print(programs_arities.shape) print(programs_depths.shape) with h5py.File(part + '_features.h5', 'w') as f: features = images.transpose(0, 3, 1, 2) / 255.0 features_dataset = f.create_dataset('features', (features.shape), dtype=numpy.float32) features_dataset[:] = features with h5py.File(part + '_questions.h5', 'w') as f: f.create_dataset('questions', data=questions_encoded) image_idxs_dataset = f.create_dataset('image_idxs', (len(questions_encoded), ), dtype=numpy.int32) image_idxs_dataset[:] = range(len(questions_encoded)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) f.create_dataset('programs_arities', data=programs_arities) f.create_dataset('programs_depths', data=programs_depths) if len(answers) > 0: f.create_dataset('answers', data=numpy.asarray(answers)) with open('vocab.json', 'w') as f: json.dump( { 'question_token_to_idx': question_vocab, 'program_token_to_idx': program_vocab, 'answer_token_to_idx': { 'false': 0, 'true': 1 } }, f)
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = [] for line in f: questions.append(json.loads(line)) # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'label' in questions[0]: answer_token_to_idx = build_vocab((q['label'] for q in questions)) question_token_to_idx = build_vocab((q['sentence'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] types = [] for orig_idx, q in enumerate(questions): question = q['sentence'] orig_idxs.append(orig_idx) if "LEFT" in q["image_attention"]: # LEFT IMG image_idxs.append( int(''.join(c for c in (q['identifier'] + "-img0") if c in digits))) else: # RIGHT IMG image_idxs.append( int(''.join(c for c in (q['identifier'] + "-img1") if c in digits))) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'label' in q: answers.append(vocab['answer_token_to_idx'][q['label']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) mapping = {} for i, t in enumerate(set(types)): mapping[t] = i print(mapping) types_coded = [] for t in types: types_coded += [mapping[t]] with h5py.File(args.output_h5_file, 'w') as f: print(image_idxs) f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers)) if len(types) > 0: f.create_dataset('types', data=np.asarray(types_coded))