def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab((q['answer'] for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Encode all questions and programs print('Encoding data') questions_encoded = [] questions_encoded_bert = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] types = [] for orig_idx, q in enumerate(questions): question = q['question'] if 'program' in q: types += [q['program'][-1]['function']] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) questions_encoded_bert.append(bert_tokenizer.encode(question.lower())) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) max_question_length_bert = max(len(x) for x in questions_encoded_bert) pad_token_bert = 0 for qe in questions_encoded_bert: while len(qe) < max_question_length_bert: qe.append(pad_token_bert) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) questions_encoded_bert = np.asarray(questions_encoded_bert, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(questions_encoded_bert.shape) print(programs_encoded.shape) mapping = {} for i, t in enumerate(set(types)): mapping[t] = i print(mapping) types_coded = [] for t in types: types_coded += [mapping[t]] with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('questions_bert', data=questions_encoded_bert) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers)) if len(types) > 0: f.create_dataset('types', data=np.asarray(types_coded))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode', default='prefix', choices=['chain', 'prefix', 'postfix']) parser.add_argument('--shapes_data', type=str, help="Path to the SHAPES dataset") parser.add_argument('--size', type=str, help="Which version of the training set to use") args = parser.parse_args() parts = ['train', 'val', 'test'] part_prefixes = ['train.' + args.size, 'val', 'test'] part_prefixes = [ os.path.join(args.shapes_data, prefix) for prefix in part_prefixes ] for part, prefix in zip(parts, part_prefixes): image_path = prefix + '.input.npy' images = numpy.load(image_path) questions_path = prefix + '.query_str.txt' questions_encoded = [] with open(questions_path) as src: questions = [str_ for str_ in src] if part == 'train': question_vocab = build_vocab(questions, delim=None) for qe in questions: tkn = tokenize(qe, delim=None) questions_encoded.append( encode(tkn, question_vocab, allow_unk=True)) max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(question_vocab['<NULL>']) answers_path = prefix + '.output' with open(answers_path) as src: answers = [1 if w.strip() == 'true' else 0 for w in src] programs_path = prefix + '.query' all_program_strs = [] with open(programs_path) as src: for line in src: line = line.strip() program = layout_tree(layout_from_parsing(parse_tree(line))) program_str = program_to_str(program, args.mode) if program_str is not None: all_program_strs.append(program_str) if part == 'train': program_vocab = build_vocab(all_program_strs) programs_encoded = [] programs_arities = [] programs_depths = [] with open(programs_path) as src: for line in src: line = line.strip() program = layout_tree(layout_from_parsing(parse_tree(line))) program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str, delim=None) program_encoded = encode(program_tokens, program_vocab, allow_unk=True) programs_encoded.append(program_encoded) programs_arities.append(program_to_arity(program, args.mode)) programs_depths.append(program_to_depth(program, args.mode)) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(program_vocab['<NULL>']) max_program_arity_length = max(len(x) for x in programs_arities) for ar in programs_arities: while len(ar) < max_program_arity_length: ar.append(-1) max_program_depth_length = max(len(x) for x in programs_depths) for de in programs_depths: while len(de) < max_program_depth_length: de.append(-1) assert (max_program_length == max_program_arity_length) and ( max_program_length == max_program_depth_length) # Create h5 file print('Writing output') questions_encoded = numpy.asarray(questions_encoded, dtype=numpy.int32) programs_encoded = numpy.asarray(programs_encoded, dtype=numpy.int32) programs_arities = numpy.asarray(programs_arities, dtype=numpy.int32) programs_depths = numpy.asarray(programs_depths, dtype=numpy.int32) print(questions_encoded.shape) print(programs_encoded.shape) print(programs_arities.shape) print(programs_depths.shape) with h5py.File(part + '_features.h5', 'w') as f: features = images.transpose(0, 3, 1, 2) / 255.0 features_dataset = f.create_dataset('features', (features.shape), dtype=numpy.float32) features_dataset[:] = features with h5py.File(part + '_questions.h5', 'w') as f: f.create_dataset('questions', data=questions_encoded) image_idxs_dataset = f.create_dataset('image_idxs', (len(questions_encoded), ), dtype=numpy.int32) image_idxs_dataset[:] = range(len(questions_encoded)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) f.create_dataset('programs_arities', data=programs_arities) f.create_dataset('programs_depths', data=programs_depths) if len(answers) > 0: f.create_dataset('answers', data=numpy.asarray(answers)) with open('vocab.json', 'w') as f: json.dump( { 'question_token_to_idx': question_vocab, 'program_token_to_idx': program_vocab, 'answer_token_to_idx': { 'false': 0, 'true': 1 } }, f)
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data from', args.input_questions_json) if args.q_family_shift and len(args.q_family_shift): if len(args.q_family_shift) != len(args.input_questions_json): raise ValueError("shift must be provided for each question file") q_family_shifts = args.q_family_shift else: q_family_shifts = [0] * len(args.input_questions_json) questions = [] for q_file, shift in zip(args.input_questions_json, q_family_shifts): print(q_file) with open(q_file, 'r') as f: more_questions = json.load(f)['questions'] for q in more_questions: q['question_family_index'] += shift questions.extend(more_questions) # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab((q['answer'] for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } def arity(name): if name == 'scene': return 0 if 'equal' in name or name in [ 'union', 'intersect', 'less_than', 'greater_than' ]: return 2 return 1 vocab['program_token_arity'] = { name: arity(name) for name in program_token_to_idx } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] types = [] for orig_idx, q in enumerate(questions): question = q['question'] if 'program' in q: types += [q['program'][-1]['function']] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) mapping = {} for i, t in enumerate(set(types)): mapping[t] = i print(mapping) types_coded = [] for t in types: types_coded += [mapping[t]] with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers)) if len(types) > 0: f.create_dataset('types', data=np.asarray(types_coded))
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = [] for line in f: questions.append(json.loads(line)) # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'label' in questions[0]: answer_token_to_idx = build_vocab((q['label'] for q in questions)) question_token_to_idx = build_vocab((q['sentence'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] types = [] for orig_idx, q in enumerate(questions): question = q['sentence'] orig_idxs.append(orig_idx) if "LEFT" in q["image_attention"]: # LEFT IMG image_idxs.append( int(''.join(c for c in (q['identifier'] + "-img0") if c in digits))) else: # RIGHT IMG image_idxs.append( int(''.join(c for c in (q['identifier'] + "-img1") if c in digits))) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'label' in q: answers.append(vocab['answer_token_to_idx'][q['label']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) mapping = {} for i, t in enumerate(set(types)): mapping[t] = i print(mapping) types_coded = [] for t in types: types_coded += [mapping[t]] with h5py.File(args.output_h5_file, 'w') as f: print(image_idxs) f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers)) if len(types) > 0: f.create_dataset('types', data=np.asarray(types_coded))