def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    questions_encoded_bert = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    types = []
    for orig_idx, q in enumerate(questions):
        question = q['question']
        if 'program' in q:
            types += [q['program'][-1]['function']]

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)
        questions_encoded_bert.append(bert_tokenizer.encode(question.lower()))

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    max_question_length_bert = max(len(x) for x in questions_encoded_bert)
    pad_token_bert = 0
    for qe in questions_encoded_bert:
        while len(qe) < max_question_length_bert:
            qe.append(pad_token_bert)

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    questions_encoded_bert = np.asarray(questions_encoded_bert, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(questions_encoded_bert.shape)
    print(programs_encoded.shape)

    mapping = {}
    for i, t in enumerate(set(types)):
        mapping[t] = i

    print(mapping)

    types_coded = []
    for t in types:
        types_coded += [mapping[t]]

    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('questions_bert', data=questions_encoded_bert)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
        if len(types) > 0:
            f.create_dataset('types', data=np.asarray(types_coded))
Exemple #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode',
                        default='prefix',
                        choices=['chain', 'prefix', 'postfix'])
    parser.add_argument('--shapes_data',
                        type=str,
                        help="Path to the SHAPES dataset")
    parser.add_argument('--size',
                        type=str,
                        help="Which version of the training set to use")

    args = parser.parse_args()
    parts = ['train', 'val', 'test']
    part_prefixes = ['train.' + args.size, 'val', 'test']
    part_prefixes = [
        os.path.join(args.shapes_data, prefix) for prefix in part_prefixes
    ]

    for part, prefix in zip(parts, part_prefixes):
        image_path = prefix + '.input.npy'
        images = numpy.load(image_path)

        questions_path = prefix + '.query_str.txt'
        questions_encoded = []
        with open(questions_path) as src:
            questions = [str_ for str_ in src]
            if part == 'train':
                question_vocab = build_vocab(questions, delim=None)
            for qe in questions:
                tkn = tokenize(qe, delim=None)
                questions_encoded.append(
                    encode(tkn, question_vocab, allow_unk=True))
        max_question_length = max(len(x) for x in questions_encoded)
        for qe in questions_encoded:
            while len(qe) < max_question_length:
                qe.append(question_vocab['<NULL>'])

        answers_path = prefix + '.output'
        with open(answers_path) as src:
            answers = [1 if w.strip() == 'true' else 0 for w in src]

        programs_path = prefix + '.query'
        all_program_strs = []
        with open(programs_path) as src:
            for line in src:
                line = line.strip()
                program = layout_tree(layout_from_parsing(parse_tree(line)))
                program_str = program_to_str(program, args.mode)
                if program_str is not None:
                    all_program_strs.append(program_str)
        if part == 'train':
            program_vocab = build_vocab(all_program_strs)

        programs_encoded = []
        programs_arities = []
        programs_depths = []

        with open(programs_path) as src:
            for line in src:
                line = line.strip()
                program = layout_tree(layout_from_parsing(parse_tree(line)))
                program_str = program_to_str(program, args.mode)
                program_tokens = tokenize(program_str, delim=None)
                program_encoded = encode(program_tokens,
                                         program_vocab,
                                         allow_unk=True)
                programs_encoded.append(program_encoded)

                programs_arities.append(program_to_arity(program, args.mode))
                programs_depths.append(program_to_depth(program, args.mode))

        if len(programs_encoded) > 0:
            max_program_length = max(len(x) for x in programs_encoded)
            for pe in programs_encoded:
                while len(pe) < max_program_length:
                    pe.append(program_vocab['<NULL>'])

            max_program_arity_length = max(len(x) for x in programs_arities)
            for ar in programs_arities:
                while len(ar) < max_program_arity_length:
                    ar.append(-1)

            max_program_depth_length = max(len(x) for x in programs_depths)
            for de in programs_depths:
                while len(de) < max_program_depth_length:
                    de.append(-1)

            assert (max_program_length == max_program_arity_length) and (
                max_program_length == max_program_depth_length)

        # Create h5 file
        print('Writing output')
        questions_encoded = numpy.asarray(questions_encoded, dtype=numpy.int32)
        programs_encoded = numpy.asarray(programs_encoded, dtype=numpy.int32)
        programs_arities = numpy.asarray(programs_arities, dtype=numpy.int32)
        programs_depths = numpy.asarray(programs_depths, dtype=numpy.int32)
        print(questions_encoded.shape)
        print(programs_encoded.shape)
        print(programs_arities.shape)
        print(programs_depths.shape)

        with h5py.File(part + '_features.h5', 'w') as f:
            features = images.transpose(0, 3, 1, 2) / 255.0
            features_dataset = f.create_dataset('features', (features.shape),
                                                dtype=numpy.float32)
            features_dataset[:] = features

        with h5py.File(part + '_questions.h5', 'w') as f:
            f.create_dataset('questions', data=questions_encoded)

            image_idxs_dataset = f.create_dataset('image_idxs',
                                                  (len(questions_encoded), ),
                                                  dtype=numpy.int32)
            image_idxs_dataset[:] = range(len(questions_encoded))

            if len(programs_encoded) > 0:
                f.create_dataset('programs', data=programs_encoded)
                f.create_dataset('programs_arities', data=programs_arities)
                f.create_dataset('programs_depths', data=programs_depths)

            if len(answers) > 0:
                f.create_dataset('answers', data=numpy.asarray(answers))

    with open('vocab.json', 'w') as f:
        json.dump(
            {
                'question_token_to_idx': question_vocab,
                'program_token_to_idx': program_vocab,
                'answer_token_to_idx': {
                    'false': 0,
                    'true': 1
                }
            }, f)
Exemple #3
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data from', args.input_questions_json)
    if args.q_family_shift and len(args.q_family_shift):
        if len(args.q_family_shift) != len(args.input_questions_json):
            raise ValueError("shift must be provided for each question file")
        q_family_shifts = args.q_family_shift
    else:
        q_family_shifts = [0] * len(args.input_questions_json)
    questions = []
    for q_file, shift in zip(args.input_questions_json, q_family_shifts):
        print(q_file)
        with open(q_file, 'r') as f:
            more_questions = json.load(f)['questions']
            for q in more_questions:
                q['question_family_index'] += shift
            questions.extend(more_questions)

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q:
                continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

        def arity(name):
            if name == 'scene':
                return 0
            if 'equal' in name or name in [
                    'union', 'intersect', 'less_than', 'greater_than'
            ]:
                return 2
            return 1

        vocab['program_token_arity'] = {
            name: arity(name)
            for name in program_token_to_idx
        }
    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    types = []
    for orig_idx, q in enumerate(questions):
        question = q['question']
        if 'program' in q:
            types += [q['program'][-1]['function']]

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)

    mapping = {}
    for i, t in enumerate(set(types)):
        mapping[t] = i

    print(mapping)

    types_coded = []
    for t in types:
        types_coded += [mapping[t]]

    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
        if len(types) > 0:
            f.create_dataset('types', data=np.asarray(types_coded))
Exemple #4
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = []
        for line in f:
            questions.append(json.loads(line))

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'label' in questions[0]:
            answer_token_to_idx = build_vocab((q['label'] for q in questions))
        question_token_to_idx = build_vocab((q['sentence'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    types = []
    for orig_idx, q in enumerate(questions):
        question = q['sentence']

        orig_idxs.append(orig_idx)
        if "LEFT" in q["image_attention"]:  # LEFT IMG
            image_idxs.append(
                int(''.join(c for c in (q['identifier'] + "-img0")
                            if c in digits)))
        else:  # RIGHT IMG
            image_idxs.append(
                int(''.join(c for c in (q['identifier'] + "-img1")
                            if c in digits)))
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'label' in q:
            answers.append(vocab['answer_token_to_idx'][q['label']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)

    mapping = {}
    for i, t in enumerate(set(types)):
        mapping[t] = i

    print(mapping)

    types_coded = []
    for t in types:
        types_coded += [mapping[t]]

    with h5py.File(args.output_h5_file, 'w') as f:
        print(image_idxs)
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
        if len(types) > 0:
            f.create_dataset('types', data=np.asarray(types_coded))