Esempio n. 1
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = preprocess_utils.build_vocab(
                (q['answer'] for q in questions))
        question_token_to_idx = preprocess_utils.build_vocab(
            (q['question'] for q in questions),
            min_token_count=args.unk_threshold,
            punct_to_keep=[';', ','],
            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = preprocess_utils.build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        utils.mkdirs(os.path.dirname(args.output_vocab_json))
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = preprocess_utils.tokenize(question,
                                                    punct_to_keep=[';', ','],
                                                    punct_to_remove=['?', '.'])
        question_encoded = preprocess_utils.encode(
            question_tokens,
            vocab['question_token_to_idx'],
            allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = preprocess_utils.tokenize(program_str)
            program_encoded = preprocess_utils.encode(
                program_tokens, vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    utils.mkdirs(os.path.dirname(args.output_h5_file))
    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
def main(args):
    print('Loading captions')
    with open(args.input_captions_json, 'r') as f:
        captions = json.load(f)
    with open(args.input_neg_captions_json, 'r') as f:
        neg_captions = json.load(f)
    with open(args.split_json, 'r') as f:
        splits = json.load(f)
    all_imgs = sorted(os.listdir(args.input_image_dir))
    captioned_imgs = list(captions.keys())
    all_captions = []
    for img, caps in captions.items():
        all_captions.extend(caps)
    all_neg_captions = []
    for img, caps in neg_captions.items():
        all_neg_captions.extend(caps)

    # Extract train data points
    train_split = splits['train']
    train_imgs = [all_imgs[idx] for idx in train_split]
    train_captions = []
    train_neg_captions = []
    for img in train_imgs:
        cap = captions[img]
        neg_cap = neg_captions[img]
        train_captions.extend(cap)
        train_neg_captions.extend(neg_cap)

    N = len(all_imgs)
    N_captioned = len(captions)
    M = len(all_captions)
    M_neg = len(all_neg_captions)
    print('Total images: %d' % N)
    print('Total captioned images: %d' % N_captioned)
    print('Total captions: %d' % M)
    print('Total negative captions: %d' % M_neg)
    print('Total train images: %d' % len(train_imgs))
    print('Total train captions: %d' % len(train_captions))
    print('Total train neg captions: %d' % len(train_neg_captions))

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '':
        print('Building vocab')
        word_to_idx = build_vocab(train_captions + train_neg_captions,
                                  min_token_count=args.word_count_threshold,
                                  punct_to_keep=[';', ','],
                                  punct_to_remove=['?', '.'])
    else:
        print('Loading vocab')
        with open(args.input_vocab_json, 'r') as f:
            word_to_idx = json.load(f)
    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(word_to_idx, f)

    # Encode all captions
    # First, figure out max length of captions
    all_cap_tokens = []
    max_length = -1
    cap_keys = sorted(list(captions.keys()))
    for img in cap_keys:
        caps = captions[img]
        n = len(caps)
        assert n > 0, 'error: some image has no caption'
        tokens_list = []
        for cap in caps:
            cap_tokens = tokenize(cap,
                                  add_start_token=True,
                                  add_end_token=False,
                                  punct_to_keep=[';', ','],
                                  punct_to_remove=['?', '.'])
            tokens_list.append(cap_tokens)
            max_length = max(max_length, len(cap_tokens))
        all_cap_tokens.append((img, tokens_list))

    all_neg_cap_tokens = []
    cap_keys = sorted(list(captions.keys()))
    for img in cap_keys:
        neg_caps = neg_captions[img]
        neg_n = len(neg_caps)
        assert neg_n > 0, 'error: some image has no caption'
        neg_tokens_list = []
        for neg_cap in neg_caps:
            neg_cap_tokens = tokenize(neg_cap,
                                      add_start_token=True,
                                      add_end_token=False,
                                      punct_to_keep=[';', ','],
                                      punct_to_remove=['?', '.'])
            neg_tokens_list.append(neg_cap_tokens)
        all_neg_cap_tokens.append((img, neg_tokens_list))

    print('Encoding captions')
    label_arrays = []
    label_start_idx = -np.ones(N, dtype=np.int)
    label_end_idx = -np.ones(N, dtype=np.int)
    label_length = np.zeros(M, dtype=np.int)
    caption_counter = 0
    counter = 0

    # Then encode
    for img, tokens_list in all_cap_tokens:
        i = int(img.split('.')[0].split('_')[-1])
        n = len(tokens_list)
        Li = np.zeros((n, max_length), dtype=np.int)
        for j, tokens in enumerate(tokens_list):
            label_length[caption_counter] = len(tokens)
            caption_counter += 1
            tokens_encoded = encode(tokens,
                                    word_to_idx,
                                    allow_unk=args.allow_unk == 1)
            for k, w in enumerate(tokens_encoded):
                Li[j, k] = w
        # captions are padded with zeros
        label_arrays.append(Li)
        label_start_idx[i] = counter
        label_end_idx[i] = counter + n - 1

        counter += n

    L = np.concatenate(label_arrays, axis=0)  # put all labels together
    assert L.shape[0] == M, "lengths don't match?"
    assert np.all(label_length > 0), 'error: some captions have no word?'

    print('Encoding negative captions')
    neg_label_arrays = []
    neg_label_start_idx = -np.ones(N, dtype=np.int)
    neg_label_end_idx = -np.ones(N, dtype=np.int)
    neg_label_length = np.zeros(M_neg, dtype=np.int)
    neg_caption_counter = 0
    neg_counter = 0

    # Then encode
    for img, tokens_list in all_neg_cap_tokens:
        i = int(img.split('.')[0].split('_')[-1])
        n = len(tokens_list)
        Li = np.zeros((n, max_length), dtype=np.int)
        for j, tokens in enumerate(tokens_list):
            neg_label_length[neg_caption_counter] = len(tokens)
            neg_caption_counter += 1
            tokens_encoded = encode(tokens,
                                    word_to_idx,
                                    allow_unk=args.allow_unk == 1)
            for k, w in enumerate(tokens_encoded):
                Li[j, k] = w
        # captions are padded with zeros
        neg_label_arrays.append(Li)
        neg_label_start_idx[i] = neg_counter
        neg_label_end_idx[i] = neg_counter + n - 1

        neg_counter += n

    neg_L = np.concatenate(neg_label_arrays, axis=0)  # put all labels together
    assert neg_L.shape[0] == M_neg, "lengths don't match?"
    assert np.all(neg_label_length > 0), 'error: some captions have no word?'

    # Create h5 file
    print('Writing output')
    print('Encoded captions array size: ', L.shape)
    print('Encoded negative captions array size: ', neg_L.shape)
    with h5py.File(args.output_h5, 'w') as f:
        f.create_dataset('labels', data=L)
        f.create_dataset('label_start_idx', data=label_start_idx)
        f.create_dataset('label_end_idx', data=label_end_idx)
        f.create_dataset('label_length', data=label_length)
        f.create_dataset('neg_labels', data=neg_L)
        f.create_dataset('neg_label_start_idx', data=neg_label_start_idx)
        f.create_dataset('neg_label_end_idx', data=neg_label_end_idx)
        f.create_dataset('neg_label_length', data=neg_label_length)
Esempio n. 3
0
def main(args):
    """
    Save nx.graph (Gss, Gts,...) and corresponding torch_geometric.data.PairData
    (via clevr_parse embedder api).
    """
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        logger.info(
            'Must give one of --input_vocab_json or --output_vocab_json')
        return
    graph_parser = clevr_parser.Parser(
        backend='spacy',
        model=args.parser_lm,
        has_spatial=True,
        has_matching=True).get_backend(identifier='spacy')
    embedder = clevr_parser.Embedder(
        backend='torch', parser=graph_parser).get_backend(identifier='torch')
    is_directed_graph = args.is_directed_graph  # Parse graphs as nx.MultiDiGraph

    out_dir, out_f_prefix = _get_out_dir_and_file_prefix(args)
    checkpoint_dir = f"{out_dir}/checkpoints"
    utils.mkdirs(checkpoint_dir)

    questions, img_scenes = get_questions_and_parsed_scenes(
        args.input_questions_json, args.input_parsed_img_scenes_json)
    if args.is_debug:
        set_default_level(10)
        questions = questions[:
                              128]  # default BSZ is 64 ensuring enought for batch iter
        logger.debug(
            f"In DEBUG mode, sampling {len(questions)} questions only..")
    # Process Vocab #
    vocab = _process_vocab(args, questions)

    # Encode all questions and programs
    logger.info('Encoding data')
    questions_encoded, programs_encoded, answers, image_idxs = [], [], [], []
    question_families = []
    orig_idxs = []

    # Graphs and Embeddings #
    data_s_list = []  # List [torch_geometric.data.Data]
    data_t_list = []  # List [torch_geometric.data.Data]
    num_samples = 0  # Counter for keeping track of processed samples
    num_skipped = 0  # Counter for tracking num of samples skipped
    for orig_idx, q in enumerate(questions):
        # First See if Gss, Gts are possible to extract.
        # If not (for e.g., some edges cases like plurality, skip data sample
        img_idx = q['image_index']
        img_fn = q['image_filename']
        logger.debug(f"\tProcessing Image - {img_idx}: {img_fn} ...")
        # q_idx = q['question_index']
        # q_fam_idx = q['question_family_index']
        ## 1: Ensure both Gs,Gt is parseable for this question sample, o.w. skip
        img_scene = list(
            filter(lambda x: x['image_index'] == img_idx, img_scenes))[0]
        try:
            Gt, t_doc = graph_parser.get_doc_from_img_scene(
                img_scene, is_directed_graph=is_directed_graph)
            X_t, ei_t, e_attr_t = embedder.embed_t(
                img_idx, args.input_parsed_img_scenes_json)
        except AssertionError as ae:
            logger.warning(f"AssertionError Encountered: {ae}")
            logger.warning(f"[{img_fn}] Excluding images with > 10 objects")
            num_skipped += 1
            continue
        if Gt is None and ("SKIP" in t_doc):
            # If the derendering pipeline failed, then just skip the
            # scene, don't process the labels (and text_scenes) for the image
            print(f"Got None img_doc at image_index: {img_idx}")
            print(f"Skipping all text_scenes for imgage idx: {img_idx}")
            num_skipped += 1
            continue
        s = q['question']
        orig_idx = q['question_index']
        try:
            Gs, s_doc = graph_parser.parse(s,
                                           return_doc=True,
                                           is_directed_graph=is_directed_graph)
            X_s, ei_s, e_attr_s = embedder.embed_s(s)
        except ValueError as ve:
            logger.warning(f"ValueError Encountered: {ve}")
            logger.warning(f"Skipping question: {s} for {img_fn}")
            num_skipped += 1
            continue
        if Gs is None and ("SKIP" in s_doc):
            logger.warning(
                "Got None as Gs and 'SKIP' in Gs_embd. (likely plural with CLEVR_OBJS label) "
            )
            logger.warning(
                f"SKIPPING processing {s} for {img_fn} and at {img_idx}")
            num_skipped += 1
            continue

        # Using ClevrData allows us a debug extension to Data
        data_s = ClevrData(x=X_s, edge_index=ei_s, edge_attr=e_attr_s)
        data_t = ClevrData(x=X_t, edge_index=ei_t, edge_attr=e_attr_t)
        data_s_list.append(data_s)
        data_t_list.append(data_t)

        question = q['question']
        orig_idxs.append(orig_idx)
        image_idxs.append(img_idx)
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = preprocess_utils.tokenize(question,
                                                    punct_to_keep=[';', ','],
                                                    punct_to_remove=['?', '.'])
        question_encoded = preprocess_utils.encode(
            question_tokens,
            vocab['question_token_to_idx'],
            allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        has_prog_seq = 'program' in q
        if has_prog_seq:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = preprocess_utils.tokenize(program_str)
            program_encoded = preprocess_utils.encode(
                program_tokens, vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            ans = q['answer']
            answers.append(vocab['answer_token_to_idx'][ans])

        num_samples += 1
        logger.info("-" * 50)
        logger.info(f"Samples processed count = {num_samples}")
        if has_prog_seq:
            logger.info(f"\n[{orig_idx}]: question: {question} \n"
                        f"\tprog_str: {program_str} \n"
                        f"\tanswer: {ans}")
        logger.info("-" * 50)

        # ---- CHECKPOINT ---- #
        if num_samples % args.checkpoint_every == 0:
            logger.info(f"Checkpointing at {num_samples}")
            checkpoint_fn_prefix = f"{out_f_prefix}_{num_samples}"
            _out_dir = f"{checkpoint_dir}/{out_f_prefix}_{num_samples}"
            utils.mkdirs(_out_dir)
            out_fpp = f"{_out_dir}/{checkpoint_fn_prefix}"
            # ------------ Checkpoint .H5 ------------#
            logger.info(
                f"CHECKPOINT: Saving checkpoint files at directory: {out_fpp}")
            save_h5(f"{out_fpp}.h5", vocab, questions_encoded, image_idxs,
                    orig_idxs, programs_encoded, question_families, answers)
            # ------------ Checkpoint GRAPH DATA ------------#
            save_graph_pairdata(out_fpp,
                                data_s_list,
                                data_t_list,
                                is_directed_graph=is_directed_graph)
            logger.info(f"-------------- CHECKPOINT: COMPLETED --------")

        if (args.max_sample > 0) and (num_samples >= args.max_sample):
            logger.info(f"len(questions_encoded = {len(questions_encoded)}")
            logger.info("args.max_sample reached: Completing ... ")
            break

    logger.debug(f"Total samples skipped = {num_skipped}")
    logger.debug(f"Total samples processed = {num_samples}")
    out_fpp = f"{out_dir}/{out_f_prefix}"
    ## SAVE .H5: Baseline {dataset}_h5.h5 file (q,p,ans,img_idx) as usual
    logger.info(f"Saving baseline (processed) data in: {out_fpp}.h5")
    save_h5(f"{out_fpp}.h5", vocab, questions_encoded, image_idxs, orig_idxs,
            programs_encoded, question_families, answers)
    ## ------------  SAVE GRAPH DATA ------------ ##
    ## N.b. Ensure the len of theses lists are all equals
    save_graph_pairdata(out_fpp,
                        data_s_list,
                        data_t_list,
                        is_directed_graph=is_directed_graph)
    logger.info(f"Saved Graph Data in: {out_fpp}_*.[h5|.gpickle|.npz|.pt] ")