def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = preprocess_utils.build_vocab( (q['answer'] for q in questions)) question_token_to_idx = preprocess_utils.build_vocab( (q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = preprocess_utils.build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': utils.mkdirs(os.path.dirname(args.output_vocab_json)) with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = preprocess_utils.tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = preprocess_utils.encode( question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = preprocess_utils.tokenize(program_str) program_encoded = preprocess_utils.encode( program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) utils.mkdirs(os.path.dirname(args.output_h5_file)) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))
def main(args): print('Loading captions') with open(args.input_captions_json, 'r') as f: captions = json.load(f) with open(args.input_neg_captions_json, 'r') as f: neg_captions = json.load(f) with open(args.split_json, 'r') as f: splits = json.load(f) all_imgs = sorted(os.listdir(args.input_image_dir)) captioned_imgs = list(captions.keys()) all_captions = [] for img, caps in captions.items(): all_captions.extend(caps) all_neg_captions = [] for img, caps in neg_captions.items(): all_neg_captions.extend(caps) # Extract train data points train_split = splits['train'] train_imgs = [all_imgs[idx] for idx in train_split] train_captions = [] train_neg_captions = [] for img in train_imgs: cap = captions[img] neg_cap = neg_captions[img] train_captions.extend(cap) train_neg_captions.extend(neg_cap) N = len(all_imgs) N_captioned = len(captions) M = len(all_captions) M_neg = len(all_neg_captions) print('Total images: %d' % N) print('Total captioned images: %d' % N_captioned) print('Total captions: %d' % M) print('Total negative captions: %d' % M_neg) print('Total train images: %d' % len(train_imgs)) print('Total train captions: %d' % len(train_captions)) print('Total train neg captions: %d' % len(train_neg_captions)) # Either create the vocab or load it from disk if args.input_vocab_json == '': print('Building vocab') word_to_idx = build_vocab(train_captions + train_neg_captions, min_token_count=args.word_count_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) else: print('Loading vocab') with open(args.input_vocab_json, 'r') as f: word_to_idx = json.load(f) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(word_to_idx, f) # Encode all captions # First, figure out max length of captions all_cap_tokens = [] max_length = -1 cap_keys = sorted(list(captions.keys())) for img in cap_keys: caps = captions[img] n = len(caps) assert n > 0, 'error: some image has no caption' tokens_list = [] for cap in caps: cap_tokens = tokenize(cap, add_start_token=True, add_end_token=False, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) tokens_list.append(cap_tokens) max_length = max(max_length, len(cap_tokens)) all_cap_tokens.append((img, tokens_list)) all_neg_cap_tokens = [] cap_keys = sorted(list(captions.keys())) for img in cap_keys: neg_caps = neg_captions[img] neg_n = len(neg_caps) assert neg_n > 0, 'error: some image has no caption' neg_tokens_list = [] for neg_cap in neg_caps: neg_cap_tokens = tokenize(neg_cap, add_start_token=True, add_end_token=False, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) neg_tokens_list.append(neg_cap_tokens) all_neg_cap_tokens.append((img, neg_tokens_list)) print('Encoding captions') label_arrays = [] label_start_idx = -np.ones(N, dtype=np.int) label_end_idx = -np.ones(N, dtype=np.int) label_length = np.zeros(M, dtype=np.int) caption_counter = 0 counter = 0 # Then encode for img, tokens_list in all_cap_tokens: i = int(img.split('.')[0].split('_')[-1]) n = len(tokens_list) Li = np.zeros((n, max_length), dtype=np.int) for j, tokens in enumerate(tokens_list): label_length[caption_counter] = len(tokens) caption_counter += 1 tokens_encoded = encode(tokens, word_to_idx, allow_unk=args.allow_unk == 1) for k, w in enumerate(tokens_encoded): Li[j, k] = w # captions are padded with zeros label_arrays.append(Li) label_start_idx[i] = counter label_end_idx[i] = counter + n - 1 counter += n L = np.concatenate(label_arrays, axis=0) # put all labels together assert L.shape[0] == M, "lengths don't match?" assert np.all(label_length > 0), 'error: some captions have no word?' print('Encoding negative captions') neg_label_arrays = [] neg_label_start_idx = -np.ones(N, dtype=np.int) neg_label_end_idx = -np.ones(N, dtype=np.int) neg_label_length = np.zeros(M_neg, dtype=np.int) neg_caption_counter = 0 neg_counter = 0 # Then encode for img, tokens_list in all_neg_cap_tokens: i = int(img.split('.')[0].split('_')[-1]) n = len(tokens_list) Li = np.zeros((n, max_length), dtype=np.int) for j, tokens in enumerate(tokens_list): neg_label_length[neg_caption_counter] = len(tokens) neg_caption_counter += 1 tokens_encoded = encode(tokens, word_to_idx, allow_unk=args.allow_unk == 1) for k, w in enumerate(tokens_encoded): Li[j, k] = w # captions are padded with zeros neg_label_arrays.append(Li) neg_label_start_idx[i] = neg_counter neg_label_end_idx[i] = neg_counter + n - 1 neg_counter += n neg_L = np.concatenate(neg_label_arrays, axis=0) # put all labels together assert neg_L.shape[0] == M_neg, "lengths don't match?" assert np.all(neg_label_length > 0), 'error: some captions have no word?' # Create h5 file print('Writing output') print('Encoded captions array size: ', L.shape) print('Encoded negative captions array size: ', neg_L.shape) with h5py.File(args.output_h5, 'w') as f: f.create_dataset('labels', data=L) f.create_dataset('label_start_idx', data=label_start_idx) f.create_dataset('label_end_idx', data=label_end_idx) f.create_dataset('label_length', data=label_length) f.create_dataset('neg_labels', data=neg_L) f.create_dataset('neg_label_start_idx', data=neg_label_start_idx) f.create_dataset('neg_label_end_idx', data=neg_label_end_idx) f.create_dataset('neg_label_length', data=neg_label_length)
def main(args): """ Save nx.graph (Gss, Gts,...) and corresponding torch_geometric.data.PairData (via clevr_parse embedder api). """ if (args.input_vocab_json == '') and (args.output_vocab_json == ''): logger.info( 'Must give one of --input_vocab_json or --output_vocab_json') return graph_parser = clevr_parser.Parser( backend='spacy', model=args.parser_lm, has_spatial=True, has_matching=True).get_backend(identifier='spacy') embedder = clevr_parser.Embedder( backend='torch', parser=graph_parser).get_backend(identifier='torch') is_directed_graph = args.is_directed_graph # Parse graphs as nx.MultiDiGraph out_dir, out_f_prefix = _get_out_dir_and_file_prefix(args) checkpoint_dir = f"{out_dir}/checkpoints" utils.mkdirs(checkpoint_dir) questions, img_scenes = get_questions_and_parsed_scenes( args.input_questions_json, args.input_parsed_img_scenes_json) if args.is_debug: set_default_level(10) questions = questions[: 128] # default BSZ is 64 ensuring enought for batch iter logger.debug( f"In DEBUG mode, sampling {len(questions)} questions only..") # Process Vocab # vocab = _process_vocab(args, questions) # Encode all questions and programs logger.info('Encoding data') questions_encoded, programs_encoded, answers, image_idxs = [], [], [], [] question_families = [] orig_idxs = [] # Graphs and Embeddings # data_s_list = [] # List [torch_geometric.data.Data] data_t_list = [] # List [torch_geometric.data.Data] num_samples = 0 # Counter for keeping track of processed samples num_skipped = 0 # Counter for tracking num of samples skipped for orig_idx, q in enumerate(questions): # First See if Gss, Gts are possible to extract. # If not (for e.g., some edges cases like plurality, skip data sample img_idx = q['image_index'] img_fn = q['image_filename'] logger.debug(f"\tProcessing Image - {img_idx}: {img_fn} ...") # q_idx = q['question_index'] # q_fam_idx = q['question_family_index'] ## 1: Ensure both Gs,Gt is parseable for this question sample, o.w. skip img_scene = list( filter(lambda x: x['image_index'] == img_idx, img_scenes))[0] try: Gt, t_doc = graph_parser.get_doc_from_img_scene( img_scene, is_directed_graph=is_directed_graph) X_t, ei_t, e_attr_t = embedder.embed_t( img_idx, args.input_parsed_img_scenes_json) except AssertionError as ae: logger.warning(f"AssertionError Encountered: {ae}") logger.warning(f"[{img_fn}] Excluding images with > 10 objects") num_skipped += 1 continue if Gt is None and ("SKIP" in t_doc): # If the derendering pipeline failed, then just skip the # scene, don't process the labels (and text_scenes) for the image print(f"Got None img_doc at image_index: {img_idx}") print(f"Skipping all text_scenes for imgage idx: {img_idx}") num_skipped += 1 continue s = q['question'] orig_idx = q['question_index'] try: Gs, s_doc = graph_parser.parse(s, return_doc=True, is_directed_graph=is_directed_graph) X_s, ei_s, e_attr_s = embedder.embed_s(s) except ValueError as ve: logger.warning(f"ValueError Encountered: {ve}") logger.warning(f"Skipping question: {s} for {img_fn}") num_skipped += 1 continue if Gs is None and ("SKIP" in s_doc): logger.warning( "Got None as Gs and 'SKIP' in Gs_embd. (likely plural with CLEVR_OBJS label) " ) logger.warning( f"SKIPPING processing {s} for {img_fn} and at {img_idx}") num_skipped += 1 continue # Using ClevrData allows us a debug extension to Data data_s = ClevrData(x=X_s, edge_index=ei_s, edge_attr=e_attr_s) data_t = ClevrData(x=X_t, edge_index=ei_t, edge_attr=e_attr_t) data_s_list.append(data_s) data_t_list.append(data_t) question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(img_idx) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = preprocess_utils.tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = preprocess_utils.encode( question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) has_prog_seq = 'program' in q if has_prog_seq: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = preprocess_utils.tokenize(program_str) program_encoded = preprocess_utils.encode( program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: ans = q['answer'] answers.append(vocab['answer_token_to_idx'][ans]) num_samples += 1 logger.info("-" * 50) logger.info(f"Samples processed count = {num_samples}") if has_prog_seq: logger.info(f"\n[{orig_idx}]: question: {question} \n" f"\tprog_str: {program_str} \n" f"\tanswer: {ans}") logger.info("-" * 50) # ---- CHECKPOINT ---- # if num_samples % args.checkpoint_every == 0: logger.info(f"Checkpointing at {num_samples}") checkpoint_fn_prefix = f"{out_f_prefix}_{num_samples}" _out_dir = f"{checkpoint_dir}/{out_f_prefix}_{num_samples}" utils.mkdirs(_out_dir) out_fpp = f"{_out_dir}/{checkpoint_fn_prefix}" # ------------ Checkpoint .H5 ------------# logger.info( f"CHECKPOINT: Saving checkpoint files at directory: {out_fpp}") save_h5(f"{out_fpp}.h5", vocab, questions_encoded, image_idxs, orig_idxs, programs_encoded, question_families, answers) # ------------ Checkpoint GRAPH DATA ------------# save_graph_pairdata(out_fpp, data_s_list, data_t_list, is_directed_graph=is_directed_graph) logger.info(f"-------------- CHECKPOINT: COMPLETED --------") if (args.max_sample > 0) and (num_samples >= args.max_sample): logger.info(f"len(questions_encoded = {len(questions_encoded)}") logger.info("args.max_sample reached: Completing ... ") break logger.debug(f"Total samples skipped = {num_skipped}") logger.debug(f"Total samples processed = {num_samples}") out_fpp = f"{out_dir}/{out_f_prefix}" ## SAVE .H5: Baseline {dataset}_h5.h5 file (q,p,ans,img_idx) as usual logger.info(f"Saving baseline (processed) data in: {out_fpp}.h5") save_h5(f"{out_fpp}.h5", vocab, questions_encoded, image_idxs, orig_idxs, programs_encoded, question_families, answers) ## ------------ SAVE GRAPH DATA ------------ ## ## N.b. Ensure the len of theses lists are all equals save_graph_pairdata(out_fpp, data_s_list, data_t_list, is_directed_graph=is_directed_graph) logger.info(f"Saved Graph Data in: {out_fpp}_*.[h5|.gpickle|.npz|.pt] ")