def run_visual_sim(imfeat_dir, bbdf_dir, this_corp, n_most=200): print_timestamped_message('Computing vis sims for %s' % (this_corp)) if this_corp == 'visgen': featfilename = 'vgregdf_rsn50-flatten_1-fi.npz' outfilename = bbdf_dir + '/visgen_vis_sim' if this_corp == 'mscoco': featfilename = 'mscoco_bbdf_rsn50-flatten_1-fi.npz' outfilename = bbdf_dir + '/mscoco_vis_sim' if isfile(outfilename + '.npz'): print '%s exists. Will not overwrite. ABORTING.' % (outfilename + '.npz') return print_timestamped_message('Loading up X') X_full = np.load(imfeat_dir + '/' + featfilename)['arr_0'] X_ids = X_full[:, :ID_FEATS] X = X_full[:, ID_FEATS:] # Note: Whole image Xs do not have pos feats at end. vissim_most_sim_out = batched_similarity_computation( X, MAX_ROWS_IMG, n_most) np.savez_compressed( outfilename, vissim_most_sim_out, dict((n, v) for n, v in enumerate(X_ids[:, ImID_Feat])))
def batched_similarity_computation(X, max_row, n_most): objsim_most_sim_out = [] for i in range(0, len(X), max_row): print_timestamped_message('... ... batch %d' % (i/max_row+1)) objsim_pdist = pdist(X[i:i+max_row], 'cosine') objsim = squareform(objsim_pdist) objsim_most_sim = np.apply_along_axis(lambda x: np.argsort(x)[:n_most], 1, objsim) objsim_most_sim_out.append(objsim_most_sim) return objsim_most_sim_out
def get_sim_mat(df, id_col='image_id', cat_col='cat', n_dims=False, max_row=MAX_ROWS_OBJ, n_most=200): print_timestamped_message('... compiling objects per image list') im2ob = defaultdict(list) df.apply(lambda row: im2ob[row[id_col]].append(row[cat_col]), axis=1) im2ob = {key: Counter(val) for key, val in im2ob.items()} row2imid = {n: iid for n, iid in enumerate(im2ob.keys())} print_timestamped_message('... vectorizing') vectorizer = DictVectorizer(sparse=False) X = vectorizer.fit_transform(im2ob.values()) if n_dims: print_timestamped_message('... reducing dimensionality') svd = TruncatedSVD(n_components=n_dims, n_iter=7, random_state=42) X = svd.fit_transform(X) print_timestamped_message('... and finally, computing similarities') objsim_most_sim_out = batched_similarity_computation(X, max_row, n_most) return objsim_most_sim_out, row2imid
def compute_feats(config, bbdf, model, preproc, xs=224, ys=224, batch_size=100): full_image = True if config.get('runtime', 'full_image') == 'True' else False filename = config.get('runtime', 'out_dir') +\ '/%s_%s' % ( config.get('runtime', 'this_bbdf'), config.get('runtime', 'model')) if full_image: filename += '-fi' if isfile(filename + '.npz'): print '%s exists. Will not overwrite. ABORTING.' % (filename + '.npz') return X_pos = [] X_i = [] ids = [] file_counter = 1 prev_iid, prev_img = (None, None) X_out = [] if full_image: bbdf = bbdf.drop_duplicates(subset='image_id') bbdf = bbdf.reset_index() if 'region_id' in bbdf.columns: # default reg_col = 'region_id' if 'object_id' in bbdf.columns: # some visgen bbdfs reg_col = 'obj_id' if 'subregion_id' in bbdf.columns: # Flickr30k subreg = True else: subreg = False # FIXME, for debugging only! Reduced size or starting with offset # bbdf = bbdf[:100] for n, row in tqdm(bbdf.iterrows(), total=len(bbdf)): this_icorpus = row['i_corpus'] this_image_id = row['image_id'] this_region_id = row[reg_col] if subreg: # this means that we are reading in Flickr30k... this_region_id = row[reg_col] + row['subregion_id'] / 100 this_bb = row['bb'] if full_image: this_bb = None this_region_id = 0 # When extracting feats for imagenet regions, must # - create combined filename out of image_id and region_id # - neutralise positional features, by setting bb given # to pos feat computation to 0,0,w,h. So that all ImageNet # regions end up with same positions. if code_icorpus[this_icorpus] == 'image_net': this_image_id_mod = join_imagenet_id(this_image_id, this_region_id) this_bb_mod = [0, 0, this_bb[2], this_bb[3]] else: this_image_id_mod = this_image_id this_bb_mod = this_bb if this_bb_mod and np.min(this_bb_mod[2:]) <= 0: print 'skipping over this image (%s,%d). Negative bb! %s' % \ (code_icorpus[this_icorpus], this_image_id, str(this_bb_mod)) continue (prev_iid, prev_img), img_resized = \ get_image_part(config, (prev_iid, prev_img), this_icorpus, this_image_id_mod, this_bb, xs=xs, ys=ys) if len(prev_img.shape) != 3 or \ (len(prev_img.shape) == 3 and prev_img.shape[2] != 3): print 'skipping over this image (%s,%d). b/w?' % \ (code_icorpus[this_icorpus], this_image_id) continue # If we continue below this line, getting region worked X_i.append(img_resized) this_pos_feats = compute_posfeats(prev_img, this_bb_mod) X_pos.append(this_pos_feats) ids.append(np.array([this_icorpus, this_image_id, this_region_id])) # is it time to do the actual extraction on this batch if (n+1) % batch_size == 0 or n+1 == len(bbdf): print_timestamped_message('new batch! (%d %d) Extracting!...' % (file_counter, n), indent=4) try: X_i = np.array(X_i) # print X_i.shape X = model.predict(preproc(X_i.astype('float64'))) except ValueError: print 'Exception! But why? Skipping this whole batch..' X_i = [] ids = [] X_pos = [] continue # raise e X_ids = np.array(ids) X_pos = np.array(X_pos) print X_ids.shape, X.shape, X_pos.shape if full_image: X_out.append(np.hstack([X_ids, X])) else: X_out.append(np.hstack([X_ids, X, X_pos])) ids = [] X_pos = [] X_i = [] file_counter += 1 # and back to the for loop X_out = np.concatenate(X_out, axis=0) print_timestamped_message('Made it through! Writing out..', indent=4) print X_out.shape np.savez_compressed(filename, X_out)
config.set('runtime', 'model', args.model) if arch == 'vgg19': from keras.applications.vgg19 import VGG19 from keras.applications.vgg19 import preprocess_input as preproc base_model = VGG19(weights='imagenet') model = Model(inputs=base_model.input, outputs=base_model.get_layer(layer).output) if arch == 'rsn50': from keras.applications.resnet50 import ResNet50 from keras.applications.resnet50 import preprocess_input as preproc base_model = ResNet50(weights='imagenet') model = Model(inputs=base_model.input, outputs=base_model.get_layer(layer).output) print_timestamped_message('starting to extract, using %s %s...' % (arch, layer)) for this_bbdf in args.bbdf: print_timestamped_message('... %s' % (this_bbdf), indent=4) this_bbdf_base = bbdf_dir + '/' + this_bbdf + '.json' if isfile(this_bbdf_base + '.gz'): this_bbdf_path = this_bbdf_base + '.gz' bbdf = pd.read_json(this_bbdf_path, orient='split', compression='gzip') else: this_bbdf_path = this_bbdf_base if not isfile(this_bbdf_base): print "bbdf file (%s) not found. Aborting." % (this_bbdf_path) sys.exit(1) bbdf = pd.read_json(this_bbdf_base,
def main(config): basename = os.path.splitext(os.path.basename(__file__))[0] print_timestamped_message('Starting to train model %s' % (basename)) outfile_base = config.get('runtime', 'out_dir') + '/' + basename if isfile(outfile_base + '.npz'): print('%s exists. Will not overwrite. ABORTING.' % (outfile_base + '.npz')) return # Model description: model = { 'rcorp': 'refcoco', # ref corpus 'cnn': 'rsn50-flatten_1', # CNN used for vision feats 'rel': 'excl', # exclude relational expressions 'wrdl': 'min', # wordlist: minimal n occurrences... 'wprm': 40, # ... 40 times 'clsf': 'logreg-l1', # logistic regression, l1 regularized 'nneg': 20000, # maximally 20k neg instances 'nsrc': 'randmax', # ... randomly selected 'notes': '' } classifier = linear_model.LogisticRegression classf_params = {'penalty': 'l1', 'warm_start': True} dsgv_home = config.get('DSGV-PATHS', 'dsgv_home') preproc_path = dsgv_home + '/Preproc/PreprocOut/' feats_path = dsgv_home + '/ExtractFeats/ExtractOut/' # ========================= DATA ================================= print_timestamped_message('loading up data.', indent=4) with open(preproc_path + 'refcoco_splits.json', 'r') as f: rc_splits = json.load(f) # Image features X = np.load(feats_path + 'mscoco_bbdf_rsn50-flatten_1.npz')['arr_0'] X_t = filter_X_by_filelist(X, rc_splits['train']) # Referring expressions refcoco_refdf = pd.read_json(preproc_path + 'refcoco_refdf.json.gz', typ='frame', orient='split', compression='gzip') refdf_train = filter_refdf_by_filelist(refcoco_refdf, rc_splits['train']) refdf_train = filter_relational_expr(refdf_train) # ======================= Intermediate ============================== print_timestamped_message('creating intermediate data structures', indent=4) word2den = create_word2den(refdf_train) X_idx = make_X_id_index(X_t) mask_matrix = make_mask_matrix(X_t, X_idx, word2den, word2den.keys()) # ======================= Wordlist ============================== print_timestamped_message('selecting words to train models for', indent=4) min_freq = model['wprm'] counts = mask_matrix.sum(axis=1) wordlist = np.array(word2den.keys())[counts > min_freq] # ======================= TRAIN ============================== print_timestamped_message('and training the %d WACs!' % (len(wordlist)), indent=4) wacs = Parallel(n_jobs=N_JOBS, require='sharedmem', prefer='threads')\ (delayed(train_this_word)(X_t, word2den, mask_matrix, model['nneg'], classifier, classf_params, this_word) for this_word in wordlist) print('') # newline, because train_this_word prints . as progress bar # ======================= SAVE ============================== print_timestamped_message('writing to disk', indent=4) weight_matrix = np.stack([ np.append(this_wac.coef_, this_wac.intercept_) for this_wac in [w[3] for w in wacs] ]) wordinfo = [e[:-1] for e in wacs] with open(outfile_base + '.json', 'w') as f: json.dump((model, wordinfo), f) np.savez_compressed(outfile_base + '.npz', weight_matrix) print_timestamped_message('DONE!')
def run_objects_sim(bbdf_dir, this_corp): if this_corp == 'mscoco': print_timestamped_message('Computing sims for MSCOCO') outfilename = bbdf_dir + '/mscoco_sim' if isfile(outfilename + '.npz'): print '%s exists. Will not overwrite. ABORTING.' % (outfilename + '.npz') return print_timestamped_message('... loading up bbdf') mscoco_bbdf = pd.read_json(bbdf_dir + '/mscoco_bbdf.json.gz', typ='frame', orient='split', compression='gzip') sim_sq, row2imid = get_sim_mat(mscoco_bbdf) print_timestamped_message('... compressing and writing to disk') np.savez_compressed(outfilename, sim_sq, row2imid) if this_corp == 'visgen': print_timestamped_message('Computing sims for Visual Genome') outfilename = bbdf_dir + '/visgen_sim' if isfile(outfilename + '.npz'): print '%s exists. Will not overwrite. ABORTING.' % (outfilename + '.npz') return print_timestamped_message('... loading up bbdf') visgen_objdf = pd.read_json(bbdf_dir + '/vgobjdf.json.gz', typ='frame', orient='split', compression='gzip') visgen_objdf = visgen_objdf[~visgen_objdf['obj_id'].duplicated()] visgen_objdf = visgen_objdf[~visgen_objdf['syn'].isnull()] sim_sq, row2imid = get_sim_mat(visgen_objdf, cat_col='syn', n_dims=50, max_row=MAX_ROWS_OBJ) print_timestamped_message('... compressing and writing to disk') np.savez_compressed(outfilename, sim_sq, row2imid)
try: with codecs.open(args.config_file, 'r', encoding='utf-8') as f: config.readfp(f) except IOError: print 'no config file found at %s' % (args.config_file) sys.exit(1) if args.bbdf_dir: bbdf_dir = args.bbdf_dir elif config.has_option('DSGV-PATHS', 'bbdf_dir'): bbdf_dir = config.get('DSGV-PATHS', 'bbdf_dir') else: bbdf_dir = '../Preproc/PreprocOut' if args.imfeat_dir: imfeat_dir = args.imfeat_dir elif config.has_option('DSGV-PATHS', 'imfeat_dir'): imfeat_dir = config.get('DSGV-PATHS', 'imfeat_dir') else: imfeat_dir = '../ExtractFeats/ExtractOut' if args.mode == 'objects': for this_corp in args.corp: run_objects_sim(bbdf_dir, this_corp) if args.mode == 'visual': for this_corp in args.corp: run_visual_sim(imfeat_dir, bbdf_dir, this_corp) print_timestamped_message('Done!')
def compute_feats(config, args, bbdf, model, preproc, xs=224, ys=224, batch_size=100): full_image = args.full_image filename = config.get('runtime', 'out_dir') + \ '/%s_%s' % ( config.get('runtime', 'this_bbdf'), config.get('runtime', 'model')) if full_image: filename += '-fi' # if isfile(filename + '.npz'): if len(glob(filename + '*')) != 0: print('Output for %s exists. Will not overwrite. ABORTING.' % (filename)) return X_pos = [] X_i = [] ids = [] file_counter = 1 prev_iid, prev_img = (None, None) X_out = [] write_flag = False write_count = 1 minibatch_size = args.write_batch checkpts = minibatch_size # FIXME, for debugging only! Reduced size or starting with offset # bbdf = bbdf[:100] if args.bbdf_slice: s, e = [int(e) for e in args.bbdf_slice.split(':')] bbdf = bbdf[s:e] if len(bbdf) > args.max_singlefile: size_flag = True else: size_flag = False if full_image: bbdf = bbdf.drop_duplicates(subset='image_id') bbdf = bbdf.reset_index() # if 'region_id' in bbdf.columns: # default reg_col = 'region_id' if 'obj_id' in bbdf.columns: # some visgen bbdfs reg_col = 'obj_id' if 'subregion_id' in bbdf.columns: # Flickr30k subreg = True subreg_column = 'subregion_id' elif 'level' in bbdf.columns: # ADE20k subreg = True subreg_column = 'level' else: subreg = False for n, row in tqdm(bbdf.iterrows(), total=len(bbdf)): this_icorpus = row['i_corpus'] this_image_id = row['image_id'] if full_image: this_bb = None this_region_id = 0 else: this_bb = row['bb'] this_region_id = row[reg_col] if subreg: # this means that we are reading in Flickr30k... # .. or ADE20k this_region_id = row[reg_col] + row[subreg_column] / 100 # When extracting feats for imagenet regions, must # - create combined filename out of image_id and region_id # - neutralise positional features, by setting bb given # to pos feat computation to 0,0,w,h. So that all ImageNet # regions end up with same positions. if code_icorpus[this_icorpus] == 'image_net': this_image_id_mod = join_imagenet_id(this_image_id, this_region_id) this_bb_mod = [0, 0, this_bb[2], this_bb[3]] elif code_icorpus[this_icorpus] == 'ade_20k': # somewhat regrettably, ade20k wasn't preprocessed to # use our normal format. this is coming back to haunt # us here, as we need to create the image id from # other rows.. this will only work on ade_imgdf, not on ade_objdf this_image_id_mod = (row['split'], row['image_cat'], row['filename']) this_bb_mod = this_bb elif code_icorpus[this_icorpus] == 'cub_birds': this_image_id_mod = row['image_path'] this_bb_mod = this_bb else: this_image_id_mod = this_image_id this_bb_mod = this_bb if this_bb_mod and np.min(this_bb_mod[2:]) <= 0: print('skipping over this image (%s,%d). Negative bb! %s' % \ (code_icorpus[this_icorpus], this_image_id, str(this_bb_mod))) continue try: (prev_iid, prev_img), img_resized = \ get_image_part(config, (prev_iid, prev_img), this_icorpus, this_image_id_mod, this_bb_mod, xs=xs, ys=ys) except ValueError as e: print('skipping over this image (%s,%d). corrupted??' % \ (code_icorpus[this_icorpus], this_image_id)) if len(prev_img.shape) != 3 or \ (len(prev_img.shape) == 3 and prev_img.shape[2] != 3): print('skipping over this image (%s,%d). b/w?' % \ (code_icorpus[this_icorpus], this_image_id)) continue # If we continue below this line, getting region worked X_i.append(img_resized) this_pos_feats = compute_posfeats(prev_img, this_bb_mod) X_pos.append(this_pos_feats) ids.append(np.array([this_icorpus, this_image_id, this_region_id])) # is it time to do the actual extraction on this batch if (n+1) % batch_size == 0 or n+1 == len(bbdf): print_timestamped_message('new batch! (%d %d) Extracting!...' % (file_counter, n), indent=4) try: X_i = np.array(X_i) # print X_i.shape X = model.predict(preproc(X_i.astype('float64'))) except ValueError: print('Exception! But why? Skipping this whole batch..') X_i = [] ids = [] X_pos = [] continue # raise e X_ids = np.array(ids) X_pos = np.array(X_pos) print(X_ids.shape, X.shape, X_pos.shape) if full_image: X_out_buff = da.from_array(np.hstack([X_ids, X]), chunks=(1000, 1000)) X_out.append(X_out_buff) else: X_out_buff = da.from_array(np.hstack([X_ids, X, X_pos]), chunks=(1000, 1000)) X_out.append(X_out_buff) ids = [] X_pos = [] X_i = [] file_counter += 1 # testing out mini-batch extractions if n >= checkpts or n+1 == len(bbdf): write_flag = True checkpts += minibatch_size if write_flag and size_flag and (not args.dry_run or args.write_dummy): write_flag = False write_buffer = np.concatenate(X_out, axis=0) # np.savez_compressed(filename + "_" + str(write_count), write_buffer) # uncompressed hdf5, after all? outfilename = filename + "_" + str(write_count) + ".hdf5" with h5py.File(outfilename, 'w') as f: f.create_dataset('img_feats', data=write_buffer) # write_buffer = da.concatenate(X_out, axis=0) # da.to_hdf5(filename + "_" + str(write_count) + ".hdf5", # 'img_feats', write_buffer, # compression="gzip", compression_opts=9, # shuffle=True, chunks=True) write_count += 1 X_out = [] # and back to the for loop if not size_flag and (not args.dry_run or args.write_dummy): # X_out = da.concatenate(X_out, axis=0) X_out = np.concatenate(X_out, axis=0) print_timestamped_message('Made it through! Writing out..', indent=4) print(X_out.shape) with h5py.File(filename + '.hdf5', 'w') as f: f.create_dataset('img_feats', data=X_out)