Esempio n. 1
0
def run_visual_sim(imfeat_dir, bbdf_dir, this_corp, n_most=200):
    print_timestamped_message('Computing vis sims for %s' % (this_corp))

    if this_corp == 'visgen':
        featfilename = 'vgregdf_rsn50-flatten_1-fi.npz'
        outfilename = bbdf_dir + '/visgen_vis_sim'
    if this_corp == 'mscoco':
        featfilename = 'mscoco_bbdf_rsn50-flatten_1-fi.npz'
        outfilename = bbdf_dir + '/mscoco_vis_sim'

    if isfile(outfilename + '.npz'):
        print '%s exists. Will not overwrite. ABORTING.' % (outfilename +
                                                            '.npz')
        return

    print_timestamped_message('Loading up X')

    X_full = np.load(imfeat_dir + '/' + featfilename)['arr_0']
    X_ids = X_full[:, :ID_FEATS]
    X = X_full[:, ID_FEATS:]
    # Note: Whole image Xs do not have pos feats at end.

    vissim_most_sim_out = batched_similarity_computation(
        X, MAX_ROWS_IMG, n_most)
    np.savez_compressed(
        outfilename, vissim_most_sim_out,
        dict((n, v) for n, v in enumerate(X_ids[:, ImID_Feat])))
Esempio n. 2
0
def batched_similarity_computation(X, max_row, n_most):
    objsim_most_sim_out = []
    for i in range(0, len(X), max_row):
        print_timestamped_message('... ... batch %d' % (i/max_row+1))
        objsim_pdist = pdist(X[i:i+max_row], 'cosine')
        objsim = squareform(objsim_pdist)
        objsim_most_sim = np.apply_along_axis(lambda x: np.argsort(x)[:n_most], 1, objsim)
        objsim_most_sim_out.append(objsim_most_sim)

    return objsim_most_sim_out
Esempio n. 3
0
def get_sim_mat(df,
                id_col='image_id',
                cat_col='cat',
                n_dims=False,
                max_row=MAX_ROWS_OBJ,
                n_most=200):
    print_timestamped_message('... compiling objects per image list')
    im2ob = defaultdict(list)
    df.apply(lambda row: im2ob[row[id_col]].append(row[cat_col]), axis=1)
    im2ob = {key: Counter(val) for key, val in im2ob.items()}
    row2imid = {n: iid for n, iid in enumerate(im2ob.keys())}

    print_timestamped_message('... vectorizing')
    vectorizer = DictVectorizer(sparse=False)
    X = vectorizer.fit_transform(im2ob.values())

    if n_dims:
        print_timestamped_message('... reducing dimensionality')
        svd = TruncatedSVD(n_components=n_dims, n_iter=7, random_state=42)
        X = svd.fit_transform(X)

    print_timestamped_message('... and finally, computing similarities')

    objsim_most_sim_out = batched_similarity_computation(X, max_row, n_most)

    return objsim_most_sim_out, row2imid
Esempio n. 4
0
def compute_feats(config, bbdf, model, preproc,
                  xs=224, ys=224, batch_size=100):

    full_image = True if config.get('runtime', 'full_image') == 'True' else False

    filename = config.get('runtime', 'out_dir') +\
        '/%s_%s' % (
            config.get('runtime', 'this_bbdf'),
            config.get('runtime', 'model'))
    if full_image:
        filename += '-fi'
    if isfile(filename + '.npz'):
        print '%s exists. Will not overwrite. ABORTING.' % (filename + '.npz')
        return

    X_pos = []
    X_i = []
    ids = []
    file_counter = 1
    prev_iid, prev_img = (None, None)

    X_out = []

    if full_image:
        bbdf = bbdf.drop_duplicates(subset='image_id')
        bbdf = bbdf.reset_index()

    if 'region_id' in bbdf.columns:  # default
        reg_col = 'region_id'
    if 'object_id' in bbdf.columns:  # some visgen bbdfs
        reg_col = 'obj_id'
    if 'subregion_id' in bbdf.columns:  # Flickr30k
        subreg = True
    else:
        subreg = False

    # FIXME, for debugging only! Reduced size or starting with offset
    # bbdf = bbdf[:100]

    for n, row in tqdm(bbdf.iterrows(), total=len(bbdf)):
        this_icorpus = row['i_corpus']
        this_image_id = row['image_id']
        this_region_id = row[reg_col]
        if subreg:  # this means that we are reading in Flickr30k...
            this_region_id = row[reg_col] + row['subregion_id'] / 100
        this_bb = row['bb']

        if full_image:
            this_bb = None
            this_region_id = 0

        #  When extracting feats for imagenet regions, must
        #  - create combined filename out of image_id and region_id
        #  - neutralise positional features, by setting bb given
        #    to pos feat computation to 0,0,w,h. So that all ImageNet
        #    regions end up with same positions.
        if code_icorpus[this_icorpus] == 'image_net':
            this_image_id_mod = join_imagenet_id(this_image_id,
                                                 this_region_id)
            this_bb_mod = [0, 0, this_bb[2], this_bb[3]]
        else:
            this_image_id_mod = this_image_id
            this_bb_mod = this_bb

        if this_bb_mod and np.min(this_bb_mod[2:]) <= 0:
            print 'skipping over this image (%s,%d). Negative bb! %s' % \
                (code_icorpus[this_icorpus], this_image_id, str(this_bb_mod))
            continue

        (prev_iid, prev_img), img_resized = \
            get_image_part(config, (prev_iid, prev_img),
                           this_icorpus, this_image_id_mod, this_bb,
                           xs=xs, ys=ys)

        if len(prev_img.shape) != 3 or \
           (len(prev_img.shape) == 3 and prev_img.shape[2] != 3):
            print 'skipping over this image (%s,%d). b/w?' % \
                (code_icorpus[this_icorpus], this_image_id)
            continue

        # If we continue below this line, getting region worked
        X_i.append(img_resized)
        this_pos_feats = compute_posfeats(prev_img, this_bb_mod)
        X_pos.append(this_pos_feats)
        ids.append(np.array([this_icorpus, this_image_id, this_region_id]))

        # is it time to do the actual extraction on this batch
        if (n+1) % batch_size == 0 or n+1 == len(bbdf):
            print_timestamped_message('new batch! (%d %d) Extracting!...' %
                                      (file_counter, n), indent=4)

            try:
                X_i = np.array(X_i)
                # print X_i.shape
                X = model.predict(preproc(X_i.astype('float64')))
            except ValueError:
                print 'Exception! But why? Skipping this whole batch..'
                X_i = []
                ids = []
                X_pos = []
                continue
                # raise e

            X_ids = np.array(ids)
            X_pos = np.array(X_pos)
            print X_ids.shape, X.shape, X_pos.shape
            if full_image:
                X_out.append(np.hstack([X_ids, X]))
            else:
                X_out.append(np.hstack([X_ids, X, X_pos]))

            ids = []
            X_pos = []
            X_i = []
            file_counter += 1
    # and back to the for loop
    X_out = np.concatenate(X_out, axis=0)

    print_timestamped_message('Made it through! Writing out..', indent=4)
    print X_out.shape

    np.savez_compressed(filename, X_out)
Esempio n. 5
0
    config.set('runtime', 'model', args.model)

    if arch == 'vgg19':
        from keras.applications.vgg19 import VGG19
        from keras.applications.vgg19 import preprocess_input as preproc
        base_model = VGG19(weights='imagenet')
        model = Model(inputs=base_model.input,
                      outputs=base_model.get_layer(layer).output)
    if arch == 'rsn50':
        from keras.applications.resnet50 import ResNet50
        from keras.applications.resnet50 import preprocess_input as preproc
        base_model = ResNet50(weights='imagenet')
        model = Model(inputs=base_model.input,
                      outputs=base_model.get_layer(layer).output)

    print_timestamped_message('starting to extract, using %s %s...' %
                              (arch, layer))

    for this_bbdf in args.bbdf:
        print_timestamped_message('... %s' % (this_bbdf), indent=4)
        this_bbdf_base = bbdf_dir + '/' + this_bbdf + '.json'
        if isfile(this_bbdf_base + '.gz'):
            this_bbdf_path = this_bbdf_base + '.gz'
            bbdf = pd.read_json(this_bbdf_path,
                                orient='split',
                                compression='gzip')
        else:
            this_bbdf_path = this_bbdf_base
            if not isfile(this_bbdf_base):
                print "bbdf file (%s) not found. Aborting." % (this_bbdf_path)
                sys.exit(1)
            bbdf = pd.read_json(this_bbdf_base,
Esempio n. 6
0
def main(config):
    basename = os.path.splitext(os.path.basename(__file__))[0]
    print_timestamped_message('Starting to train model %s' % (basename))

    outfile_base = config.get('runtime', 'out_dir') + '/' + basename
    if isfile(outfile_base + '.npz'):
        print('%s exists. Will not overwrite. ABORTING.' %
              (outfile_base + '.npz'))
        return

    # Model description:
    model = {
        'rcorp': 'refcoco',  # ref corpus
        'cnn': 'rsn50-flatten_1',  # CNN used for vision feats
        'rel': 'excl',  # exclude relational expressions
        'wrdl': 'min',  # wordlist: minimal n occurrences...
        'wprm': 40,  # ... 40 times
        'clsf': 'logreg-l1',  # logistic regression, l1 regularized
        'nneg': 20000,  # maximally 20k neg instances
        'nsrc': 'randmax',  # ... randomly selected
        'notes': ''
    }
    classifier = linear_model.LogisticRegression
    classf_params = {'penalty': 'l1', 'warm_start': True}

    dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')
    preproc_path = dsgv_home + '/Preproc/PreprocOut/'
    feats_path = dsgv_home + '/ExtractFeats/ExtractOut/'

    # ========================= DATA =================================
    print_timestamped_message('loading up data.', indent=4)

    with open(preproc_path + 'refcoco_splits.json', 'r') as f:
        rc_splits = json.load(f)

    # Image features
    X = np.load(feats_path + 'mscoco_bbdf_rsn50-flatten_1.npz')['arr_0']
    X_t = filter_X_by_filelist(X, rc_splits['train'])

    # Referring expressions
    refcoco_refdf = pd.read_json(preproc_path + 'refcoco_refdf.json.gz',
                                 typ='frame',
                                 orient='split',
                                 compression='gzip')
    refdf_train = filter_refdf_by_filelist(refcoco_refdf, rc_splits['train'])

    refdf_train = filter_relational_expr(refdf_train)

    # ======================= Intermediate ==============================
    print_timestamped_message('creating intermediate data structures',
                              indent=4)
    word2den = create_word2den(refdf_train)
    X_idx = make_X_id_index(X_t)
    mask_matrix = make_mask_matrix(X_t, X_idx, word2den, word2den.keys())

    # ======================= Wordlist ==============================
    print_timestamped_message('selecting words to train models for', indent=4)
    min_freq = model['wprm']
    counts = mask_matrix.sum(axis=1)
    wordlist = np.array(word2den.keys())[counts > min_freq]

    # ======================= TRAIN ==============================
    print_timestamped_message('and training the %d WACs!' % (len(wordlist)),
                              indent=4)

    wacs = Parallel(n_jobs=N_JOBS, require='sharedmem', prefer='threads')\
                   (delayed(train_this_word)(X_t, word2den, mask_matrix,
                                             model['nneg'],
                                             classifier, classf_params,
                                             this_word)
                    for this_word in wordlist)
    print('')  # newline, because train_this_word prints . as progress bar

    # ======================= SAVE ==============================
    print_timestamped_message('writing to disk', indent=4)
    weight_matrix = np.stack([
        np.append(this_wac.coef_, this_wac.intercept_)
        for this_wac in [w[3] for w in wacs]
    ])
    wordinfo = [e[:-1] for e in wacs]
    with open(outfile_base + '.json', 'w') as f:
        json.dump((model, wordinfo), f)
    np.savez_compressed(outfile_base + '.npz', weight_matrix)

    print_timestamped_message('DONE!')
Esempio n. 7
0
def run_objects_sim(bbdf_dir, this_corp):
    if this_corp == 'mscoco':
        print_timestamped_message('Computing sims for MSCOCO')

        outfilename = bbdf_dir + '/mscoco_sim'
        if isfile(outfilename + '.npz'):
            print '%s exists. Will not overwrite. ABORTING.' % (outfilename +
                                                                '.npz')
            return

        print_timestamped_message('... loading up bbdf')
        mscoco_bbdf = pd.read_json(bbdf_dir + '/mscoco_bbdf.json.gz',
                                   typ='frame',
                                   orient='split',
                                   compression='gzip')

        sim_sq, row2imid = get_sim_mat(mscoco_bbdf)

        print_timestamped_message('... compressing and writing to disk')
        np.savez_compressed(outfilename, sim_sq, row2imid)

    if this_corp == 'visgen':
        print_timestamped_message('Computing sims for Visual Genome')

        outfilename = bbdf_dir + '/visgen_sim'
        if isfile(outfilename + '.npz'):
            print '%s exists. Will not overwrite. ABORTING.' % (outfilename +
                                                                '.npz')
            return

        print_timestamped_message('... loading up bbdf')
        visgen_objdf = pd.read_json(bbdf_dir + '/vgobjdf.json.gz',
                                    typ='frame',
                                    orient='split',
                                    compression='gzip')
        visgen_objdf = visgen_objdf[~visgen_objdf['obj_id'].duplicated()]
        visgen_objdf = visgen_objdf[~visgen_objdf['syn'].isnull()]

        sim_sq, row2imid = get_sim_mat(visgen_objdf,
                                       cat_col='syn',
                                       n_dims=50,
                                       max_row=MAX_ROWS_OBJ)

        print_timestamped_message('... compressing and writing to disk')
        np.savez_compressed(outfilename, sim_sq, row2imid)
Esempio n. 8
0
    try:
        with codecs.open(args.config_file, 'r', encoding='utf-8') as f:
            config.readfp(f)
    except IOError:
        print 'no config file found at %s' % (args.config_file)
        sys.exit(1)

    if args.bbdf_dir:
        bbdf_dir = args.bbdf_dir
    elif config.has_option('DSGV-PATHS', 'bbdf_dir'):
        bbdf_dir = config.get('DSGV-PATHS', 'bbdf_dir')
    else:
        bbdf_dir = '../Preproc/PreprocOut'

    if args.imfeat_dir:
        imfeat_dir = args.imfeat_dir
    elif config.has_option('DSGV-PATHS', 'imfeat_dir'):
        imfeat_dir = config.get('DSGV-PATHS', 'imfeat_dir')
    else:
        imfeat_dir = '../ExtractFeats/ExtractOut'

    if args.mode == 'objects':
        for this_corp in args.corp:
            run_objects_sim(bbdf_dir, this_corp)
    if args.mode == 'visual':
        for this_corp in args.corp:
            run_visual_sim(imfeat_dir, bbdf_dir, this_corp)

    print_timestamped_message('Done!')
Esempio n. 9
0
def compute_feats(config, args, bbdf, model, preproc,
                  xs=224, ys=224, batch_size=100):

    full_image = args.full_image

    filename = config.get('runtime', 'out_dir') + \
        '/%s_%s' % (
            config.get('runtime', 'this_bbdf'),
            config.get('runtime', 'model'))
    if full_image:
        filename += '-fi'
    # if isfile(filename + '.npz'):
    if len(glob(filename + '*')) != 0:
        print('Output for %s exists. Will not overwrite. ABORTING.' % (filename))
        return

    X_pos = []
    X_i = []
    ids = []
    file_counter = 1
    prev_iid, prev_img = (None, None)

    X_out = []
    write_flag = False
    write_count = 1
    minibatch_size = args.write_batch
    checkpts = minibatch_size
    
    # FIXME, for debugging only! Reduced size or starting with offset
    # bbdf = bbdf[:100]
    if args.bbdf_slice:
        s, e = [int(e) for e in args.bbdf_slice.split(':')]
        bbdf = bbdf[s:e]

    if len(bbdf) > args.max_singlefile:
        size_flag = True
    else:
        size_flag = False

    if full_image:
        bbdf = bbdf.drop_duplicates(subset='image_id')
        bbdf = bbdf.reset_index()

    # if 'region_id' in bbdf.columns:  # default
    reg_col = 'region_id'
    if 'obj_id' in bbdf.columns:  # some visgen bbdfs
        reg_col = 'obj_id'
    if 'subregion_id' in bbdf.columns:  # Flickr30k
        subreg = True
        subreg_column = 'subregion_id'
    elif 'level' in bbdf.columns:  # ADE20k
        subreg = True
        subreg_column = 'level'
    else:
        subreg = False

    for n, row in tqdm(bbdf.iterrows(), total=len(bbdf)):
        this_icorpus = row['i_corpus']
        this_image_id = row['image_id']

        if full_image:
            this_bb = None
            this_region_id = 0
        else:
            this_bb = row['bb']
            this_region_id = row[reg_col]

        if subreg:  # this means that we are reading in Flickr30k...
            # .. or ADE20k
            this_region_id = row[reg_col] + row[subreg_column] / 100

        #  When extracting feats for imagenet regions, must
        #  - create combined filename out of image_id and region_id
        #  - neutralise positional features, by setting bb given
        #    to pos feat computation to 0,0,w,h. So that all ImageNet
        #    regions end up with same positions.
        if code_icorpus[this_icorpus] == 'image_net':
            this_image_id_mod = join_imagenet_id(this_image_id,
                                                 this_region_id)
            this_bb_mod = [0, 0, this_bb[2], this_bb[3]]
        elif code_icorpus[this_icorpus] == 'ade_20k':
            # somewhat regrettably, ade20k wasn't preprocessed to
            # use our normal format. this is coming back to haunt
            # us here, as we need to create the image id from
            # other rows.. this will only work on ade_imgdf, not on ade_objdf
            this_image_id_mod = (row['split'], row['image_cat'], row['filename'])
            this_bb_mod = this_bb
        elif code_icorpus[this_icorpus] == 'cub_birds':
            this_image_id_mod = row['image_path']
            this_bb_mod = this_bb
        else:
            this_image_id_mod = this_image_id
            this_bb_mod = this_bb

        if this_bb_mod and np.min(this_bb_mod[2:]) <= 0:
            print('skipping over this image (%s,%d). Negative bb! %s' % \
                (code_icorpus[this_icorpus], this_image_id, str(this_bb_mod)))
            continue

        try:
            (prev_iid, prev_img), img_resized = \
            get_image_part(config, (prev_iid, prev_img),
                           this_icorpus, this_image_id_mod, this_bb_mod,
                           xs=xs, ys=ys)
        except ValueError as e:
            print('skipping over this image (%s,%d). corrupted??' % \
               (code_icorpus[this_icorpus], this_image_id))

        if len(prev_img.shape) != 3 or \
           (len(prev_img.shape) == 3 and prev_img.shape[2] != 3):
            print('skipping over this image (%s,%d). b/w?' % \
                (code_icorpus[this_icorpus], this_image_id))
            continue

        # If we continue below this line, getting region worked
        X_i.append(img_resized)
        this_pos_feats = compute_posfeats(prev_img, this_bb_mod)
        X_pos.append(this_pos_feats)
        ids.append(np.array([this_icorpus, this_image_id, this_region_id]))

        # is it time to do the actual extraction on this batch
        if (n+1) % batch_size == 0 or n+1 == len(bbdf):
            print_timestamped_message('new batch! (%d %d) Extracting!...' %
                                      (file_counter, n), indent=4)

            try:
                X_i = np.array(X_i)
                # print X_i.shape
                X = model.predict(preproc(X_i.astype('float64')))
            except ValueError:
                print('Exception! But why? Skipping this whole batch..')
                X_i = []
                ids = []
                X_pos = []
                continue
                # raise e

            X_ids = np.array(ids)
            X_pos = np.array(X_pos)
            print(X_ids.shape, X.shape, X_pos.shape)
            if full_image:
                X_out_buff = da.from_array(np.hstack([X_ids, X]), chunks=(1000, 1000))
                X_out.append(X_out_buff)
            else:
                X_out_buff = da.from_array(np.hstack([X_ids, X, X_pos]), chunks=(1000, 1000))
                X_out.append(X_out_buff)

            ids = []
            X_pos = []
            X_i = []
            file_counter += 1
    
        # testing out mini-batch extractions
        if n >= checkpts or n+1 == len(bbdf):
            write_flag = True
            checkpts += minibatch_size

        if write_flag and size_flag and (not args.dry_run or args.write_dummy):
            write_flag = False

            write_buffer = np.concatenate(X_out, axis=0)
            # np.savez_compressed(filename + "_" + str(write_count), write_buffer)
            # uncompressed hdf5, after all?
            outfilename = filename + "_" + str(write_count) + ".hdf5"
            with h5py.File(outfilename, 'w') as f:
                f.create_dataset('img_feats', data=write_buffer)

            # write_buffer = da.concatenate(X_out, axis=0)
            # da.to_hdf5(filename + "_" + str(write_count) + ".hdf5",
            #            'img_feats', write_buffer,
            #            compression="gzip", compression_opts=9,
            #            shuffle=True, chunks=True)

            write_count += 1
            X_out = []

    # and back to the for loop
    if not size_flag and (not args.dry_run or args.write_dummy):
        # X_out = da.concatenate(X_out, axis=0)
        X_out = np.concatenate(X_out, axis=0)

        print_timestamped_message('Made it through! Writing out..', indent=4)
        print(X_out.shape)

        with h5py.File(filename + '.hdf5', 'w') as f:
            f.create_dataset('img_feats', data=X_out)