def main(params):
    data = json.load(open(params['input_json'], 'r'))
    #imgs = imgs['images']

    seed(123)  # make reproducible
    #shuffle(imgs)  # shuffle the order
    imgs = data["images"]
    prepro_captions(imgs)

    # create the vocab
    vocab = build_vocab(imgs, params)
    itow = {i + 1: w for i, w in enumerate(vocab)}  # a 1-indexed vocab translation table
    wtoi = {w: i + 1 for i, w in enumerate(vocab)}  # inverse table

    # done 

    # assign the splits
    assign_splits(imgs, params)

    # encode captions in large arrays, ready to ship to hdf5 file
    L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi)

    
    import misc.resnet as resnet
    resnet_type = 'resnet151'
    if resnet_type == 'resnet101':
        resnet = resnet.resnet101()
        resnet.load_state_dict(torch.load('resnet/resnet101.pth'))
    else:
        resnet = resnet.resnet152()
        resnet.load_state_dict(torch.load('resnet/resnet152.pth'))
    my_resnet = myResnet(resnet)
    my_resnet.cuda()
    my_resnet.eval()

    # create output h5 file
    N = len(imgs)
    f_lb = h5py.File(params['output_h5'] + '_'+ resnet_type +'_label.h5', "w")
    f_fc = h5py.File(params['output_h5'] + '_'+ resnet_type +'_fc.h5', "w")
    f_att = h5py.File(params['output_h5'] + '_'+ resnet_type +'_att.h5', "w")
    f_lb.create_dataset("labels", dtype='uint32', data=L)
    f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
    f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
    f_lb.create_dataset("label_length", dtype='uint32', data=label_length)
    f_lb.close()

    #exit()
    ### extract features
    dset_fc = f_fc.create_dataset("fc", (N, 2048), dtype='float32')
    dset_att = f_att.create_dataset("att", (N, 14, 14, 2048), dtype='float32')
    for i, img in enumerate(imgs):
        # load the image
        real_path = img['filepath'] + "/" + img['filename']
        I = skimage.io.imread(os.path.join(params['images_root'],real_path))  # note the path 
        # handle grayscale input images
        if len(I.shape) == 2:
            I = I[:, :, np.newaxis]
            I = np.concatenate((I, I, I), axis=2)

        I = I.astype('float32') / 255.0
        I = torch.from_numpy(I.transpose([2, 0, 1])).cuda()
        I = Variable(preprocess(I), volatile=True)
        tmp_fc, tmp_att = my_resnet(I)
        # write to h5
        dset_fc[i] = tmp_fc.data.cpu().float().numpy()
        dset_att[i] = tmp_att.data.cpu().float().numpy()
        if i % 1000 == 0:
            print 'processing %d/%d (%.2f%% done)' % (i, N, i * 100.0 / N)
    f_fc.close()
    f_att.close()
    print 'wrote ', params['output_h5']

    # create output json file
    out = {}
    out['ix_to_word'] = itow  # encode the (1-indexed) vocab
    out['images'] = []
    for i, img in enumerate(imgs):

        jimg = {}
        jimg['split'] = img['split']
        if 'filepath' in img: jimg['filepath'] = img['filepath']  # copy it over, might need
        if 'id' in img: jimg['id'] = img['id']  # copy over & mantain an id, if present (e.g. coco ids, useful)

        out['images'].append(jimg)

    json.dump(out, open(params['output_json'], 'w'))
    print 'wrote ', params['output_json']
Beispiel #2
0
import torch
from torch.autograd import Variable
import skimage
import skimage.io
import scipy.misc

from torchvision import transforms as trn
preprocess = trn.Compose([
    #trn.ToTensor(),
    trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

from misc.resnet_utils import myResnet
import misc.resnet as resnet

resnet = resnet.resnet101()
resnet.load_state_dict(
    torch.load('/datadrive/resnet_pretrianed_t7/resnet101.pth'))
my_resnet = myResnet(resnet)
my_resnet.cuda()
my_resnet.eval()


class DataLoaderRaw():
    def __init__(self, opt):
        self.opt = opt
        self.coco_json = opt.get('coco_json', '')
        self.folder_path = opt.get('folder_path', '')

        self.batch_size = opt.get('batch_size', 1)
        self.seq_per_img = 1
def main(params):
    data = json.load(open(params['input_json'], 'r'))
    #imgs = imgs['images']

    seed(123)  # make reproducible
    #shuffle(imgs)  # shuffle the order
    imgs = data["images"]
    prepro_captions(imgs)

    # create the vocab
    vocab = build_vocab(imgs, params)
    itow = {i + 1: w
            for i, w in enumerate(vocab)
            }  # a 1-indexed vocab translation table
    wtoi = {w: i + 1 for i, w in enumerate(vocab)}  # inverse table

    # done

    # assign the splits
    assign_splits(imgs, params)

    # encode captions in large arrays, ready to ship to hdf5 file
    L, label_start_ix, label_end_ix, label_length = encode_captions(
        imgs, params, wtoi)

    import misc.resnet as resnet
    resnet_type = 'resnet101'
    if resnet_type == 'resnet101':
        resnet = resnet.resnet101()
        resnet.load_state_dict(torch.load('resnet/resnet101.pth'))
    else:
        resnet = resnet.resnet152()
        resnet.load_state_dict(torch.load('resnet/resnet152.pth'))
    my_resnet = myResnet(resnet)
    my_resnet.cuda()
    my_resnet.eval()

    # create output h5 file
    N = len(imgs)
    f_lb = h5py.File(params['output_h5'] + '_' + resnet_type + '_label.h5',
                     "w")
    f_fc = h5py.File(params['output_h5'] + '_' + resnet_type + '_fc.h5', "w")
    f_att = h5py.File(params['output_h5'] + '_' + resnet_type + '_att.h5', "w")
    f_lb.create_dataset("labels", dtype='uint32', data=L)
    f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
    f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
    f_lb.create_dataset("label_length", dtype='uint32', data=label_length)
    f_lb.close()

    #exit()
    ### extract features
    dset_fc = f_fc.create_dataset("fc", (N, 2048), dtype='float32')
    dset_att = f_att.create_dataset("att", (N, 14, 14, 2048), dtype='float32')
    for i, img in enumerate(imgs):
        # load the image
        real_path = img['filepath'] + "/" + img['filename']
        I = skimage.io.imread(os.path.join(params['images_root'],
                                           real_path))  # note the path
        # handle grayscale input images
        if len(I.shape) == 2:
            I = I[:, :, np.newaxis]
            I = np.concatenate((I, I, I), axis=2)

        I = I.astype('float32') / 255.0
        I = torch.from_numpy(I.transpose([2, 0, 1])).cuda()
        I = Variable(preprocess(I), volatile=True)
        tmp_fc, tmp_att = my_resnet(I)
        # write to h5
        dset_fc[i] = tmp_fc.data.cpu().float().numpy()
        dset_att[i] = tmp_att.data.cpu().float().numpy()
        if i % 1000 == 0:
            print 'processing %d/%d (%.2f%% done)' % (i, N, i * 100.0 / N)
    f_fc.close()
    f_att.close()
    print 'wrote ', params['output_h5']

    # create output json file
    out = {}
    out['ix_to_word'] = itow  # encode the (1-indexed) vocab
    out['images'] = []
    for i, img in enumerate(imgs):

        jimg = {}
        jimg['split'] = img['split']
        if 'filepath' in img:
            jimg['filepath'] = img['filepath']  # copy it over, might need
        if 'id' in img:
            jimg['id'] = img[
                'id']  # copy over & mantain an id, if present (e.g. coco ids, useful)

        out['images'].append(jimg)

    json.dump(out, open(params['output_json'], 'w'))
    print 'wrote ', params['output_json']