Example #1
0
# flickr doesnt need to be split at the root node
def iterate_data(h5_file):
    for x in h5_file.root:
        yield x


f_nodes = [node for node in iterate_data(data_file)]

# split the database into train test and validation sets. default settings uses the json file
# with the karpathy split
train, test, val = split_data_flickr(f_nodes, args.split_loc)
#####################################################
# network modules
img_net = img_encoder(image_config)
cap_net = text_rnn_encoder(char_config)

# create a trainer with just the evaluator for the purpose of testing a pretrained model
trainer = flickr_trainer(img_net, cap_net, args.visual, args.cap)
trainer.set_raw_text_batcher()
# optionally use cuda
if cuda:
    trainer.set_cuda()
trainer.set_evaluator([1, 5, 10])

# list all the trained model parameters
models = os.listdir(args.results_loc)
caption_models = [x for x in models if 'caption' in x]
img_models = [x for x in models if 'image' in x]

# run the image and caption retrieval and create an ensemble
    sort = np.argsort(- np.array(lengths))    
    sent = sent[sort]
    lengths = np.array(lengths)[sort]
    sent = torch.autograd.Variable(torch.cuda.FloatTensor(sent))    
    # embed the captions
    embeddings = params.sent_embedder(sent, lengths)
    embeddings = embeddings.data.cpu().numpy()   
    embeddings = embeddings[np.argsort(sort)]
    return embeddings

# create config dictionaries with all the parameters for your encoders
text_config = {'embed':{'num_chars': 101, 'embedding_dim': 20, 'sparse': False, 'padding_idx': 0}, 
               'rnn':{'input_size': 20, 'hidden_size': 1024, 'num_layers': 1, 'batch_first': True,
               'bidirectional': True, 'dropout': 0}, 'att':{'in_size': 2048, 'hidden_size': 128, 'heads': 1}}
# create encoder
encoder = text_rnn_encoder(text_config)
for p in encoder.parameters():
    p.requires_grad = False
encoder.cuda()

models = os.listdir(PATH_TO_ENC)
models = [x for x in models if 'caption_model' in x]

for model in models:
    print(model)
    # load pretrained model
    encoder_state = torch.load(os.path.join(PATH_TO_ENC, model))
    encoder.load_state_dict(encoder_state)
    for p in encoder.parameters():
        p.requires_grad = False
    encoder.cuda()
Example #3
0
# flickr doesnt need to be split at the root node
def iterate_data(h5_file):
    for x in h5_file.root:
        yield x


f_nodes = [node for node in iterate_data(data_file)]

# split the database into train test and validation sets. default settings uses the json file
# with the karpathy split
train, test, val = split_data_flickr(f_nodes, args.split_loc)
#####################################################
# network modules
img_net = img_encoder(image_config)
cap_net = text_rnn_encoder(token_config)

# list all the trained model parameters
models = os.listdir(args.results_loc)
caption_models = [x for x in models if 'caption' in x]
img_models = [x for x in models if 'image' in x]

# run the image and caption retrieval
img_models.sort()
caption_models.sort()

# create a trainer with just the evaluator for the purpose of testing a pretrained model
trainer = flickr_trainer(img_net, cap_net, args.visual, args.cap)
trainer.set_token_batcher()
# optionally use cuda
if cuda:
Example #4
0
def create_encoders(preset_name, dict_size):
    if preset_name == 'rnn':
        # create config dictionaries with all the parameters for your encoders
        audio_config = {
            'conv': {
                'in_channels': 39,
                'out_channels': 64,
                'kernel_size': 6,
                'stride': 2,
                'padding': 0,
                'bias': False
            },
            'rnn': {
                'input_size': [64],
                'hidden_size': [1024],
                'n_layers': [4],
                'batch_first': True,
                'bidirectional': True,
                'dropout': 0,
                'max_len': 1024
            },
            'att': {
                'in_size': 2048,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 0,
                'n_embs': [],
                'emb_dim': []
            },
            'app_order': [0]
        }
        # calculate the required output size of the image encoder
        out_size = audio_config['rnn']['hidden_size'][-1] * 2 ** \
                   audio_config['rnn']['bidirectional'] * audio_config['att']['heads']
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = audio_rnn_encoder(audio_config)

    elif preset_name == 'rnn_VQ':
        # create config dictionaries with all the parameters for your encoders
        audio_config = {
            'conv': {
                'in_channels': 39,
                'out_channels': 64,
                'kernel_size': 6,
                'stride': 2,
                'padding': 0,
                'bias': False
            },
            'rnn': {
                'input_size': [64, 2048, 2048],
                'hidden_size': [1024, 1024, 1024],
                'n_layers': [1, 1, 2],
                'batch_first': True,
                'bidirectional': True,
                'dropout': 0,
                'max_len': 1024
            },
            'att': {
                'in_size': 2048,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 2,
                'n_embs': [128, 2048],
                'emb_dim': [2048, 2048]
            },
            'app_order': [0, 1, 0, 1, 0],
        }
        # calculate the required output size of the image encoder
        out_size = audio_config['rnn']['hidden_size'][-1] * 2 ** \
                   audio_config['rnn']['bidirectional'] * audio_config['att']['heads']
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = audio_rnn_encoder(audio_config)

    elif preset_name == 'rnn_pack':
        audio_config = {
            'conv': {
                'in_channels': 39,
                'out_channels': 64,
                'kernel_size': 6,
                'stride': 2,
                'padding': 0,
                'bias': False
            },
            'rnn': {
                'input_size': [64, 1024],
                'hidden_size': [1024, 1024],
                'n_layers': [1, 3],
                'batch_first': True,
                'bidirectional': [True, True],
                'dropout': 0,
                'max_len': 1024
            },
            'rnn_pack': {
                'input_size': [2048],
                'hidden_size': [1024]
            },
            'att': {
                'in_size': 2048,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 1,
                'n_embs': [64],
                'emb_dim': [2048]
            },
            'app_order': ['rnn', 'VQ', 'rnn_pack', 'rnn'],
        }

        out_size = audio_config['rnn']['hidden_size'][-1] * 2 ** \
                   audio_config['rnn']['bidirectional'][-1] * audio_config['att']['heads']
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = rnn_pack_encoder(audio_config)

    elif preset_name == 'conv_VQ':

        audio_config = {
            'conv_init': {
                'in_channels': 39,
                'out_channels': 128,
                'kernel_size': 1,
                'stride': 1,
                'padding': 0
            },
            'conv': {
                'in_channels': [128, 128, 256, 512],
                'out_channels': [128, 256, 512, 1024],
                'kernel_size': [9, 9, 9, 9],
                'stride': [2, 2, 2, 2],
                'n_layers': 4
            },
            'att': {
                'in_size': 1024,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 2,
                'n_embs': [1024, 1024],
                'emb_dim': [128, 256]
            },
            'max_len': 1024,
            'app_order': [0, 1, 0, 1, 0, 0]
        }
        # get the required output size of the img encoder from audio_config
        out_size = audio_config['conv']['out_channels'][-1]
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = conv_VQ_encoder(audio_config)

    elif preset_name == 'conv':

        audio_config = {
            'conv_init': {
                'in_channels': 39,
                'out_channels': 128,
                'kernel_size': 1,
                'stride': 1,
                'padding': 0
            },
            'conv': {
                'in_channels': [128, 128, 256, 512],
                'out_channels': [128, 256, 512, 1024],
                'kernel_size': [9, 9, 9, 9],
                'stride': [2, 2, 2, 2],
                'n_layers': 4
            },
            'att': {
                'in_size': 1024,
                'hidden_size': 128,
                'heads': 1
            },
            'VQ': {
                'n_layers': 0,
                'n_embs': [],
                'emb_dim': []
            },
            'max_len': 1024,
            'app_order': [0, 0, 0, 0]
        }
        # get the required output size of the img encoder from audio_config
        out_size = audio_config['conv']['out_channels'][-1]
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = conv_VQ_encoder(audio_config)

    elif preset_name == 'rnn_text':
        # create config dictionaries with all the parameters for your encoders
        char_config = {
            'embed': {
                'num_chars': dict_size,
                'embedding_dim': 1024,
                'sparse': False,
                'padding_idx': 0
            },
            'rnn': {
                'input_size': 1024,
                'hidden_size': 1024,
                'n_layers': 1,
                'batch_first': True,
                'bidirectional': True,
                'dropout': 0,
                'max_len': 1024
            },
            'att': {
                'in_size': 2048,
                'hidden_size': 128,
                'heads': 1
            }
        }
        # calculate the required output size of the image encoder
        out_size = char_config['rnn']['hidden_size'] * 2 ** \
                   char_config['rnn']['bidirectional'] * char_config['att']['heads']
        image_config = {
            'linear': {
                'in_size': 2048,
                'out_size': out_size
            },
            'norm': True
        }
        img_net = img_encoder(image_config)
        cap_net = text_rnn_encoder(char_config)

    return (img_net, cap_net)