def main(model_name, model_type, feats_name, img_savepath, query_file, vocab_file, cuda): """Main function.""" queries = json.load(open(query_file)) vocab = json.load(open(vocab_file)) data = h5py.File(feats_name, 'r') data_dict = dict() for fname, feat in zip(data['filenames'], data['features']): data_dict[fname] = feat if model_type == 'inception': model = inception(512, 512, 2480, batch_first=True, dropout=0.7) elif model_type == 'vgg': model = vgg(512, 512, 2480, batch_first=True, dropout=0.7) elif model_type == 'squeezenet': model = squeezenet(512, 512, 2480, batch_first=True, dropout=0.7) else: print("Please, specify a valid model type: inception, vgg, squeezenet"\ "instead of %s" % model_type) return """Load the model weights.""" if cuda: model = model.cuda() model.load_state_dict(torch.load(model_name)) model.eval() txt_trf = TextTransforms() # pylint: disable=W0108 txt_norm = lambda x: txt_trf.normalize(x) # First of all, add a zero vector to the features data for start/stop. data_dict['zeros'] = np.zeros_like(data['features'][0]) zero_idx = list(data_dict.keys()).index('zeros') answers_feats = torch.from_numpy(np.array(data_dict.values())) answers_feats = torch.nn.functional.normalize(answers_feats, p=2, dim=1) if cuda: answers_feats = answers_feats.cuda() for nq, query in enumerate(queries): # Now, generate outfit for one image (forward and backward prediction until start/stop): query_feats = torch.from_numpy(np.array([data_dict[q] for q in query['image_query']])) query_feats = torch.nn.functional.normalize(query_feats, p=2, dim=1) if cuda: query_feats = query_feats.cuda() first_prod = query_feats[0].unsqueeze(0) # Start with the first image # Forward prediction forward_seq = run_forward_lstm(model, first_prod, answers_feats, data_dict, zero_idx, cuda) # Backward prediction backward_seq = run_backward_lstm(model, first_prod, answers_feats, data_dict, zero_idx, cuda) # Concatenate full sequence (forward + backward) generated by first product first_sequence = backward_seq + [query['image_query'][0]] + forward_seq seq_feats = torch.from_numpy(np.array([data_dict[im] for im in first_sequence])) seq_feats = torch.nn.functional.normalize(seq_feats, p=2, dim=1) if cuda: seq_feats = seq_feats.cuda() # If there are more images, substitute the nearest one by the query and recompute: if len(query['image_query']) >= 2: positions = [len(backward_seq)] # Position of the first query in the sequence for i, img in enumerate(query['image_query'][1:]): # Find NN of the next item dists = torch.mm(query_feats[i + 1].unsqueeze(0), seq_feats.permute(1, 0)) _, idx = torch.max(dists, 1) positions.append(idx) start_pos = np.min(positions) end_pos = np.max(positions) if start_pos == positions[0]: start_feats = query_feats[0].unsqueeze(0) end_feats = query_feats[i + 1].unsqueeze(0) start_item = query['image_query'][0] end_item = query['image_query'][i + 1] elif end_pos == positions[0]: start_feats = query_feats[i + 1].unsqueeze(0) end_feats = query_feats[0].unsqueeze(0) start_item = query['image_query'][i + 1] end_item = query['image_query'][0] blanks = run_fill_lstm(model, start_feats, end_feats, end_pos - start_pos - 1, answers_feats, data_dict, zero_idx, cuda) sets = [start_item] + blanks + [end_item] sets_feats = torch.from_numpy(np.array([data_dict[im] for im in sets])) sets_feats = torch.nn.functional.normalize(sets_feats, p=2, dim=1) if cuda: sets_feats = sets_feats.cuda() # run bi LSTM again forward_seq = run_forward_lstm(model, sets_feats, answers_feats, data_dict, zero_idx, cuda) backward_seq = run_backward_lstm(model, sets_feats, answers_feats, data_dict, zero_idx, cuda) sets = backward_seq + sets + forward_seq positions = [len(backward_seq), len(sets) - len(forward_seq) - 1] else: sets = backward_seq + query['image_query'] + forward_seq if len(query['text_query']): text_query = txt_norm(query['text_query']) texts = torch.stack([get_one_hot(word, vocab) for word in text_query.split()]) texts = torch.autograd.Variable(texts) if cuda: texts = texts.cuda() text_query_feat = model.textn(texts) text_query_feat = torch.mean(text_query_feat.view(len(text_query_feat), -1), 0) text_query_feat = torch.nn.functional.normalize(text_query_feat.unsqueeze(0), p=2, dim=1) sets_text = sets[:] for i, j in enumerate(sets): if j not in query['image_query']: sets_text[i] = nn_search(j, text_query_feat, data_dict, answers_feats, cuda) create_img_outfit(sets, positions, os.path.join(img_savepath, "%d.jpg" % nq)) create_img_outfit(sets_text, positions, os.path.join(img_savepath, "%d_%s.jpg" % (nq, text_query)))
def config(net_params, data_params, opt_params, cuda_params): """Get parameters to configure the experiment and prepare the needed variables. Args: - net_params: list containing: input_dimension for LSTM (int) output_dimension for LSTM (int) margin for contrastive loss (float) size of the vocabulary (int) load_path for loading weights (str) (None by default) freeze (bool) whether or not freezing the cnn layers - data_params: dictionary with keys: 'img_dir': path to the directory where images are (string) 'json_dir': path to the directory wher jsons are (string) 'json_files': names of the train, test and validation jsons (dictionary) 'batch_size': batch_size (int) 'batch_first': batch_first (bool) for the LSTM sequences - opt_params: dictionary with keys: 'learning_rate': learning rate value (float) 'weight_decay': weight decay value (float) - cuda_params: dictionary with keys: 'cuda': (bool): whether to use GPU or not 'multigpu': (list of int): indices of GPUs to use Returns: - model: pytorch model to train - dataloaders: data iterators - scheduler: scheduler of the optimizer function to train - criterion: loss equation to train """ model_type, input_dim, hidden_dim, margin, vocab_size, load_path, freeze = net_params if model_type == 'inception': model = inception(input_dim, hidden_dim, vocab_size, data_params['batch_first'], dropout=0.7, freeze=freeze) img_size = 299 img_trf = {'train': ImageTransforms(img_size + 6, 5, img_size, 0.5), 'test': ImageTransforms(img_size)} img_train_tf = lambda x: torchvision.transforms.ToTensor()(img_trf['train'].random_crop( img_trf['train'].random_rotation(img_trf['train'].random_horizontal_flip( img_trf['train'].resize(x))))) img_test_val_tf = lambda x: torchvision.transforms.ToTensor()(img_trf['test'].resize(x)) elif model_type == 'vgg': model = vgg(input_dim, hidden_dim, vocab_size, data_params['batch_first'], dropout=0.7, freeze=freeze) img_size = 224 norm_trf = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) img_trf = {'train': ImageTransforms(img_size + 6, 5, img_size, 0.5), 'test': ImageTransforms(img_size)} img_train_tf = lambda x: norm_trf(torchvision.transforms.ToTensor()(img_trf['train'].random_crop( img_trf['train'].random_rotation(img_trf['train'].random_horizontal_flip( img_trf['train'].resize(x)))))) img_test_val_tf = lambda x: norm_trf(torchvision.transforms.ToTensor()(img_trf['test'].resize(x))) elif model_type == 'squeezenet': model = squeezenet(input_dim, hidden_dim, vocab_size, data_params['batch_first'], dropout=0.7, freeze=freeze) img_size = 227 norm_trf = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) img_trf = {'train': ImageTransforms(img_size + 6, 5, img_size, 0.5), 'test': ImageTransforms(img_size)} img_train_tf = lambda x: norm_trf(torchvision.transforms.ToTensor()(img_trf['train'].random_crop( img_trf['train'].random_rotation(img_trf['train'].random_horizontal_flip( img_trf['train'].resize(x)))))) img_test_val_tf = lambda x: norm_trf(torchvision.transforms.ToTensor()(img_trf['test'].resize(x))) else: print("Please, specify a valid model type: inception, vgg or squeezenet"\ "instead of %s" % model_type) return txt_train_tf = lambda x: TXT_TRF.random_delete(TXT_TRF.normalize(x)) img_transforms = {'train': img_train_tf, 'test': img_test_val_tf, 'val': img_test_val_tf} txt_transforms = {'train': txt_train_tf, 'test': TXT_TEST_VAL_TF, 'val': TXT_TEST_VAL_TF} if load_path is not None: print("Loading weights from %s" % load_path) model.load_state_dict(torch.load(load_path)) if cuda_params['cuda']: print("Switching model to gpu") model.cuda() if cuda_params['multigpu']: print("Switching model to multigpu") multgpu = ast.literal_eval(multigpu[0]) model.cuda() model = nn.DataParallel(model, device_ids=cuda_params['multigpu']) dataloaders = {x: torch.utils.data.DataLoader( PolyvoreDataset(os.path.join(data_params['json_dir'], data_params['json_files'][x]), data_params['img_dir'], img_transform=img_transforms[x], txt_transform=txt_transforms[x]), batch_size=data_params['batch_size'], shuffle=True, num_workers=24, collate_fn=collate_seq, pin_memory=True) for x in ['train', 'test', 'val']} # Optimize only the layers with requires_grad = True, not the frozen layers: optimizer = optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt_params['learning_rate'], weight_decay=opt_params['weight_decay']) criterion = LSTMLosses(data_params['batch_first'], cuda_params['cuda']) contrastive_criterion = SBContrastiveLoss(margin) return model, dataloaders, optimizer, criterion, contrastive_criterion
def main(model_name, model_type, feats_name, img_savepath, cuda): """Main function.""" outfit_file = 'data/label/fill_in_blank_test.json' fitb = json.load(open(outfit_file)) data = h5py.File(feats_name, 'r') data_dict = dict() for fname, feat in zip(data['filenames'], data['features']): data_dict[fname] = feat if model_type == 'inception': model = inception(512, 512, 2480, batch_first=True, dropout=0.7) elif model_type == 'vgg': model = vgg(512, 512, 2480, batch_first=True, dropout=0.7) elif model_type == 'squeezenet': model = squeezenet(512, 512, 2480, batch_first=True, dropout=0.7) else: print("Please, specify a valid model type: inception, vgg or squeezenet"\ "instead of %s" % model_type) return """Load the model weights.""" if cuda: model = model.cuda() model.load_state_dict(torch.load(model_name)) model.eval() scores = [] tic = time.time() for i, outfit in enumerate(fitb): sys.stdout.write('Outfit %d/%d - %2.f secs remaining\r' % (i, len(fitb), (time.time() - tic) / (i + 1) * (len(fitb) - i))) sys.stdout.flush() question_feats = torch.from_numpy( np.array([data_dict[q] for q in outfit['question']])) question_feats = torch.nn.functional.normalize(question_feats, p=2, dim=1) answers_feats = torch.from_numpy( np.array([data_dict[a] for a in outfit['answers']])) answers_feats = torch.nn.functional.normalize(answers_feats, p=2, dim=1) if cuda: question_feats = question_feats.cuda() answers_feats = answers_feats.cuda() position = outfit['blank_position'] - 1 if position == 0: out, _ = model.lstm( torch.autograd.Variable(question_feats).unsqueeze(0)) out = out.data bw_hidden = out[0, :question_feats.size(0), out.size(2) // 2:][0].view(1, -1) pred = predict_single_direction( torch.autograd.Variable(bw_hidden), torch.autograd.Variable(answers_feats)) elif position == len(question_feats): out, _ = model.lstm( torch.autograd.Variable(question_feats).unsqueeze(0)) out = out.data fw_hidden = out[0, :question_feats.size(0), :out.size(2) // 2][-1].view(1, -1) pred = predict_single_direction( torch.autograd.Variable(fw_hidden), torch.autograd.Variable(answers_feats)) else: prev = question_feats[:position] prev_out, _ = model.lstm( torch.autograd.Variable(prev).unsqueeze(0)) prev_out = prev_out.data fw_hidden = prev_out[0, :prev.size(0), :prev_out.size(2) // 2][-1].view(1, -1) post = question_feats[position:] post_out, _ = model.lstm( torch.autograd.Variable(post).unsqueeze(0)) post_out = post_out.data bw_hidden = post_out[0, :post.size(0), post_out.size(2) // 2:][0].view(1, -1) pred = predict_multi_direction( torch.autograd.Variable(fw_hidden), torch.autograd.Variable(bw_hidden), torch.autograd.Variable(answers_feats)) create_img_fitb( outfit, pred[0].data[0], os.path.join(img_savepath, "%d_score%.2f.jpg" % (i, pred[1].data[0] * 100))) scores.append(pred) print("\n") acc = np.sum([i[0].data[0] == 0 for i in scores]) / float(len(scores)) print("\033[0;31m\nModel: %s\033[0m" % model_name) print("\033[1;30mFITB accuracy for %d outfits: %.2f%%\033[0m" % (len(fitb), acc * 100))