def extract_feats(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSRN(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) # SAVE SCAN FEATS if split == 'dev': np.save('/media/sounak/4tbdisk/VSRN/img_embs_1K.npy', img_embs) np.save('/media/sounak/4tbdisk/VSRN/cap_embs_1K.npy', cap_embs) else: np.save('/media/sounak/4tbdisk/VSRN/img_embs_5K.npy', img_embs) np.save('/media/sounak/4tbdisk/VSRN/cap_embs_5K.npy', cap_embs) return
def evalrank(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSRN(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) if not fold5: # no cross-validation, full evaluation r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ri, rti = t2i(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.2f" % rsum) print("Average i2t Recall: %.2f" % ar) print("Image to text: %.2f %.2f %.2f %.2f %.2f" % r) print("Average t2i Recall: %.2f" % ari) print("Text to image: %.2f %.2f %.2f %.2f %.2f" % ri) else: # 5fold cross-validation, only for MSCOCO results = [] for i in range(5): r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], measure=opt.measure, return_ranks=True) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], measure=opt.measure, return_ranks=True) if i == 0: rt, rti = rt0, rti0 print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % (mean_metrics[10] * 6)) print("Average i2t Recall: %.1f" % mean_metrics[11]) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5]) print("Average t2i Recall: %.1f" % mean_metrics[12]) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10]) torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/data', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=2048, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') ### AM Parameters parser.add_argument('--text_number', default=15, type=int, help='Number of ocr tokens used (max. 20).') parser.add_argument('--text_dim', default=300, type=int, help='Dimension of scene text embedding - default 300') ###caption parameters parser.add_argument( '--dim_vid', type=int, default=2048, help='dim of features of video frames') parser.add_argument( '--dim_hidden', type=int, default=512, help='size of the rnn hidden layer') parser.add_argument( "--bidirectional", type=int, default=0, help="0 for disable, 1 for enable. encoder/decoder bidirectional.") parser.add_argument( '--input_dropout_p', type=float, default=0.2, help='strength of dropout in the Language Model RNN') parser.add_argument( '--rnn_type', type=str, default='gru', help='lstm or gru') parser.add_argument( '--rnn_dropout_p', type=float, default=0.5, help='strength of dropout in the Language Model RNN') parser.add_argument( '--dim_word', type=int, default=300, # 512 help='the encoding size of each token in the vocabulary, and the video.' ) parser.add_argument( "--max_len", type=int, default=60, help='max length of captions(containing <sos>,<eos>)') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load(open(os.path.join( opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSRN(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch best_rsum = train(opt, train_loader, model, epoch, val_loader, best_rsum) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint({ 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/data', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.05, type=float, help='loss margin.') parser.add_argument('--temperature', default=14, type=int, help='loss temperature.') parser.add_argument('--num_epochs', default=9, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=2048, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=4, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--seed', default=1, type=int, help='random seed.') parser.add_argument('--use_atten', action='store_true', help='use_atten') parser.add_argument('--lambda_softmax', default=9., type=float, help='Attention softmax temperature.') parser.add_argument('--use_box', action='store_true', help='use_box') parser.add_argument('--use_label', action='store_true', help='use_label') parser.add_argument('--use_mmd', action='store_true', help='use_mmd') parser.add_argument('--score_path', default='../user_data/score.npy', type=str) opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) #tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper #vocab = pickle.load(open(os.path.join( # opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) #vocab = deserialize_vocab(os.path.join(opt.vocab_path, 'kdd2020_caps_vocab_train_val_threshold2.json')) vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) stoppath = os.path.join(opt.vocab_path, 'stopwords.txt') f_stop = open(stoppath, 'r') stops = f_stop.readlines() stopwords = [] for sw in stops: sw = sw.strip() #.encode('utf-8').decode('utf-8') stopwords.append(sw) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, stopwords, opt.batch_size, opt.workers, opt, True) # Construct the model model = VSRN(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume))
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/data', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.05, type=float, help='loss margin.') parser.add_argument('--temperature', default=14, type=int, help='loss temperature.') parser.add_argument('--num_epochs', default=7, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=2048, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=4, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--seed', default=1, type=int, help='random seed.') parser.add_argument('--use_atten', action='store_true', help='use_atten') parser.add_argument('--use_box', action='store_true', help='use_box') parser.add_argument('--use_label', action='store_true', help='use_label') parser.add_argument('--lambda_softmax', default=9., type=float, help='Attention softmax temperature.') parser.add_argument('--use_mmd', action='store_true', help='use_mmd') parser.add_argument('--score_path', default='../user_data/score.npy', type=str) opt = parser.parse_args() print(opt) set_seed(opt.seed) if not os.path.exists(opt.logger_name): os.mkdir(opt.logger_name) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) stoppath = os.path.join(opt.vocab_path, 'stopwords.txt') f_stop = open(stoppath, 'r') stops = f_stop.readlines() stopwords = [] for sw in stops: sw = sw.strip() #.encode('utf-8').decode('utf-8') stopwords.append(sw) # Load data loaders if opt.resume: train_loader, val_loader = data_finetune.get_loaders( opt.data_name, vocab, stopwords, opt.batch_size, opt.workers, opt) else: train_loader, val_loader = data.get_loaders(opt.data_name, vocab, stopwords, opt.batch_size, opt.workers, opt) # Construct the model model = VSRN(opt) # optionally resume from a checkpoint start_epoch = 0 if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = 4 #start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' epoch {}".format( opt.resume, start_epoch)) #validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 best_rerank_rsum = 0 for epoch in range(start_epoch, opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train_loader.dataset.initial() best_rsum, best_rerank_rsum = train(opt, train_loader, model, epoch, val_loader, best_rsum, best_rerank_rsum) # evaluate on validation set rsum, rerank_rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum rerank_is_best = rerank_rsum > best_rerank_rsum best_rsum = max(rsum, best_rsum) best_rerank_rsum = max(rerank_rsum, best_rerank_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'best_rerank_rsum': best_rerank_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, rerank_is_best, prefix=opt.logger_name + '/')