def eval_model(model, dataset_config, image_size, device):
    # extract query feature
    query = get_test_loader(root=os.path.join(dataset_config.root,
                                              dataset_config.query),
                            batch_size=512,
                            image_size=image_size,
                            num_workers=16)

    query_feat = []
    query_label = []
    query_cam_id = []
    for data, label, cam_id, _ in query:
        feat = model(data.cuda(non_blocking=True))

        query_feat.append(feat.data.cpu().numpy())
        query_label.append(label.data.cpu().numpy())
        query_cam_id.append(cam_id.data.cpu().numpy())

    query_feat = np.concatenate(query_feat, axis=0)
    query_label = np.concatenate(query_label, axis=0)
    query_cam_id = np.concatenate(query_cam_id, axis=0)

    # extract gallery feature
    gallery = get_test_loader(root=os.path.join(dataset_config.root,
                                                dataset_config.gallery),
                              batch_size=512,
                              image_size=image_size,
                              num_workers=16)

    gallery_feat = []
    gallery_label = []
    gallery_cam_id = []
    for data, label, cam_id, _ in gallery:
        feat = model(data.cuda(non_blocking=True))

        gallery_feat.append(feat.data.cpu().numpy())
        gallery_label.append(label)
        gallery_cam_id.append(cam_id)

    gallery_feat = np.concatenate(gallery_feat, axis=0)
    gallery_label = np.concatenate(gallery_label, axis=0)
    gallery_cam_id = np.concatenate(gallery_cam_id, axis=0)

    mAP, r1, r5, r10 = eval_feature(query_feat, gallery_feat, query_label,
                                    query_cam_id, gallery_label,
                                    gallery_cam_id, device)
    print(
        'mAP = %f , r1 precision = %f , r5 precision = %f , r10 precision = %f'
        % (mAP, r1, r5, r10))
Beispiel #2
0
def debug_show_similarity_with_manually_created_examples(
        model_path, data_path=None, split='test'):
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.use_external_captions = False
    if data_path is not None:
        opt.data_path = data_path

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    img_embs, cap_embs = encode_data(model, data_loader)
    img_embs = img_embs[:100]
    cap_embs = cap_embs[:100]
    data_loader_ex_0 = get_text_loader(split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt,
                                       'manually_ex_%d' % 0)
    encoding_0 = encode_data(model, data_loader_ex_0)[1]
    data_loader_ex_1 = get_text_loader(split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt,
                                       'manually_ex_%d' % 1)
    encoding_1 = encode_data(model, data_loader_ex_1)[1]
    print('Computing results...')

    # compute similarity
    result = list()
    result_0 = list()
    result_1 = list()

    npts = img_embs.shape[0] // 5
    for index in range(npts):
        # Get query image
        im = img_embs[5 * index].reshape(1, img_embs.shape[1])

        # Compute scores
        if opt.measure == 'order':
            raise Exception('Measure order not supported.')
        else:
            result.append(numpy.dot(im, cap_embs.T).flatten())
            result_0.append(numpy.dot(im, encoding_0.T).flatten())
            result_1.append(numpy.dot(im, encoding_1.T).flatten())
    torch.save({
        'orig': result,
        'Tete': result_0,
        'Haoyue': result_1
    }, 'shy_runs/debug.pt')
def evalrank(model_path, data_path=None, split='dev', fold5=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    print(opt)
    if data_path is not None:
        opt.data_path = data_path
    # load vocabulary used by the model
    vocab = deserialize_vocab(
        os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
    opt.vocab_size = len(vocab)

    # construct model
    model = Local_Alignment(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.batch_size,
                                  opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs, cap_lens = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' % (img_embs.shape[0], cap_embs.shape[0]))

    img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
    print('Images: ', img_embs.shape)
    print('Captions: ', cap_embs.shape)

    start = time.time()
    sims = compute_sims(img_embs, cap_embs, cap_lens, opt, shard_size=128)
    print(sims[:20, :4])
    end = time.time()
    print("calculate similarity time:", end - start)

    print('Saving results...')
    sio.savemat('%s_relation.mat' % opt.data_name, {'similarity': sims})
    print('Saving success...')

    r, rt = i2t(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
    ri, rti = t2i(img_embs, cap_embs, cap_lens, sims, return_ranks=True)
    ar = (r[0] + r[1] + r[2]) / 3
    ari = (ri[0] + ri[1] + ri[2]) / 3
    rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
    print("rsum: %.1f" % rsum)
    print("Average i2t Recall: %.1f" % ar)
    print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
    print("Average t2i Recall: %.1f" % ari)
    print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)

    torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
Beispiel #4
0
def eval_with_single_extended(model_path,
                              data_path=None,
                              data_name=None,
                              split='test',
                              backup_vec_ex=None):
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.use_external_captions = False
    if data_path is not None:
        opt.data_path = data_path
    if data_name is not None:
        opt.data_name = data_name

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    img_embs, cap_embs = encode_data(model, data_loader)
    if backup_vec_ex is None:
        cap_embs_ex = list()
        for i in range(img_embs.shape[0]):
            data_loader_ex = get_text_loader(split, opt.data_name, vocab,
                                             opt.batch_size, opt.workers, opt,
                                             'ex/%d' % i)
            encoding = encode_data(model, data_loader_ex)[1]
            if encoding is not None:
                cap_embs_ex.append(encoding.copy())
            else:
                cap_embs_ex.append(np.zeros(cap_embs[:1].shape))
            print('Caption Embedding: %d' % i)
        # torch.save(cap_embs_ex, 'data/coco_precomp/cap_embs_ex.pth')
    else:
        cap_embs_ex = torch.load(backup_vec_ex)
    print('Computing results...')

    r, rt = i2t_split(img_embs,
                      cap_embs,
                      cap_embs_ex,
                      measure=opt.measure,
                      return_ranks=True)
    ar = (r[0] + r[1] + r[2]) / 3
    print("Average i2t Recall: %.1f" % ar)
    print("Image to text: %.1f\t%.1f\t%.1f\t%.1f\t%.1f" % r)
    torch.save({'rt': rt}, model_path[:model_path.find('model_best')] +
               'ranks_single_extended.pth.tar')
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='./data/',
                        help='path to datasets')
    parser.add_argument('--model_path',
                        default='./data/',
                        help='path to model')
    parser.add_argument('--split', default='test', help='val/test')
    parser.add_argument('--gpuid', default=0., type=str, help='gpuid')
    parser.add_argument('--fold5', action='store_true', help='fold5')
    opts = parser.parse_args()

    device_id = opts.gpuid
    print("use GPU:", device_id)
    os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id)
    device_id = 0
    torch.cuda.set_device(0)
    # load model and options
    checkpoint = torch.load(opts.model_path)
    opt = checkpoint['opt']
    opt.loss_verbose = False
    opt.split = opts.split
    opt.data_path = opts.data_path
    opt.fold5 = opts.fold5

    # load vocabulary used by the model
    vocab = deserialize_vocab(
        os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
    opt.vocab_size = len(vocab)

    # construct model
    model = SCAN(opt)
    model.cuda()
    model = nn.DataParallel(model)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = data.get_test_loader(opt.split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt)

    print(opt)
    print('Computing results...')

    evaluation.evalrank(model.module,
                        data_loader,
                        opt,
                        split=opt.split,
                        fold5=opt.fold5)
Beispiel #6
0
def eval_with_manually_extended(model_path, data_path=None, split='test'):
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.use_external_captions = False
    if data_path is not None:
        opt.data_path = data_path

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    img_embs, cap_embs = encode_data(model, data_loader)
    img_embs = img_embs[:100]
    cap_embs = cap_embs[:100]
    cap_embs_ex = list()
    data_loader_ex_0 = get_text_loader(split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt,
                                       'manually_ex_%d' % 0)
    encoding_0 = encode_data(model, data_loader_ex_0)[1]
    data_loader_ex_1 = get_text_loader(split, opt.data_name, vocab,
                                       opt.batch_size, opt.workers, opt,
                                       'manually_ex_%d' % 1)
    encoding_1 = encode_data(model, data_loader_ex_1)[1]
    for i in range(100):
        cap_emb = np.concatenate(
            (encoding_0[i * 2:i * 2 + 2], encoding_1[i * 2:i * 2 + 2]), axis=0)
        cap_embs_ex.append(cap_emb)
    print('Computing results...')

    r, rt = i2t_split(img_embs,
                      cap_embs,
                      cap_embs_ex,
                      measure=opt.measure,
                      return_ranks=True)
    # r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True)
    ar = (r[0] + r[1] + r[2]) / 3
    print("Average i2t Recall: %.1f" % ar)
    print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
    torch.save({'rt': rt}, model_path[:model_path.find('model_best')] +
               'ranks_manually_extended_1.pth.tar')
Beispiel #7
0
def online_test(data_path,model_dir_list,
                loggits_save_path,
                test_img_size=128):
    models = []
    device = torch.device("cuda:0")
    for inx, i in enumerate(model_dir_list):
        if 'se50' in i:
            model=GermanNetSE50().to(device)
        elif 'xcep' in i:
            model=GermanNetXcep().to(device)
        else:
            model=GermanNetIncepRes().to(device)
        model.eval()
        model_path = os.path.join(i, 'model_best.pth')
        print(model_path)
        model.load_state_dict(torch.load(model_path))
        models.append(model)

    dataloader = get_test_loader(data_path,bsize=32,img_size=test_img_size)
    pred_npy = np.zeros((len(dataloader.dataset),17),np.float32)
    print('online test predicting.....')
    utils.create_dir(config_dict['commit_outdir'])
    submit_csv = config_dict['commit_outdir'] + \
                 datetime.datetime.now().strftime('%Y%m%d_%H%M%S') +  "_submit.csv"
    fout = open(submit_csv,'w')
    inx=0
    for data,label in tqdm.tqdm(dataloader):
        data=data.to(device)
        pred = models[0](data)
        for i in range(1, len(models)):
            pred_sub = models[i](data)
            pred += pred_sub
        pred = pred.data.cpu().numpy()
        pred_npy[inx:inx+pred.shape[0],:]=pred
        inx+=pred.shape[0]
        pred = np.argmax(pred, 1)
        
        for i in range(pred.shape[0]):
            one_hot = [0,0,0,0,0,
                       0,0,0,0,0,
                       0,0,0,0,0,
                       0,0]
            one_hot[pred[i]]=1
            for j in  range(16):
                fout.write(str(one_hot[j])+',')
            fout.write(str(one_hot[16]) + '\n')
    fout.close()
    np.save(loggits_save_path,pred_npy)
    print('pred logits saved in: '+loggits_save_path)
    print('submit csv saved in: '+submit_csv)
Beispiel #8
0
def eval_with_extended(model_path,
                       data_path=None,
                       data_name=None,
                       split='test'):
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.use_external_captions = True
    opt.negative_number = 5
    if data_path is not None:
        opt.data_path = data_path
    if data_name is not None:
        opt.data_name = data_name

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)
    opt.use_external_captions = True

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    print('Computing results...')
    img_embs, cap_embs = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] // 5, cap_embs.shape[0]))

    r, rt = i2t_text_only(img_embs,
                          cap_embs,
                          measure=opt.measure,
                          return_ranks=True)
    ar = (r[0] + r[1] + r[2]) / 3
    print("Average i2t Recall: %.1f" % ar)
    print("Image to text: %.1f\t%.1f\t%.1f\t%.1f\t%.1f" % r)
    torch.save({'rt': rt}, model_path[:model_path.find('model_best')] +
               'ranks_extended.pth.tar')
Beispiel #9
0
def evalrank(model_path, data_path=None, split='dev', fold5=False, lang=None):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path

    # Never use undersample when testing.
    opt.undersample = False
    print(opt)

    #Load vocabulary used by the model
    if opt.data_name != "m30k":
        with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
                  'rb') as f:
            vocab = pickle.load(f)
            opt.vocab_size = len(vocab)
    else:
        vocab = pickle.load(open(os.path.join(opt.logger_name, 'vocab.pkl')))

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])
    print('Loading dataset')
    if lang is not None:
        opt.lang = lang
    langs = opt.lang.split('-')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    if len(langs) > 1:
        for data, loader_lang in zip(data_loader, langs):
            loader = data
            run_eval(model, loader, fold5, opt, loader_lang)
    else:
        run_eval(model, data_loader, fold5, opt, opt.lang)
Beispiel #10
0
def extract_feats(model_path, data_path=None, split='dev', fold5=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSRN(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    # SAVE SCAN FEATS
    if split == 'dev':
        np.save('/media/sounak/4tbdisk/VSRN/img_embs_1K.npy', img_embs)
        np.save('/media/sounak/4tbdisk/VSRN/cap_embs_1K.npy', cap_embs)
    else:
        np.save('/media/sounak/4tbdisk/VSRN/img_embs_5K.npy', img_embs)
        np.save('/media/sounak/4tbdisk/VSRN/cap_embs_5K.npy', cap_embs)
    return
Beispiel #11
0
def evalrank(model_path, data_path=None, split='dev', fold5=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path

    print(opt)

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = VSE(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    scores = numpy.sum(numpy.multiply(img_embs, cap_embs), -1)
    print(scores.shape)
    print('scores:', np.mean(scores))
Beispiel #12
0
def evalrank(model_path, data_path=None, split='dev', fold5=False, region_bbox_file=None, feature_path=None):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path
    if region_bbox_file is not None:
        opt.region_bbox_file = region_bbox_file
    if feature_path is not None:
        opt.feature_path = feature_path

    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path,
                           '%s_vocab.pkl' % opt.data_name), 'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)
    print(opt)

    # construct model
    model = VSE(opt)
    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)
    print('Computing results...')
    img_embs, cap_embs= encode_data(model, data_loader)
    time_sim_start = time.time()

    if not fold5:
        img_emb_new = img_embs[0:img_embs.size(0):5]
        print(img_emb_new.size())

        sims = torch.mm(img_emb_new, cap_embs.t())
        sims_T = torch.mm(cap_embs, cap_embs.t())
        sims_T = sims_T.cpu().numpy()

        sims = sims.cpu().numpy()
        np.save('sims_f.npy',sims)
        np.save('sims_f_T.npy',sims_T)

        print('Images: %d, Captions: %d' %
              (img_embs.shape[0] / 5, cap_embs.shape[0]))

        r = simrank(sims)

        time_sim_end = time.time()
        print('sims_time:%f' % (time_sim_end - time_sim_start))
        del sims
    else: # fold5-especially for coco
        print('5k---------------')
        img_emb_new = img_embs[0:img_embs.size(0):5]
        print(img_emb_new.size())

        sims = torch.mm(img_emb_new, cap_embs.t())
        sims_T = torch.mm(cap_embs, cap_embs.t())

        sims = sims.cpu().numpy()
        sims_T = sims_T.cpu().numpy()

        np.save('sims_full_5k.npy',sims)
        np.save('sims_full_T_5k.npy',sims_T)
        print('Images: %d, Captions: %d' %
              (img_embs.shape[0] / 5, cap_embs.shape[0]))

        r = simrank(sims)

        time_sim_end = time.time()
        print('sims_time:%f' % (time_sim_end - time_sim_start))
        del sims, sims_T
        print('1k---------------')
        r_ = [0, 0, 0, 0, 0, 0, 0]
        for i in range(5):
            print(i)
            img_emb_new = img_embs[i * 5000 : int(i * 5000 + img_embs.size(0)/5):5]
            cap_emb_new = cap_embs[i * 5000 : int(i * 5000 + cap_embs.size(0)/5)]

            sims = torch.mm(img_emb_new, cap_emb_new.t())
            sims_T = torch.mm(cap_emb_new, cap_emb_new.t())
            sims_T = sims_T.cpu().numpy()
            sims = sims.cpu().numpy()
            np.save('sims_full_%d.npy'%i,sims)
            np.save('sims_full_T_%d'%i,sims_T)

            print('Images: %d, Captions: %d' %
                  (img_emb_new.size(0), cap_emb_new.size(0)))

            r = simrank(sims)
            r_ = np.array(r_) + np.array(r)

            del sims
            print('--------------------')
        r_ = tuple(r_/5)
        print('I2T:%.1f %.1f %.1f' % r_[0:3])
        print('T2I:%.1f %.1f %.1f' % r_[3:6])
        print('Rsum:%.1f' % r_[-1])
Beispiel #13
0
def evalrank(model_path, data_path=None, split='dev', fold5=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    print(opt)
    if data_path is not None:
        opt.data_path = data_path

    # load vocabulary used by the model
    vocab = deserialize_vocab(
        os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
    opt.vocab_size = len(vocab)

    # construct model
    model = BFAN(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.batch_size,
                                  opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs, cap_lens = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    if not fold5:
        # no cross-validation, full evaluation
        img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
        start = time.time()

        sims = shard_xattn(img_embs, cap_embs, cap_lens, opt, shard_size=128)

        end = time.time()
        print("calculate similarity time:", end - start)

        batch_size = img_embs.shape[0]
        r, rt = i2t(batch_size, sims, return_ranks=True)
        ri, rti = t2i(batch_size, sims, return_ranks=True)
        ar = (r[0] + r[1] + r[2]) / 3
        ari = (ri[0] + ri[1] + ri[2]) / 3
        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
        print("rsum: %.1f" % rsum)
        print("Average i2t Recall: %.1f" % ar)
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
        print("Average t2i Recall: %.1f" % ari)
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
    else:
        # 5fold cross-validation, only for MSCOCO
        results = []
        for i in range(5):
            img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
            cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
            cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
            start = time.time()
            sims = shard_xattn(img_embs_shard,
                               cap_embs_shard,
                               cap_lens_shard,
                               opt,
                               shard_size=128)

            end = time.time()
            print("calculate similarity time:", end - start)

            batch_size = img_embs_shard.shape[0]
            r, rt0 = i2t(batch_size, sims, return_ranks=True)
            print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
            ri, rti0 = t2i(batch_size, sims, return_ranks=True)
            print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)

            if i == 0:
                rt, rti = rt0, rti0
            ar = (r[0] + r[1] + r[2]) / 3
            ari = (ri[0] + ri[1] + ri[2]) / 3
            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
            print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
            results += [list(r) + list(ri) + [ar, ari, rsum]]

        print("-----------------------------------")
        print("Mean metrics: ")
        mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
        print("rsum: %.1f" % (mean_metrics[10] * 6))
        print("Average i2t Recall: %.1f" % mean_metrics[11])
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5])
        print("Average t2i Recall: %.1f" % mean_metrics[12])
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10])

    torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
Beispiel #14
0
    logger.info(pprint.pformat(customized_cfg))

    # data loader
    train_loader = get_train_loader(root=os.path.join(cfg.root, cfg.train),
                                    batch_size=cfg.batch_size,
                                    image_size=cfg.image_size,
                                    random_crop=cfg.random_crop,
                                    random_erase=cfg.random_erase,
                                    random_mirror=cfg.random_mirror,
                                    num_workers=4)

    query_loader = None
    gallery_loader = None
    if cfg.validate_interval > 0:
        query_loader = get_test_loader(root=os.path.join(cfg.root, cfg.query),
                                       batch_size=512,
                                       image_size=cfg.image_size,
                                       num_workers=4)

        gallery_loader = get_test_loader(root=os.path.join(
            cfg.root, cfg.gallery),
                                         batch_size=512,
                                         image_size=cfg.image_size,
                                         num_workers=4)

    # model
    model = PCBModel(num_class=cfg.num_id,
                     num_parts=cfg.num_parts,
                     bottleneck_dims=cfg.bottleneck_dims,
                     pool_type=cfg.pool_type,
                     share_embed=cfg.share_embed)
Beispiel #15
0
def main():
    # parse options
    parser = TrainOptions()
    opts = parser.parse()

    # daita loader
    print('\n--- load dataset ---')
    vocab = pickle.load(
        open(os.path.join(opts.vocab_path, '%s_vocab.pkl' % opts.data_name),
             'rb'))
    vocab_size = len(vocab)
    opts.vocab_size = vocab_size
    torch.backends.cudnn.enabled = False
    # Load data loaders
    train_loader, val_loader = data.get_loaders(opts.data_name, vocab,
                                                opts.crop_size,
                                                opts.batch_size, opts.workers,
                                                opts)
    test_loader = data.get_test_loader('test', opts.data_name, vocab,
                                       opts.crop_size, opts.batch_size,
                                       opts.workers, opts)
    # model
    print('\n--- load subspace ---')
    subspace = model_2.VSE(opts)
    subspace.setgpu()
    print('\n--- load model ---')
    model = DRIT(opts)
    model.setgpu(opts.gpu)
    if opts.resume is None:  #之前没有保存过模型
        model.initialize()
        ep0 = -1
        total_it = 0
    else:
        ep0, total_it = model.resume(opts.resume)
    model.set_scheduler(opts, last_ep=ep0)
    ep0 += 1
    print('start the training at epoch %d' % (ep0))

    # saver for display and output
    saver = Saver(opts)

    # train
    print('\n--- train ---')
    max_it = 500000
    score = 0.0
    subspace.train_start()
    for ep in range(ep0, opts.pre_iter):
        print('-----ep:{} --------'.format(ep))
        for it, (images, captions, lengths, ids) in enumerate(train_loader):
            if it >= opts.train_iter:
                break
            # input data
            images = images.cuda(opts.gpu).detach()
            captions = captions.cuda(opts.gpu).detach()

            img, cap = subspace.train_emb(images,
                                          captions,
                                          lengths,
                                          ids,
                                          pre=True)  #[b,1024]

            subspace.pre_optimizer.zero_grad()
            img = img.view(images.size(0), -1, 32, 32)
            cap = cap.view(images.size(0), -1, 32, 32)

            model.pretrain_ae(img, cap)

            if opts.grad_clip > 0:
                clip_grad_norm(subspace.params, opts.grad_clip)

            subspace.pre_optimizer.step()

    for ep in range(ep0, opts.n_ep):
        subspace.train_start()
        adjust_learning_rate(opts, subspace.optimizer, ep)
        for it, (images, captions, lengths, ids) in enumerate(train_loader):
            if it >= opts.train_iter:
                break
            # input data
            images = images.cuda(opts.gpu).detach()
            captions = captions.cuda(opts.gpu).detach()

            img, cap = subspace.train_emb(images, captions, lengths,
                                          ids)  #[b,1024]

            img = img.view(images.size(0), -1, 32, 32)
            cap = cap.view(images.size(0), -1, 32, 32)

            subspace.optimizer.zero_grad()

            for p in model.disA.parameters():
                p.requires_grad = True
            for p in model.disB.parameters():
                p.requires_grad = True
            for p in model.disA_attr.parameters():
                p.requires_grad = True
            for p in model.disB_attr.parameters():
                p.requires_grad = True

            for i in range(opts.niters_gan_d):  #5
                model.update_D(img, cap)

            for p in model.disA.parameters():
                p.requires_grad = False
            for p in model.disB.parameters():
                p.requires_grad = False
            for p in model.disA_attr.parameters():
                p.requires_grad = False
            for p in model.disB_attr.parameters():
                p.requires_grad = False

            for i in range(opts.niters_gan_enc):
                model.update_E(img, cap)  #利用新的content损失函数

            subspace.optimizer.step()

            print('total_it: %d (ep %d, it %d), lr %09f' %
                  (total_it, ep, it, model.gen_opt.param_groups[0]['lr']))
            total_it += 1

        # decay learning rate
        if opts.n_ep_decay > -1:
            model.update_lr()

        # save result image
        #saver.write_img(ep, model)
        if (ep + 1) % opts.n_ep == 0:
            print('save model')
            filename = os.path.join(opts.result_dir, opts.name)
            model.save('%s/final_model.pth' % (filename), ep, total_it)
            torch.save(subspace.state_dict(),
                       '%s/final_subspace.pth' % (filename))
        elif (ep + 1) % 10 == 0:
            print('save model')
            filename = os.path.join(opts.result_dir, opts.name)
            model.save('%s/%s_model.pth' % (filename, str(ep + 1)), ep,
                       total_it)
            torch.save(subspace.state_dict(),
                       '%s/%s_subspace.pth' % (filename, str(ep + 1)))

        if (ep + 1) % opts.model_save_freq == 0:
            a = None
            b = None
            c = None
            d = None
            subspace.val_start()
            for it, (images, captions, lengths, ids) in enumerate(test_loader):
                if it >= opts.val_iter:
                    break
                images = images.cuda(opts.gpu).detach()
                captions = captions.cuda(opts.gpu).detach()

                img_emb, cap_emb = subspace.forward_emb(images,
                                                        captions,
                                                        lengths,
                                                        volatile=True)

                img = img_emb.view(images.size(0), -1, 32, 32)
                cap = cap_emb.view(images.size(0), -1, 32, 32)
                image1, text1 = model.test_model2(img, cap)
                img2 = image1.view(images.size(0), -1)
                cap2 = text1.view(images.size(0), -1)

                if a is None:
                    a = np.zeros(
                        (opts.val_iter * opts.batch_size, img_emb.size(1)))
                    b = np.zeros(
                        (opts.val_iter * opts.batch_size, cap_emb.size(1)))

                    c = np.zeros(
                        (opts.val_iter * opts.batch_size, img2.size(1)))
                    d = np.zeros(
                        (opts.val_iter * opts.batch_size, cap2.size(1)))

                a[ids] = img_emb.data.cpu().numpy().copy()
                b[ids] = cap_emb.data.cpu().numpy().copy()

                c[ids] = img2.data.cpu().numpy().copy()
                d[ids] = cap2.data.cpu().numpy().copy()

            aa = torch.from_numpy(a)
            bb = torch.from_numpy(b)

            cc = torch.from_numpy(c)
            dd = torch.from_numpy(d)

            (r1, r5, r10, medr, meanr) = i2t(aa, bb, measure=opts.measure)
            print('test640: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                medr, r1, r5, r10))

            (r1i, r5i, r10i, medri, meanr) = t2i(aa, bb, measure=opts.measure)
            print('test640: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                medri, r1i, r5i, r10i))

            (r2, r3, r4, m1, m2) = i2t(cc, dd, measure=opts.measure)
            print('test640: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                m1, r2, r3, r4))

            (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd, measure=opts.measure)
            print('test640: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                m1i, r2i, r3i, r4i))

            curr = r2 + r3 + r4 + r2i + r3i + r4i

            if curr > score:
                score = curr
                print('save model')
                filename = os.path.join(opts.result_dir, opts.name)
                model.save('%s/best_model.pth' % (filename), ep, total_it)
                torch.save(subspace.state_dict(),
                           '%s/subspace.pth' % (filename))

            a = None
            b = None
            c = None
            d = None

            for it, (images, captions, lengths, ids) in enumerate(test_loader):

                images = images.cuda(opts.gpu).detach()
                captions = captions.cuda(opts.gpu).detach()

                img_emb, cap_emb = subspace.forward_emb(images,
                                                        captions,
                                                        lengths,
                                                        volatile=True)

                img = img_emb.view(images.size(0), -1, 32, 32)
                cap = cap_emb.view(images.size(0), -1, 32, 32)
                image1, text1 = model.test_model2(img, cap)
                img2 = image1.view(images.size(0), -1)
                cap2 = text1.view(images.size(0), -1)

                if a is None:
                    a = np.zeros((len(test_loader.dataset), img_emb.size(1)))
                    b = np.zeros((len(test_loader.dataset), cap_emb.size(1)))

                    c = np.zeros((len(test_loader.dataset), img2.size(1)))
                    d = np.zeros((len(test_loader.dataset), cap2.size(1)))

                a[ids] = img_emb.data.cpu().numpy().copy()
                b[ids] = cap_emb.data.cpu().numpy().copy()

                c[ids] = img2.data.cpu().numpy().copy()
                d[ids] = cap2.data.cpu().numpy().copy()

            aa = torch.from_numpy(a)
            bb = torch.from_numpy(b)

            cc = torch.from_numpy(c)
            dd = torch.from_numpy(d)

            (r1, r5, r10, medr, meanr) = i2t(aa, bb, measure=opts.measure)
            print('test5000: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                medr, r1, r5, r10))

            (r1i, r5i, r10i, medri, meanr) = t2i(aa, bb, measure=opts.measure)
            print('test5000: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                medri, r1i, r5i, r10i))

            (r2, r3, r4, m1, m2) = i2t(cc, dd, measure=opts.measure)
            print('test5000: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                m1, r2, r3, r4))

            (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd, measure=opts.measure)
            print('test5000: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(
                m1i, r2i, r3i, r4i))

    return
        '--model',
        type=str,
        default='model.pt',
        metavar='M',
        help="the model file to be evaluated. (default: model.pt)")
    parser.add_argument(
        '--outfile',
        type=str,
        default='visualize_stn.png',
        metavar='O',
        help=
        "visualize the STN transformation on some input batch (default: visualize_stn.png)"
    )

    args = parser.parse_args()

    # Load model checkpoint
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load(args.model, map_location=device)

    # Neural Network and Loss Function
    model = TrafficSignNet().to(device)
    model.load_state_dict(checkpoint)
    model.eval()
    criterion = nn.CrossEntropyLoss()

    # Data Initialization and Loading
    test_loader = get_test_loader(args.data, device)
    evaluate(model, criterion, test_loader)
    visualize_stn(test_loader, args.outfile)
Beispiel #17
0
def evalrank(model_path, data_path=None, data_name=None, data_name_vocab=None, split='dev', fold5=False,
            VSE_model=None, data_loader=None, concept_path=None, transfer_test=False, concept_name=None):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    start_epoch = checkpoint['epoch']
    best_rsum = checkpoint['best_rsum']
    print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
          .format(opt.resume, start_epoch, best_rsum))

    if data_path is not None:
        opt.data_path = data_path
    if data_name is not None:   
        opt.data_name = data_name   

    # Jugde whether to use transfering testing results
    if transfer_test == True:
        opt.attribute_path = concept_path
    if concept_name is not None:
        opt.concept_name = concept_name
    if 'coco' in opt.data_name:
        fuse_weight = 0.9   
    elif 'f30k' in opt.data_name:   
        fuse_weight = 0.85  

    print(opt)
    print("=> loading checkpoint '{}'".format(opt.resume))

    with open(os.path.join(opt.vocab_path,
                           '%s_vocab.pkl' % data_name_vocab), 'rb') as f:   
        vocab = pickle.load(f)

    opt.vocab_size = len(vocab)
    word2idx = vocab.word2idx

    # construct model
    model = VSE_model(word2idx, opt)  # if with channel attention
    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab,
                                  opt.batch_size, opt.workers, transfer_test, opt)      
    print('Computing results...')
    img_embs, cap_embs, img_emb_cons, cap_emb_cons, concept_labels = encode_data(model=model, data_loader=data_loader, alpha=fuse_weight)

    '''2). Make label completation'''   
    ind_cap_complete = label_complete(concept_label=concept_labels, img_embs=img_embs, cap_embs=cap_embs, data_name=opt.data_name)

    img_embs, cap_embs, img_emb_cons, cap_emb_cons, completion_labels = encode_data_KNN_rerank(model=model, data_loader=data_loader,
                                                                                             index_KNN_neighbour=ind_cap_complete, concept_labels=concept_labels,
                                                                                             alpha=fuse_weight)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] // 5, cap_embs.shape[0]), " for testing")

    if not fold5:
        # no cross-validation, full evaluation
        r, rt = i2t_sep_sim(img_embs, cap_embs, img_emb_cons, cap_emb_cons, opt.data_name,   
                            weight_fused=0.95,
                            measure=opt.measure, return_ranks=True)
        ri, rti = t2i_sep_sim(img_embs, cap_embs, img_emb_cons, cap_emb_cons, opt.data_name,   
                            weight_fused=0.95,
                            measure=opt.measure, return_ranks=True)
        ar = (r[0] + r[1] + r[2]) / 3
        ari = (ri[0] + ri[1] + ri[2]) / 3       
        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
        print("rsum: %.1f" % rsum)
        print("Average i2t Recall: %.1f" % ar)
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
        print("Average t2i Recall: %.1f" % ari)
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)

    else:
        # 5fold cross-validation, only for MSCOCO
        results = []
        for i in range(5):
            r, rt0 = i2t_sep_sim(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000],
                                img_emb_cons[i * 5000:(i + 1) * 5000], cap_emb_cons[i * 5000:(i + 1) * 5000],
                                opt.data_name, 
                                weight_fused=0.95, measure=opt.measure, return_ranks=True) 
            print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
            ri, rti0 = t2i_sep_sim(img_embs[i * 5000:(i + 1) * 5000],  cap_embs[i * 5000:(i + 1) * 5000],
                           img_emb_cons[i * 5000:(i + 1) * 5000], cap_emb_cons[i * 5000:(i + 1) * 5000],
                           opt.data_name, 
                           weight_fused=0.95, measure=opt.measure, return_ranks=True)
            if i == 0:  
                rt, rti = rt0, rti0
            print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)   

            ar = (r[0] + r[1] + r[2]) / 3   
            ari = (ri[0] + ri[1] + ri[2]) / 3   
            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]   
            print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))        
            results += [list(r) + list(ri) + [ar, ari, rsum]]           

        print("-----------------------------------")    
        print("Mean metrics: ") 
        mean_metrics = tuple(np.array(results).mean(axis=0).flatten())  
        print("rsum: %.1f" % (mean_metrics[10] * 6))    
        print("Average i2t Recall: %.1f" % mean_metrics[11])    
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" %   
              mean_metrics[:5]) 
        print("Average t2i Recall: %.1f" % mean_metrics[12])
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
              mean_metrics[5:10])

    torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
is_cuda = torch.cuda.is_available()
print('is cuda : ', is_cuda)
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

if __name__ == "__main__":
    code_intent_pair = Code_Intent_Pairs()
    path = 'vocab/'
    code_intent_pair.load_dict(path)
    special_symbols = code_intent_pair.get_special_symbols()
    word_size = code_intent_pair.get_word_size()
    code_size = code_intent_pair.get_code_size()

    test_path = 'processed_corpus/test.json'
    test_entries = code_intent_pair.load_entries(test_path)
    testloader = get_test_loader(test_entries)

    model = Seq2Seq(word_size, code_size, hyperP)
    if hyperP['load_pretrain_code_embed']:
        model.decoder.embed[0].load_state_dict(
            torch.load('./pretrain_code_lm/embedding-1556211835.t7'))
        if hyperP['freeze_embed']:
            for param in model.decoder.embed[0].parameters():
                param.requires_grad = False

    model.load('model_100.t7')
    beam_decoder = Decoder(model)
    if is_cuda:
        model.to(device)
        # beam_decoder.to(device)
    model.eval()
Beispiel #19
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default='/home/dcsaero01/data/datasets/vsepp/',
                        help='path to datasets')
    parser.add_argument('--data_name', default='minicsdv2',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path', default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin', default=0.2, type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs', default=20, type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size', default=128, type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim', default=300, type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size', default=1024, type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip', default=2., type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size', default=224, type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers', default=1, type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate', default=.0002, type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update', default=15, type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--dropout_value', default=0, type=float,
                        help='Probability value for dropout after linear layer')
    parser.add_argument('--workers', default=10, type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step', default=10, type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step', default=500, type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name', default='runs/minicsdv2/',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation', action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim', default=4096, type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--text_dim', default=500, type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--seq_length', default=10, type=int,
                        help='Max sentence sequence length of the GRU')
    parser.add_argument('--finetune', action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type', default='resnet152',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval', action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure', default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs', action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--test_mode', action='store_true', default=False,
                        help='Set this flag to run the script in testing mode')
    parser.add_argument('--skip_model', action='store_true',
                        help='Whether to train with Skipthoughts RNN Model')
    parser.add_argument('--no_imgnorm', action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train', action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    
    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    if opt.data_name =='coco_st_precomp' or opt.data_name == 'coco_st_ner_precomp' or opt.data_name == 'csd_ner_precomp' or opt.data_name == 'breakingnews_precomp':
        vocab = None
        opt.vocab_size = 0
    elif opt.data_name =='csd_skip_precomp':
        opt.vocab_size = -1
        vocab = None
    else:
        vocab = pickle.load(open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb'))
        opt.vocab_size = len(vocab)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
                  .format(opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))


    if opt.test_mode:
        ###Test the Model

        test_loader = data.get_test_loader('test',opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt)

        validate(opt, test_loader, model)
        

    else:

        # Train the Model
        best_rsum = 0
        for epoch in range(opt.num_epochs):
            adjust_learning_rate(opt, model.optimizer, epoch)

            # train for one epoch
            train(opt, train_loader, model, epoch, val_loader)

            # evaluate on validation set
            rsum = validate(opt, val_loader, model)

            # remember best R@ sum and save checkpoint
            is_best = rsum > best_rsum
            best_rsum = max(rsum, best_rsum)
            save_checkpoint({
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
            }, is_best, prefix=opt.logger_name + '/')
Beispiel #20
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='./data/',
                        help='path to datasets')
    parser.add_argument('--data_name',
                        default='precomp',
                        help='{coco,f30k}_precomp')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=64,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--learning_rate',
                        default=.0001,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=10,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=1000,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='./runs/runX/log',
                        help='Path to save Tensorboard log.')
    parser.add_argument('--model_name',
                        default='./runs/runX/checkpoint',
                        help='Path to save the model.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        default=True,
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=2048,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--final_dims',
                        default=256,
                        type=int,
                        help='dimension of final codes.')
    parser.add_argument('--max_words',
                        default=32,
                        type=int,
                        help='maximum number of words in a sentence.')
    parser.add_argument(
        "--bert_path",
        default=
        '/media/ling/datum/Datasets/word_embeddings/uncased_L-12_H-768_A-12/',
        type=str,
        help="The BERT model path.")
    parser.add_argument("--txt_stru",
                        default='cnn',
                        help="Whether to use pooling or cnn or rnn")
    parser.add_argument(
        "--trans_cfg",
        default='/media/ling/datum/workspace/SCAN/image_bert.json',
        help="config file for image transformer")

    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
    opt.logger_name = opt.logger_name + TIMESTAMP
    tb_logger.configure(opt.logger_name, flush_secs=5)

    f = open(opt.logger_name + "opt.txt", 'w')
    f.write(opt.__str__())
    f.close()

    opt.vocab_file = opt.bert_path + 'vocab.txt'
    opt.bert_config_file = opt.bert_path + 'bert_config.json'
    opt.init_checkpoint = opt.bert_path + 'pytorch_model.bin'
    opt.do_lower_case = True

    # Load data loaders
    test_loader = data.get_test_loader('test', opt.data_name, opt.batch_size,
                                       opt.workers, opt)

    # Construct the model
    model = SAEM(opt)

    opt.resume = 'runs/f30k/log/model_best.pth.tar'
    # optionally resume from a checkpoint
    # opt.resume = 'runs/f30k/log/checkpoint_27.pth.tar'
    if os.path.isfile(opt.resume):
        print("=> loading checkpoint '{}'".format(opt.resume))
        checkpoint = torch.load(opt.resume)
        start_epoch = checkpoint['epoch']
        best_rsum = checkpoint['best_rsum']
        model.load_state_dict(checkpoint['model'])
        # Eiters is used to show logs as the continuation of another
        # training
        model.Eiters = checkpoint['Eiters']
        print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
            opt.resume, start_epoch, best_rsum))
        validate(opt, test_loader, model)
    else:
        print("=> no checkpoint found at '{}'".format(opt.resume))
Beispiel #21
0
def evalrank(model_path, data_path=None, split='dev', fold5=False):

    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    print(opt)
    if data_path is not None:
        opt.data_path = data_path

    vocab = deserialize_vocab(
        os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name))
    opt.vocab_size = len(vocab)

    captions_w = np.load(opt.caption_np + 'caption_np.npy')
    captions_w = torch.from_numpy(captions_w)

    captions_w = captions_w.cuda()

    model = SCAN(opt, captions_w)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.batch_size,
                                  opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs, cap_lens = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    if not fold5:

        img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
        start = time.time()
        if opt.cross_attn == 't2i':
            sims = shard_xattn_t2i(img_embs,
                                   cap_embs,
                                   cap_lens,
                                   opt,
                                   shard_size=128)
        elif opt.cross_attn == 'i2t':
            sims = shard_xattn_i2t(img_embs,
                                   cap_embs,
                                   cap_lens,
                                   opt,
                                   shard_size=128)
        elif opt.cross_attn == 'all':
            sims, label = shard_xattn_all(model,
                                          img_embs,
                                          cap_embs,
                                          cap_lens,
                                          opt,
                                          shard_size=128)
        else:
            raise NotImplementedError
        end = time.time()
        print("calculate similarity time:", end - start)
        np.save('sim_stage1', sims)

        r, rt = i2t(label,
                    img_embs,
                    cap_embs,
                    cap_lens,
                    sims,
                    return_ranks=True)
        ri, rti = t2i(label,
                      img_embs,
                      cap_embs,
                      cap_lens,
                      sims,
                      return_ranks=True)
        ar = (r[0] + r[1] + r[2]) / 3
        ari = (ri[0] + ri[1] + ri[2]) / 3
        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
        print("rsum: %.1f" % rsum)
        print("Average i2t Recall: %.1f" % ar)
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
        print("Average t2i Recall: %.1f" % ari)
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
    else:

        results = []
        for i in range(5):
            img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
            cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
            cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
            start = time.time()
            if opt.cross_attn == 't2i':
                sims = shard_xattn_t2i(img_embs_shard,
                                       cap_embs_shard,
                                       cap_lens_shard,
                                       opt,
                                       shard_size=128)
            elif opt.cross_attn == 'i2t':
                sims = shard_xattn_i2t(img_embs_shard,
                                       cap_embs_shard,
                                       cap_lens_shard,
                                       opt,
                                       shard_size=128)
            else:
                raise NotImplementedError
            end = time.time()
            print("calculate similarity time:", end - start)

            r, rt0 = i2t(img_embs_shard,
                         cap_embs_shard,
                         cap_lens_shard,
                         sims,
                         return_ranks=True)
            print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
            ri, rti0 = t2i(img_embs_shard,
                           cap_embs_shard,
                           cap_lens_shard,
                           sims,
                           return_ranks=True)
            print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)

            if i == 0:
                rt, rti = rt0, rti0
            ar = (r[0] + r[1] + r[2]) / 3
            ari = (ri[0] + ri[1] + ri[2]) / 3
            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
            print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
            results += [list(r) + list(ri) + [ar, ari, rsum]]

        print("-----------------------------------")
        print("Mean metrics: ")
        mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
        print("rsum: %.1f" % (mean_metrics[10] * 6))
        print("Average i2t Recall: %.1f" % mean_metrics[11])
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5])
        print("Average t2i Recall: %.1f" % mean_metrics[12])
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10])

    torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
Beispiel #22
0
def main(opt):
    logging.basicConfig(format='%(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Construct the model
    model = UDAG(opt)

    # optionally resume from a checkpoint
    if opt.evaluation:
        val_loader = data.get_test_loader(opt.data_name, opt.batch_size,
                                          opt.workers, opt)

        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            _, sims = validate(opt, val_loader, model)
            np.save(opt.data_name + '_sims', sims)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))
    # if opt.resume:
    #     if os.path.isfile(opt.resume):
    #         print("=> loading checkpoint '{}'".format(opt.resume))
    #         checkpoint = torch.load(opt.resume)
    #         start_epoch = checkpoint['epoch']
    #         best_rsum = checkpoint['best_rsum']
    #         model.load_state_dict(checkpoint['model'])
    #         # Eiters is used to show logs as the continuation of another training
    #         model.Eiters = checkpoint['Eiters']
    #         print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(opt.resume, start_epoch, best_rsum))
    #         validate(opt, val_loader, model)
    #     else:
    #         print("=> no checkpoint found at '{}'".format(opt.resume))
    else:
        # Train the Model
        # Load data loaders
        train_loader, val_loader = data.get_loaders(opt.data_name,
                                                    opt.batch_size,
                                                    opt.workers, opt)
        best_rsum = 0
        for epoch in range(opt.num_epochs):
            adjust_learning_rate(opt, model.optimizer, epoch)

            # rsum = validate(opt, val_loader, model)

            # train for one epoch
            train(opt, train_loader, model, epoch, val_loader)

            # evaluate on validation set
            rsum = validate(opt, val_loader, model)

            # remember best R@ sum and save checkpoint
            is_best = rsum > best_rsum
            best_rsum = max(rsum, best_rsum)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': model.state_dict(),
                    'best_rsum': best_rsum,
                    'opt': opt,
                    'Eiters': model.Eiters,
                },
                is_best,
                prefix=opt.logger_name + '_' + opt.model_name + '/')
Beispiel #23
0
def evalrank(model, args, split='test'):
    print('Loading dataset')
    data_loader = get_test_loader(args, vocab)

    print('Computing results... (eval_on_gpu={})'.format(args.eval_on_gpu))
    img_embs, txt_embs = encode_data(model, data_loader, args.eval_on_gpu)
    n_samples = img_embs.shape[0]

    nreps = 5 if args.data_name == 'coco' else 1
    print('Images: %d, Sentences: %d' %
          (img_embs.shape[0] / nreps, txt_embs.shape[0]))

    # 5fold cross-validation, only for MSCOCO
    mean_metrics = None
    if args.data_name == 'coco':
        results = []
        for i in range(5):
            r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000],
                         txt_embs[i * 5000:(i + 1) * 5000],
                         nreps=nreps,
                         return_ranks=True,
                         order=args.order,
                         use_gpu=args.eval_on_gpu)
            r = (r[0], r[1], r[2], r[3], r[3] / n_samples, r[4],
                 r[4] / n_samples)
            print("Image to text: %.2f, %.2f, %.2f, %.2f (%.2f), %.2f (%.2f)" %
                  r)

            ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000],
                           txt_embs[i * 5000:(i + 1) * 5000],
                           nreps=nreps,
                           return_ranks=True,
                           order=args.order,
                           use_gpu=args.eval_on_gpu)
            if i == 0:
                rt, rti = rt0, rti0
            ri = (ri[0], ri[1], ri[2], ri[3], ri[3] / n_samples, ri[4],
                  ri[4] / n_samples)
            print("Text to image: %.2f, %.2f, %.2f, %.2f (%.2f), %.2f (%.2f)" %
                  ri)

            ar = (r[0] + r[1] + r[2]) / 3
            ari = (ri[0] + ri[1] + ri[2]) / 3
            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
            print("rsum: %.2f ar: %.2f ari: %.2f" % (rsum, ar, ari))
            results += [list(r) + list(ri) + [ar, ari, rsum]]

        mean_metrics = tuple(np.array(results).mean(axis=0).flatten())

        print("-----------------------------------")
        print("Mean metrics from 5-fold evaluation: ")
        print("rsum: %.2f" % (mean_metrics[-1] * 6))
        print("Average i2t Recall: %.2f" % mean_metrics[-3])
        print("Image to text: %.2f %.2f %.2f %.2f (%.2f) %.2f (%.2f)" %
              mean_metrics[:7])
        print("Average t2i Recall: %.2f" % mean_metrics[-2])
        print("Text to image: %.2f %.2f %.2f %.2f (%.2f) %.2f (%.2f)" %
              mean_metrics[7:14])

    # no cross-validation, full evaluation
    r, rt = i2t(img_embs,
                txt_embs,
                nreps=nreps,
                return_ranks=True,
                use_gpu=args.eval_on_gpu)
    ri, rti = t2i(img_embs,
                  txt_embs,
                  nreps=nreps,
                  return_ranks=True,
                  use_gpu=args.eval_on_gpu)
    ar = (r[0] + r[1] + r[2]) / 3
    ari = (ri[0] + ri[1] + ri[2]) / 3
    rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
    r = (r[0], r[1], r[2], r[3], r[3] / n_samples, r[4], r[4] / n_samples)
    ri = (ri[0], ri[1], ri[2], ri[3], ri[3] / n_samples, ri[4],
          ri[4] / n_samples)
    print("rsum: %.2f" % rsum)
    print("Average i2t Recall: %.2f" % ar)
    print("Image to text: %.2f %.2f %.2f %.2f (%.2f) %.2f (%.2f)" % r)
    print("Average t2i Recall: %.2f" % ari)
    print("Text to image: %.2f %.2f %.2f %.2f (%.2f) %.2f (%.2f)" % ri)

    return mean_metrics
Beispiel #24
0
def train(cfg):
    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        torch.distributed.init_process_group(backend="nccl",
                                             world_size=num_gpus)

    # set logger
    log_dir = os.path.join("logs/", cfg.source_dataset, cfg.prefix)
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir, exist_ok=True)

    logging.basicConfig(format="%(asctime)s %(message)s",
                        filename=log_dir + "/" + "log.txt",
                        filemode="a")

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    logger.addHandler(stream_handler)

    # writer = SummaryWriter(log_dir, purge_step=0)

    if dist.is_initialized() and dist.get_rank() != 0:

        logger = writer = None
    else:
        logger.info(pprint.pformat(cfg))

    # training data loader
    if not cfg.joint_training:  # single domain
        train_loader = get_train_loader(root=os.path.join(
            cfg.source.root, cfg.source.train),
                                        batch_size=cfg.batch_size,
                                        image_size=cfg.image_size,
                                        random_flip=cfg.random_flip,
                                        random_crop=cfg.random_crop,
                                        random_erase=cfg.random_erase,
                                        color_jitter=cfg.color_jitter,
                                        padding=cfg.padding,
                                        num_workers=4)
    else:  # cross domain
        source_root = os.path.join(cfg.source.root, cfg.source.train)
        target_root = os.path.join(cfg.target.root, cfg.target.train)

        train_loader = get_cross_domain_train_loader(
            source_root=source_root,
            target_root=target_root,
            batch_size=cfg.batch_size,
            random_flip=cfg.random_flip,
            random_crop=cfg.random_crop,
            random_erase=cfg.random_erase,
            color_jitter=cfg.color_jitter,
            padding=cfg.padding,
            image_size=cfg.image_size,
            num_workers=8)

    # evaluation data loader
    query_loader = None
    gallery_loader = None
    if cfg.eval_interval > 0:
        query_loader = get_test_loader(root=os.path.join(
            cfg.target.root, cfg.target.query),
                                       batch_size=512,
                                       image_size=cfg.image_size,
                                       num_workers=4)

        gallery_loader = get_test_loader(root=os.path.join(
            cfg.target.root, cfg.target.gallery),
                                         batch_size=512,
                                         image_size=cfg.image_size,
                                         num_workers=4)

    # model
    num_classes = cfg.source.num_id
    num_cam = cfg.source.num_cam + cfg.target.num_cam
    cam_ids = train_loader.dataset.target_dataset.cam_ids if cfg.joint_training else train_loader.dataset.cam_ids
    num_instances = len(
        train_loader.dataset.target_dataset) if cfg.joint_training else None

    model = Model(num_classes=num_classes,
                  drop_last_stride=cfg.drop_last_stride,
                  joint_training=cfg.joint_training,
                  num_instances=num_instances,
                  cam_ids=cam_ids,
                  num_cam=num_cam,
                  neighbor_mode=cfg.neighbor_mode,
                  neighbor_eps=cfg.neighbor_eps,
                  scale=cfg.scale,
                  mix=cfg.mix,
                  alpha=cfg.alpha)

    model.cuda()

    # optimizer
    ft_params = model.backbone.parameters()
    new_params = [
        param for name, param in model.named_parameters()
        if not name.startswith("backbone.")
    ]
    param_groups = [{
        'params': ft_params,
        'lr': cfg.ft_lr
    }, {
        'params': new_params,
        'lr': cfg.new_params_lr
    }]

    optimizer = optim.SGD(param_groups, momentum=0.9, weight_decay=cfg.wd)

    # convert model for mixed precision distributed training

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=cfg.fp16,
                                      opt_level="O2")
    lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer,
                                                  milestones=cfg.lr_step,
                                                  gamma=0.1)

    if dist.is_initialized():
        model = parallel.DistributedDataParallel(model, delay_allreduce=True)

    # engine
    checkpoint_dir = os.path.join("checkpoints", cfg.source_dataset,
                                  cfg.prefix)
    engine = get_trainer(
        model=model,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        logger=logger,
        # writer=writer,
        non_blocking=True,
        log_period=cfg.log_period,
        save_interval=10,
        save_dir=checkpoint_dir,
        prefix=cfg.prefix,
        eval_interval=cfg.eval_interval,
        query_loader=query_loader,
        gallery_loader=gallery_loader)

    # training
    engine.run(train_loader, max_epochs=cfg.num_epoch)

    if dist.is_initialized():
        dist.destroy_process_group()
Beispiel #25
0
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

    model = Baseline(eval=True, drop_last_stride=True, dual_path=False)
    # model = MixNet(eval=True, drop_last_stride=True)

    state_dict = torch.load(model_path)

    model.load_state_dict(state_dict, strict=False)
    model.float()
    model.eval()
    model.cuda()

    # extract test feature
    gallery_loader, query_loader = get_test_loader(
        dataset=dataset,
        root=dataset_config.data_root,
        batch_size=512,
        image_size=image_size,
        num_workers=16)
    # extract query features
    feats = []
    labels = []
    cam_ids = []
    img_paths = []
    for data, label, cam_id, img_path, _ in query_loader:
        with torch.autograd.no_grad():
            feat = model(data.cuda(non_blocking=True), cam_ids=cam_id)

        feats.append(feat.data.cpu().numpy())
        labels.append(label.data.cpu().numpy())
        cam_ids.append(cam_id.data.cpu().numpy())
        img_paths.extend(img_path)
Beispiel #26
0
def evalstack(model_path, data_path=None, split='dev', fold5=False, is_sparse=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    opt.is_sparse = is_sparse
    print(opt)
    if data_path is not None:
        opt.data_path = data_path
        opt.vocab_path = "/media/ubuntu/data/chunxiao/vocab"

    # load vocabulary used by the model
    vocab = deserialize_vocab(os.path.join(
        opt.vocab_path, '%s_vocab.json' % opt.data_name))
    opt.vocab_size = len(vocab)

    # construct model
    model = GSMN(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab,
                                  opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs, bbox, depends, cap_lens = encode_data(
        model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    if not fold5:
        # no cross-validation, full evaluation
        img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)])
        start = time.time()
        sims = shard_xattn(model, img_embs, cap_embs, bbox,
                           depends, cap_lens, opt, shard_size=80)
        end = time.time()
        print("calculate similarity time:", end - start)

        return sims

    else:
        # 5fold cross-validation, only for MSCOCO
        sims_a = []
        for i in range(5):
            img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5]
            cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000]
            cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000]
            bbox_shard = bbox[i * 5000:(i + 1) * 5000:5]
            depend_shard = depends[i * 5000:(i + 1) * 5000]
            start = time.time()
            sims = shard_xattn(model, img_embs_shard, cap_embs_shard,
                               bbox_shard, depend_shard, cap_lens_shard, opt, shard_size=80)
            end = time.time()
            print("calculate similarity time:", end - start)

            sims_a.append(sims)

        return sims_a
Beispiel #27
0
import model_2
import pickle
"""
f = open('./test_recall.log', 'a')
sys.stdout = f
sys.stderr = f
"""
parser = TestOptions()
opts = parser.parse()

vocab = pickle.load(
    open(os.path.join(opts.vocab_path, '%s_vocab.pkl' % opts.data_name), 'rb'))
opts.vocab_size = len(vocab)

test_loader = data.get_test_loader('test', opts.data_name, vocab,
                                   opts.crop_size, opts.batch_size,
                                   opts.workers, opts)

subspace = model_2.VSE(opts)
subspace.setgpu()
subspace.load_state_dict(torch.load(opts.resume2))
subspace.val_start()

# model
print('\n--- load model ---')
model = DRIT(opts)
model.setgpu(opts.gpu)
model.resume(opts.resume, train=False)
model.eval()

a = None
Beispiel #28
0
def train(cfg):
    # set logger
    log_dir = os.path.join("logs/", cfg.dataset, cfg.prefix)
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir, exist_ok=True)

    logging.basicConfig(format="%(asctime)s %(message)s",
                        filename=log_dir + "/" + "log.txt",
                        filemode="a")

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    logger.addHandler(stream_handler)

    logger.info(pprint.pformat(cfg))

    # training data loader
    train_loader = get_train_loader(dataset=cfg.dataset,
                                    root=cfg.data_root,
                                    sample_method=cfg.sample_method,
                                    batch_size=cfg.batch_size,
                                    p_size=cfg.p_size,
                                    k_size=cfg.k_size,
                                    random_flip=cfg.random_flip,
                                    random_crop=cfg.random_crop,
                                    random_erase=cfg.random_erase,
                                    color_jitter=cfg.color_jitter,
                                    padding=cfg.padding,
                                    image_size=cfg.image_size,
                                    num_workers=8)

    # evaluation data loader
    gallery_loader, query_loader = None, None
    if cfg.eval_interval > 0:
        gallery_loader, query_loader = get_test_loader(
            dataset=cfg.dataset,
            root=cfg.data_root,
            batch_size=512,
            image_size=cfg.image_size,
            num_workers=4)

    # model
    model = Baseline(num_classes=cfg.num_id,
                     dual_path=cfg.dual_path,
                     drop_last_stride=cfg.drop_last_stride,
                     triplet=cfg.triplet,
                     classification=cfg.classification)

    model.cuda()

    # optimizer
    assert cfg.optimizer in ['adam', 'sgd']
    if cfg.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=cfg.lr,
                               weight_decay=cfg.wd)
    else:
        optimizer = optim.SGD(model.parameters(),
                              lr=cfg.lr,
                              momentum=0.9,
                              weight_decay=cfg.wd)

    # convert model for mixed precision training
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=cfg.fp16,
                                      opt_level="O2")
    lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer,
                                                  milestones=cfg.lr_step,
                                                  gamma=0.1)

    # engine
    checkpoint_dir = os.path.join("checkpoints", cfg.dataset, cfg.prefix)
    engine = get_trainer(model=model,
                         optimizer=optimizer,
                         lr_scheduler=lr_scheduler,
                         logger=logger,
                         non_blocking=True,
                         log_period=cfg.log_period,
                         save_dir=checkpoint_dir,
                         prefix=cfg.prefix,
                         eval_interval=cfg.eval_interval,
                         gallery_loader=gallery_loader,
                         query_loader=query_loader,
                         dataset=cfg.dataset)

    # training
    engine.run(train_loader, max_epochs=cfg.num_epoch)
def evalrank(model_path, data_path=None, split='dev', fold5=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path
    opt.vocab_path = 'vocab'
    # load vocabulary used by the model
    with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
              'rb') as f:
        vocab = pickle.load(f)
    opt.vocab_size = len(vocab)

    # construct model
    model = XRN(opt)

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    if not fold5:
        # no cross-validation, full evaluation
        r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True)
        ri, rti = t2i(img_embs,
                      cap_embs,
                      measure=opt.measure,
                      return_ranks=True)
        ar = (r[0] + r[1] + r[2]) / 3
        ari = (ri[0] + ri[1] + ri[2]) / 3
        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
        print("rsum: %.1f" % rsum)
        print("Average i2t Recall: %.1f" % ar)
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
        print("Average t2i Recall: %.1f" % ari)
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
    else:
        # 5fold cross-validation, only for MSCOCO
        results = []
        for i in range(5):
            r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000],
                         cap_embs[i * 5000:(i + 1) * 5000],
                         measure=opt.measure,
                         return_ranks=True)
            print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
            ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000],
                           cap_embs[i * 5000:(i + 1) * 5000],
                           measure=opt.measure,
                           return_ranks=True)
            if i == 0:
                rt, rti = rt0, rti0
            print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
            ar = (r[0] + r[1] + r[2]) / 3
            ari = (ri[0] + ri[1] + ri[2]) / 3
            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
            print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
            results += [list(r) + list(ri) + [ar, ari, rsum]]

        print("-----------------------------------")
        print("Mean metrics: ")
        mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
        print("rsum: %.1f" % (mean_metrics[10] * 6))
        print("Average i2t Recall: %.1f" % mean_metrics[11])
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5])
        print("Average t2i Recall: %.1f" % mean_metrics[12])
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10])

    torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
Beispiel #30
0
def evalrank(model_path, data_path=None, split='dev', fold5=False, return_ranks=False):
    """
    Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
    cross-validation is done (only for MSCOCO). Otherwise, the full data is
    used for evaluation.
    """
    # load model and options
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    checkpoint = torch.load(model_path)
    opt = checkpoint['opt']
    if data_path is not None:
        opt.data_path = data_path
    # load vocabulary used by the model

    vocab = pickle.load(open(os.path.join(
        opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb'))
    opt.vocab_size = len(vocab)
    opt.distributed = False
    opt.use_all = True
    opt.instance_loss = False
    opt.attention = False

    print(opt)
    # construct model
    model = VSE(opt)

    if "cnn.classifier.1.weight" in checkpoint['model'][0]:
        checkpoint['model'][0]["cnn.classifier.0.weight"] = checkpoint['model'][0].pop("cnn.classifier.1.weight")
        checkpoint['model'][0]["cnn.classifier.0.bias"] = checkpoint['model'][0].pop("cnn.classifier.1.bias")
        checkpoint['model'][0]["cnn.classifier.3.weight"] = checkpoint['model'][0].pop("cnn.classifier.4.weight")
        checkpoint['model'][0]["cnn.classifier.3.bias"] = checkpoint['model'][0].pop("cnn.classifier.4.bias")

    # load model state
    model.load_state_dict(checkpoint['model'])

    print('Loading dataset')
    data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size,
                                  opt.batch_size, opt.workers, opt)

    print('Computing results...')
    img_embs, cap_embs = encode_data(model, data_loader)
    print('Images: %d, Captions: %d' %
          (img_embs.shape[0] / 5, cap_embs.shape[0]))

    if not fold5:
        # no cross-validation, full evaluation
        r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True)
        ri, rti = t2i(img_embs, cap_embs,
                      measure=opt.measure, return_ranks=True)
        ar = (r[0] + r[1] + r[2]) / 3
        ari = (ri[0] + ri[1] + ri[2]) / 3
        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
        print("rsum: %.1f" % rsum)
        print("Average i2t Recall: %.1f" % ar)
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
        print("Average t2i Recall: %.1f" % ari)
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
    else:
        # 5fold cross-validation, only for MSCOCO
        results = []
        for i in range(5):
            r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000],
                         cap_embs[i * 5000:(i + 1) *
                                  5000], measure=opt.measure,
                         return_ranks=True)
            print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
            ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000],
                           cap_embs[i * 5000:(i + 1) *
                                    5000], measure=opt.measure,
                           return_ranks=True)
            if i == 0:
                rt, rti = rt0, rti0
            print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
            ar = (r[0] + r[1] + r[2]) / 3
            ari = (ri[0] + ri[1] + ri[2]) / 3
            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
            print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
            results += [list(r) + list(ri) + [ar, ari, rsum]]

        print("-----------------------------------")
        print("Mean metrics: ")
        mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
        print("rsum: %.1f" % (mean_metrics[10] * 6))
        print("Average i2t Recall: %.1f" % mean_metrics[11])
        print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
              mean_metrics[:5])
        print("Average t2i Recall: %.1f" % mean_metrics[12])
        print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
              mean_metrics[5:10])
    if return_ranks:
        return rt, rti