Beispiel #1
0
def build_eig_estimator(args,
                        vocab,
                        optim_function=True,
                        method_override=None,
                        model_batch_size=None):
    if args.dataset == "wmt14":
        vocab = vocab["de"]
    if args.eig_text_encoder == "rnn":
        text_encoder = models.RNNTextEncoder(vocab)
    elif args.eig_text_encoder == "bow":
        text_encoder = models.BOWTextEncoder(vocab)
    else:
        raise NotImplementedError(f"text encoder = {args.eig_text_encoder}")

    if method_override is None:
        acquisition = args.acquisition
    else:
        acquisition = method_override
    if acquisition == "eig_y":
        # Just dependent on captions
        eig_model = eig.estimators.BatchedEIGEstimator(args.ensemble,
                                                       text_encoder,
                                                       model_batch_size)
    elif acquisition == "eig_xy":
        if args.backbone == "mlp" and args.dataset == "coco":
            backbone_input_dim = 512
        else:
            backbone_input_dim = None
        img_encoder = models.ImageEncoder(
            backbone=args.eig_img_encoder,
            pretrained=False,
            grayscale="mnist" in args.dataset,
            backbone_input_dim=backbone_input_dim,
        )
        # Dependent on image and captions
        eig_model = eig.estimators.EIGEstimatorWithImage(
            args.ensemble, text_encoder, img_encoder)
    else:
        return None

    eig_optimizer = lambda p: optim.Adam(p)
    if not optim_function:
        # Actually return an optimizer, not the function
        eig_optimizer = eig_optimizer(eig_model.parameters())

    # No reduction as individual loss terms can have weights in EIG eval.
    eig_loss = nn.CrossEntropyLoss(reduction="none")

    if args.cuda:
        eig_model = eig_model.cuda()
        eig_loss = eig_loss.cuda()

    return eig_model, eig_optimizer, eig_loss
Beispiel #2
0
    def __init__(self):
        self.images = storage.ImageSet("../datasets/celeba_160/",
                                       is_uint8=False,
                                       require_cwh=True)
        self.val_images = storage.ImageSet("../datasets/celeba_160_val/",
                                           is_uint8=False,
                                           require_cwh=True)
        self.masks = mask_generator.MaskSet(320,
                                            mask_size=(160, 128),
                                            num_holes=2,
                                            size_holes=24,
                                            border=32)

        self.mask_encoder = models.MaskEncoder().to(DEVICE)
        self.image_encoder = models.ImageEncoder().to(DEVICE)
        self.image_decoder = models.ImageDecoder().to(DEVICE)
        self.image_discriminator = models.ImageDiscriminator().to(DEVICE)

        self.mask_encoder.load_state_dict(
            torch.load(MASK_ENCODER_DIR + "mask_encoder.params"))
        if os.path.exists(OUTPUT_DIR + "image_decoder.params"):
            self.image_encoder.load_state_dict(
                torch.load(OUTPUT_DIR + "image_encoder.params"))
            self.image_decoder.load_state_dict(
                torch.load(OUTPUT_DIR + "image_decoder.params"))
            self.image_discriminator.load_state_dict(
                torch.load(OUTPUT_DIR + "image_discriminator.params"))
        for param in self.mask_encoder.parameters():
            param.requires_grad = False

        gen_params = list()
        for param in self.image_encoder.parameters():
            gen_params.append(param)
        for param in self.image_decoder.parameters():
            gen_params.append(param)
        dis_params = list()
        for param in self.image_discriminator.parameters():
            dis_params.append(param)

        self.mse_loss = nn.MSELoss().cuda()
        self.bce_loss = nn.BCELoss().cuda()
        self.gen_mse_opti = torch.optim.Adam(gen_params, lr=1e-2)
        self.gen_bce_opti = torch.optim.Adam(gen_params, lr=2e-3)
        self.dis_opti = torch.optim.Adam(dis_params, lr=2e-3)

        self.ones = torch.ones([BATCH_SIZE, 1]).to(DEVICE)
        self.zeros = torch.zeros([BATCH_SIZE, 1]).to(DEVICE)

        return
def main(args):
    gpu = args.gpu
    config_path = args.config
    vocab_path = args.vocab
    img2vec_path = args.img2vec
    train_json_path = args.train_json
    name = args.name
    save_path = args.save

    print("[args] gpu=%d" % gpu)
    print("[args] config_path=%s" % config_path)
    print("[args] word2vec_path=%s" % vocab_path)
    print("[args] img2vec_path=%s" % img2vec_path)
    print("[args] train_json_path=%s" % train_json_path)
    print("[args] name=%s" % name)
    print("[args] save_path=%s" % save_path)
    print()

    device = torch.device("cuda:" +
                          str(gpu) if torch.cuda.is_available() else "cpu")

    config.read(config_path)

    # Model parameters
    modelparams = config["modelparams"]
    sentence_encoder_name = modelparams.get("sentence_encoder")
    n_layers = modelparams.getint("n_layers")
    n_head = modelparams.getint("n_head")
    d_k = modelparams.getint("d_k")
    d_v = modelparams.getint("d_v")
    d_inner = modelparams.getint("d_inner")
    d_img = modelparams.getint("d_img")
    d_model = modelparams.getint("d_model")

    print("[modelparames] sentence_encoder_name=%s" % sentence_encoder_name)
    if n_layers:
        print("[modelparames] n_layers=%d" % n_layers)
    if n_head:
        print("[modelparames] n_head=%d" % n_head)
    if d_k:
        print("[modelparames] d_k=%d" % d_k)
    if d_v:
        print("[modelparames] d_v=%d" % d_v)
    if d_inner:
        print("[modelparames] d_inner=%d" % d_inner)
    print("[modelparames] d_img=%d" % d_img)
    print("[modelparames] d_model=%d" % d_model)
    print()

    # Hyper parameters
    hyperparams = config["hyperparams"]
    margin = hyperparams.getfloat("margin")
    weight_decay = hyperparams.getfloat("weight_decay")
    grad_clip = hyperparams.getfloat("grad_clip")
    lr = hyperparams.getfloat("lr")
    batch_size = hyperparams.getint("batch_size")
    n_epochs = hyperparams.getint("n_epochs")
    n_negatives = hyperparams.getint("n_negatives")

    print("[hyperparames] margin=%f" % margin)
    print("[hyperparames] weight_decay=%f" % weight_decay)
    print("[hyperparames] grad_clip=%f" % grad_clip)
    print("[hyperparames] lr=%f" % lr)
    print("[hyperparames] batch_size=%d" % batch_size)
    print("[hyperparames] n_epochs=%d" % n_epochs)
    print("[hyperparames] n_negatives=%d" % n_negatives)
    print()

    # Data preparation
    print("[info] Loading vocabulary ...")
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    dataloader_train = datasets.coco.get_loader(img2vec_path, train_json_path,
                                                vocab, batch_size)

    # Model preparation
    img_encoder = models.ImageEncoder(d_img, d_model).to(device)
    sen_encoder = models.SentenceEncoder(vocab, sentence_encoder_name, d_model,
                                         n_layers, n_head, d_k, d_v,
                                         d_inner).to(device)

    img_optimizer = optim.Adam(img_encoder.parameters(),
                               lr=lr,
                               weight_decay=weight_decay)
    sen_optimizer = optim.Adam(sen_encoder.parameters(),
                               lr=lr,
                               weight_decay=weight_decay)

    criterion = PairwiseRankingLoss(margin=margin)

    # Train
    print("[info] Training ...")
    for epoch in range(n_epochs):
        pbar = tqdm(dataloader_train)
        running_loss = 0.0

        for i, (images, src_seq, src_pos, _, _) in enumerate(pbar):
            pbar.set_description('epoch %3d / %d' % (epoch + 1, n_epochs))

            images = images.to(device)
            src_seq = src_seq.to(device)
            src_pos = src_pos.to(device)
            img_embedded = img_encoder(images)
            sen_embedded = sen_encoder(src_seq, src_pos)

            img_optimizer.zero_grad()
            sen_optimizer.zero_grad()

            loss = 0.0
            for _ in range(n_negatives):
                perm = torch.randperm(len(img_embedded))
                img_shuffled = img_embedded[perm]
                loss += criterion(sen_embedded, img_embedded, img_shuffled)
            loss /= n_negatives
            loss.backward()

            nn.utils.clip_grad_value_(img_encoder.parameters(), grad_clip)
            nn.utils.clip_grad_value_(sen_encoder.parameters(), grad_clip)
            img_optimizer.step()
            sen_optimizer.step()

            running_loss += loss.item()
            if (i + 1) % args.print_every == 0:
                pbar.set_postfix(loss=running_loss / args.print_every)
                running_loss = 0

        if (epoch + 1) % args.save_every == 0:
            save_dir = os.path.join(save_path, name)
            if not os.path.isdir(save_dir):
                os.mkdir(save_dir)

            sen_dict = sen_encoder.state_dict()
            sen_dict.pop('embed.weight')
            img_dict = img_encoder.state_dict()
            torch.save(
                sen_dict,
                os.path.join(save_dir,
                             'sentence_encoder-{}.pth'.format(epoch + 1)))
            torch.save(
                img_dict,
                os.path.join(save_dir,
                             'image_encoder-{}.pth'.format(epoch + 1)))
Beispiel #4
0
def build_model(args, vocab):
    """
    Build the model, optimizer, and loss according to experiment args

    Parameters
    ----------
    args : argparse.Namespace
        Experiment arguments
    vocab : dict
        vocab for language model

    Returns
    -------
    model : torch.nn.Module
        The model to be trained
    optimizer_func : () -> torch.optim.Optimizer
        A function that creates an optimizer (done because we may need to re-initialize the optimizer)
    loss : torch.nn.Module
        The loss function
    """
    params_to_optimize = []
    # Build the model according to the given arguments
    if args.backbone == "mlp" and args.dataset == "coco":
        backbone_input_dim = 512
    else:
        backbone_input_dim = None
    encoder = models.ImageEncoder(
        backbone=args.backbone,
        pretrained=args.pretrained_backbone,
        grayscale="mnist" in args.dataset,
        backbone_input_dim=backbone_input_dim,
    )
    encoder.fine_tune(not args.freeze_encoder)
    if not args.freeze_encoder:
        params_to_optimize.append({
            "params": [p for p in encoder.parameters() if p.requires_grad],
            "lr":
            args.encoder_lr,
        })

    if args.attention:
        decoder = models.DecoderWithAttention(
            embed_dim=args.emb_dim,
            decoder_dim=args.decoder_dim,
            vocab_size=vocab,
            encoder_dim=encoder.output_dim,
            dropout=args.dropout,
            attention_dim=args.attention_dim,
        )
    else:
        decoder = models.Decoder(
            embed_dim=args.emb_dim,
            decoder_dim=args.decoder_dim,
            vocab=vocab,
            encoder_dim=encoder.output_dim,
            dropout=args.dropout,
        )

    params_to_optimize.append({
        "params": [p for p in decoder.parameters() if p.requires_grad],
        "lr":
        args.decoder_lr,
    })

    loss = nn.CrossEntropyLoss()

    model = models.Captioner(encoder, decoder, encoder_dropout=args.dropout)

    optimizer_func = lambda: optim.Adam(params_to_optimize)

    if args.cuda:
        model = model.cuda()
        loss = loss.cuda()

    return model, optimizer_func, loss
Beispiel #5
0
import models
import storage
import mask_generator
import numpy as np
import matplotlib.pyplot as plt
import torch
import utilities
import os

EVALUATE_DIR = "../evaluate_min/"
IMAGE_ENCODER_DIR = "../outputs/train_image_encoder_min/"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if __name__ == '__main__':
    image_encoder = models.ImageEncoder().to(DEVICE)
    image_decoder = models.ImageDecoderMin().to(DEVICE)

    image_encoder.load_state_dict(
        torch.load(IMAGE_ENCODER_DIR + "image_encoder.params"))
    image_decoder.load_state_dict(
        torch.load(IMAGE_ENCODER_DIR + "image_decoder.params"))

    test_set = storage.ImageSet("../datasets/celeba_160_test_min/",
                                is_uint8=False,
                                require_cwh=True)

    mask = mask_generator.generate_mask(mask_size=(160, 128),
                                        num_holes=2,
                                        size_holes=24,
                                        border=32,
                                        expand_dim=False)
Beispiel #6
0
def main(args):
    gpu = args.gpu
    config_path = args.config
    vocab_path = args.vocab
    img2vec_path = args.img2vec
    val_json_path = args.val_json
    sentence_encoder_path = args.sentence_encoder
    image_encoder_path = args.image_encoder
    name = args.name

    print("[args] gpu=%d" % gpu)
    print("[args] config_path=%s" % config_path)
    print("[args] word2vec_path=%s" % vocab_path)
    print("[args] img2vec_path=%s" % img2vec_path)
    print("[args] val_json_path=%s" %val_json_path)
    print("[args] sentence_encoder_path=%s" % sentence_encoder_path)
    print("[args] image_encoder_path=%s" % image_encoder_path)
    print("[args] name=%s" % name)
    print()

    device = torch.device("cuda:" + str(gpu) if torch.cuda.is_available() else "cpu")

    config.read(config_path)

    # Model parameters
    modelparams = config["modelparams"]
    sentence_encoder_name = modelparams.get("sentence_encoder")
    n_layers = modelparams.getint("n_layers")
    n_head = modelparams.getint("n_head")
    d_k = modelparams.getint("d_k")
    d_v = modelparams.getint("d_v")
    d_inner = modelparams.getint("d_inner")
    d_img = modelparams.getint("d_img")
    d_model = modelparams.getint("d_model")

    print("[modelparames] sentence_encoder_name=%s" % sentence_encoder_name)
    if n_layers:
        print("[modelparames] n_layers=%d" % n_layers)
    if n_head:
        print("[modelparames] n_head=%d" % n_head)
    if d_k:
        print("[modelparames] d_k=%d" % d_k)
    if d_v:
        print("[modelparames] d_v=%d" % d_v)
    if d_inner:
        print("[modelparames] d_inner=%d" % d_inner)
    print("[modelparames] d_img=%d" % d_img)
    print("[modelparames] d_model=%d" % d_model)
    print()

    hyperparams = config["hyperparams"]
    batch_size = hyperparams.getint("batch_size")

    print("[hyperparames] batch_size=%d" % batch_size)
    print()

    print("[info] Loading vocabulary ...")
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    dataloader_val = datasets.coco.get_loader(img2vec_path, val_json_path, vocab, batch_size)

    # Model preparation
    img_encoder = models.ImageEncoder(d_img, d_model).to(device)
    sen_encoder = models.SentenceEncoder(vocab, sentence_encoder_name, d_model, n_layers, n_head, d_k, d_v, d_inner).to(device)

    # Load params
    img_encoder.load_state_dict(torch.load(image_encoder_path))
    sen_encoder.load_state_dict(torch.load(sentence_encoder_path), strict=False)
    img_encoder.eval()
    sen_encoder.eval()

    # Evaluate
    print("[info] Evaluating on the validation set ...")
    s2i, i2s = evaluate(sen_encoder, img_encoder, dataloader_val, device)
    print(
        "[validation] s2i[R@5=%.02f, R@10=%.02f, R@20=%.02f], i2s[R@5=%.02f, R@10=%.02f, R@20=%.02f]" % \
        (s2i["recall"][5], s2i["recall"][10], s2i["recall"][20],
         i2s["recall"][5], i2s["recall"][10], i2s["recall"][20]))
Beispiel #7
0
def main(args):
    gpu = args.gpu
    config_path = args.config
    vocab_path = args.vocab
    img2vec_path = args.img2vec
    val_json_path = args.val_json
    sentence_encoder_path = args.sentence_encoder
    image_encoder_path = args.image_encoder
    name = args.name
    mode = args.mode

    print("[args] gpu=%d" % gpu)
    print("[args] config_path=%s" % config_path)
    print("[args] word2vec_path=%s" % vocab_path)
    print("[args] img2vec_path=%s" % img2vec_path)
    print("[args] val_json_path=%s" % val_json_path)
    print("[args] sentence_encoder_path=%s" % sentence_encoder_path)
    print("[args] image_encoder_path=%s" % image_encoder_path)
    print("[args] name=%s" % name)
    print("[args] mode=%s" % mode)
    print()

    device = torch.device("cuda:" +
                          str(gpu) if torch.cuda.is_available() else "cpu")

    config.read(config_path)

    # Model parameters
    modelparams = config["modelparams"]
    sentence_encoder_name = modelparams.get("sentence_encoder")
    n_layers = modelparams.getint("n_layers")
    n_head = modelparams.getint("n_head")
    d_k = modelparams.getint("d_k")
    d_v = modelparams.getint("d_v")
    d_inner = modelparams.getint("d_inner")
    d_img = modelparams.getint("d_img")
    d_model = modelparams.getint("d_model")

    print("[modelparames] sentence_encoder_name=%s" % sentence_encoder_name)
    if n_layers:
        print("[modelparames] n_layers=%d" % n_layers)
    if n_head:
        print("[modelparames] n_head=%d" % n_head)
    if d_k:
        print("[modelparames] d_k=%d" % d_k)
    if d_v:
        print("[modelparames] d_v=%d" % d_v)
    if d_inner:
        print("[modelparames] d_inner=%d" % d_inner)
    print("[modelparames] d_img=%d" % d_img)
    print("[modelparames] d_model=%d" % d_model)
    print()

    hyperparams = config["hyperparams"]
    batch_size = hyperparams.getint("batch_size")

    print("[hyperparames] batch_size=%d" % batch_size)
    print()

    # Data preparation
    print("[info] Loading vocabulary ...")
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    dataloader_val = datasets.coco.get_loader(img2vec_path, val_json_path,
                                              vocab, batch_size)

    # Model preparation
    img_encoder = models.ImageEncoder(d_img, d_model).to(device)
    sen_encoder = models.SentenceEncoder(vocab, sentence_encoder_name, d_model,
                                         n_layers, n_head, d_k, d_v,
                                         d_inner).to(device)

    # Load params
    img_encoder.load_state_dict(torch.load(image_encoder_path))
    sen_encoder.load_state_dict(torch.load(sentence_encoder_path),
                                strict=False)
    img_encoder.eval()
    sen_encoder.eval()

    # Evaluate
    print("[info] Encoding candidate ...")
    s_vectors, s_ids, i_vectors, i_ids = encode_candidate(
        sen_encoder, img_encoder, dataloader_val, device)

    if mode == 's2i':
        print('[info] Retrieving image')
        caption_id = input("input caption id: ")
        caption_id = int(caption_id)
        coco = dataloader_val.dataset.coco
        print("Caption: %s" % coco.anns[caption_id]['caption'])
        target = s_vectors[s_ids == caption_id]
        target = target.flatten()

        scores = i_vectors.dot(target)
        sorted_ids = i_ids[np.argsort(scores)[::-1]]
        for i in range(9):
            print(sorted_ids[i])

    elif mode == 'i2s':
        print('[info] Retrieving caption')
        image_id = input("input image id: ")
        image_id = int(image_id)
        coco = dataloader_val.dataset.coco
        target = i_vectors[i_ids == image_id]
        target = target.flatten()

        scores = s_vectors.dot(target)
        sorted_ids = s_ids[np.argsort(scores)[::-1]]
        for i in range(9):
            print("Caption: %s" % coco.anns[sorted_ids[i]]['caption'])