def build_eig_estimator(args, vocab, optim_function=True, method_override=None, model_batch_size=None): if args.dataset == "wmt14": vocab = vocab["de"] if args.eig_text_encoder == "rnn": text_encoder = models.RNNTextEncoder(vocab) elif args.eig_text_encoder == "bow": text_encoder = models.BOWTextEncoder(vocab) else: raise NotImplementedError(f"text encoder = {args.eig_text_encoder}") if method_override is None: acquisition = args.acquisition else: acquisition = method_override if acquisition == "eig_y": # Just dependent on captions eig_model = eig.estimators.BatchedEIGEstimator(args.ensemble, text_encoder, model_batch_size) elif acquisition == "eig_xy": if args.backbone == "mlp" and args.dataset == "coco": backbone_input_dim = 512 else: backbone_input_dim = None img_encoder = models.ImageEncoder( backbone=args.eig_img_encoder, pretrained=False, grayscale="mnist" in args.dataset, backbone_input_dim=backbone_input_dim, ) # Dependent on image and captions eig_model = eig.estimators.EIGEstimatorWithImage( args.ensemble, text_encoder, img_encoder) else: return None eig_optimizer = lambda p: optim.Adam(p) if not optim_function: # Actually return an optimizer, not the function eig_optimizer = eig_optimizer(eig_model.parameters()) # No reduction as individual loss terms can have weights in EIG eval. eig_loss = nn.CrossEntropyLoss(reduction="none") if args.cuda: eig_model = eig_model.cuda() eig_loss = eig_loss.cuda() return eig_model, eig_optimizer, eig_loss
def __init__(self): self.images = storage.ImageSet("../datasets/celeba_160/", is_uint8=False, require_cwh=True) self.val_images = storage.ImageSet("../datasets/celeba_160_val/", is_uint8=False, require_cwh=True) self.masks = mask_generator.MaskSet(320, mask_size=(160, 128), num_holes=2, size_holes=24, border=32) self.mask_encoder = models.MaskEncoder().to(DEVICE) self.image_encoder = models.ImageEncoder().to(DEVICE) self.image_decoder = models.ImageDecoder().to(DEVICE) self.image_discriminator = models.ImageDiscriminator().to(DEVICE) self.mask_encoder.load_state_dict( torch.load(MASK_ENCODER_DIR + "mask_encoder.params")) if os.path.exists(OUTPUT_DIR + "image_decoder.params"): self.image_encoder.load_state_dict( torch.load(OUTPUT_DIR + "image_encoder.params")) self.image_decoder.load_state_dict( torch.load(OUTPUT_DIR + "image_decoder.params")) self.image_discriminator.load_state_dict( torch.load(OUTPUT_DIR + "image_discriminator.params")) for param in self.mask_encoder.parameters(): param.requires_grad = False gen_params = list() for param in self.image_encoder.parameters(): gen_params.append(param) for param in self.image_decoder.parameters(): gen_params.append(param) dis_params = list() for param in self.image_discriminator.parameters(): dis_params.append(param) self.mse_loss = nn.MSELoss().cuda() self.bce_loss = nn.BCELoss().cuda() self.gen_mse_opti = torch.optim.Adam(gen_params, lr=1e-2) self.gen_bce_opti = torch.optim.Adam(gen_params, lr=2e-3) self.dis_opti = torch.optim.Adam(dis_params, lr=2e-3) self.ones = torch.ones([BATCH_SIZE, 1]).to(DEVICE) self.zeros = torch.zeros([BATCH_SIZE, 1]).to(DEVICE) return
def main(args): gpu = args.gpu config_path = args.config vocab_path = args.vocab img2vec_path = args.img2vec train_json_path = args.train_json name = args.name save_path = args.save print("[args] gpu=%d" % gpu) print("[args] config_path=%s" % config_path) print("[args] word2vec_path=%s" % vocab_path) print("[args] img2vec_path=%s" % img2vec_path) print("[args] train_json_path=%s" % train_json_path) print("[args] name=%s" % name) print("[args] save_path=%s" % save_path) print() device = torch.device("cuda:" + str(gpu) if torch.cuda.is_available() else "cpu") config.read(config_path) # Model parameters modelparams = config["modelparams"] sentence_encoder_name = modelparams.get("sentence_encoder") n_layers = modelparams.getint("n_layers") n_head = modelparams.getint("n_head") d_k = modelparams.getint("d_k") d_v = modelparams.getint("d_v") d_inner = modelparams.getint("d_inner") d_img = modelparams.getint("d_img") d_model = modelparams.getint("d_model") print("[modelparames] sentence_encoder_name=%s" % sentence_encoder_name) if n_layers: print("[modelparames] n_layers=%d" % n_layers) if n_head: print("[modelparames] n_head=%d" % n_head) if d_k: print("[modelparames] d_k=%d" % d_k) if d_v: print("[modelparames] d_v=%d" % d_v) if d_inner: print("[modelparames] d_inner=%d" % d_inner) print("[modelparames] d_img=%d" % d_img) print("[modelparames] d_model=%d" % d_model) print() # Hyper parameters hyperparams = config["hyperparams"] margin = hyperparams.getfloat("margin") weight_decay = hyperparams.getfloat("weight_decay") grad_clip = hyperparams.getfloat("grad_clip") lr = hyperparams.getfloat("lr") batch_size = hyperparams.getint("batch_size") n_epochs = hyperparams.getint("n_epochs") n_negatives = hyperparams.getint("n_negatives") print("[hyperparames] margin=%f" % margin) print("[hyperparames] weight_decay=%f" % weight_decay) print("[hyperparames] grad_clip=%f" % grad_clip) print("[hyperparames] lr=%f" % lr) print("[hyperparames] batch_size=%d" % batch_size) print("[hyperparames] n_epochs=%d" % n_epochs) print("[hyperparames] n_negatives=%d" % n_negatives) print() # Data preparation print("[info] Loading vocabulary ...") with open(vocab_path, 'rb') as f: vocab = pickle.load(f) dataloader_train = datasets.coco.get_loader(img2vec_path, train_json_path, vocab, batch_size) # Model preparation img_encoder = models.ImageEncoder(d_img, d_model).to(device) sen_encoder = models.SentenceEncoder(vocab, sentence_encoder_name, d_model, n_layers, n_head, d_k, d_v, d_inner).to(device) img_optimizer = optim.Adam(img_encoder.parameters(), lr=lr, weight_decay=weight_decay) sen_optimizer = optim.Adam(sen_encoder.parameters(), lr=lr, weight_decay=weight_decay) criterion = PairwiseRankingLoss(margin=margin) # Train print("[info] Training ...") for epoch in range(n_epochs): pbar = tqdm(dataloader_train) running_loss = 0.0 for i, (images, src_seq, src_pos, _, _) in enumerate(pbar): pbar.set_description('epoch %3d / %d' % (epoch + 1, n_epochs)) images = images.to(device) src_seq = src_seq.to(device) src_pos = src_pos.to(device) img_embedded = img_encoder(images) sen_embedded = sen_encoder(src_seq, src_pos) img_optimizer.zero_grad() sen_optimizer.zero_grad() loss = 0.0 for _ in range(n_negatives): perm = torch.randperm(len(img_embedded)) img_shuffled = img_embedded[perm] loss += criterion(sen_embedded, img_embedded, img_shuffled) loss /= n_negatives loss.backward() nn.utils.clip_grad_value_(img_encoder.parameters(), grad_clip) nn.utils.clip_grad_value_(sen_encoder.parameters(), grad_clip) img_optimizer.step() sen_optimizer.step() running_loss += loss.item() if (i + 1) % args.print_every == 0: pbar.set_postfix(loss=running_loss / args.print_every) running_loss = 0 if (epoch + 1) % args.save_every == 0: save_dir = os.path.join(save_path, name) if not os.path.isdir(save_dir): os.mkdir(save_dir) sen_dict = sen_encoder.state_dict() sen_dict.pop('embed.weight') img_dict = img_encoder.state_dict() torch.save( sen_dict, os.path.join(save_dir, 'sentence_encoder-{}.pth'.format(epoch + 1))) torch.save( img_dict, os.path.join(save_dir, 'image_encoder-{}.pth'.format(epoch + 1)))
def build_model(args, vocab): """ Build the model, optimizer, and loss according to experiment args Parameters ---------- args : argparse.Namespace Experiment arguments vocab : dict vocab for language model Returns ------- model : torch.nn.Module The model to be trained optimizer_func : () -> torch.optim.Optimizer A function that creates an optimizer (done because we may need to re-initialize the optimizer) loss : torch.nn.Module The loss function """ params_to_optimize = [] # Build the model according to the given arguments if args.backbone == "mlp" and args.dataset == "coco": backbone_input_dim = 512 else: backbone_input_dim = None encoder = models.ImageEncoder( backbone=args.backbone, pretrained=args.pretrained_backbone, grayscale="mnist" in args.dataset, backbone_input_dim=backbone_input_dim, ) encoder.fine_tune(not args.freeze_encoder) if not args.freeze_encoder: params_to_optimize.append({ "params": [p for p in encoder.parameters() if p.requires_grad], "lr": args.encoder_lr, }) if args.attention: decoder = models.DecoderWithAttention( embed_dim=args.emb_dim, decoder_dim=args.decoder_dim, vocab_size=vocab, encoder_dim=encoder.output_dim, dropout=args.dropout, attention_dim=args.attention_dim, ) else: decoder = models.Decoder( embed_dim=args.emb_dim, decoder_dim=args.decoder_dim, vocab=vocab, encoder_dim=encoder.output_dim, dropout=args.dropout, ) params_to_optimize.append({ "params": [p for p in decoder.parameters() if p.requires_grad], "lr": args.decoder_lr, }) loss = nn.CrossEntropyLoss() model = models.Captioner(encoder, decoder, encoder_dropout=args.dropout) optimizer_func = lambda: optim.Adam(params_to_optimize) if args.cuda: model = model.cuda() loss = loss.cuda() return model, optimizer_func, loss
import models import storage import mask_generator import numpy as np import matplotlib.pyplot as plt import torch import utilities import os EVALUATE_DIR = "../evaluate_min/" IMAGE_ENCODER_DIR = "../outputs/train_image_encoder_min/" DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") if __name__ == '__main__': image_encoder = models.ImageEncoder().to(DEVICE) image_decoder = models.ImageDecoderMin().to(DEVICE) image_encoder.load_state_dict( torch.load(IMAGE_ENCODER_DIR + "image_encoder.params")) image_decoder.load_state_dict( torch.load(IMAGE_ENCODER_DIR + "image_decoder.params")) test_set = storage.ImageSet("../datasets/celeba_160_test_min/", is_uint8=False, require_cwh=True) mask = mask_generator.generate_mask(mask_size=(160, 128), num_holes=2, size_holes=24, border=32, expand_dim=False)
def main(args): gpu = args.gpu config_path = args.config vocab_path = args.vocab img2vec_path = args.img2vec val_json_path = args.val_json sentence_encoder_path = args.sentence_encoder image_encoder_path = args.image_encoder name = args.name print("[args] gpu=%d" % gpu) print("[args] config_path=%s" % config_path) print("[args] word2vec_path=%s" % vocab_path) print("[args] img2vec_path=%s" % img2vec_path) print("[args] val_json_path=%s" %val_json_path) print("[args] sentence_encoder_path=%s" % sentence_encoder_path) print("[args] image_encoder_path=%s" % image_encoder_path) print("[args] name=%s" % name) print() device = torch.device("cuda:" + str(gpu) if torch.cuda.is_available() else "cpu") config.read(config_path) # Model parameters modelparams = config["modelparams"] sentence_encoder_name = modelparams.get("sentence_encoder") n_layers = modelparams.getint("n_layers") n_head = modelparams.getint("n_head") d_k = modelparams.getint("d_k") d_v = modelparams.getint("d_v") d_inner = modelparams.getint("d_inner") d_img = modelparams.getint("d_img") d_model = modelparams.getint("d_model") print("[modelparames] sentence_encoder_name=%s" % sentence_encoder_name) if n_layers: print("[modelparames] n_layers=%d" % n_layers) if n_head: print("[modelparames] n_head=%d" % n_head) if d_k: print("[modelparames] d_k=%d" % d_k) if d_v: print("[modelparames] d_v=%d" % d_v) if d_inner: print("[modelparames] d_inner=%d" % d_inner) print("[modelparames] d_img=%d" % d_img) print("[modelparames] d_model=%d" % d_model) print() hyperparams = config["hyperparams"] batch_size = hyperparams.getint("batch_size") print("[hyperparames] batch_size=%d" % batch_size) print() print("[info] Loading vocabulary ...") with open(vocab_path, 'rb') as f: vocab = pickle.load(f) dataloader_val = datasets.coco.get_loader(img2vec_path, val_json_path, vocab, batch_size) # Model preparation img_encoder = models.ImageEncoder(d_img, d_model).to(device) sen_encoder = models.SentenceEncoder(vocab, sentence_encoder_name, d_model, n_layers, n_head, d_k, d_v, d_inner).to(device) # Load params img_encoder.load_state_dict(torch.load(image_encoder_path)) sen_encoder.load_state_dict(torch.load(sentence_encoder_path), strict=False) img_encoder.eval() sen_encoder.eval() # Evaluate print("[info] Evaluating on the validation set ...") s2i, i2s = evaluate(sen_encoder, img_encoder, dataloader_val, device) print( "[validation] s2i[R@5=%.02f, R@10=%.02f, R@20=%.02f], i2s[R@5=%.02f, R@10=%.02f, R@20=%.02f]" % \ (s2i["recall"][5], s2i["recall"][10], s2i["recall"][20], i2s["recall"][5], i2s["recall"][10], i2s["recall"][20]))
def main(args): gpu = args.gpu config_path = args.config vocab_path = args.vocab img2vec_path = args.img2vec val_json_path = args.val_json sentence_encoder_path = args.sentence_encoder image_encoder_path = args.image_encoder name = args.name mode = args.mode print("[args] gpu=%d" % gpu) print("[args] config_path=%s" % config_path) print("[args] word2vec_path=%s" % vocab_path) print("[args] img2vec_path=%s" % img2vec_path) print("[args] val_json_path=%s" % val_json_path) print("[args] sentence_encoder_path=%s" % sentence_encoder_path) print("[args] image_encoder_path=%s" % image_encoder_path) print("[args] name=%s" % name) print("[args] mode=%s" % mode) print() device = torch.device("cuda:" + str(gpu) if torch.cuda.is_available() else "cpu") config.read(config_path) # Model parameters modelparams = config["modelparams"] sentence_encoder_name = modelparams.get("sentence_encoder") n_layers = modelparams.getint("n_layers") n_head = modelparams.getint("n_head") d_k = modelparams.getint("d_k") d_v = modelparams.getint("d_v") d_inner = modelparams.getint("d_inner") d_img = modelparams.getint("d_img") d_model = modelparams.getint("d_model") print("[modelparames] sentence_encoder_name=%s" % sentence_encoder_name) if n_layers: print("[modelparames] n_layers=%d" % n_layers) if n_head: print("[modelparames] n_head=%d" % n_head) if d_k: print("[modelparames] d_k=%d" % d_k) if d_v: print("[modelparames] d_v=%d" % d_v) if d_inner: print("[modelparames] d_inner=%d" % d_inner) print("[modelparames] d_img=%d" % d_img) print("[modelparames] d_model=%d" % d_model) print() hyperparams = config["hyperparams"] batch_size = hyperparams.getint("batch_size") print("[hyperparames] batch_size=%d" % batch_size) print() # Data preparation print("[info] Loading vocabulary ...") with open(vocab_path, 'rb') as f: vocab = pickle.load(f) dataloader_val = datasets.coco.get_loader(img2vec_path, val_json_path, vocab, batch_size) # Model preparation img_encoder = models.ImageEncoder(d_img, d_model).to(device) sen_encoder = models.SentenceEncoder(vocab, sentence_encoder_name, d_model, n_layers, n_head, d_k, d_v, d_inner).to(device) # Load params img_encoder.load_state_dict(torch.load(image_encoder_path)) sen_encoder.load_state_dict(torch.load(sentence_encoder_path), strict=False) img_encoder.eval() sen_encoder.eval() # Evaluate print("[info] Encoding candidate ...") s_vectors, s_ids, i_vectors, i_ids = encode_candidate( sen_encoder, img_encoder, dataloader_val, device) if mode == 's2i': print('[info] Retrieving image') caption_id = input("input caption id: ") caption_id = int(caption_id) coco = dataloader_val.dataset.coco print("Caption: %s" % coco.anns[caption_id]['caption']) target = s_vectors[s_ids == caption_id] target = target.flatten() scores = i_vectors.dot(target) sorted_ids = i_ids[np.argsort(scores)[::-1]] for i in range(9): print(sorted_ids[i]) elif mode == 'i2s': print('[info] Retrieving caption') image_id = input("input image id: ") image_id = int(image_id) coco = dataloader_val.dataset.coco target = i_vectors[i_ids == image_id] target = target.flatten() scores = s_vectors.dot(target) sorted_ids = s_ids[np.argsort(scores)[::-1]] for i in range(9): print("Caption: %s" % coco.anns[sorted_ids[i]]['caption'])