def main(): args = parse_args() # Set the GPU to use torch.cuda.set_device(args.gpu) annotations = osp.expanduser(args.annotations) questions = osp.expanduser(args.questions) vqa_loader = dataset.get_train_dataloader(annotations, questions, args.images, args) # We always use the vocab from the training set vocab = vqa_loader.dataset.vocab maps = { "word_to_wid": vqa_loader.dataset.word_to_wid, "wid_to_word": vqa_loader.dataset.wid_to_word, "ans_to_aid": vqa_loader.dataset.ans_to_aid, "aid_to_ans": vqa_loader.dataset.aid_to_ans, } val_loader = dataset.get_val_dataloader(osp.expanduser( args.val_annotations), osp.expanduser(args.val_questions), args.val_images, args, maps=maps, vocab=vocab, shuffle=False) arch = Models[args.arch].value model = arch(len(vocab), output_dim=args.top_answer_limit) if torch.cuda.is_available(): model.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=tuple(args.betas), weight_decay=args.weight_decay) scheduler = lr_scheduler.StepLR(optimizer, step_size=args.decay_interval, gamma=args.lr_decay) vis = visualize.Visualizer(args.port) print("Beginning training") print("#" * 80) for epoch in range(args.start_epoch, args.epochs): scheduler.step() trainer.train(model, vqa_loader, criterion, optimizer, epoch, args, vis=vis) trainer.evaluate(model, val_loader, criterion, epoch, args, vis=vis) print("Training complete!")
def main(): args = parse_args() # Set the GPU to use torch.cuda.set_device(args.gpu) transform = transforms.Compose([ transforms.Resize(256), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) vqa_loader = dataset.get_train_dataloader(osp.expanduser(args.annotations), osp.expanduser(args.questions), args.images, args, raw_images=args.raw_images, transforms=transform) # We always use the vocab from the training set vocab = vqa_loader.dataset.vocab maps = { "vocab": vocab, "word_to_wid": vqa_loader.dataset.word_to_wid, "wid_to_word": vqa_loader.dataset.wid_to_word, "ans_to_aid": vqa_loader.dataset.ans_to_aid, "aid_to_ans": vqa_loader.dataset.aid_to_ans, } val_transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) val_loader = dataset.get_val_dataloader(osp.expanduser( args.val_annotations), osp.expanduser(args.val_questions), args.val_images, args, raw_images=args.raw_images, maps=maps, vocab=vocab, shuffle=False, transforms=val_transform) arch = Models[args.arch].value model = arch(len(vocab), output_dim=args.top_answer_limit, raw_images=args.raw_images) if args.resume: state = torch.load(args.resume) model.load_state_dict(state["model"]) model.cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=tuple(args.betas), weight_decay=args.weight_decay) scheduler = lr_scheduler.StepLR(optimizer, step_size=args.decay_interval, gamma=args.lr_decay) if args.visualize: vis = visualize.Visualizer(args.port) else: vis = None print("Beginning training") print("#" * 80) for epoch in range(args.start_epoch, args.epochs): scheduler.step() trainer.train(model, vqa_loader, criterion, optimizer, epoch, args, vis=vis) trainer.evaluate(model, val_loader, criterion, epoch, args, vis=vis) print("Training complete!")
def main(): args = parse_args() # Set the GPU to use torch.cuda.set_device(args.gpu) vqa_loader = dataset.get_train_dataloader(osp.expanduser(args.annotations), osp.expanduser(args.questions), args.images, args, raw_images=args.raw_images, transforms=None) # We always use the vocab from the training set vocab = vqa_loader.dataset.vocab maps = { "vocab": vocab, "word_to_wid": vqa_loader.dataset.word_to_wid, "wid_to_word": vqa_loader.dataset.wid_to_word, "ans_to_aid": vqa_loader.dataset.ans_to_aid, "aid_to_ans": vqa_loader.dataset.aid_to_ans, } val_transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) val_loader = dataset.get_val_dataloader(osp.expanduser( args.val_annotations), osp.expanduser(args.val_questions), args.val_images, args, raw_images=args.raw_images, maps=maps, vocab=vocab, shuffle=False, transforms=val_transform) arch = Models[args.arch].value model = arch(len(vocab), output_dim=args.top_answer_limit, raw_images=args.raw_images) if args.resume: state = torch.load(args.resume) model.load_state_dict(state["model"]) else: print( "No trained model weights provided. Don't expect the answers to be meaningful." ) if torch.cuda.is_available(): model.cuda() with torch.no_grad(): results = evaluate(model, val_loader) for k in results.keys(): results[k] = np.asarray(results[k]) acc = results[k].sum() / results[k].shape print("Accuracy for {0} type answers: \t\t{1}".format(k, acc))