def test(model, data, device): model.eval() results = [] question_ids = [] for batch in tqdm(data, total=len(data)): coco_ids, q_ids, answers, *batch_input = [todevice(x, device) for x in batch] logits, loss_time = model(*batch_input) predicts = torch.max(logits, dim=1)[1] for predict in predicts: results.append(data.vocab['answer_idx_to_token'][predict.item()]) for q_id in q_ids: question_ids.append(q_id.item()) return results, question_ids
def val_with_acc(model, data, device): model.eval() question_ids = [] accs = [] total_acc, count = 0, 0 for batch in tqdm(data, total=len(data)): coco_ids, q_ids, answers, *batch_input = [todevice(x, device) for x in batch] logits, loss_time = model(*batch_input) batch_acc = batch_accuracy(logits, answers) predicts = torch.max(logits, dim=1)[1] for q_id in q_ids: question_ids.append(q_id.item()) for acc in batch_acc: accs.append(acc.item()) return accs, question_ids
def validate(model, data, device): count, correct = 0, 0 model.eval() print('validate...') total_acc, count = 0, 0 with torch.no_grad(): for batch in tqdm(data, total=len(data)): coco_ids, q_ids, answers, *batch_input = [todevice(x, device) for x in batch] batch_input = [x.detach() for x in batch_input] logits, loss_time = model(*batch_input) acc = batch_accuracy(logits, answers) total_acc += acc.sum().data.item() count += answers.size(0) acc = total_acc / count return acc
def validate(model, data, device, detail=False): count, correct = 0, 0 beta = 1. model.eval() print('validate...') for batch in tqdm(data, total=len(data)): orig_idx, image_idx, answers, *batch_input = [ todevice(x, device) for x in batch ] logits, loss_t = model(*batch_input) predicts = logits.max(1)[1] correct += torch.eq(predicts, answers).long().sum().item() count += answers.size(0) acc = correct / count return acc
def train(args): logging.info("Create train_loader and val_loader.........") train_loader_kwargs = { 'question_pt': args.train_question_pt, 'vocab_json': args.vocab_json, 'feature_h5': args.train_feature_h5, 'batch_size': args.batch_size, 'num_workers': 4, 'shuffle': True } train_loader = CLEVRDataLoader(**train_loader_kwargs) if args.val: val_loader_kwargs = { 'question_pt': args.val_question_pt, 'vocab_json': args.vocab_json, 'feature_h5': args.val_feature_h5, 'batch_size': args.batch_size, 'num_workers': 2, 'shuffle': False } val_loader = CLEVRDataLoader(**val_loader_kwargs) logging.info("Create model.........") device = 'cuda' if torch.cuda.is_available() else 'cpu' model_kwargs = { 'vocab': train_loader.vocab, 'dim_word': args.dim_word, 'dim_hidden': args.hidden_size, 'dim_vision': args.dim_vision, 'state_size': args.state_size, 'mid_size': args.mid_size, 'dropout_prob': args.dropout, 'glimpses': args.glimpses, 'dim_edge': args.dim_edge } model_kwargs_tosave = { k:v for k,v in model_kwargs.items() if k != 'vocab' } model = Net(**model_kwargs) if torch.cuda.device_count() > 1: model = nn.DataParallel(model).to(device) # Support multiple GPUS else: model = model.to(device) logging.info(model) ################################################################ parameters = [p for p in model.parameters() if p.requires_grad] optimizer = optim.Adamax(parameters, args.lr, weight_decay=0) start_epoch = 0 if args.restore: print("Restore checkpoint and optimizer...") ckpt = os.path.join(args.save_dir, 'model.pt') ckpt = torch.load(ckpt, map_location={'cuda:0': 'cpu'}) start_epoch = 4 if torch.cuda.device_count() > 1: model.module.load_state_dict(ckpt['state_dict']) else: model.load_state_dict(ckpt['state_dict']) # optimizer.load_state_dict(ckpt['optimizer']) # scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.5**(1 / args.lr_halflife)) # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[8, 12, 15, 17, 19, 22], gamma=0.5) # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5) gradual_warmup_steps = [0.25 * args.lr, 0.5 * args.lr, 0.75 * args.lr, 1.0 * args.lr] criterion = nn.CrossEntropyLoss().to(device) last_acc = 0. logging.info("Start training........") for epoch in range(start_epoch, args.num_epoch): model.train() if epoch < len(gradual_warmup_steps): utils.set_lr(optimizer, gradual_warmup_steps[epoch]) else: scheduler.step() for p in optimizer.param_groups: lr_rate = p['lr'] logging.info("Learning rate: %6f" % (lr_rate)) for i, batch in enumerate(train_loader): progress = epoch+i/len(train_loader) orig_idx, image_idx, answers, *batch_input = [todevice(x, device) for x in batch] batch_input = [x.detach() for x in batch_input] logits, loss_time = model(*batch_input) ##################### loss ##################### ce_loss = criterion(logits, answers) loss_time = 0.01 * loss_time.mean() loss = ce_loss + loss_time ################################################ optimizer.zero_grad() loss.backward() nn.utils.clip_grad_value_(parameters, clip_value=0.25) optimizer.step() if (i+1) % (len(train_loader) // 20) == 0: logging.info("Progress %.3f ce_loss = %.3f time_loss = %.3f" % (progress, ce_loss.item(), loss_time.item())) del answers, batch_input, logits torch.cuda.empty_cache() # save_checkpoint(epoch, model, optimizer, model_kwargs_tosave, os.path.join(args.save_dir, 'model.pt')) logging.info(' >>>>>> save to %s <<<<<<' % (args.save_dir)) if args.val: if epoch % 1 ==0: valid_acc = validate(model, val_loader, device) logging.info('\n ~~~~~~ Valid Accuracy: %.4f ~~~~~~~\n' % valid_acc) if valid_acc >= last_acc: last_acc = valid_acc save_checkpoint(epoch, model, optimizer, model_kwargs_tosave, os.path.join(args.save_dir, 'model.pt'))