def train(args): """ Train a VQA model using the training set """ # set random seed torch.manual_seed(1000) if torch.cuda.is_available(): torch.cuda.manual_seed(1000) else: raise SystemExit('No CUDA available, script requires cuda') # Load the VQA training set print('Loading data') dataset = VQA_Dataset(args.data_dir, args.emb) loader = DataLoader(dataset, batch_size=args.bsize, shuffle=True, num_workers=5, collate_fn=collate_fn) # Load the VQA validation set val_dataset = VQA_Dataset(args.data_dir, args.emb, train=False) val_loader = DataLoader(val_dataset, batch_size=args.bsize, shuffle=False, num_workers=4, collate_fn=collate_fn) n_batches = len(dataset) // args.bsize # Print data and model parameters print('Parameters:\n\t' 'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d' '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb, dataset.feat_dim, args.hid, dataset.n_answers)) print('Initializing model') model = Model(vocab_size=dataset.q_words, emb_dim=args.emb, feat_dim=dataset.feat_dim, hid_dim=args.hid, out_dim=dataset.n_answers, dropout=args.dropout, neighbourhood_size=args.neighbourhood_size, pretrained_wemb=dataset.pretrained_wemb) criterion = nn.MultiLabelSoftMarginLoss() # Move it to GPU #model = model.cuda() model = nn.DataParallel(model).cuda() criterion = criterion.cuda() # Define the optimiser optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) logger = Logger(os.path.join('save/', 'log.txt')) # Continue training from saved model start_ep = 0 if args.model_path and os.path.isfile(args.model_path): print('Resuming from checkpoint %s' % (args.model_path)) ckpt = torch.load(args.model_path) start_ep = ckpt['epoch'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) # Update the learning rate for param_group in optimizer.param_groups: param_group['lr'] = args.lr # Learning rate scheduler scheduler = MultiStepLR(optimizer, milestones=[5, 10, 15], gamma=0.1) scheduler.last_epoch = start_ep - 1 # Train iterations print('Start training.') for ep in range(start_ep, start_ep + args.ep): scheduler.step() ep_loss = 0.0 ep_correct = 0.0 ave_loss = 0.0 ave_correct = 0.0 losses = [] for step, next_batch in tqdm(enumerate(loader)): model.train() # Move batch to cuda q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \ batch_to_cuda(next_batch) # forward pass output = model(q_batch, i_batch, k_batch, qlen_batch) #loss = criterion(output[0], a_batch) loss = criterion(output[0], a_batch) + criterion( output[1], a_batch) output = output[0] + output[1] # Compute batch accuracy based on vqa evaluation correct = total_vqa_score(output, vote_batch) ep_correct += correct ep_loss += loss.data.item() ave_correct += correct ave_loss += loss.data.item() losses.append(loss.cpu().data.item()) # This is a 40 step average if step % TRAIN_REPORT_INTERVAL == 0 and step != 0: logger.write( ' Epoch %02d(%03d/%03d), avg loss: %.7f, avg accuracy: %.2f%%' % (ep + 1, step, n_batches, ave_loss / TRAIN_REPORT_INTERVAL, ave_correct * 100 / (args.bsize * TRAIN_REPORT_INTERVAL))) ave_correct = 0 ave_loss = 0 # Compute gradient and do optimisation step optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() # save model and compute accuracy for epoch epoch_loss = ep_loss / n_batches epoch_acc = ep_correct * 100 / (n_batches * args.bsize) print('Epoch %02d done, average loss: %.3f, average accuracy: %.2f%%' % (ep + 1, epoch_loss, epoch_acc)) logger.write( 'Epoch %02d done, average loss: %.3f, average accuracy: %.2f%%' % (ep + 1, epoch_loss, epoch_acc)) #Compute validation accuracy at the end of epoch model.eval() save(model, optimizer, ep, epoch_loss, epoch_acc, dir=args.save_dir, name=args.name + '_' + str(ep + 1)) with torch.no_grad(): test_correct = 0 for data in val_loader: q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \ batch_to_cuda(data) # forward pass output = model(q_batch, i_batch, k_batch, qlen_batch) output = output[0] + output[1] test_correct += total_vqa_score(output, vote_batch) acc = test_correct / len(val_dataset) * 100 logger.write("Validation accuracy: {:.2f} %".format(acc))
def trainval(args): """ Train a VQA model using the training + validation set """ # set random seed torch.manual_seed(1000) if torch.cuda.is_available(): torch.cuda.manual_seed(1000) else: raise SystemExit('No CUDA available, script requires CUDA.') # load train+val sets for training print('Loading data') dataset = VQA_Dataset_Test(args.data_dir, args.emb) loader = DataLoader(dataset, batch_size=args.bsize, shuffle=True, num_workers=5, collate_fn=collate_fn) n_batches = len(dataset) // args.bsize # Print data and model parameters print( 'Parameters:\n\tvocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d\ \n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb, dataset.feat_dim, args.hid, dataset.n_answers)) print('Initializing model') model = Model(vocab_size=dataset.q_words, emb_dim=args.emb, feat_dim=dataset.feat_dim, hid_dim=args.hid, out_dim=dataset.n_answers, dropout=args.dropout, neighbourhood_size=args.neighbourhood_size, pretrained_wemb=dataset.pretrained_wemb) criterion = nn.MultiLabelSoftMarginLoss() # Move it to GPU model = model.cuda() criterion = criterion.cuda() # Define the optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Continue training from saved model start_ep = 0 if args.model_path and os.path.isfile(args.model_path): print('Resuming from checkpoint %s' % (args.model_path)) ckpt = torch.load(args.model_path) start_ep = ckpt['epoch'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) # ensure you can load with new lr for param_group in optimizer.param_groups: param_group['lr'] = args.lr # learner rate scheduler scheduler = MultiStepLR(optimizer, milestones=[30], gamma=0.5) scheduler.last_epoch = start_ep - 1 # Training script print('Start training.') for ep in range(start_ep, start_ep + args.ep): scheduler.step() ep_loss = 0.0 ep_correct = 0.0 ave_loss = 0.0 ave_correct = 0.0 losses = [] for step, next_batch in tqdm(enumerate(loader)): model.train() # batch to gpu q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \ batch_to_cuda(next_batch) # Do model forward output = model(q_batch, i_batch, k_batch, qlen_batch) loss = criterion(output, a_batch) # compute accuracy based on vqa evaluation correct = total_vqa_score(output, vote_batch) ep_correct += correct ep_loss += loss.data[0] ave_correct += correct ave_loss += loss.data[0] losses.append(loss.cpu().data[0]) # This is a 40 step average if step % 40 == 0 and step != 0: print( ' Epoch %02d(%03d/%03d), ave loss: %.7f, ave accuracy: %.2f%%' % (ep + 1, step, n_batches, ave_loss / 40, ave_correct * 100 / (args.bsize * 40))) ave_correct = 0 ave_loss = 0 ave_correct = ave_loss = ave_sparsity = 0 # compute gradient and do optim step optimizer.zero_grad() loss.backward() optimizer.step() # save model and compute accuracy for epoch epoch_loss = ep_loss / n_batches epoch_acc = ep_correct * 100 / (n_batches * args.bsize) save(model, optimizer, ep, epoch_loss, epoch_acc, dir=args.save_dir, name=args.name + '_' + str(ep + 1)) print('Epoch %02d done, average loss: %.3f, average accuracy: %.2f%%' % (ep + 1, epoch_loss, epoch_acc))
# Create the network net = WSDDN(base_net=args.base_net) if OFFSET != 0: state_path = os.path.join(BASE_DIR, "states", f"epoch_{OFFSET}.pt") net.load_state_dict(torch.load(state_path)) tqdm.write(f"Loaded epoch {OFFSET}'s state.") net.to(DEVICE) net.train() # Set loss function and optimizer optimizer = optim.Adam(net.parameters(), lr=LR, weight_decay=WD) scheduler = MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1) scheduler.last_epoch = OFFSET # Train the model for epoch in tqdm(range(OFFSET + 1, EPOCHS + 1), "Total"): epoch_loss = 0.0 for ( batch_img_ids, batch_imgs, batch_boxes, batch_scores, batch_target, ) in tqdm(train_dl, f"Epoch {epoch}"): optimizer.zero_grad()
def train(args): """ Train a VQG model using the training set and validate on val set. """ # Load the VQA training set print('Loading data...') dataset = VQA_Dataset(args.data_dir, args.emb) loader = DataLoader(dataset, batch_size=args.bsize, shuffle=True, num_workers=0, collate_fn=collate_fn) # Load the VQA validation set dataset_test = VQA_Dataset(args.data_dir, args.emb, train=False) loader_val = DataLoader(dataset_test, batch_size=args.bsize, shuffle=False, num_workers=0, collate_fn=collate_fn) n_batches = len(dataset) // args.bsize question_vocab = pickle.load( open('/mnt/data/xiaojinhui/wangtan_MM/vqa-project/data/train_q_dict.p', 'rb')) # Print data and model parameters print('Parameters:\n\t' 'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d' '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.n_answers, args.emb, dataset.feat_dim, args.hid, dataset.n_answers)) print('Initializing model') model_gcn = conditional_GCN(nfeat=options['gcn']['nfeat'], nhid=options['gcn']['nhid'], nclass=options['gcn']['nclass'], emb=options['gcn']['fliter_emb'], dropout=options['gcn']['dropout']) # model_gcn_nofinding = layer_vqg.conditional_GCN_1(nfeat=options['gcn']['nfeat'], # nhid=options['gcn']['nhid'], # nclass=options['gcn']['nclass'], # emb = options['gcn']['fliter_emb'], # dropout=options['gcn']['dropout']) model_vqg = question_gen(vocab=question_vocab['wtoi'], vocab_i2t=question_vocab['itow'], opt=options['vqg']) # no_finding = layer_vqg.no_finding_area_top(in_feature=2652, hidden_feature=512, dropout=options['gcn']['dropout']) criterion = nn.CrossEntropyLoss() # Move it to GPU model_gcn = model_gcn.cuda() model_vqg = model_vqg.cuda() # model_gcn_nofinding = model_gcn_nofinding.cuda() # no_finding = no_finding.cuda() criterion = criterion.cuda() # Define the optimiser optimizer = torch.optim.Adam([{ 'params': model_gcn.parameters() }, { 'params': model_vqg.parameters() }], lr=args.lr) # Continue training from saved model start_ep = 0 if args.model_path and os.path.isfile(args.model_path): print('Resuming from checkpoint %s' % (args.model_path)) ckpt = torch.load(args.model_path) start_ep = ckpt['epoch'] model_gcn.load_state_dict(ckpt['state_dict_gcn']) model_vqg.load_state_dict(ckpt['state_dict_vqg']) optimizer.load_state_dict(ckpt['optimizer']) # Update the learning rate for param_group in optimizer.param_groups: param_group['lr'] = args.lr # Learning rate scheduler scheduler = MultiStepLR(optimizer, milestones=[30], gamma=0.5) scheduler.last_epoch = start_ep - 1 # Train iterations print('Start training.') bleu_best = [0.0, 0.0, 0.0, 0.0] cider_best = 0.0 meteor_best = 0.0 rouge_best = 0.0 for ep in range(start_ep, start_ep + args.ep): adjust_learning_rate(optimizer, ep) scheduler.step() ep_loss = 0.0 ep_top3 = 0.0 ep_top1 = 0.0 ave_loss = 0.0 ave_top3 = 0.0 ave_top1 = 0.0 iter_time_all = 0.0 for step, next_batch in enumerate(loader): model_gcn.train() model_vqg.train() # Move batch to cuda target_q, an_feat, img_feat, adj_mat = \ utils.batch_to_cuda(next_batch, volatile=True) # forward pass torch.cuda.synchronize() start = time.time() # img_feat_no = torch.mul(img_feat[:,:,None,:], img_feat[:,None,:,:]).view(-1, 2652) # adj_mat = no_finding(img_feat_no).view(-1,36,36) # adj_mat += torch.eye(36).cuda() # adj_mat = torch.clamp(adj_mat, max=1) feat_gcn, adj_new = model_gcn(img_feat, adj_mat) # feat_gcn, adj_new = model_gcn_nofinding(img_feat, adj_mat) output = model_vqg(feat_gcn, an_feat, target_q) # for i in range(256): # dataset.drawarea[i]['adj'] = adj_new[i].detach().cpu().numpy().tolist() # dataset.drawarea[i]['adj_diag'] = np.diag(adj_new[i].detach().cpu().numpy()).tolist() # # json.dump(dataset.drawarea, open('/mnt/data/xiaojinhui/wangtan_MM/vqa-project/draw/new_adj_t.json', 'w')) # output_bs = model_vqg.beam_search(feat_gcn[0].unsqueeze(0), # an_feat[0].unsqueeze(0)) target_q = target_q[:, 1:].contiguous() loss = criterion( output.view(output.size(0) * output.size(1), output.size(2)), target_q.view(target_q.size(0) * target_q.size(1))) # Compute batch accu top1 = utils.accuracy(output, target_q, 1) top3 = utils.accuracy(output, target_q, 3) ep_top1 += top1 ep_top3 += top3 ep_loss += loss.item() ave_top1 += top1 ave_top3 += top3 ave_loss += loss.item() # This is a 40 step average if step % 40 == 0 and step != 0: print( ' Epoch %02d(%03d/%03d), ave loss: %.7f, top1: %.2f%%, top3: %.2f%%, iter time: %.4fs' % (ep + 1, step, n_batches, ave_loss / 40, ave_top1 / 40, ave_top3 / 40, iter_time_all / 40)) ave_top1 = 0 ave_top3 = 0 ave_loss = 0 iter_time_all = 0 # Compute gradient and do optimisation step optimizer.zero_grad() loss.backward() # clip_grad_norm_(model_gcn.parameters(), 2.) # clip_grad_norm_(model_gcn.parameters(), 2.) # clip_grad_norm_(no_finding.parameters(), 2.) optimizer.step() end = time.time() iter_time = end - start iter_time_all += iter_time # save model and compute validation accuracy every 400 steps if step == 0: with torch.no_grad(): epoch_loss = ep_loss / n_batches epoch_top1 = ep_top1 / n_batches epoch_top3 = ep_top3 / n_batches # compute validation accuracy over a small subset of the validation set model_gcn.train(False) model_vqg.train(False) model_gcn.eval() model_vqg.eval() output_all = [] output_all_bs = {} ref_all = [] flag_val = 0 for valstep, val_batch in tqdm(enumerate(loader_val)): # test_batch = next(loader_test) target_q, an_feat, img_feat, adj_mat = \ utils.batch_to_cuda(val_batch, volatile=True) # img_feat_no = torch.mul(img_feat[:, :, None, :], img_feat[:, None, :, :]).view(-1, 2652) # adj_mat = no_finding(img_feat_no).view(-1, 36, 36) # adj_mat += torch.eye(36).cuda() # adj_mat = torch.clamp(adj_mat, max=1) # feat_gcn, _ = model_gcn_nofinding(img_feat, adj_mat) feat_gcn, adj_new = model_gcn(img_feat, adj_mat) output = model_vqg.generate(feat_gcn, an_feat) for j in range(feat_gcn.size(0)): output_bs = model_vqg.beam_search( feat_gcn[j].unsqueeze(0), an_feat[j].unsqueeze(0)) output_all_bs[flag_val] = output_bs flag_val += 1 output_all.append(output.cpu().numpy()) ref_all.append(target_q[:, :-1].cpu().numpy()) gen, ref = utils.idx2question( np.concatenate(output_all, 0), np.concatenate(ref_all, 0), question_vocab['itow']) print(gen.values()[:10]) # save the best bleu, cider, meteor, rouge = main.main(ref, gen) bleu_best, cider_best, meteor_best, rouge_best, choice = utils.save_the_best( bleu, cider, meteor, rouge, bleu_best, cider_best, meteor_best, rouge_best) if choice: utils.save(model_gcn, model_vqg, optimizer, ep, epoch_loss, epoch_top1, dir=args.save_dir, name=args.name + '_' + str(ep + 1)) print('use beam search...') bleu, cider, meteor, rouge = main.main(ref, output_all_bs) bleu_best, cider_best, meteor_best, rouge_best, choice = utils.save_the_best( bleu, cider, meteor, rouge, bleu_best, cider_best, meteor_best, rouge_best) if choice: utils.save(model_gcn, model_vqg, optimizer, ep, epoch_loss, epoch_top1, dir=args.save_dir, name=args.name + '_' + str(ep + 1)) print( 'the best bleu: %s, cider: %.6s, meteor: %.6s, rouge: %.6s' % (bleu_best, cider_best, meteor_best, rouge_best)) print(output_all_bs.values()[:10]) model_gcn.train(True) model_vqg.train(True)
def train(model, train_loader, eval_loader, opt): utils.create_dir(opt.output) optim = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=opt.weight_decay) logger = utils.Logger(os.path.join(opt.output, 'log.txt')) utils.print_model(model, logger) for param_group in optim.param_groups: param_group['lr'] = opt.learning_rate scheduler = MultiStepLR(optim, milestones=[100], gamma=0.8) scheduler.last_epoch = opt.s_epoch best_eval_score = 0 for epoch in range(opt.s_epoch, opt.num_epochs): total_loss = 0 total_norm = 0 count_norm = 0 train_score = 0 t = time.time() N = len(train_loader.dataset) scheduler.step() for i, (v, b, a, _, qa_text, _, _, q_t, bias) in enumerate(train_loader): v = v.cuda() b = b.cuda() a = a.cuda() bias = bias.cuda() qa_text = qa_text.cuda() rand_index = random.sample(range(0, opt.train_candi_ans_num), opt.train_candi_ans_num) qa_text = qa_text[:,rand_index,:] a = a[:,rand_index] bias = bias[:,rand_index] if opt.lp == 0: logits = model(qa_text, v, b, epoch, 'train') loss = instance_bce_with_logits(logits, a, reduction='mean') elif opt.lp == 1: logits = model(qa_text, v, b, epoch, 'train') loss_pos = instance_bce_with_logits(logits, a, reduction='mean') index = random.sample(range(0, v.shape[0]), v.shape[0]) v_neg = v[index] b_neg = b[index] logits_neg = model(qa_text, v_neg, b_neg, epoch, 'train') self_loss = compute_self_loss(logits_neg, a) loss = loss_pos + opt.self_loss_weight * self_loss elif opt.lp == 2: logits, loss = model(qa_text, v, b, epoch, 'train', bias, a) else: assert 1==2 loss.backward() total_norm += nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) count_norm += 1 optim.step() optim.zero_grad() score = compute_score_with_logits(logits, a.data).sum() train_score += score.item() total_loss += loss.item() * v.size(0) if i != 0 and i % 100 == 0: print( 'training: %d/%d, train_loss: %.6f, train_acc: %.6f' % (i, len(train_loader), total_loss / (i * v.size(0)), 100 * train_score / (i * v.size(0)))) total_loss /= N if None != eval_loader: model.train(False) eval_score, bound = evaluate(model, eval_loader, opt) model.train(True) logger.write('\nlr: %.7f' % optim.param_groups[0]['lr']) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write( '\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) if (eval_loader is not None and eval_score > best_eval_score): if opt.lp == 0: model_path = os.path.join(opt.output, 'SAR_top'+str(opt.train_candi_ans_num)+'_best_model.pth') elif opt.lp == 1: model_path = os.path.join(opt.output, 'SAR_SSL_top'+str(opt.train_candi_ans_num)+'_best_model.pth') elif opt.lp == 2: model_path = os.path.join(opt.output, 'SAR_LMH_top'+str(opt.train_candi_ans_num)+'_best_model.pth') utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_score = eval_score
def train(model, train_loader, eval_loader, opt): utils.create_dir(opt.output) optim = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=opt.weight_decay) logger = utils.Logger(os.path.join(opt.output, 'log.txt')) utils.print_model(model, logger) # load snapshot if opt.checkpoint_path is not None: print('loading %s' % opt.checkpoint_path) model_data = torch.load(opt.checkpoint_path) model.load_state_dict(model_data.get('model_state', model_data)) optim.load_state_dict(model_data.get('optimizer_state', model_data)) opt.s_epoch = model_data['epoch'] + 1 for param_group in optim.param_groups: param_group['lr'] = opt.learning_rate scheduler = MultiStepLR(optim, milestones=[10, 15, 20, 25, 30, 35], gamma=0.5) scheduler.last_epoch = opt.s_epoch best_eval_score = 0 for epoch in range(opt.s_epoch, opt.num_epochs): total_loss = 0 total_bce_loss = 0 self_loss = 0 total_self_loss = 0 train_score_pos = 0 train_score_neg = 0 total_norm = 0 count_norm = 0 t = time.time() N = len(train_loader.dataset) scheduler.step() for i, (v, b, q, a, _) in enumerate(train_loader): v = v.cuda() q = q.cuda() a = a.cuda() # for the labeled samples if epoch < opt.pretrain_epoches: logits_pos, _ = model(q, v, False) if opt.ml_loss: bce_loss_pos = instance_bce_with_logits(logits_pos, a, reduction='mean') else: bce_loss_pos = instance_bce(logits_pos, a) loss = bce_loss_pos else: logits_pos, logits_neg, _, _ = model(q, v, True) if opt.ml_loss: #use multi-label loss bce_loss_pos = instance_bce_with_logits(logits_pos, a, reduction='mean') else: #use cross-entropy loss bce_loss_pos = instance_bce(logits_pos, a) self_loss = compute_self_loss(logits_neg, a) loss = bce_loss_pos + opt.self_loss_weight * self_loss loss.backward() total_norm += nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) count_norm += 1 optim.step() optim.zero_grad() score_pos = compute_score_with_logits(logits_pos, a.data).sum() train_score_pos += score_pos.item() total_loss += loss.item() * v.size(0) total_bce_loss += bce_loss_pos.item() * v.size(0) if epoch < opt.pretrain_epoches: #pretrain total_self_loss = 0 train_score_neg = 0 else: #fintune score_neg = compute_score_with_logits(logits_neg, a.data).sum() total_self_loss += self_loss.item() * v.size(0) train_score_neg += score_neg.item() if i != 0 and i % 100 == 0: print( 'traing: %d/%d, train_loss: %.6f, bce_loss: %.6f, self_loss: %.6f, neg_train_acc: %.6f, pos_train_acc: %.6f' % (i, len(train_loader), total_loss / (i * v.size(0)), total_bce_loss / (i * v.size(0)), total_self_loss / (i * v.size(0)), 100 * train_score_neg / (i * v.size(0)), 100 * train_score_pos / (i * v.size(0)))) total_loss /= N total_bce_loss /= N total_self_loss /= N train_score_pos = 100 * train_score_pos / N if None != eval_loader: model.train(False) eval_score, bound, entropy = evaluate(model, eval_loader) model.train(True) logger.write('\nlr: %.7f' % optim.param_groups[0]['lr']) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score_pos)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) if eval_loader is not None and entropy is not None: info = '' + ' %.2f' % entropy logger.write('\tentropy: ' + info) if (eval_loader is not None and eval_score > best_eval_score): model_path = os.path.join(opt.output, 'best_model.pth') utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_score = eval_score
def train(args): """ Train a VQA model using the training set """ # set random seed torch.manual_seed(1000) if torch.cuda.is_available(): torch.cuda.manual_seed(1000) else: raise SystemExit('No CUDA available, script requires cuda') # Load the VQA training set print('Loading data') dataset = VQA_Dataset(args.data_dir, args.emb) loader = DataLoader(dataset, batch_size=args.bsize, shuffle=True, num_workers=5, collate_fn=collate_fn) # Load the VQA validation set dataset_test = VQA_Dataset(args.data_dir, args.emb, train=False) test_sampler = RandomSampler(dataset_test) loader_test = iter( DataLoader(dataset_test, batch_size=args.bsize, sampler=test_sampler, shuffle=False, num_workers=4, collate_fn=collate_fn)) n_batches = len(dataset) // args.bsize # Print data and model parameters print('Parameters:\n\t' 'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d' '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb, dataset.feat_dim, args.hid, dataset.n_answers)) print('Initializing model') model = Model(vocab_size=dataset.q_words, emb_dim=args.emb, feat_dim=dataset.feat_dim, hid_dim=args.hid, out_dim=dataset.n_answers, dropout=args.dropout, neighbourhood_size=args.neighbourhood_size, pretrained_wemb=dataset.pretrained_wemb) criterion = nn.MultiLabelSoftMarginLoss() # Move it to GPU model = model.cuda() criterion = criterion.cuda() # Define the optimiser optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Continue training from saved model start_ep = 0 if args.model_path and os.path.isfile(args.model_path): print('Resuming from checkpoint %s' % (args.model_path)) ckpt = torch.load(args.model_path) start_ep = ckpt['epoch'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) # Update the learning rate for param_group in optimizer.param_groups: param_group['lr'] = args.lr # Learning rate scheduler scheduler = MultiStepLR(optimizer, milestones=[30], gamma=0.5) scheduler.last_epoch = start_ep - 1 # Split incoming data across gpus net = nn.DataParallel(model.train(True)) # Train iterations print('Start training.') for ep in range(start_ep, start_ep + args.ep): scheduler.step() ep_loss = 0.0 ep_correct = 0.0 ave_loss = 0.0 ave_correct = 0.0 losses = [] for step, next_batch in tqdm(enumerate(loader)): model.train() # Move batch to cuda q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \ batch_to_cuda(next_batch) # forward pass output, adjacency_matrix = net(q_batch, i_batch, k_batch, qlen_batch) loss = criterion(output, a_batch) # Compute batch accuracy based on vqa evaluation correct = total_vqa_score(output, vote_batch) ep_correct += correct ep_loss += loss.data[0] ave_correct += correct ave_loss += loss.data[0] losses.append(loss.cpu().data[0]) # This is a 40 step average if step % 40 == 0 and step != 0: print( ' Epoch %02d(%03d/%03d), ave loss: %.7f, ave accuracy: %.2f%%' % (ep + 1, step, n_batches, ave_loss / 40, ave_correct * 100 / (args.bsize * 40))) ave_correct = 0 ave_loss = 0 # Compute gradient and do optimisation step optimizer.zero_grad() loss.backward() optimizer.step() # save model and compute validation accuracy every 400 steps if step % 400 == 0: epoch_loss = ep_loss / n_batches epoch_acc = ep_correct * 100 / (n_batches * args.bsize) save(model, optimizer, ep, epoch_loss, epoch_acc, dir=args.save_dir, name=args.name + '_' + str(ep + 1)) # compute validation accuracy over a small subset of the validation set test_correct = 0 net = nn.DataParallel(model.train(False)) for i in range(10): test_batch = next(loader_test) q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \ batch_to_cuda(test_batch, volatile=True) output, _ = net(q_batch, i_batch, k_batch, qlen_batch) test_correct += total_vqa_score(output, vote_batch) net = nn.DataParallel(model.train(True)) acc = test_correct / (10 * args.bsize) * 100 print("Validation accuracy: {:.2f} %".format(acc)) # save model and compute accuracy for epoch epoch_loss = ep_loss / n_batches epoch_acc = ep_correct * 100 / (n_batches * args.bsize) save(model, optimizer, ep, epoch_loss, epoch_acc, dir=args.save_dir, name=args.name + '_' + str(ep + 1)) print('Epoch %02d done, average loss: %.3f, average accuracy: %.2f%%' % (ep + 1, epoch_loss, epoch_acc))