def main(opt): Initializer.manual_seed(opt.seed) print("Constructing the dataset...") if opt.trainval == 0: trainset = Dataset(opt.data_path, opt.data_name, "train", opt.seq_per_img, opt.img_name, opt.size_scale, use_h5py=opt.use_h5py) if not opt.use_rcnn else \ RCNN_Dataset(opt.data_path, opt.data_name, "train", opt.seq_per_img) trainLoader = DataLoader(trainset, batch_size=opt.batch_size, shuffle=opt.shuffle, num_workers=opt.num_workers, pin_memory=opt.pin_memory, drop_last=opt.drop_last, use_thread=opt.use_thread) valset = Dataset(opt.data_path, opt.data_name, "val", opt.seq_per_img, opt.img_name, opt.size_scale, use_h5py=opt.use_h5py) if not opt.use_rcnn else \ RCNN_Dataset(opt.data_path, opt.data_name, "val", opt.seq_per_img) valLoader = DataLoader(valset, batch_size=opt.batch_size, shuffle=opt.shuffle, num_workers=opt.num_workers, pin_memory=opt.pin_memory, drop_last=opt.drop_last, use_thread=opt.use_thread) else: trainset = Dataset(opt.data_path, opt.data_name, "trainval", opt.seq_per_img, opt.img_name, opt.size_scale, use_h5py=opt.use_h5py) if not opt.use_rcnn else \ RCNN_Dataset(opt.data_path, opt.data_name, "trainval", opt.seq_per_img) trainLoader = DataLoader(trainset, batch_size=opt.batch_size, shuffle=opt.shuffle, num_workers=opt.num_workers, pin_memory=opt.pin_memory, drop_last=opt.drop_last, use_thread=opt.use_thread) valset = None valLoader = None idx2word = trainset.idx2word ans_pool = trainset.ans_pool ans_pool = torch.from_numpy(ans_pool) print("Building model...") word_embedded = LargeEmbedding(len(idx2word), 300, padding_idx=0, devices=opt.gpus) word_embedded.load_pretrained_vectors(opt.word_vectors) if opt.predict_type in ["sum_attn", "cat_attn", "prod_attn"]: num_ans = ans_pool.size(0) model = DCN(opt, num_ans) if not opt.use_rcnn else DCNWithRCNN( opt, num_ans) else: ans = word_embedded(Variable(ans_pool.cuda(opt.gpus[0]), volatile=True)).data ans_mask = ans_pool.ne(0).float() model = DCNWithAns(opt, ans, ans_mask) if not opt.use_rcnn else \ DCNWithRCNNAns(opt, ans, ans_mask) criterion = BinaryLoss() evaluation = Accuracy() dict_checkpoint = opt.train_from if dict_checkpoint: print("Loading model from checkpoint at %s" % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) model.load_state_dict(checkpoint["model"]) if len(opt.gpus) >= 1: model.cuda(opt.gpus[0]) if len(opt.gpus) > 1: model = nn.DataParallel(model, opt.gpus, dim=0) model.word_embedded = word_embedded optimizer = Adam(list(filter(lambda x: x.requires_grad, model.parameters())), lr=opt.lr, weight_decay=opt.weight_decay, record_step=opt.record_step) scheduler = lr_scheduler.StepLR(optimizer, opt.step_size, gamma=opt.gamma) optim_wrapper = OptimWrapper(optimizer, scheduler) nparams = [] named_parameters = model.module.named_parameters() if len( opt.gpus) > 1 else model.named_parameters() for name, param in named_parameters: if not (name.startswith("resnet") or name.startswith("word_embedded") or name.startswith("ans")): nparams.append(param.numel()) print("* Number of parameters: %d" % sum(nparams)) checkpoint = None timer = Timer() timer.tic() try: with torch.cuda.device(opt.gpus[0]): trainModel(trainLoader, valLoader, model, criterion, evaluation, optim_wrapper, opt) except KeyboardInterrupt: print("It toke %.2f hours to train the network" % (timer.toc() / 3600)) sys.exit("Training interrupted") print("It toke %.2f hours to train the network" % (timer.toc() / 3600))
def trainEpoch(epoch, dataloader, model, criterion, evaluation, optim, opt, writer): model.train() loss_record = [Meter() for _ in range(3)] accuracy_record = [Meter() for _ in range(3)] timer = Timer() timer.tic() optim.step_epoch() for i, batch in enumerate(dataloader): if not opt.use_rcnn: img, ques, ques_mask, _, ans_idx = batch else: img, ques, img_mask, ques_mask, _, ans_idx = batch img = Variable(img) if opt.use_rcnn else Variable(img, volatile=True) img_mask = Variable(img_mask) if opt.use_rcnn else None ques = Variable(ques, volatile=True) ques_mask = Variable(ques_mask) ans_idx = Variable(ans_idx) img, img_mask, ques, ques_mask, ans_idx = \ move_to_cuda((img, img_mask, ques, ques_mask, ans_idx), devices=opt.gpus) ques = model.word_embedded(ques) ques = Variable(ques.data) optim.zero_grad() score = model(img, ques, img_mask, ques_mask) if opt.use_rcnn else \ model(img, ques, img_mask, ques_mask, is_train=True) loss = criterion(score, ans_idx) loss.backward() accuracy = evaluation(Variable(score.data, volatile=True), Variable(ans_idx.data, volatile=True)) _, ratio, updates, params = optim.step() for j in range(3): loss_record[j].update((loss.data[0] / opt.batch_size)) accuracy_record[j].update(accuracy.data[0]) if ratio is not None: writer.add_scalar("statistics/update_to_param_ratio", ratio, global_step=(epoch * len(dataloader) + i)) writer.add_scalar("statistics/absolute_updates", updates, global_step=(epoch * len(dataloader) + i)) writer.add_scalar("statistics/absolute_params", params, global_step=(epoch * len(dataloader) + i)) if (i + 1) % 10 == 0: writer.add_scalar("iter/train_loss", loss_record[0].avg, global_step=(epoch * len(dataloader) + i)) writer.add_scalar("iter/train_accuracy", accuracy_record[0].avg, global_step=(epoch * len(dataloader) + i)) loss_record[0].reset() accuracy_record[0].reset() if (i + 1) % opt.log_interval == 0: print( "Epoch %5d; iter %6i; loss: %8.2f; accuracy: %8.2f; %6.0fs elapsed" % (epoch, i + 1, loss_record[1].avg, accuracy_record[1].avg, timer.toc(average=False))) loss_record[1].reset() accuracy_record[1].reset() timer.tic() writer.add_scalar("epoch/train_loss", loss_record[2].avg, global_step=epoch) writer.add_scalar("epoch/train_accuracy", accuracy_record[2].avg, global_step=epoch) return loss_record[2].avg, accuracy_record[2].avg
lr=opt['lr'], weight_decay=opt['weight_decay'], record_step=opt['record_step']) scheduler = lr_scheduler.StepLR(optimizer, opt['step_size'], gamma=opt['gamma']) optim_wrapper = OptimWrapper(optimizer, scheduler) nparams = [] named_parameters = model.module.named_parameters() if len( opt['gpus']) > 1 else model.named_parameters() for name, param in named_parameters: if not (name.startswith("resnet") or name.startswith("word_embedded") or name.startswith("ans")): nparams.append(param.numel()) print("* Number of parameters: %d" % sum(nparams)) checkpoint = None timer = Timer() timer.tic() try: with torch.cuda.device(opt['gpus'][0]): print('Training model....') trainModel(trainLoader, valLoader, model, criterion, evaluation, optim_wrapper, opt) except KeyboardInterrupt: print("It took %.2f hours to train the network" % (timer.toc() / 3600)) sys.exit("Training interrupted") print("It toke %.2f hours to train the network" % (timer.toc() / 3600))