def training_loop_lars(model, criterion, train_loader, valid_loader, epochs, device, print_every=1): ''' Function defining the entire training loop ''' # set objects for storing metrics train_losses = [] valid_losses = [] train_accuracy = [] valid_accuracy = [] # Train model for epoch in range(0, epochs): #training if epoch < 10: optimizer = LARS(model.parameters(), lr=0.1 * (epoch + 1) / 10, momentum=0.9) elif epoch < 15: optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9) else: optimizer = LARS(model.parameters(), lr=0.1 * (0.95**(epoch - 15)), momentum=0.9) model, optimizer, train_loss = train(train_loader, model, criterion, optimizer, device) train_losses.append(train_loss) # validation with torch.no_grad(): model, valid_loss = validate(valid_loader, model, criterion, device) valid_losses.append(valid_loss) if epoch % print_every == (print_every - 1): train_acc = get_accuracy(model, train_loader, device=device) valid_acc = get_accuracy(model, valid_loader, device=device) train_accuracy.append(float(train_acc)) valid_accuracy.append(float(valid_acc)) print(f'{datetime.now().time().replace(microsecond=0)} --- ' f'Epoch: {epoch}\t' f'Train loss: {train_loss:.4f}\t' f'Valid loss: {valid_loss:.4f}\t' f'Train accuracy: {100 * train_acc:.2f}\t' f'Valid accuracy: {100 * valid_acc:.2f}\n') #plot_losses(train_losses, valid_losses) return model, [train_losses, valid_losses, train_accuracy, valid_accuracy]
def load_model(args): model = SimCLR(backbone=args.backbone, projection_dim=args.projection_dim, pretrained=args.pretrained, normalize=args.normalize) if args.inference: model.load_state_dict( torch.load("SimCLR_{}_epoch90.pth".format(args.backbone))) model = model.to(args.device) scheduler = None if args.optimizer == "Adam": optimizer = Adam(model.parameters(), lr=3e-4) # TODO: LARS elif args.optimizer == "LARS": # optimized using LARS with linear learning rate scaling # (i.e. LearningRate = 0.3 × BatchSize/256) and weight decay of 10−6. learning_rate = 0.3 * args.batch_size / 256 optimizer = LARS( model.parameters(), lr=learning_rate, weight_decay=args.weight_decay, exclude_from_weight_decay=["batch_normalization", "bias"], ) # "decay the learning rate with the cosine decay schedule without restarts" scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0, last_epoch=-1) else: raise NotImplementedError return model, optimizer, scheduler
def build_byol_optimizer(hparams: AttrDict, model: nn.Module) -> Optimizer: """ Build optimizer for BYOL self-supervised network, including backbone. """ regular_parameters = [] excluded_parameters = [] for name, parameter in model.named_parameters(): if parameter.requires_grad is False: continue if any(x in name for x in [".bn", ".bias"]): excluded_parameters.append(parameter) else: regular_parameters.append(parameter) param_groups = [ { "params": regular_parameters, "use_lars": True }, { "params": excluded_parameters, "use_lars": False, "weight_decay": 0, }, ] return LARS( param_groups, lr=hparams.self_supervised.learning_rate.base, eta=hparams.self_supervised.lars_eta, momentum=hparams.self_supervised.momentum, weight_decay=hparams.self_supervised.weight_decay, )
def configure_optimizers(self): #With this thing we get only params, which requires grad (weights needed to train) params = filter(lambda p: p.requires_grad, self.model.parameters()) if self.hparams.optimizer == "SGD": self.optimizer = torch.optim.SGD(params, self.hparams.lr, momentum = self.hparams.momentum, weight_decay=self.hparams.wd) elif self.hparams.optimizer == "LARS": self.optimizer = LARS(params, lr=self.hparams.lr, momentum=self.hparams.momentum, weight_decay=self.hparams.wd, max_epoch=self.hparams.epochs) self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, self.hparams.lr, epochs=self.hparams.epochs, steps_per_epoch=1, pct_start=self.hparams.pct_start) sched_dict = {'scheduler': self.scheduler} return [self.optimizer], [sched_dict]
def do_training(args): trainloader, testloader = build_dataset( args.dataset, dataroot=args.dataroot, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, num_workers=2) model = build_model(args.arch, num_classes=num_classes(args.dataset)) if args.cuda: model = torch.nn.DataParallel(model).cuda() # Calculate total number of model parameters num_params = sum(p.numel() for p in model.parameters()) track.metric(iteration=0, num_params=num_params) num_chunks = max(1, args.batch_size // args.max_samples_per_gpu) optimizer = LARS(params=model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eta=args.eta, max_epoch=args.epochs) criterion = torch.nn.CrossEntropyLoss() best_acc = 0.0 for epoch in range(args.epochs): track.debug("Starting epoch %d" % epoch) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, args.cuda, num_chunks=num_chunks) test_loss, test_acc = test(testloader, model, criterion, epoch, args.cuda) track.debug('Finished epoch %d... | train loss %.3f | train acc %.3f ' '| test loss %.3f | test acc %.3f' % (epoch, train_loss, train_acc, test_loss, test_acc)) # Save model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) if test_acc > best_acc: best_acc = test_acc best_fname = os.path.join(track.trial_dir(), "best.ckpt") track.debug("New best score! Saving model") torch.save(model, best_fname)
class ApexDistributeModel(object): def __init__(self, model, criterion, optimizer, args, gpu=None): super(ApexDistributeModel, self).__init__() self.model = model self.args = args self.sync_bn = self.args.sync_bn self.gpu = gpu if self.gpu is not None: assert isinstance(self.gpu, int), "GPU should is a int type." self.criterion = criterion self.optimizer = optimizer self.opt_level = None def convert(self, opt_level='O0'): self.opt_level = opt_level if self.sync_bn: # synchronization batch normal self.model = apex.parallel.convert_syncbn_model(self.model) # assign specific gpu self.model = self.model.cuda(self.gpu) self.criterion = self.criterion.cuda(self.gpu) # init model and optimizer by apex self.model, self.optimizer = apex.amp.initialize(self.model, self.optimizer, opt_level=self.opt_level) # apex parallel self.model = apex.parallel.DistributedDataParallel(self.model, delay_allreduce=True) # self.model = apex.parallel.DistributedDataParallel(self.model) self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[self.gpu]) return self.model, self.criterion, self.optimizer def lars(self): print("Enable LARS Optimizer Algorithm") self.optimizer = LARS(self.optimizer) def train(self, epoch, train_loader, max_batches, train_index): """ you must run it after the 'convert' function. :param epoch: :param train_loader: :return: """ batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() self.model.train() train_log_file = 'train_log' end = time.time() epoch_batch = epoch * (len(train_loader.dataset) / self.args.batch_size) for i, (inputs, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if train_index != -1 and i < train_index: continue adjust_learning_rate_epoch_poly(self.optimizer, self.args, burn_in=Darknet53.burn_in, power=Darknet53.power, batch_num=i + epoch_batch, max_batches=max_batches) inputs = inputs.cuda(self.gpu, non_blocking=True) target = target.cuda(self.gpu, non_blocking=True) # compute output output = self.model(inputs) loss = self.criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(acc1[0], inputs.size(0)) top5.update(acc5[0], inputs.size(0)) # compute gradient and do SGD step self.optimizer.zero_grad() # loss.backward() time1 = time.time() with apex.amp.scale_loss(loss, self.optimizer) as scale_loss: scale_loss.backward() time2 = time.time() self.optimizer.step() time3 = time.time() print("step cost time: {}, backward cost time: {}".format(time3-time2, time2-time1)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)) if i % 1000 == 0 and i: save_checkpoint({ 'epoch': epoch, 'index': i, 'arch': "Darknet_53", 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), }, False, filename='darknet_53_init_55_pytorch_train_tmp.pth.tar') with open(train_log_file, 'a+') as log_file: log_file.write('Epoch:{0}, Loss{loss.avg:.4f}, Top1:{top1.avg:.3f},Top5:{top5.avg:.3f}\n'.format( epoch, loss=losses, top1=top1, top5=top5))
def main(lr=0.1): global best_acc args.lr = lr device = 'cuda' if torch.cuda.is_available() else 'cpu' best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='/tmp/cifar10', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # Model print('==> Building model..') # net = VGG('VGG19') # net = ResNet18() # net = PreActResNet18() # net = GoogLeNet() # net = DenseNet121() # net = ResNeXt29_2x64d() # net = MobileNet() # net = MobileNetV2() # net = DPN92() # net = ShuffleNetG2() # net = SENet18() # net = ShuffleNetV2(1) # net = EfficientNetB0() # net = RegNetX_200MF() net = ResNet50() net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True ckpt = './checkpoint/' + args.optimizer + str(lr) + '_ckpt.pth' if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir( 'checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load(ckpt) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] criterion = nn.CrossEntropyLoss() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': #no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1) train_acc = [] valid_acc = [] # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): print(batch_idx) inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() # lr_scheduler.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() print(100. * correct / total) train_acc.append(correct / total) def test(epoch): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 print('test') with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): print(batch_idx) inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() # Save checkpoint. acc = 100. * correct / total print(acc) valid_acc.append(correct / total) if acc > best_acc: print('Saving..') state = { 'net': net.state_dict(), 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, ckpt) best_acc = acc for epoch in range(200): if epoch in args.lr_decay: checkpoint = torch.load(ckpt) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] args.lr *= 0.1 if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': # no tensorboardX optimizer = LARS(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, dampening=args.damping) elif args.optimizer.lower() == 'lamb': optimizer = Lamb(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': optimizer = NovoGrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) train(epoch) test(epoch) file = open(args.optimizer + str(lr) + 'log.json', 'w+') json.dump([train_acc, valid_acc], file) return best_acc
lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars': #no tensorboardX from lars import LARS optimizer = LARS(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(model.parameters(), lr=args.base_lr, weight_decay=args.weight_decay) lr_scheduler = [ optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader), 1e-4) ]
epochs = 400 stop_at_epoch = 100 batch_size = 64 image_size = (92, 92) train_loader, mem_loader, test_loader = get_train_mem_test_dataloaders( batch_size=batch_size) train_transform, test_transform = gpu_transformer(image_size) loss_ls = [] acc_ls = [] model = BYOL().to(device) optimizer = LARS(model.named_modules(), lr=lr, momentum=momentum, weight_decay=weight_decay) scheduler = LR_Scheduler(optimizer, warmup_epochs, warmup_lr * batch_size / 8, epochs, lr * batch_size / 8, final_lr * batch_size / 8, len(train_loader), constant_predictor_lr=True) min_loss = np.inf accuracy = 0 # start training
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower()=='adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars':#no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1) train_acc = [] valid_acc = []
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower()=='adam': optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': from radam import RAdam optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lars':#no tensorboardX from lars import LARS optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lamb': from lamb import Lamb optimizer = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'novograd': from novograd import NovoGrad optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay) elif args.optimizer.lower() == 'dyna': from dyna import Dyna optimizer = Dyna(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay) # lr_scheduler = LambdaLR(optimizer,lrs) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)
def lars(self): self.optimizer = LARS(self.optimizer)
class ApexDistributeModel(object): def __init__(self, model, criterion, optimizer, config, gpu=None): super(ApexDistributeModel, self).__init__() self.model = model self.config = config self.sync_bn = self.config['sync_bn'] self.gpu = gpu if self.gpu is not None: assert isinstance(self.gpu, int), "GPU should is a int type." self.criterion = criterion self.optimizer = optimizer self.opt_level = None def convert(self, opt_level='O0'): self.opt_level = opt_level if self.sync_bn: # synchronization batch normal self.model = apex.parallel.convert_syncbn_model(self.model) # assign specific gpu self.model = self.model.cuda(self.gpu) self.criterion = self.criterion.cuda(self.gpu) # init model and optimizer by apex self.model, self.optimizer = apex.amp.initialize( self.model, self.optimizer, opt_level=self.opt_level) # apex parallel self.model = apex.parallel.DistributedDataParallel( self.model, delay_allreduce=True) return self.model, self.criterion, self.optimizer def lars(self): self.optimizer = LARS(self.optimizer) def train(self, epoch, train_loader): """ you must run it after the 'convert' function. :param epoch: :param train_loader: :return: """ self.model.train() print("Epoch is {}".format(epoch)) train_iter = iter(train_loader) inputs, target = next(train_iter) step = 0 start_time = time.time() while inputs is not None: step += 1 inputs = inputs.cuda(self.gpu, non_blocking=True) target = target.cuda(self.gpu, non_blocking=True) output = self.model(inputs) loss = self.criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) self.optimizer.zero_grad() with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() self.optimizer.step() inputs, target = next(train_iter, (None, None)) if step % 10 == 0: end_time = time.time() print("Step is {}, cost time: {}, loss: {}, acc1: {}, acc5:{}". format(step, (end_time - start_time), loss.item(), acc1.item(), acc5.item())) start_time = time.time()
class ApexDistributeModel(object): def __init__(self, model, criterion, optimizer, args, gpu=None): super(ApexDistributeModel, self).__init__() self.model = model self.args = args self.sync_bn = self.args.sync_bn self.gpu = gpu if self.gpu is not None: assert isinstance(self.gpu, int), "GPU should is a int type." self.criterion = criterion self.optimizer = optimizer self.opt_level = None def convert(self, opt_level='O0'): self.opt_level = opt_level if self.sync_bn: # synchronization batch normal self.model = apex.parallel.convert_syncbn_model(self.model) # assign specific gpu self.model = self.model.cuda(self.gpu) # self.criterion = self.criterion.cuda(self.gpu) # init model and optimizer by apex # self.model, self.optimizer = apex.amp.initialize(self.model, self.optimizer, # opt_level=self.opt_level) # apex parallel # self.model = apex.parallel.DistributedDataParallel(self.model, delay_allreduce=True) # self.model = apex.parallel.DistributedDataParallel(self.model) self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[self.gpu], bucket_cap_mb=10) return self.model, self.criterion, self.optimizer def lars(self): print("Enable LARS Optimizer Algorithm") self.optimizer = LARS(self.optimizer) def train(self, epoch, train_loader, max_batches, train_index): """ you must run it after the 'convert' function. :param epoch: :param train_loader: :return: """ batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() self.model.train() train_log_file = 'train_log' end = time.time() epoch_batch = epoch * (len(train_loader.dataset) / self.args.batch_size) for i, (inputs, target) in enumerate(train_loader): data_time.update(time.time() - end) print("data iteration cost time: {}ms".format(data_time.val * 1000)) # print(target) gpu_1 = time.time() inputs = inputs.cuda(self.gpu, non_blocking=True) target = target.cuda(self.gpu, non_blocking=True) gpu_2 = time.time() print("convert datasets to gpu cost time: {}ms".format( (gpu_2 - gpu_1) * 1000)) inference_1 = time.time() output = self.model(inputs) inference_2 = time.time() print("inference time cost: {}ms".format( (inference_2 - inference_1) * 1000)) loss_1 = time.time() loss = self.criterion(output, target) loss_2 = time.time() print("loss cost time: {}ms".format((loss_2 - loss_1) * 1000)) zero_1 = time.time() self.optimizer.zero_grad() zero_2 = time.time() print("zero cost time: {}ms".format((zero_2 - zero_1) * 1000)) backward_1 = time.time() loss.backward() backward_2 = time.time() print("backward cost time: {}ms".format( (backward_2 - backward_1) * 1000)) step_1 = time.time() self.optimizer.step() step_2 = time.time() print("step cost time: {}ms".format((step_2 - step_1) * 1000)) batch_time.update(time.time() - end) print("total cost time is: {}s".format(batch_time.val)) # item_1 = time.time() # print("loss is {}".format(loss.item())) # item_2 = time.time() # print("loss item cost: {}s".format(item_2 - item_1)) print("==================================") end = time.time()
#we are going to try to find the sum(n_releant_orders) #important predictors. first compute all X possible products X_aug = poly_order_of_cols(X,orders,include_cross_terms) print 'shape of X_aug ',X_aug.shape B = linear_regression(X_aug,Y[:,0],True) r_sq = lin_reg_coefficient_of_determination(X_aug,Y[:,0],B) print 'r_squared for brute force fit ',r_sq lin_reg_statistics(X_aug,Y[:,0],B,False) print 'full linreg coefs' print B print 'beginning LARS test' lars_obj = LARS(X_aug,Y[:,0],0) lars_obj.train() print 'beginning LASSO test' lars_obj = LARS(X_aug,Y[:,0],1) lars_obj.train() #Cross Validation #cvn = 10 #print 'performing cross validation CV('+str(cvn)+') for linear regression' #cvn_lr = n_cross_validation(X_aug,Y,linear_regression_train_test,None,cvn) #print 'linear regression cvn mse is ',cvn_lr #Now we will compute cvn for LASSO with a number of lambdas #if m==1: if False:
def lars(self): print("Enable LARS Optimizer Algorithm") self.optimizer = LARS(self.optimizer)
if len(sys.argv) == 1: optimizer = optim.SGD(model.parameters(), lr=0.01) elif sys.argv[1] == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) elif sys.argv[1] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=0.01) elif sys.argv[1] == 'sgdwm': optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) elif sys.argv[1] == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=0.001, momentum=0.9) elif sys.argv[1] == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=0.01) elif sys.argv[1] == 'radam': optimizer = RAdam(model.parameters()) elif sys.argv[1] == 'lars': #no tensorboardX optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9) elif sys.argv[1] == 'lamb': optimizer = Lamb(model.parameters()) elif sys.argv[1] == 'novograd': optimizer = NovoGrad(model.parameters(), lr=0.01, weight_decay=0.001) schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader), 1e-4) def train(train_loader, model, criterion, optimizer, schedular, device): ''' Function for the training step of the training loop ''' model.train() running_loss = 0
def main(): args = parse_args() if args.name is None: args.name = '%s_WideResNet%s-%s_%d' % (args.dataset, args.depth, args.width, args.batch_size) if args.linear_scaling: args.name += '_wLS' if args.lars: args.name += '_wLARS' if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) print('Config -----') for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) criterion = nn.CrossEntropyLoss().cuda() cudnn.benchmark = True # data loading code if args.dataset == 'cifar10': transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_set = datasets.CIFAR10(root='~/data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=8) test_set = datasets.CIFAR10(root='~/data', train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, batch_size=128, shuffle=False, num_workers=8) num_classes = 10 elif args.dataset == 'cifar100': transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_set = datasets.CIFAR100(root='~/data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True, num_workers=8) test_set = datasets.CIFAR100(root='~/data', train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, batch_size=128, shuffle=False, num_workers=8) num_classes = 100 # create model model = WideResNet(args.depth, args.width, num_classes=num_classes) model = model.cuda() if args.lars: optimizer = LARS(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.linear_scaling: scheduler = WarmupMultiStepLR( optimizer, milestones=[int(e) for e in args.milestones.split(',')], target_lr=args.lr * args.batch_size / base_batch_size, gamma=args.gamma) else: scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[int(e) for e in args.milestones.split(',')], gamma=args.gamma) log = pd.DataFrame( index=[], columns=['epoch', 'lr', 'loss', 'acc', 'val_loss', 'val_acc']) best_acc = 0 for epoch in range(args.epochs): print('Epoch [%d/%d]' % (epoch + 1, args.epochs)) scheduler.step() # train for one epoch train_log = train(args, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_log = validate(args, test_loader, model, criterion) print('loss %.4f - acc %.4f - val_loss %.4f - val_acc %.4f' % (train_log['loss'], train_log['acc'], val_log['loss'], val_log['acc'])) tmp = pd.Series( [ epoch, scheduler.get_lr()[0], train_log['loss'], train_log['acc'], val_log['loss'], val_log['acc'], ], index=['epoch', 'lr', 'loss', 'acc', 'val_loss', 'val_acc']) log = log.append(tmp, ignore_index=True) log.to_csv('models/%s/log.csv' % args.name, index=False) if val_log['acc'] > best_acc: torch.save(model.state_dict(), 'models/%s/model.pth' % args.name) best_acc = val_log['acc'] print("=> saved best model")
args = parser.parse_args() if args.optimizer.lower() == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'sgdwm': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'lars': #no tensorboardX optimizer = LARS(model.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer.lower() == 'lamb': optimizer = Lamb(model.parameters(), lr=args.lr) elif args.optimizer.lower() == 'novograd': optimizer = NovoGrad(model.parameters(), lr=args.lr, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=0.01) optname = args.optimizer if len(sys.argv) >= 2 else 'sgd' # log = open(optname+'log.txt','w+') log = None criterion = nn.CrossEntropyLoss()
def main(is_distributed, rank, ip, sync_bn): world_size = 1 if is_distributed: world_size = 2 torch.distributed.init_process_group(backend='nccl', init_method=ip, world_size=world_size, rank=rank) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." print("Connect") # set hyper parameters batch_size = 128 lr = 0.01 # base on batch size 256 momentum = 0.9 weight_decay = 0.0001 epoch = 100 # recompute lr lr = lr * world_size # create model model = AlexNet(10) # synchronization batch normal if sync_bn: model = apex.parallel.convert_syncbn_model(model) model = model.cuda() # define loss function criterion = nn.CrossEntropyLoss().cuda() # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O0') optimizer = LARS(optimizer) if is_distributed: # for distribute training model = nn.parallel.DistributedDataParallel(model) # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # load train data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if is_distributed: train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, batch_size, shuffle=(train_sampler is None), num_workers=4, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epoch in range(100): # for distribute if is_distributed: train_sampler.set_epoch(epoch) model.train() # train_iter = iter(train_loader) # inputs, target = next(train_iter) prefetcher = DataPrefetcher(train_loader) inputs, target = prefetcher.next() step = 0 print("Epoch is {}".format(epoch)) while inputs is not None: step += 1 print("Step is {}".format(step)) time_model_1 = time.time() output = model(inputs) time_model_2 = time.time() print("model time: {}".format(time_model_2 - time_model_1)) time_loss_1 = time.time() loss = criterion(output, target.cuda(async=True)) time_loss_2 = time.time() print("loss time: {}".format(time_loss_2 - time_loss_1)) optimizer.zero_grad() time_back_1 = time.time() # loss.backward() with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() time_back_2 = time.time() print("back time: {}".format(time_back_2 - time_back_1)) optimizer.step() # if step % 10 == 0: # print("loss is : {}", loss.item()) # inputs, target = next(train_iter, (None, None)) inputs, target = prefetcher.next()