class ApexDistributeModel(object): def __init__(self, model, criterion, optimizer, args, gpu=None): super(ApexDistributeModel, self).__init__() self.model = model self.args = args self.sync_bn = self.args.sync_bn self.gpu = gpu if self.gpu is not None: assert isinstance(self.gpu, int), "GPU should is a int type." self.criterion = criterion self.optimizer = optimizer self.opt_level = None def convert(self, opt_level='O0'): self.opt_level = opt_level if self.sync_bn: # synchronization batch normal self.model = apex.parallel.convert_syncbn_model(self.model) # assign specific gpu self.model = self.model.cuda(self.gpu) self.criterion = self.criterion.cuda(self.gpu) # init model and optimizer by apex self.model, self.optimizer = apex.amp.initialize(self.model, self.optimizer, opt_level=self.opt_level) # apex parallel self.model = apex.parallel.DistributedDataParallel(self.model, delay_allreduce=True) # self.model = apex.parallel.DistributedDataParallel(self.model) self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[self.gpu]) return self.model, self.criterion, self.optimizer def lars(self): print("Enable LARS Optimizer Algorithm") self.optimizer = LARS(self.optimizer) def train(self, epoch, train_loader, max_batches, train_index): """ you must run it after the 'convert' function. :param epoch: :param train_loader: :return: """ batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() self.model.train() train_log_file = 'train_log' end = time.time() epoch_batch = epoch * (len(train_loader.dataset) / self.args.batch_size) for i, (inputs, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if train_index != -1 and i < train_index: continue adjust_learning_rate_epoch_poly(self.optimizer, self.args, burn_in=Darknet53.burn_in, power=Darknet53.power, batch_num=i + epoch_batch, max_batches=max_batches) inputs = inputs.cuda(self.gpu, non_blocking=True) target = target.cuda(self.gpu, non_blocking=True) # compute output output = self.model(inputs) loss = self.criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(acc1[0], inputs.size(0)) top5.update(acc5[0], inputs.size(0)) # compute gradient and do SGD step self.optimizer.zero_grad() # loss.backward() time1 = time.time() with apex.amp.scale_loss(loss, self.optimizer) as scale_loss: scale_loss.backward() time2 = time.time() self.optimizer.step() time3 = time.time() print("step cost time: {}, backward cost time: {}".format(time3-time2, time2-time1)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)) if i % 1000 == 0 and i: save_checkpoint({ 'epoch': epoch, 'index': i, 'arch': "Darknet_53", 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), }, False, filename='darknet_53_init_55_pytorch_train_tmp.pth.tar') with open(train_log_file, 'a+') as log_file: log_file.write('Epoch:{0}, Loss{loss.avg:.4f}, Top1:{top1.avg:.3f},Top5:{top5.avg:.3f}\n'.format( epoch, loss=losses, top1=top1, top5=top5))
class ApexDistributeModel(object): def __init__(self, model, criterion, optimizer, args, gpu=None): super(ApexDistributeModel, self).__init__() self.model = model self.args = args self.sync_bn = self.args.sync_bn self.gpu = gpu if self.gpu is not None: assert isinstance(self.gpu, int), "GPU should is a int type." self.criterion = criterion self.optimizer = optimizer self.opt_level = None def convert(self, opt_level='O0'): self.opt_level = opt_level if self.sync_bn: # synchronization batch normal self.model = apex.parallel.convert_syncbn_model(self.model) # assign specific gpu self.model = self.model.cuda(self.gpu) # self.criterion = self.criterion.cuda(self.gpu) # init model and optimizer by apex # self.model, self.optimizer = apex.amp.initialize(self.model, self.optimizer, # opt_level=self.opt_level) # apex parallel # self.model = apex.parallel.DistributedDataParallel(self.model, delay_allreduce=True) # self.model = apex.parallel.DistributedDataParallel(self.model) self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[self.gpu], bucket_cap_mb=10) return self.model, self.criterion, self.optimizer def lars(self): print("Enable LARS Optimizer Algorithm") self.optimizer = LARS(self.optimizer) def train(self, epoch, train_loader, max_batches, train_index): """ you must run it after the 'convert' function. :param epoch: :param train_loader: :return: """ batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() self.model.train() train_log_file = 'train_log' end = time.time() epoch_batch = epoch * (len(train_loader.dataset) / self.args.batch_size) for i, (inputs, target) in enumerate(train_loader): data_time.update(time.time() - end) print("data iteration cost time: {}ms".format(data_time.val * 1000)) # print(target) gpu_1 = time.time() inputs = inputs.cuda(self.gpu, non_blocking=True) target = target.cuda(self.gpu, non_blocking=True) gpu_2 = time.time() print("convert datasets to gpu cost time: {}ms".format( (gpu_2 - gpu_1) * 1000)) inference_1 = time.time() output = self.model(inputs) inference_2 = time.time() print("inference time cost: {}ms".format( (inference_2 - inference_1) * 1000)) loss_1 = time.time() loss = self.criterion(output, target) loss_2 = time.time() print("loss cost time: {}ms".format((loss_2 - loss_1) * 1000)) zero_1 = time.time() self.optimizer.zero_grad() zero_2 = time.time() print("zero cost time: {}ms".format((zero_2 - zero_1) * 1000)) backward_1 = time.time() loss.backward() backward_2 = time.time() print("backward cost time: {}ms".format( (backward_2 - backward_1) * 1000)) step_1 = time.time() self.optimizer.step() step_2 = time.time() print("step cost time: {}ms".format((step_2 - step_1) * 1000)) batch_time.update(time.time() - end) print("total cost time is: {}s".format(batch_time.val)) # item_1 = time.time() # print("loss is {}".format(loss.item())) # item_2 = time.time() # print("loss item cost: {}s".format(item_2 - item_1)) print("==================================") end = time.time()
class ApexDistributeModel(object): def __init__(self, model, criterion, optimizer, config, gpu=None): super(ApexDistributeModel, self).__init__() self.model = model self.config = config self.sync_bn = self.config['sync_bn'] self.gpu = gpu if self.gpu is not None: assert isinstance(self.gpu, int), "GPU should is a int type." self.criterion = criterion self.optimizer = optimizer self.opt_level = None def convert(self, opt_level='O0'): self.opt_level = opt_level if self.sync_bn: # synchronization batch normal self.model = apex.parallel.convert_syncbn_model(self.model) # assign specific gpu self.model = self.model.cuda(self.gpu) self.criterion = self.criterion.cuda(self.gpu) # init model and optimizer by apex self.model, self.optimizer = apex.amp.initialize( self.model, self.optimizer, opt_level=self.opt_level) # apex parallel self.model = apex.parallel.DistributedDataParallel( self.model, delay_allreduce=True) return self.model, self.criterion, self.optimizer def lars(self): self.optimizer = LARS(self.optimizer) def train(self, epoch, train_loader): """ you must run it after the 'convert' function. :param epoch: :param train_loader: :return: """ self.model.train() print("Epoch is {}".format(epoch)) train_iter = iter(train_loader) inputs, target = next(train_iter) step = 0 start_time = time.time() while inputs is not None: step += 1 inputs = inputs.cuda(self.gpu, non_blocking=True) target = target.cuda(self.gpu, non_blocking=True) output = self.model(inputs) loss = self.criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) self.optimizer.zero_grad() with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() self.optimizer.step() inputs, target = next(train_iter, (None, None)) if step % 10 == 0: end_time = time.time() print("Step is {}, cost time: {}, loss: {}, acc1: {}, acc5:{}". format(step, (end_time - start_time), loss.item(), acc1.item(), acc5.item())) start_time = time.time()
def main(is_distributed, rank, ip, sync_bn): world_size = 1 if is_distributed: world_size = 2 torch.distributed.init_process_group(backend='nccl', init_method=ip, world_size=world_size, rank=rank) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." print("Connect") # set hyper parameters batch_size = 128 lr = 0.01 # base on batch size 256 momentum = 0.9 weight_decay = 0.0001 epoch = 100 # recompute lr lr = lr * world_size # create model model = AlexNet(10) # synchronization batch normal if sync_bn: model = apex.parallel.convert_syncbn_model(model) model = model.cuda() # define loss function criterion = nn.CrossEntropyLoss().cuda() # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O0') optimizer = LARS(optimizer) if is_distributed: # for distribute training model = nn.parallel.DistributedDataParallel(model) # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # load train data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if is_distributed: train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, batch_size, shuffle=(train_sampler is None), num_workers=4, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epoch in range(100): # for distribute if is_distributed: train_sampler.set_epoch(epoch) model.train() # train_iter = iter(train_loader) # inputs, target = next(train_iter) prefetcher = DataPrefetcher(train_loader) inputs, target = prefetcher.next() step = 0 print("Epoch is {}".format(epoch)) while inputs is not None: step += 1 print("Step is {}".format(step)) time_model_1 = time.time() output = model(inputs) time_model_2 = time.time() print("model time: {}".format(time_model_2 - time_model_1)) time_loss_1 = time.time() loss = criterion(output, target.cuda(async=True)) time_loss_2 = time.time() print("loss time: {}".format(time_loss_2 - time_loss_1)) optimizer.zero_grad() time_back_1 = time.time() # loss.backward() with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() time_back_2 = time.time() print("back time: {}".format(time_back_2 - time_back_1)) optimizer.step() # if step % 10 == 0: # print("loss is : {}", loss.item()) # inputs, target = next(train_iter, (None, None)) inputs, target = prefetcher.next()