class ApexDistributeModel(object):
    def __init__(self, model, criterion, optimizer, args, gpu=None):
        super(ApexDistributeModel, self).__init__()
        self.model = model
        self.args = args
        self.sync_bn = self.args.sync_bn
        self.gpu = gpu
        if self.gpu is not None:
            assert isinstance(self.gpu, int), "GPU should is a int type."
        self.criterion = criterion
        self.optimizer = optimizer
        self.opt_level = None

    def convert(self, opt_level='O0'):
        self.opt_level = opt_level
        if self.sync_bn:
            # synchronization batch normal
            self.model = apex.parallel.convert_syncbn_model(self.model)
        # assign specific gpu
        self.model = self.model.cuda(self.gpu)
        self.criterion = self.criterion.cuda(self.gpu)
        # init model and optimizer by apex
        self.model, self.optimizer = apex.amp.initialize(self.model, self.optimizer,
                                                         opt_level=self.opt_level)
        # apex parallel
        self.model = apex.parallel.DistributedDataParallel(self.model, delay_allreduce=True)
        # self.model = apex.parallel.DistributedDataParallel(self.model)

        self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[self.gpu])
        return self.model, self.criterion, self.optimizer

    def lars(self):
        print("Enable LARS Optimizer Algorithm")
        self.optimizer = LARS(self.optimizer)

    def train(self, epoch, train_loader, max_batches, train_index):
        """
        you must run it after the 'convert' function.
        :param epoch:
        :param train_loader:
        :return:
        """
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()
        self.model.train()

        train_log_file = 'train_log'

        end = time.time()
        epoch_batch = epoch * (len(train_loader.dataset) / self.args.batch_size)
        for i, (inputs, target) in enumerate(train_loader):
            # measure data loading time
            data_time.update(time.time() - end)
            if train_index != -1 and i < train_index:
                continue
            adjust_learning_rate_epoch_poly(self.optimizer, self.args, burn_in=Darknet53.burn_in, power=Darknet53.power,
                                            batch_num=i + epoch_batch, max_batches=max_batches)
            inputs = inputs.cuda(self.gpu, non_blocking=True)
            target = target.cuda(self.gpu, non_blocking=True)

            # compute output
            output = self.model(inputs)
            loss = self.criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), inputs.size(0))
            top1.update(acc1[0], inputs.size(0))
            top5.update(acc5[0], inputs.size(0))

            # compute gradient and do SGD step
            self.optimizer.zero_grad()
            # loss.backward()
            time1 = time.time()
            with apex.amp.scale_loss(loss, self.optimizer) as scale_loss:
                scale_loss.backward()
            time2 = time.time()
            self.optimizer.step()
            time3 = time.time()
            print("step cost time: {}, backward cost time: {}".format(time3-time2, time2-time1))
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % self.args.print_freq == 0:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                       epoch, i, len(train_loader), batch_time=batch_time,
                       data_time=data_time, loss=losses, top1=top1, top5=top5))

            if i % 1000 == 0 and i:
                save_checkpoint({
                    'epoch': epoch,
                    'index': i,
                    'arch': "Darknet_53",
                    'state_dict': self.model.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                }, False, filename='darknet_53_init_55_pytorch_train_tmp.pth.tar')

        with open(train_log_file, 'a+') as log_file:
            log_file.write('Epoch:{0}, Loss{loss.avg:.4f}, Top1:{top1.avg:.3f},Top5:{top5.avg:.3f}\n'.format(
                epoch, loss=losses, top1=top1, top5=top5))
Example #2
0
class ApexDistributeModel(object):
    def __init__(self, model, criterion, optimizer, args, gpu=None):
        super(ApexDistributeModel, self).__init__()
        self.model = model
        self.args = args
        self.sync_bn = self.args.sync_bn
        self.gpu = gpu
        if self.gpu is not None:
            assert isinstance(self.gpu, int), "GPU should is a int type."
        self.criterion = criterion
        self.optimizer = optimizer
        self.opt_level = None

    def convert(self, opt_level='O0'):
        self.opt_level = opt_level
        if self.sync_bn:
            # synchronization batch normal
            self.model = apex.parallel.convert_syncbn_model(self.model)
        # assign specific gpu
        self.model = self.model.cuda(self.gpu)
        # self.criterion = self.criterion.cuda(self.gpu)
        # init model and optimizer by apex
        # self.model, self.optimizer = apex.amp.initialize(self.model, self.optimizer,
        #                                                  opt_level=self.opt_level)
        # apex parallel
        # self.model = apex.parallel.DistributedDataParallel(self.model, delay_allreduce=True)
        # self.model = apex.parallel.DistributedDataParallel(self.model)

        self.model = nn.parallel.DistributedDataParallel(self.model,
                                                         device_ids=[self.gpu],
                                                         bucket_cap_mb=10)
        return self.model, self.criterion, self.optimizer

    def lars(self):
        print("Enable LARS Optimizer Algorithm")
        self.optimizer = LARS(self.optimizer)

    def train(self, epoch, train_loader, max_batches, train_index):
        """
        you must run it after the 'convert' function.
        :param epoch:
        :param train_loader:
        :return:
        """
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()
        self.model.train()

        train_log_file = 'train_log'

        end = time.time()
        epoch_batch = epoch * (len(train_loader.dataset) /
                               self.args.batch_size)
        for i, (inputs, target) in enumerate(train_loader):
            data_time.update(time.time() - end)
            print("data iteration cost time: {}ms".format(data_time.val *
                                                          1000))
            # print(target)
            gpu_1 = time.time()
            inputs = inputs.cuda(self.gpu, non_blocking=True)
            target = target.cuda(self.gpu, non_blocking=True)
            gpu_2 = time.time()
            print("convert datasets to gpu cost time: {}ms".format(
                (gpu_2 - gpu_1) * 1000))

            inference_1 = time.time()
            output = self.model(inputs)
            inference_2 = time.time()
            print("inference time cost: {}ms".format(
                (inference_2 - inference_1) * 1000))

            loss_1 = time.time()
            loss = self.criterion(output, target)
            loss_2 = time.time()
            print("loss cost time: {}ms".format((loss_2 - loss_1) * 1000))

            zero_1 = time.time()
            self.optimizer.zero_grad()
            zero_2 = time.time()
            print("zero cost time: {}ms".format((zero_2 - zero_1) * 1000))

            backward_1 = time.time()
            loss.backward()
            backward_2 = time.time()
            print("backward cost time: {}ms".format(
                (backward_2 - backward_1) * 1000))

            step_1 = time.time()
            self.optimizer.step()
            step_2 = time.time()
            print("step cost time: {}ms".format((step_2 - step_1) * 1000))

            batch_time.update(time.time() - end)
            print("total cost time is: {}s".format(batch_time.val))
            # item_1 = time.time()
            # print("loss is {}".format(loss.item()))
            # item_2 = time.time()
            # print("loss item cost: {}s".format(item_2 - item_1))
            print("==================================")

            end = time.time()
Example #3
0
class ApexDistributeModel(object):
    def __init__(self, model, criterion, optimizer, config, gpu=None):
        super(ApexDistributeModel, self).__init__()
        self.model = model
        self.config = config
        self.sync_bn = self.config['sync_bn']
        self.gpu = gpu
        if self.gpu is not None:
            assert isinstance(self.gpu, int), "GPU should is a int type."

        self.criterion = criterion
        self.optimizer = optimizer
        self.opt_level = None

    def convert(self, opt_level='O0'):
        self.opt_level = opt_level
        if self.sync_bn:
            # synchronization batch normal
            self.model = apex.parallel.convert_syncbn_model(self.model)
        # assign specific gpu
        self.model = self.model.cuda(self.gpu)
        self.criterion = self.criterion.cuda(self.gpu)
        # init model and optimizer by apex
        self.model, self.optimizer = apex.amp.initialize(
            self.model, self.optimizer, opt_level=self.opt_level)
        # apex parallel
        self.model = apex.parallel.DistributedDataParallel(
            self.model, delay_allreduce=True)
        return self.model, self.criterion, self.optimizer

    def lars(self):
        self.optimizer = LARS(self.optimizer)

    def train(self, epoch, train_loader):
        """
        you must run it after the 'convert' function.
        :param epoch:
        :param train_loader:
        :return:
        """
        self.model.train()
        print("Epoch is {}".format(epoch))
        train_iter = iter(train_loader)
        inputs, target = next(train_iter)
        step = 0
        start_time = time.time()
        while inputs is not None:
            step += 1
            inputs = inputs.cuda(self.gpu, non_blocking=True)
            target = target.cuda(self.gpu, non_blocking=True)
            output = self.model(inputs)
            loss = self.criterion(output, target)
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            self.optimizer.zero_grad()
            with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
            self.optimizer.step()
            inputs, target = next(train_iter, (None, None))
            if step % 10 == 0:
                end_time = time.time()
                print("Step is {}, cost time: {}, loss: {}, acc1: {}, acc5:{}".
                      format(step, (end_time - start_time), loss.item(),
                             acc1.item(), acc5.item()))
                start_time = time.time()
Example #4
0
def main(is_distributed, rank, ip, sync_bn):
    world_size = 1
    if is_distributed:
        world_size = 2
        torch.distributed.init_process_group(backend='nccl',
                                             init_method=ip,
                                             world_size=world_size,
                                             rank=rank)
    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."
    print("Connect")
    # set hyper parameters
    batch_size = 128
    lr = 0.01  # base on batch size 256
    momentum = 0.9
    weight_decay = 0.0001
    epoch = 100

    # recompute lr
    lr = lr * world_size

    # create model
    model = AlexNet(10)

    # synchronization batch normal
    if sync_bn:
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda()

    # define loss function
    criterion = nn.CrossEntropyLoss().cuda()

    # define optimizer strategy
    optimizer = torch.optim.SGD(model.parameters(),
                                lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O0')
    optimizer = LARS(optimizer)

    if is_distributed:
        # for distribute training
        model = nn.parallel.DistributedDataParallel(model)
        # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True)

    # load train data
    data_path = '~/datasets/cifar10/train'
    train_set = LoadClassifyDataSets(data_path, 227)
    train_sampler = None
    if is_distributed:
        train_sampler = distributed.DistributedSampler(train_set)
    train_loader = DataLoader(train_set,
                              batch_size,
                              shuffle=(train_sampler is None),
                              num_workers=4,
                              pin_memory=True,
                              sampler=train_sampler,
                              collate_fn=collate_fn)

    for epoch in range(100):
        # for distribute
        if is_distributed:
            train_sampler.set_epoch(epoch)

        model.train()
        # train_iter = iter(train_loader)
        # inputs, target = next(train_iter)
        prefetcher = DataPrefetcher(train_loader)
        inputs, target = prefetcher.next()

        step = 0
        print("Epoch is {}".format(epoch))
        while inputs is not None:
            step += 1
            print("Step is {}".format(step))

            time_model_1 = time.time()
            output = model(inputs)
            time_model_2 = time.time()
            print("model time: {}".format(time_model_2 - time_model_1))
            time_loss_1 = time.time()
            loss = criterion(output, target.cuda(async=True))
            time_loss_2 = time.time()
            print("loss time: {}".format(time_loss_2 - time_loss_1))
            optimizer.zero_grad()
            time_back_1 = time.time()
            # loss.backward()
            with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            time_back_2 = time.time()
            print("back time: {}".format(time_back_2 - time_back_1))
            optimizer.step()
            # if step % 10 == 0:
            #     print("loss is : {}", loss.item())
            # inputs, target = next(train_iter, (None, None))
            inputs, target = prefetcher.next()