Example #1
0
def main():
    global args, rank, world_size
    if args.dist == 1:
        rank, world_size = dist_init()
    else:
        rank = 0
        world_size = 1

    DATA_DIR = './data'

    train_set_raw = torchvision.datasets.CIFAR10(root=DATA_DIR,
                                                 train=True,
                                                 download=True)
    test_set_raw = torchvision.datasets.CIFAR10(root=DATA_DIR,
                                                train=False,
                                                download=True)

    lr_schedule = PiecewiseLinear([0, 5, 24], [0, 0.4 * args.lr_scale, 0])
    train_transforms = [Crop(32, 32), FlipLR(), Cutout(8, 8)]

    model = TorchGraph(union(net(), losses)).cuda()
    if args.half == 1:
        model = model.half()
    if args.double == 1:
        model = model.double()
    if args.dist == 1:
        model = DistModule(model)
    opt = torch.optim.SGD(model.parameters(),
                          lr=0.0,
                          momentum=args.momentum,
                          weight_decay=5e-4 * args.batch_size,
                          nesterov=True)

    t = Timer()

    train_set = list(
        zip(transpose(normalise(pad(train_set_raw.data, 4))),
            train_set_raw.targets))
    test_set = list(
        zip(transpose(normalise(test_set_raw.data)), test_set_raw.targets))
    dataset_len = len(train_set)
    args.warm_up_iter = math.ceil(dataset_len * args.warm_up_epoch /
                                  (world_size * args.batch_size))

    TSV = TSVLogger()
    train(model,
          lr_schedule,
          opt,
          Transform(train_set, train_transforms),
          test_set,
          args=args,
          batch_size=args.batch_size,
          num_workers=args.workers,
          loggers=(TableLogger(rank), TSV),
          timer=t,
          test_time_in_total=False,
          drop_last=True)
def main():
    global args, best_prec1, min_loss
    args = parser.parse_args()

    rank, world_size = dist_init(args.port)
    print("world_size is: {}".format(world_size))
    assert (args.batch_size % world_size == 0)
    assert (args.workers % world_size == 0)
    args.batch_size = args.batch_size // world_size
    args.workers = args.workers // world_size

    # create model
    print("=> creating model '{}'".format("inceptionv4"))
    print("save_path is: {}".format(args.save_path))

    image_size = 341
    input_size = 299
    model = get_model('inceptionv4', pretrained=True)
    # print("model is: {}".format(model))
    model.cuda()
    model = DistModule(model)

    # optionally resume from a checkpoint
    if args.load_path:
        if args.resume_opt:
            best_prec1, start_epoch = load_state(args.load_path,
                                                 model,
                                                 optimizer=optimizer)
        else:
            # print('load weights from', args.load_path)
            load_state(args.load_path, model)

    cudnn.benchmark = True

    # Data loading code
    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

    train_dataset = McDataset(
        args.train_root, args.train_source,
        transforms.Compose([
            transforms.Resize(image_size),
            transforms.RandomCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            ColorAugmentation(),
            normalize,
        ]))
    val_dataset = McDataset(
        args.val_root, args.val_source,
        transforms.Compose([
            transforms.Resize(image_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            normalize,
        ]))

    train_sampler = DistributedSampler(train_dataset)
    val_sampler = DistributedSampler(val_dataset)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.workers,
                              pin_memory=False,
                              sampler=train_sampler)

    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=False,
                            sampler=val_sampler)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()

    lr = 0
    patience = 0
    for epoch in range(args.start_epoch, args.epochs):
        # adjust_learning_rate(optimizer, epoch)
        train_sampler.set_epoch(epoch)

        if epoch == 1:
            lr = 0.00003
        if patience == 2:
            patience = 0
            checkpoint = load_checkpoint(args.save_path + '_best.pth.tar')
            model.load_state_dict(checkpoint['state_dict'])
            print("Loading checkpoint_best.............")
            # model.load_state_dict(torch.load('checkpoint_best.pth.tar'))
            lr = lr / 10.0

        if epoch == 0:
            lr = 0.001
            for name, param in model.named_parameters():
                # print("name is: {}".format(name))
                if (name not in last_layer_names):
                    param.requires_grad = False
            optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad,
                                                   model.parameters()),
                                            lr=lr)
            # optimizer = torch.optim.Adam(
            #     filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
        else:
            for param in model.parameters():
                param.requires_grad = True
            optimizer = torch.optim.RMSprop(model.parameters(),
                                            lr=lr,
                                            weight_decay=0.0001)
            # optimizer = torch.optim.Adam(
            #     model.parameters(), lr=lr, weight_decay=0.0001)
        print("lr is: {}".format(lr))
        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        val_prec1, val_losses = validate(val_loader, model, criterion)
        print("val_losses is: {}".format(val_losses))
        # remember best prec@1 and save checkpoint
        if rank == 0:
            # remember best prec@1 and save checkpoint
            if val_losses < min_loss:
                is_best = True
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': 'inceptionv4',
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                        'optimizer': optimizer.state_dict(),
                    }, is_best, args.save_path)
                # torch.save(model.state_dict(), 'best_val_weight.pth')
                print(
                    'val score improved from {:.5f} to {:.5f}. Saved!'.format(
                        min_loss, val_losses))

                min_loss = val_losses
                patience = 0
            else:
                patience += 1
        if rank == 1 or rank == 2 or rank == 3 or rank == 4 or rank == 5 or rank == 6 or rank == 7:
            if val_losses < min_loss:
                min_loss = val_losses
                patience = 0
            else:
                patience += 1
        print("patience is: {}".format(patience))
        print("min_loss is: {}".format(min_loss))
    print("min_loss is: {}".format(min_loss))
def main():
    global args
    args = parser.parse_args()

    # TODO model arguments module should be more easy to write and read
    if args.approach == 'lwf':
        approach = lwf
        assert (args.memory_size is None)
        assert (args.memory_mini_batch_size is None)
    elif args.approach == 'joint_train':
        approach = joint_train
        assert (args.memory_size is None)
        assert (args.memory_mini_batch_size is None)
    elif args.approach == 'fine_tuning':
        approach = fine_tuning
        assert (args.memory_size is None)
        assert (args.memory_mini_batch_size is None)
    elif args.approach == 'gem':
        approach = gem
        assert (args.memory_size is not None)
        assert (args.memory_mini_batch_size is None)
    else:
        approach = None

    rank, world_size = dist_init('27777')

    if rank == 0:
        print('=' * 100)
        print('Arguments = ')
        for arg in vars(args):
            print('\t' + arg + ':', getattr(args, arg))
        print('=' * 100)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed)
    else:
        print('[CUDA unavailable]')
        sys.exit()

    # Generate Tasks
    args.batch_size = args.batch_size // world_size
    Tasks = generator.GetTasks(args.approach, args.batch_size, world_size, \
        memory_size=args.memory_size, memory_mini_batch_size=args.memory_mini_batch_size)
    # Network
    net = network.resnet50(pretrained=True).cuda()
    net = DistModule(net)
    # Approach
    Appr = approach.Approach(net, args, Tasks)

    # Solve tasks incrementally
    for t in range(len(Tasks)):
        task = Tasks[t]

        if rank == 0:
            print('*' * 100)
            print()
            print('Task {:d}: {:d} classes ({:s})'.format(
                t, task['class_num'], task['description']))
            print()
            print('*' * 100)

        Appr.solve(t, Tasks)

        if rank == 0:
            print('*' * 100)
            print('Task {:d}: {:d} classes Finished.'.format(
                t, task['class_num']))
            print('*' * 100)
Example #4
0
def main():
    global args, config, best_prec1
    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.load(f)

    config = EasyDict(config['common'])
    config.save_path = os.path.dirname(args.config)

    rank, world_size = dist_init()

    # create model
    bn_group_size = config.model.kwargs.bn_group_size
    bn_var_mode = config.model.kwargs.get('bn_var_mode', 'L2')
    if bn_group_size == 1:
        bn_group = None
    else:
        assert world_size % bn_group_size == 0
        bn_group = simple_group_split(world_size, rank,
                                      world_size // bn_group_size)

    config.model.kwargs.bn_group = bn_group
    config.model.kwargs.bn_var_mode = (link.syncbnVarMode_t.L1 if bn_var_mode
                                       == 'L1' else link.syncbnVarMode_t.L2)
    model = model_entry(config.model)
    if rank == 0:
        print(model)

    model.cuda()

    if config.optimizer.type == 'FP16SGD' or config.optimizer.type == 'FusedFP16SGD':
        args.fp16 = True
    else:
        args.fp16 = False

    if args.fp16:
        # if you have modules that must use fp32 parameters, and need fp32 input
        # try use link.fp16.register_float_module(your_module)
        # if you only need fp32 parameters set cast_args=False when call this
        # function, then call link.fp16.init() before call model.half()
        if config.optimizer.get('fp16_normal_bn', False):
            print('using normal bn for fp16')
            link.fp16.register_float_module(link.nn.SyncBatchNorm2d,
                                            cast_args=False)
            link.fp16.register_float_module(torch.nn.BatchNorm2d,
                                            cast_args=False)
            link.fp16.init()
        model.half()

    model = DistModule(model, args.sync)

    # create optimizer
    opt_config = config.optimizer
    opt_config.kwargs.lr = config.lr_scheduler.base_lr
    if config.get('no_wd', False):
        param_group, type2num = param_group_no_wd(model)
        opt_config.kwargs.params = param_group
    else:
        opt_config.kwargs.params = model.parameters()

    optimizer = optim_entry(opt_config)

    # optionally resume from a checkpoint
    last_iter = -1
    best_prec1 = 0
    if args.load_path:
        if args.recover:
            best_prec1, last_iter = load_state(args.load_path,
                                               model,
                                               optimizer=optimizer)
        else:
            load_state(args.load_path, model)

    cudnn.benchmark = True

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    # augmentation
    aug = [
        transforms.RandomResizedCrop(config.augmentation.input_size),
        transforms.RandomHorizontalFlip()
    ]

    for k in config.augmentation.keys():
        assert k in [
            'input_size', 'test_resize', 'rotation', 'colorjitter', 'colorold'
        ]
    rotation = config.augmentation.get('rotation', 0)
    colorjitter = config.augmentation.get('colorjitter', None)
    colorold = config.augmentation.get('colorold', False)

    if rotation > 0:
        aug.append(transforms.RandomRotation(rotation))

    if colorjitter is not None:
        aug.append(transforms.ColorJitter(*colorjitter))

    aug.append(transforms.ToTensor())

    if colorold:
        aug.append(ColorAugmentation())

    aug.append(normalize)

    # train
    train_dataset = McDataset(config.train_root,
                              config.train_source,
                              transforms.Compose(aug),
                              fake=args.fake)

    # val
    val_dataset = McDataset(
        config.val_root, config.val_source,
        transforms.Compose([
            transforms.Resize(config.augmentation.test_resize),
            transforms.CenterCrop(config.augmentation.input_size),
            transforms.ToTensor(),
            normalize,
        ]), args.fake)

    train_sampler = DistributedGivenIterationSampler(
        train_dataset,
        config.lr_scheduler.max_iter,
        config.batch_size,
        last_iter=last_iter)
    val_sampler = DistributedSampler(val_dataset, round_up=False)

    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch_size,
                              shuffle=False,
                              num_workers=config.workers,
                              pin_memory=True,
                              sampler=train_sampler)

    val_loader = DataLoader(val_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            num_workers=config.workers,
                            pin_memory=True,
                            sampler=val_sampler)

    config.lr_scheduler['optimizer'] = optimizer.optimizer if isinstance(
        optimizer, FP16SGD) else optimizer
    config.lr_scheduler['last_iter'] = last_iter
    lr_scheduler = get_scheduler(config.lr_scheduler)

    if rank == 0:
        tb_logger = SummaryWriter(config.save_path + '/events')
        logger = create_logger('global_logger', config.save_path + '/log.txt')
        logger.info('args: {}'.format(pprint.pformat(args)))
        logger.info('config: {}'.format(pprint.pformat(config)))
    else:
        tb_logger = None

    if args.evaluate:
        if args.fusion_list is not None:
            validate(val_loader,
                     model,
                     fusion_list=args.fusion_list,
                     fuse_prob=args.fuse_prob)
        else:
            validate(val_loader, model)
        link.finalize()
        return

    train(train_loader, val_loader, model, optimizer, lr_scheduler,
          last_iter + 1, tb_logger)

    link.finalize()
Example #5
0
def validate(val_loader, model, fusion_list=None, fuse_prob=False):
    batch_time = AverageMeter(0)
    losses = AverageMeter(0)
    top1 = AverageMeter(0)
    top5 = AverageMeter(0)

    # switch to evaluate mode
    if fusion_list is not None:
        model_list = []
        for i in range(len(fusion_list)):
            model_list.append(model_entry(config.model))
            model_list[i].cuda()
            model_list[i] = DistModule(model_list[i], args.sync)
            load_state(fusion_list[i], model_list[i])
            model_list[i].eval()
        if fuse_prob:
            softmax = nn.Softmax(dim=1)
    else:
        model.eval()

    rank = link.get_rank()
    world_size = link.get_world_size()

    logger = logging.getLogger('global_logger')

    criterion = nn.CrossEntropyLoss()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
            input = input.cuda() if not args.fp16 else input.half().cuda()
            target = target.cuda()
            # compute output
            if fusion_list is not None:
                output_list = []
                for model_idx in range(len(fusion_list)):
                    output = model_list[model_idx](input)
                    if fuse_prob:
                        output = softmax(output)
                    output_list.append(output)
                output = torch.stack(output_list, 0)
                output = torch.mean(output, 0)
            else:
                output = model(input)

            # measure accuracy and record loss
            loss = criterion(
                output, target
            )  #/ world_size ## loss should not be scaled here, it's reduced later!
            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

            num = input.size(0)
            losses.update(loss.item(), num)
            top1.update(prec1.item(), num)
            top5.update(prec5.item(), num)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % config.print_freq == 0 and rank == 0:
                logger.info(
                    'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})'
                    .format(i, len(val_loader), batch_time=batch_time))

    # gather final results
    total_num = torch.Tensor([losses.count])
    loss_sum = torch.Tensor([losses.avg * losses.count])
    top1_sum = torch.Tensor([top1.avg * top1.count])
    top5_sum = torch.Tensor([top5.avg * top5.count])
    link.allreduce(total_num)
    link.allreduce(loss_sum)
    link.allreduce(top1_sum)
    link.allreduce(top5_sum)
    final_loss = loss_sum.item() / total_num.item()
    final_top1 = top1_sum.item() / total_num.item()
    final_top5 = top5_sum.item() / total_num.item()

    if rank == 0:
        logger.info(
            ' * Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'.
            format(final_top1, final_top5, final_loss, total_num.item()))

    model.train()

    return final_loss, final_top1, final_top5
def main():
    global args, rank, world_size, best_prec1, dataset_len

    if args.dist == 1:
        rank, world_size = dist_init()
    else:
        rank = 0
        world_size = 1

    model = LeNet()
    model.cuda()
    if args.double == 1:
        param_copy = [
            param.clone().type(torch.cuda.DoubleTensor).detach()
            for param in model.parameters()
        ]
    else:
        param_copy = [
            param.clone().type(torch.cuda.FloatTensor).detach()
            for param in model.parameters()
        ]

    for param in param_copy:
        param.requires_grad = True

    if args.double == 1:
        model = model.double()
    if args.half == 1:
        model = model.half()
    if args.dist == 1:
        model = DistModule(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(param_copy,
                                args.base_lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    last_iter = -1

    # Data loading code
    train_dataset = datasets.MNIST(root='./data',
                                   train=True,
                                   transform=transforms.ToTensor(),
                                   download=False)
    val_dataset = datasets.MNIST(root='./data',
                                 train=False,
                                 transform=transforms.ToTensor(),
                                 download=False)

    dataset_len = len(train_dataset)
    args.max_iter = math.ceil(
        (dataset_len * args.epoch) / (world_size * args.batch_size))

    if args.dist == 1:
        train_sampler = DistributedGivenIterationSampler(train_dataset,
                                                         args.max_iter,
                                                         args.batch_size,
                                                         last_iter=last_iter)
        val_sampler = DistributedSampler(val_dataset, round_up=False)
    else:
        train_sampler = DistributedGivenIterationSampler(train_dataset,
                                                         args.max_iter,
                                                         args.batch_size,
                                                         world_size=1,
                                                         rank=0,
                                                         last_iter=last_iter)
        val_sampler = None

    # pin_memory if true, will copy the tensor to cuda pinned memory
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.workers,
                              pin_memory=True,
                              sampler=train_sampler)

    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=True,
                            sampler=val_sampler)

    train(train_loader, val_loader, model, criterion, optimizer, param_copy)
Example #7
0
def main():
    global args, best_prec1, timer
    args = parser.parse_args()
    rank, world_size = dist_init(args.port)
    assert (args.batch_size % world_size == 0)
    assert (args.workers % world_size == 0)
    args.batch_size = args.batch_size // world_size
    args.workers = args.workers // world_size

    # step1: create model
    print("=> creating model '{}'".format(args.arch))
    if args.arch.startswith('inception_v3'):
        print('inception_v3 without aux_logits!')
        image_size = 341
        input_size = 299
        model = models.__dict__[args.arch](aux_logits=False)
    elif args.arch.startswith('ir18'):
        image_size = 640
        input_size = 448
        model = IR18()
    else:
        image_size = 256
        input_size = 224
        model = models.__dict__[args.arch]()

    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        if os.path.isfile(args.pretrained):
            print("=> loading pretrained_model '{}'".format(args.pretrained))
            pretrained_model = torch.load(args.pretrained)
            model.load_state_dict(pretrained_model['state_dict'], strict=False)
            print("=> loaded pretrained_model '{}'".format(args.pretrained))
        else:
            print("=> no checkpoint found at '{}'".format(args.pretrained))
    model.cuda()
    model = DistModule(model)

    # step2: define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # step3: Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = McDataset(
        args.train_root,
        args.train_source,
        transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            # ColorAugmentation(),
            # normalize,
        ]))
    val_dataset = McDataset(
        args.val_root,
        args.val_source,
        transforms.Compose([
            transforms.Resize(image_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            # normalize,
        ]))

    train_sampler = DistributedSampler(train_dataset)
    val_sampler = DistributedSampler(val_dataset)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.workers,
                              pin_memory=False,
                              sampler=train_sampler)

    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=False,
                            sampler=val_sampler)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return
    timer = Timer(
        len(train_loader) + len(val_loader), args.epochs - args.start_epoch)
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)
        train_sampler.set_epoch(epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        if rank == 0:
            # remember best prec@1 and save checkpoint
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best, args.save_path)
            print('* Best Prec 1: {best:.3f}'.format(best=best_prec1))
def parse_rev_args(receive_msg):
    """ parse reveive msgs to global variable
    """
    global trainloader
    global testloader
    global trainsampler
    global testsampler
    global net
    global criterion
    global optimizer
    global rank, world_size

    # Loading Data
    if rank == 0:
        logger.debug("Preparing data..")

    transform_train, transform_test = utils.data_transforms_cifar10(args)

    dataPath = os.environ["HOME"] + "/mountdir/data/"
    trainset = torchvision.datasets.CIFAR10(root=dataPath,
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    #
    # trainsampler = DistributedSampler(trainset)
    #
    # trainloader = torch.utils.data.DataLoader(
    #     trainset, batch_size=args.batch_size_per_gpu, shuffle=False, num_workers=args.workers,
    #     pin_memory=False, sampler=trainsampler
    # )

    testset = torchvision.datasets.CIFAR10(root=dataPath,
                                           train=False,
                                           download=True,
                                           transform=transform_test)

    testsampler = DistributedSampler(testset)

    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=0,
                                             pin_memory=False,
                                             sampler=testsampler)
    if rank == 0:
        print("len(trainset)=" + str(len(trainset)))
        print("len(testset)=" + str(len(testset)))

    # Model
    if rank == 0:
        logger.debug("Building model..")
    net = build_graph_from_json(receive_msg)

    net = net.to(device)
    net = DistModule(net)
    criterion = nn.CrossEntropyLoss()

    # if args.optimizer == "SGD":
    #     optimizer = optim.SGD(
    #         net.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=5e-4
    #     )
    # if args.optimizer == "Adadelta":
    #     optimizer = optim.Adadelta(net.parameters(), lr=args.learning_rate)
    # if args.optimizer == "Adagrad":
    #     optimizer = optim.Adagrad(net.parameters(), lr=args.learning_rate)
    # if args.optimizer == "Adam":
    #     optimizer = optim.Adam(net.parameters(), lr=args.learning_rate)
    # if args.optimizer == "Adamax":
    #     optimizer = optim.Adamax(net.parameters(), lr=args.learning_rate)
    # if args.optimizer == "RMSprop":
    #     optimizer = optim.RMSprop(net.parameters(), lr=args.learning_rate)

    cudnn.benchmark = True

    return 0