コード例 #1
0
def main():
    global best_prec1, args

    args.distributed = args.world_size > 1
    #    args.gpu = 0
    if args.distributed:
        #       args.gpu = args.rank % torch.cuda.device_count()
        #      torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    model = model.cuda()
    n_dev = torch.cuda.device_count()
    if args.fp16: model = network_to_half(model)
    if args.distributed:
        model = DDP(model)
        #args.lr *= n_dev
    elif args.dp:
        model = nn.DataParallel(model)
        args.batch_size *= n_dev
        #args.lr *= n_dev

    global param_copy
    if args.fp16:
        param_copy = [
            param.clone().type(torch.cuda.FloatTensor).detach()
            for param in model.parameters()
        ]
        for param in param_copy:
            param.requires_grad = True
    else:
        param_copy = list(model.parameters())

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(param_copy,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(args.sz),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_sampler = (
        torch.utils.data.distributed.DistributedSampler(train_dataset)
        if args.distributed else None)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(int(args.sz * 1.14)),
            transforms.CenterCrop(args.sz),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed: train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)
        if args.prof: break
        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
コード例 #2
0
def main():
    print("~~epoch\thours\ttop1Accuracy\n")
    start_time = datetime.now()
    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed:
        args.gpu = args.rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    # create model
    if args.pretrained: model = models.__dict__[args.arch](pretrained=True)
    else: model = models.__dict__[args.arch]()

    model = model.cuda()
    n_dev = torch.cuda.device_count()
    if args.fp16: model = network_to_half(model)
    if args.distributed: model = DDP(model)
    elif args.dp:
        model = nn.DataParallel(model)
        args.batch_size *= n_dev

    global param_copy
    if args.fp16:
        param_copy = [
            param.clone().type(torch.cuda.FloatTensor).detach()
            for param in model.parameters()
        ]
        for param in param_copy:
            param.requires_grad = True
    else:
        param_copy = list(model.parameters())

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(param_copy,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    best_prec1 = 0
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    train_loader, val_loader, train_sampler = get_loaders(traindir, valdir)

    if args.evaluate:
        return validate(val_loader, model, criterion, epoch, start_time)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed: train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)
        if epoch == args.epochs - 6:
            args.sz = 288
            args.batch_size = 128
            train_loader, val_loader, train_sampler, val_sampler = get_loaders(
                traindir, valdir, use_val_sampler=False, min_scale=0.5)

        if args.distributed:
            train_sampler.set_epoch(epoch)
            val_sampler.set_epoch(epoch)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            train(train_loader, model, criterion, optimizer, epoch)

        if args.prof: break
        prec1 = validate(val_loader, model, criterion, epoch, start_time)

        if args.rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
コード例 #3
0
ファイル: train_adam.py プロジェクト: ypwhs/doodle
                                    transform=transform,
                                    num_workers=num_workers)

optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
optimizer = hvd.DistributedOptimizer(optimizer,
                                     named_parameters=model.named_parameters())

epoch = 0
if args.checkpoint:
    load_checkpoint(model, optimizer, args.checkpoint)
    tag = args.checkpoint.split('_')[-1].split('.')[0]
    if tag.isnumeric():
        epoch = int(tag)

if args.half:
    model = network_to_half(model)
model = model.cuda()
model.name = f'{args.model}_{args.tag}'

hvd.broadcast_parameters(model.state_dict(), root_rank=0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                            step_size=40,
                                            gamma=0.5623)

# training
for i in range(epoch):
    scheduler.step()

for i in range(epoch, 400):
    epoch += 1
    scheduler.step()
コード例 #4
0
elif args.net == 'mobilenet':
    net = MobileNetV2()
# net = DPN92()
elif args.net == 'shufflenet':
    net = ShuffleNetv2()
elif args.net == 'efficientnet':
    net = Efficientnet()
else:
    print("{} not found").format(args.net)

net = net.to(device)
print(net)

if args.fp16:
    from fp16util import network_to_half
    net = network_to_half(net)

if device == 'cuda':
    # net = torch.nn.DataParallel(net) # make parallel
    """ can't use dataparallel for onnx..
    see https://github.com/pytorch/pytorch/issues/13397 """
    cudnn.benchmark = True

if args.resume:
    # Load checkpoint.
    print('==> Resuming from checkpoint..')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/{}-ckpt.t7'.format(args.net))
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']
コード例 #5
0
ファイル: micro_bench.py プロジェクト: mila-iqia/training
def run_benchmarking(net,
                     batch_size,
                     iterations,
                     run_fp16,
                     dataparallel,
                     distributed_dataparallel,
                     device_ids=None,
                     distributed_parameters=None):
    if device_ids:
        torch.cuda.set_device("cuda:%d" % device_ids[0])
    else:
        torch.cuda.set_device("cuda:0")

    network = get_network(net)
    if run_fp16:
        network = network_to_half(network)

    if dataparallel:
        network = torch.nn.DataParallel(network, device_ids=device_ids)
        num_devices = len(
            device_ids) if device_ids is not None else torch.cuda.device_count(
            )

    elif distributed_dataparallel:
        rendezvous(distributed_parameters)
        network = torch.nn.parallel.DistributedDataParallel(
            network, device_ids=device_ids)
        num_devices = len(
            device_ids) if device_ids is not None else torch.cuda.device_count(
            )

    else:
        num_devices = 1

    if net == "inception_v3":
        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
    else:
        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")

    if run_fp16:
        inp = inp.half()

    target = torch.randint(
        0, 1, size=(batch_size, ),
        device='cuda')  # torch.arange(batch_size, device="cuda")

    param_copy = network.parameters()
    if run_fp16:
        param_copy = get_param_copy(network)

    optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9)

    ## warmup.
    print("INFO: running forward and backward for warmup.")
    forwardbackward(inp, optimizer, network, target)
    forwardbackward(inp, optimizer, network, target)

    time.sleep(1)
    torch.cuda.synchronize()

    ## benchmark.
    print("INFO: running the benchmark..")
    tm = time.time()
    for i in range(iterations):
        forwardbackward(inp, optimizer, network, target)

    torch.cuda.synchronize()

    tm2 = time.time()
    time_per_batch = (tm2 - tm) / iterations
    rank = distributed_parameters.get('rank', -1)
    world_size = distributed_parameters.get('world_size', 1)

    process_report = {
        'model': net,
        'rank': rank,
        'num_device': num_devices,
        'batch_size': batch_size,
        'batch_time': time_per_batch,
        'speed': batch_size / time_per_batch
    }

    with open(f'{tmp}/process_report_{rank}.json', 'w') as report:
        json.dump(process_report, report)

    if rank == 0:
        overall_report = {
            'world_size': world_size,
            'batch_size': batch_size * world_size,
            'batch_time': time_per_batch,
            'speed': batch_size * world_size / time_per_batch
        }
        with open(f'{tmp}/overall_report.json', 'w') as report:
            json.dump(overall_report, report)
コード例 #6
0
ファイル: main.py プロジェクト: Henley13/imagenet-fast
def main():
    global best_prec1, args

    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed:
        args.gpu = args.rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    model = model.cuda()
    n_dev = torch.cuda.device_count()
    if args.fp16: model = network_to_half(model)
    if args.distributed:
        model = DDP(model)
        #args.lr *= n_dev
    elif args.dp:
        model = nn.DataParallel(model)
        args.batch_size *= n_dev
        #args.lr *= n_dev

    global param_copy
    if args.fp16:
        param_copy = [param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters()]
        for param in param_copy: param.requires_grad = True
    else: param_copy = list(model.parameters())

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(param_copy, args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else: print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(args.sz),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_sampler = (torch.utils.data.distributed.DistributedSampler(train_dataset)
                     if args.distributed else None)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(int(args.sz*1.14)),
            transforms.CenterCrop(args.sz),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed: train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)
        if args.prof: break
        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
コード例 #7
0
ファイル: main.py プロジェクト: lcskrishna/examples
def main():
    global args, best_prec1
    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    args.distributed = args.world_size > 1
    print ("INFO: args.distributed values is : {} and value of worldsize is {}".format(args.distributed, args.world_size))
    if args.distributed:
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size)

    if args.fp16:
        assert torch.backends.cudnn.enabled

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.gpu is not None:
        model = model.cuda(args.gpu)
    elif args.distributed:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    if args.fp16:
        model = network_to_half(model)
    
    global param_copy
    if args.fp16:
        param_copy = [param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters()]
        for param in param_copy:
            param.requires_grad = True
    else:
        param_copy = list(model.parameters())

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(param_copy, args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)
コード例 #8
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--fp16',
                        type=int,
                        default=0,
                        required=False,
                        help='undergo fp16 training')
    parser.add_argument('--scale_factor',
                        type=float,
                        default=1,
                        help='Loss scale factor for fp16 training')

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    if args.fp16:
        assert torch.backends.cudnn.enabled

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = Net().to(device)
    if args.fp16:
        print("INFO: training the network for fp16")
        model = network_to_half(model)

    global param_copy
    if args.fp16:
        param_copy = [
            param.clone().type(torch.cuda.FloatTensor).detach()
            for param in model.parameters()
        ]
        for param in param_copy:
            param.requires_grad = True
    else:
        param_copy = list(model.parameters())

    optimizer = optim.SGD(param_copy, lr=args.lr, momentum=args.momentum)

    print(model)

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(args, model, device, test_loader)
コード例 #9
0
def run_benchmarking(net,
                     batch_size,
                     iterations,
                     run_fp16,
                     dataparallel,
                     distributed_dataparallel,
                     device_ids=None,
                     distributed_parameters=None):
    if device_ids:
        torch.cuda.set_device("cuda:%d" % device_ids[0])
    else:
        torch.cuda.set_device("cuda:0")

    network = get_network(net)
    print('Total parameters:', count_parameters(network))

    if (run_fp16):
        network = network_to_half(network)

    if (dataparallel):
        network = torch.nn.DataParallel(network, device_ids=device_ids)
        num_devices = len(
            device_ids) if device_ids is not None else torch.cuda.device_count(
            )
    elif (distributed_dataparallel):
        rendezvous(distributed_parameters)
        network = torch.nn.parallel.DistributedDataParallel(
            network, device_ids=device_ids)
        num_devices = len(
            device_ids) if device_ids is not None else torch.cuda.device_count(
            )
    else:
        num_devices = 1

    if (net == "inception_v3"):
        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
    else:
        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
    if (run_fp16):
        inp = inp.half()
    target = torch.arange(batch_size, device="cuda")
    param_copy = network.parameters()
    if (run_fp16):
        param_copy = get_param_copy(network)
    optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9)

    ## warmup.
    print("INFO: running forward and backward for warmup.")
    forwardbackward(inp, optimizer, network, target)
    forwardbackward(inp, optimizer, network, target)

    time.sleep(1)
    torch.cuda.synchronize()

    ## benchmark.
    print("INFO: running the benchmark..")
    tm = time.time()
    for i in range(iterations):
        forwardbackward(inp, optimizer, network, target)
    torch.cuda.synchronize()

    tm2 = time.time()
    time_per_batch = (tm2 - tm) / iterations

    print("OK: finished running benchmark..")
    print("--------------------SUMMARY--------------------------")
    print("Microbenchmark for network : {}".format(net))
    if (distributed_dataparallel):
        print("--------This process: rank " +
              str(distributed_parameters['rank']) + "--------")
    print("Num devices: {}".format(num_devices))
    print("Mini batch size [img] : {}".format(batch_size))
    print("Time per mini-batch : {}".format(time_per_batch))
    print("Throughput [img/sec] : {}".format(batch_size / time_per_batch))
    if (distributed_dataparallel):
        print("")
        print(
            "--------Overall (all ranks) (assuming same num/type devices for each rank)--------"
        )
        world_size = distributed_parameters['world_size']
        print("Num devices: {}".format(num_devices * world_size))
        print("Mini batch size [img] : {}".format(batch_size * world_size))
        print("Time per mini-batch : {}".format(time_per_batch))
        print("Throughput [img/sec] : {}".format(batch_size * world_size /
                                                 time_per_batch))
コード例 #10
0
def run_benchmarking(local_rank, ngpus, net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None):
    if device_ids:
        assert ngpus == len(device_ids)
        torch.cuda.set_device("cuda:%d" % device_ids[local_rank])
    else:
        torch.cuda.set_device("cuda:0")

    network = get_network(net)
    if (run_fp16):
        network = network_to_half(network)

    if (dataparallel):
        devices_to_run_on = device_ids if device_ids else list(range(ngpus))
        print ("INFO: Running dataparallel on devices: {}".format(str(devices_to_run_on)))
        network = torch.nn.DataParallel(network, device_ids=devices_to_run_on)
    elif (distributed_dataparallel):
        distributed_parameters['rank'] += local_rank
        rendezvous(distributed_parameters)
        devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)]
        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
        batch_size = int(batch_size / ngpus)

    if (net == "inception_v3"):
        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
    else:
        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
    if (run_fp16):
        inp = inp.half()
    target = torch.arange(batch_size, device="cuda")
    param_copy = network.parameters()
    if (run_fp16):
        param_copy = get_param_copy(network)
    optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9)

    ## warmup.
    print ("INFO: running forward and backward for warmup.")
    forwardbackward(inp, optimizer, network, target)
    forwardbackward(inp, optimizer, network, target)

    time.sleep(1)
    torch.cuda.synchronize()

    ## benchmark.
    print ("INFO: running the benchmark..")
    tm = time.time()
    for i in range(iterations):
        forwardbackward(inp, optimizer, network, target)
        if(i%10==0):
            print (time.asctime( time.localtime(time.time())) + " INFO: iteration " + str(i) + " completed.")
    torch.cuda.synchronize()
    
    tm2 = time.time()
    time_per_batch = (tm2 - tm) / iterations

    print ("OK: finished running benchmark..")
    print ("--------------------SUMMARY--------------------------")
    print ("Microbenchmark for network : {}".format(net))
    if (distributed_dataparallel):
      print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
      print ("Num devices: 1")
    else:
      print ("Num devices: {}".format(ngpus))
    print ("Mini batch size [img] : {}".format(batch_size))
    print ("Time per mini-batch : {}".format(time_per_batch))
    print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch))
    if (distributed_dataparallel):
      print ("")
      print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
      world_size = distributed_parameters['world_size']
      print ("Num devices: {}".format(world_size))
      print ("Mini batch size [img] : {}".format(batch_size*world_size))
      print ("Time per mini-batch : {}".format(time_per_batch))
      print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
def run_benchmarking(local_rank,
                     ngpus,
                     net,
                     batch_size,
                     iterations,
                     prof_step,
                     amp_opt_level,
                     run_fp16,
                     dataparallel,
                     distributed_dataparallel,
                     device_ids=None,
                     distributed_parameters=None):
    if device_ids:
        assert ngpus == len(device_ids)
        torch.cuda.set_device("cuda:%d" % device_ids[local_rank])
    else:
        torch.cuda.set_device("cuda:0")

    network = get_network(net)
    if "shufflenet" == net:
        model.apply(weight_init)

    if (run_fp16):
        network = network_to_half(network)

    if (dataparallel):
        devices_to_run_on = device_ids if device_ids else list(range(ngpus))
        print("INFO: Running dataparallel on devices: {}".format(
            str(devices_to_run_on)))
        network = torch.nn.DataParallel(network, device_ids=devices_to_run_on)
    elif (distributed_dataparallel):
        distributed_parameters['rank'] += local_rank
        rendezvous(distributed_parameters)
        devices_to_run_on = [
            (device_ids[local_rank] if device_ids else local_rank)
        ]
        print("INFO: Rank {} running distributed_dataparallel on devices: {}".
              format(distributed_parameters['rank'], str(devices_to_run_on)))
        network = torch.nn.parallel.DistributedDataParallel(
            network, device_ids=devices_to_run_on)
        batch_size = int(batch_size / ngpus)

    if (net == "inception_v3"):
        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
    else:
        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
    if (run_fp16):
        inp = inp.half()
    if net in models:
        # number of classes is 1000 for imagenet
        target = torch.randint(0, 1000, (batch_size, ), device="cuda")
    elif net in segmentation_models:
        # number of classes is 21 for segmentation
        target = torch.randint(0, 21, (batch_size, ), device="cuda")
    param_copy = network.parameters()
    if (run_fp16):
        param_copy = get_param_copy(network)
    optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9)

    if (amp_opt_level):
        network, optimizer = apex.amp.initialize(network,
                                                 optimizer,
                                                 opt_level="O%d" %
                                                 amp_opt_level)

    ## warmup.
    print("INFO: running forward and backward for warmup.")
    forwardbackward(inp, optimizer, network, target, amp_opt_level)
    forwardbackward(inp, optimizer, network, target, amp_opt_level)

    time.sleep(1)
    torch.cuda.synchronize()

    ## benchmark.
    print("INFO: running the benchmark..")
    tm = time.time()
    for i in range(iterations):
        if i == prof_step:
            forwardbackward(inp, optimizer, network, target, amp_opt_level, i)
        else:
            forwardbackward(inp, optimizer, network, target, amp_opt_level)
    torch.cuda.synchronize()

    tm2 = time.time()
    time_per_batch = (tm2 - tm) / iterations

    if run_fp16:
        dtype = 'FP16'
    elif amp_opt_level == 1:
        dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.'
    elif amp_opt_level == 2:
        dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.'
    elif amp_opt_level == 3:
        dtype = 'AMP-O3: Pure FP16 training.'
    elif amp_opt_level == 4:
        dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.'
    elif amp_opt_level == 5:
        dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.'
    else:
        dtype = 'FP32'

    print("OK: finished running benchmark..")
    print("--------------------SUMMARY--------------------------")
    print("Microbenchmark for network : {}".format(net))
    if (distributed_dataparallel):
        print("--------This process: rank " +
              str(distributed_parameters['rank']) + "--------")
        print("Num devices: 1")
    else:
        print("Num devices: {}".format(ngpus))
    print("Dtype: {}".format(dtype))
    print("Mini batch size [img] : {}".format(batch_size))
    print("Time per mini-batch : {}".format(time_per_batch))
    print("Throughput [img/sec] : {}".format(batch_size / time_per_batch))
    if (distributed_dataparallel):
        print("")
        print(
            "--------Overall (all ranks) (assuming same num/type devices for each rank)--------"
        )
        world_size = distributed_parameters['world_size']
        print("Num devices: {}".format(world_size))
        print("Dtype: {}".format(dtype))
        print("Mini batch size [img] : {}".format(batch_size * world_size))
        print("Time per mini-batch : {}".format(time_per_batch))
        print("Throughput [img/sec] : {}".format(batch_size * world_size /
                                                 time_per_batch))