Esempio n. 1
0
def main(_run, _config, world_size, rank, init_method, datadir, outdir_suffix,
         batch_size, num_workers, outdir, lr, wd, num_epochs, nsamples):
    cudnn.benchmark = True
    device = torch.device('cuda:0')  # device is set by CUDA_VISIBLE_DEVICES
    torch.cuda.set_device(device)

    # rank 0 creates experiment observer
    is_master = rank == 0

    # rank joins process group
    print('rank', rank, 'init_method', init_method)
    dist.init_process_group('nccl',
                            rank=rank,
                            world_size=world_size,
                            init_method=init_method)

    # actual training stuff
    train = make_loader(
        pt.join(datadir, '') if datadir else None,
        batch_size,
        device,
        world_size,
        rank,
        num_workers,
        # this the parameter based on which augmentation is applied to the data
        gpu_augmentation=False,
        image_rng=None,
        nsamples=nsamples)

    print('\n experiment name ', exp)
    # outdir stuff
    if outdir is None:
        outdir = pt.join(ROOT, '../exp/', outdir_suffix)

    model = Net(batch_size=batch_size)

    model = model.to(device)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[device])

    optimizer, policy = make_policy(num_epochs, model, lr, 0.9, wd)

    # loss for autoencoder
    loss = L1Loss(output_key='output', target_key='target_image').to(device)
    trainer = Trainer(model,
                      optimizer,
                      loss,
                      rank,
                      MSELossMetric(),
                      policy,
                      None,
                      train,
                      None,
                      outdir,
                      snapshot_interval=4 if is_master else None,
                      quiet=True if not is_master else False)

    print('\n Number of epochs are: ', num_epochs)
    start = datetime.now()
    with train:
        trainer.train(num_epochs, start_epoch=0)

    print("Training complete in: " + str(datetime.now() - start))
Esempio n. 2
0
def main(_run, _config, world_size, rank, init_method, datadir, batch_size,
         val_batch_size, num_workers, outdir, outdir_prefix, lr, wd,
         bn_momentum, bn_correct, warmup, num_epochs, resume, finetune, size,
         nsamples):
    cudnn.benchmark = True
    device = torch.device('cuda:0')  # device is set by CUDA_VISIBLE_DEVICES
    torch.cuda.set_device(device)

    # rank 0 creates experiment observer
    is_master = rank == 0

    # rank joins process group
    print('rank', rank, 'init_method', init_method)
    dist.init_process_group('nccl',
                            rank=rank,
                            world_size=world_size,
                            init_method=init_method)

    # actual training stuff
    train = make_loader(
        pt.join(datadir, '') if datadir else None,
        batch_size,
        device,
        world_size,
        rank,
        num_workers,
        size,
        # this the parameter based on which augmentation is applied to the data
        gpu_augmentation=False,
        image_rng=None,
        nsamples=nsamples)

    # lr is scaled linearly to original batch size of 256
    world_batch_size = world_size * batch_size
    k = world_batch_size / 256
    lr = k * lr

    # outdir stuff
    if outdir is None:
        outdir = pt.join(outdir_prefix, '%dgpu' % (world_size, ))

    model = Net(num_classes=1000, batch_size=batch_size)

    model = model.to(device)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[device])
    #model = Unpacker(model)

    optimizer, policy = make_policy(num_epochs, model, lr, 0.9)
    print('\n policy defined')

    # loss for autoencoder
    loss = L1Loss(output_key='output', target_key='target_image').to(device)
    # this loss is for classifier
    classifier_loss = CrossEntropyLoss(output_key='probs',
                                       target_key='label').to(device)
    trainer = Trainer(model,
                      optimizer,
                      loss,
                      None,
                      policy,
                      None,
                      train,
                      None,
                      outdir,
                      snapshot_interval=5 if is_master else None,
                      quiet=rank != 0)

    print('\n trainer has been initialized')

    start = datetime.now()
    with train:
        trainer.train(num_epochs, start_epoch=0)

    print("Training complete in: " + str(datetime.now() - start))
Esempio n. 3
0
def main(_run, _config, world_size, rank, init_method, datadir, batch_size,
         num_workers, outdir_suffix, outdir, lr, wd, warmup, num_epochs,
         nsamples):
    cudnn.benchmark = True
    device = torch.device('cuda:0')  # device is set by CUDA_VISIBLE_DEVICES
    torch.cuda.set_device(device)

    # rank 0 creates experiment observer
    is_master = rank == 0

    # rank joins process group
    print('rank', rank, 'init_method', init_method)
    dist.init_process_group('nccl',
                            rank=rank,
                            world_size=world_size,
                            init_method=init_method)

    # actual training stuff
    train = make_loader(
        pt.join(datadir, '') if datadir else None,
        batch_size,
        device,
        world_size,
        rank,
        num_workers,
        # this the parameter based on which augmentation is applied to the data
        gpu_augmentation=False,
        image_rng=None,
        nsamples=nsamples)

    # lr is scaled linearly to original batch size of 256
    # world_batch_size = world_size * batch_size
    # k = world_batch_size / 256
    # lr = k * lr

    # outdir stuff
    if outdir is None:
        outdir = pt.join(ROOT, '../exp/', outdir_suffix)

    model = Net(num_classes=500, batch_size=batch_size)
    print('\n network parameters ', len(list(model.parameters())))

    model = model.to(device)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[device])
    #model = Unpacker(model)

    optimizer, policy = make_policy(num_epochs, model, lr, 0.9, wd)

    # loss for autoencoder
    loss = L1Loss(output_key='output', target_key='target_image').to(device)
    # this loss is for classifier
    classifier_loss = CrossEntropyLoss(output_key='probs',
                                       target_key='label').to(device)
    trainer = Trainer(model,
                      optimizer,
                      loss,
                      classifier_loss,
                      rank,
                      AccuracyMetric(output_key='softmax_output',
                                     target_key='label'),
                      policy,
                      None,
                      train,
                      None,
                      outdir,
                      snapshot_interval=4 if is_master else None,
                      quiet=True if not is_master else False)

    print('\n Number of epochs are: ', num_epochs)
    start = datetime.now()
    with train:
        trainer.train(num_epochs, start_epoch=0)

    print("Training complete in: " + str(datetime.now() - start))