def main(_run, _config, world_size, rank, init_method, datadir, outdir_suffix, batch_size, num_workers, outdir, lr, wd, num_epochs, nsamples): cudnn.benchmark = True device = torch.device('cuda:0') # device is set by CUDA_VISIBLE_DEVICES torch.cuda.set_device(device) # rank 0 creates experiment observer is_master = rank == 0 # rank joins process group print('rank', rank, 'init_method', init_method) dist.init_process_group('nccl', rank=rank, world_size=world_size, init_method=init_method) # actual training stuff train = make_loader( pt.join(datadir, '') if datadir else None, batch_size, device, world_size, rank, num_workers, # this the parameter based on which augmentation is applied to the data gpu_augmentation=False, image_rng=None, nsamples=nsamples) print('\n experiment name ', exp) # outdir stuff if outdir is None: outdir = pt.join(ROOT, '../exp/', outdir_suffix) model = Net(batch_size=batch_size) model = model.to(device) model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) optimizer, policy = make_policy(num_epochs, model, lr, 0.9, wd) # loss for autoencoder loss = L1Loss(output_key='output', target_key='target_image').to(device) trainer = Trainer(model, optimizer, loss, rank, MSELossMetric(), policy, None, train, None, outdir, snapshot_interval=4 if is_master else None, quiet=True if not is_master else False) print('\n Number of epochs are: ', num_epochs) start = datetime.now() with train: trainer.train(num_epochs, start_epoch=0) print("Training complete in: " + str(datetime.now() - start))
def main(_run, _config, world_size, rank, init_method, datadir, batch_size, val_batch_size, num_workers, outdir, outdir_prefix, lr, wd, bn_momentum, bn_correct, warmup, num_epochs, resume, finetune, size, nsamples): cudnn.benchmark = True device = torch.device('cuda:0') # device is set by CUDA_VISIBLE_DEVICES torch.cuda.set_device(device) # rank 0 creates experiment observer is_master = rank == 0 # rank joins process group print('rank', rank, 'init_method', init_method) dist.init_process_group('nccl', rank=rank, world_size=world_size, init_method=init_method) # actual training stuff train = make_loader( pt.join(datadir, '') if datadir else None, batch_size, device, world_size, rank, num_workers, size, # this the parameter based on which augmentation is applied to the data gpu_augmentation=False, image_rng=None, nsamples=nsamples) # lr is scaled linearly to original batch size of 256 world_batch_size = world_size * batch_size k = world_batch_size / 256 lr = k * lr # outdir stuff if outdir is None: outdir = pt.join(outdir_prefix, '%dgpu' % (world_size, )) model = Net(num_classes=1000, batch_size=batch_size) model = model.to(device) model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) #model = Unpacker(model) optimizer, policy = make_policy(num_epochs, model, lr, 0.9) print('\n policy defined') # loss for autoencoder loss = L1Loss(output_key='output', target_key='target_image').to(device) # this loss is for classifier classifier_loss = CrossEntropyLoss(output_key='probs', target_key='label').to(device) trainer = Trainer(model, optimizer, loss, None, policy, None, train, None, outdir, snapshot_interval=5 if is_master else None, quiet=rank != 0) print('\n trainer has been initialized') start = datetime.now() with train: trainer.train(num_epochs, start_epoch=0) print("Training complete in: " + str(datetime.now() - start))
def main(_run, _config, world_size, rank, init_method, datadir, batch_size, num_workers, outdir_suffix, outdir, lr, wd, warmup, num_epochs, nsamples): cudnn.benchmark = True device = torch.device('cuda:0') # device is set by CUDA_VISIBLE_DEVICES torch.cuda.set_device(device) # rank 0 creates experiment observer is_master = rank == 0 # rank joins process group print('rank', rank, 'init_method', init_method) dist.init_process_group('nccl', rank=rank, world_size=world_size, init_method=init_method) # actual training stuff train = make_loader( pt.join(datadir, '') if datadir else None, batch_size, device, world_size, rank, num_workers, # this the parameter based on which augmentation is applied to the data gpu_augmentation=False, image_rng=None, nsamples=nsamples) # lr is scaled linearly to original batch size of 256 # world_batch_size = world_size * batch_size # k = world_batch_size / 256 # lr = k * lr # outdir stuff if outdir is None: outdir = pt.join(ROOT, '../exp/', outdir_suffix) model = Net(num_classes=500, batch_size=batch_size) print('\n network parameters ', len(list(model.parameters()))) model = model.to(device) model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) #model = Unpacker(model) optimizer, policy = make_policy(num_epochs, model, lr, 0.9, wd) # loss for autoencoder loss = L1Loss(output_key='output', target_key='target_image').to(device) # this loss is for classifier classifier_loss = CrossEntropyLoss(output_key='probs', target_key='label').to(device) trainer = Trainer(model, optimizer, loss, classifier_loss, rank, AccuracyMetric(output_key='softmax_output', target_key='label'), policy, None, train, None, outdir, snapshot_interval=4 if is_master else None, quiet=True if not is_master else False) print('\n Number of epochs are: ', num_epochs) start = datetime.now() with train: trainer.train(num_epochs, start_epoch=0) print("Training complete in: " + str(datetime.now() - start))