Ejemplo n.º 1
0
def main(_run, _config, experiment, experiment_variant, world_size, rank,
         init_method, data_dir, batch_size, val_batch_size, out_dir, momentum,
         wd, lr, epochs, device):

    cudnn.benchmark = True
    torch.cuda.set_device(device)
    is_master = rank == 0
    dist.init_process_group('nccl',
                            rank=rank,
                            world_size=world_size,
                            init_method=init_method)

    out_dir = pt.join(out_dir, experiment_variant)
    if not pt.exists(out_dir):
        os.makedirs(out_dir)

    train = make_loader(pt.join(data_dir, 'train.msgpack'),
                        batch_size,
                        world_size,
                        rank,
                        image_rng=AUGMENTATION_TRAIN)
    val = make_loader(pt.join(data_dir, 'val.msgpack'),
                      val_batch_size,
                      world_size,
                      rank,
                      image_params={'scale': 256 / 224})

    network = get_network(experiment, experiment_variant)
    network = Normalize(module=network).to(device)
    network = nn.parallel.DistributedDataParallel(network, device_ids=[device])
    network = Unpacker(network)

    optimizer, policy = make_policy(epochs, network, lr, momentum, wd)

    loss = CrossEntropyLoss(target_key='label').to(device)

    trainer = Trainer(network,
                      optimizer,
                      loss,
                      AccuracyMetric(output_key='probs'),
                      policy,
                      None,
                      train,
                      val,
                      out_dir,
                      snapshot_interval=5 if is_master else None,
                      quiet=True if not is_master else False)

    start = datetime.now()
    with train:
        with val:
            trainer.train(epochs, start_epoch=0)

    print('Total Time taken: ', datetime.now() - start)
Ejemplo n.º 2
0
def main(_run, _config, world_size, rank, init_method, datadir, batch_size,
         val_batch_size, num_workers, outdir, outdir_prefix, lr, wd,
         bn_momentum, bn_correct, warmup, num_epochs, resume, finetune, size,
         nsamples):
    cudnn.benchmark = True
    device = torch.device('cuda:0')  # device is set by CUDA_VISIBLE_DEVICES
    torch.cuda.set_device(device)

    # rank 0 creates experiment observer
    is_master = rank == 0

    # rank joins process group
    print('rank', rank, 'init_method', init_method)
    dist.init_process_group('nccl',
                            rank=rank,
                            world_size=world_size,
                            init_method=init_method)

    # actual training stuff
    train = make_loader(
        pt.join(datadir, '') if datadir else None,
        batch_size,
        device,
        world_size,
        rank,
        num_workers,
        size,
        # this the parameter based on which augmentation is applied to the data
        gpu_augmentation=False,
        image_rng=None,
        nsamples=nsamples)

    # lr is scaled linearly to original batch size of 256
    world_batch_size = world_size * batch_size
    k = world_batch_size / 256
    lr = k * lr

    # outdir stuff
    if outdir is None:
        outdir = pt.join(outdir_prefix, '%dgpu' % (world_size, ))

    model = Net(num_classes=1000, batch_size=batch_size)

    model = model.to(device)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[device])
    #model = Unpacker(model)

    optimizer, policy = make_policy(num_epochs, model, lr, 0.9)
    print('\n policy defined')

    # loss for autoencoder
    loss = L1Loss(output_key='output', target_key='target_image').to(device)
    # this loss is for classifier
    classifier_loss = CrossEntropyLoss(output_key='probs',
                                       target_key='label').to(device)
    trainer = Trainer(model,
                      optimizer,
                      loss,
                      None,
                      policy,
                      None,
                      train,
                      None,
                      outdir,
                      snapshot_interval=5 if is_master else None,
                      quiet=rank != 0)

    print('\n trainer has been initialized')

    start = datetime.now()
    with train:
        trainer.train(num_epochs, start_epoch=0)

    print("Training complete in: " + str(datetime.now() - start))
Ejemplo n.º 3
0
def main(_run, _config, world_size, rank, init_method, datadir, batch_size,
         num_workers, outdir_suffix, outdir, lr, wd, warmup, num_epochs,
         nsamples):
    cudnn.benchmark = True
    device = torch.device('cuda:0')  # device is set by CUDA_VISIBLE_DEVICES
    torch.cuda.set_device(device)

    # rank 0 creates experiment observer
    is_master = rank == 0

    # rank joins process group
    print('rank', rank, 'init_method', init_method)
    dist.init_process_group('nccl',
                            rank=rank,
                            world_size=world_size,
                            init_method=init_method)

    # actual training stuff
    train = make_loader(
        pt.join(datadir, '') if datadir else None,
        batch_size,
        device,
        world_size,
        rank,
        num_workers,
        # this the parameter based on which augmentation is applied to the data
        gpu_augmentation=False,
        image_rng=None,
        nsamples=nsamples)

    # lr is scaled linearly to original batch size of 256
    # world_batch_size = world_size * batch_size
    # k = world_batch_size / 256
    # lr = k * lr

    # outdir stuff
    if outdir is None:
        outdir = pt.join(ROOT, '../exp/', outdir_suffix)

    model = Net(num_classes=500, batch_size=batch_size)
    print('\n network parameters ', len(list(model.parameters())))

    model = model.to(device)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[device])
    #model = Unpacker(model)

    optimizer, policy = make_policy(num_epochs, model, lr, 0.9, wd)

    # loss for autoencoder
    loss = L1Loss(output_key='output', target_key='target_image').to(device)
    # this loss is for classifier
    classifier_loss = CrossEntropyLoss(output_key='probs',
                                       target_key='label').to(device)
    trainer = Trainer(model,
                      optimizer,
                      loss,
                      classifier_loss,
                      rank,
                      AccuracyMetric(output_key='softmax_output',
                                     target_key='label'),
                      policy,
                      None,
                      train,
                      None,
                      outdir,
                      snapshot_interval=4 if is_master else None,
                      quiet=True if not is_master else False)

    print('\n Number of epochs are: ', num_epochs)
    start = datetime.now()
    with train:
        trainer.train(num_epochs, start_epoch=0)

    print("Training complete in: " + str(datetime.now() - start))
Ejemplo n.º 4
0
		# only freeze the conv layers which is the first child in network
		ct = 0
		for child in vgg.children():
			if ct < 1:
				for name, param in child.named_parameters():
					param.requires_grad = False

			ct += 1

	# wrap the vgg network with Normalize and Unpacker
	vgg = Normalize(module = vgg).to(device)
	vgg = vgg.to(device)
	vgg = nn.parallel.DistributedDataParallel(vgg, device_ids=[device])
	vgg = Unpacker(vgg)

	# define optimizer with parameters which only require gradients
	optimizer, policy = make_policy(epochs, vgg, lr, momentum, wd)

	loss = CrossEntropyLoss(target_key = 'label').to(device)

	trainer = Trainer(vgg, optimizer, loss, AccuracyMetric(output_key = 'probs'), policy, None, train, val, 
		out_dir, snapshot_interval=5 if is_master else None, quiet = True if not is_master else False)

	start = datetime.now()
	with train:
		with val:
			trainer.train(epochs, start_epoch = 0)

	print('Total Time taken: ', datetime.now() - start)