def test_already_has_grad(): model = nn.Sequential(nn.Conv2d(3, 3, 1)) sample = torch.rand(1, 3, 32, 32) model(sample).norm().backward() with pytest.raises(ValueError, match='some parameter already has gradient'): balance_by_time(1, model, sample, device='cpu')
def test_balance_by_time_tuple(): class Twin(nn.Module): def forward(self, x): return x, x.detach() class Add(nn.Module): def forward(self, a_b): a, b = a_b return a + b model = nn.Sequential(Twin(), Add()) sample = torch.rand(1, requires_grad=True) balance_by_time(1, model, sample, device='cpu')
def test_sandbox_during_profiling(device): model = nn.Sequential(nn.BatchNorm2d(3)) before = {k: v.clone() for k, v in model.state_dict().items()} sample = torch.rand(1, 3, 10, 10) balance_by_time(1, model, sample, device=device) after = model.state_dict() assert before.keys() == after.keys() for key, value in before.items(): assert torch.allclose(after[key], value), key
def test_not_training(): class AssertTraining(nn.Module): def forward(self, x): assert self.training return x model = nn.Sequential(AssertTraining()) model.eval() assert not model.training sample = torch.rand(1) balance_by_time(1, model, sample, device='cpu') assert not model.training
def test_balance_by_time_loop_resets_input(): # nn.Flatten was introduced at PyTorch 1.2.0. class Flatten(nn.Module): def forward(self, x): return x.flatten(1) model = nn.Sequential(nn.Conv2d(3, 2, 1), Flatten(), nn.Linear(128, 10)) sample = torch.rand(10, 3, 8, 8) balance = balance_by_time(2, model, sample, device='cpu') assert balance == [1, 2]
def test_balance_by_time(device): class Delay(nn.Module): def __init__(self, seconds): super().__init__() self.seconds = seconds def forward(self, x): time.sleep(self.seconds) return x model = nn.Sequential(*[Delay(i/100) for i in [1, 2, 3, 4, 5, 6]]) sample = torch.rand(1) balance = balance_by_time(2, model, sample, device=device) assert balance == [4, 2]
def main(): parser = argparse.ArgumentParser(description='D-DNN imagenet benchmark') parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet50)') parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') # Value of args.synthetic_data may seem confusing, but those values # come from bash and there 0=true and all else =false parser.add_argument('-s', '--synthetic_data', type=int, default=0, help="Use synthetic data") args = parser.parse_args() torch.manual_seed(1) torch.cuda.manual_seed(1) cudnn.benchmark = True #--------------------------------------------------------------------------------- # Move model to GPU. print("=> creating model '{}'".format(args.arch)) model = model_names[args.arch].cuda() partitions = torch.cuda.device_count() if args.synthetic_data == -1: sample = torch.empty(batch_size, 3, 512, 512) else: sample = torch.empty(batch_size, 3, 224, 224) balance = balance_by_time(partitions, model, sample) model = GPipe(model, balance, chunks=microbatches) #--------------------------------------------------------------------------------- devices = list(model.devices) in_device = devices[0] out_device = devices[-1] torch.cuda.set_device(in_device) throughputs = [] elapsed_times = [] #--------------------------------------------------------------------------------- # define optimizer optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) #--------------------------------------------------------------------------------- normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_comp = [ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ] val_comp = [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ] if args.synthetic_data == -1: # Load highres data traindir = datadir + '/HIGHRES/train' valdir = datadir + '/HIGHRES/val' train_comp = [transforms.ToTensor(), normalize] val_comp = [transforms.ToTensor(), normalize] elif args.synthetic_data: # Load normal data traindir = datadir + '/train' valdir = datadir + '/val' else: # Load synthetic data traindir = datadir + '/IMAGENET/train' valdir = datadir + '/IMAGENET/val' train_loader = torch.utils.data.DataLoader(datasets.ImageFolder( traindir, transforms.Compose(train_comp)), batch_size=batch_size, shuffle=True, num_workers=cores_gpu, pin_memory=True) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose(val_comp)), batch_size=batch_size, shuffle=True, num_workers=cores_gpu, pin_memory=True) #--------------------------------------------------------------------------------- for epoch in range(epochs): throughput, elapsed_time = run_epoch(train_loader, val_loader, model, optimizer, epoch, args, in_device, out_device) throughputs.append(throughput) elapsed_times.append(elapsed_time) _, valid_accuracy = evaluate(val_loader, model, args, in_device, out_device) n = len(throughputs) throughput = sum(throughputs) / n if n > 0 else 0.0 elapsed_time = sum(elapsed_times) / n if n > 0 else 0.0 print('valid accuracy: %.4f | %.3f samples/sec, %.3f sec/epoch (average)' '' % (valid_accuracy, throughput, elapsed_time))
]), loader=grayloader, ), batch_size=batch_size, shuffle=True, num_workers=cores_gpu, pin_memory=True) #--------------------------------------------------------------------------------- # Move model to GPU. print("=> creating model '{}'".format(args.arch)) model = model_names[args.arch].cuda() partitions = torch.cuda.device_count() sample = torch.empty(batch_size, 1, 28, 28) balance = balance_by_time(partitions, model, sample) model = GPipe(model, balance, chunks=microbatches) #--------------------------------------------------------------------------------- devices = list(model.devices) in_device = devices[0] out_device = devices[-1] torch.cuda.set_device(in_device) throughputs = [] elapsed_times = [] optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(epochs):