def baseline(devices: List[int]) -> Stuffs: L, D = 18, 208 model = amoebanetd(num_classes=1000, num_layers=L, num_filters=D) device = devices[0] model.to(device) return model, L, D, [torch.device(device)]
def pipeline8(devices: List[int]) -> Stuffs: L, D = 72, 512 balance = [23, 16, 11, 6, 6, 5, 5, 6] model: nn.Module = amoebanetd(num_classes=1000, num_layers=L, num_filters=D) model = cast(nn.Sequential, model) model = GPipe(model, balance, devices=devices, chunks=128) return model, L, D, list(model.devices)
def cli( ctx: click.Context, experiment: str, epochs: int, skip_epochs: int, devices: List[int], ) -> None: """AmoebaNet-D (18, 256) Speed Benchmark""" if skip_epochs >= epochs: ctx.fail('--skip-epochs=%d must be less than --epochs=%d' % (skip_epochs, epochs)) model: nn.Module = amoebanetd(num_classes=1000, num_layers=18, num_filters=256) f: Experiment = EXPERIMENTS[experiment] try: model, batch_size, _devices = f(model, devices) except ValueError as exc: # Examples: # ValueError: too few devices to hold given partitions (devices: 1, paritions: 2) ctx.fail(str(exc)) optimizer = SGD(model.parameters(), lr=0.1) in_device = _devices[0] out_device = _devices[-1] torch.cuda.set_device(in_device) # This experiment cares about only training speed, rather than accuracy. # To eliminate any overhead due to data loading, we use fake random 224x224 # images over 1000 labels. dataset_size = 10000 input = torch.rand(batch_size, 3, 224, 224, device=in_device) target = torch.randint(1000, (batch_size, ), device=out_device) data = [(input, target)] * (dataset_size // batch_size) if dataset_size % batch_size != 0: last_input = input[:dataset_size % batch_size] last_target = target[:dataset_size % batch_size] data.append((last_input, last_target)) # HEADER ====================================================================================== title = f'{experiment}, {skip_epochs+1}-{epochs} epochs' click.echo(title) if isinstance(model, GPipe): click.echo(f'batch size: {batch_size}, chunks: {model.chunks}, ' f'balance: {model.balance}, checkpoint: {model.checkpoint}') else: click.echo(f'batch size: {batch_size}') click.echo( 'torchgpipe: %s, python: %s, torch: %s, cudnn: %s, cuda: %s, gpu: %s' % (torchgpipe.__version__, platform.python_version(), torch.__version__, torch.backends.cudnn.version(), torch.version.cuda, torch.cuda.get_device_name(in_device))) # TRAIN ======================================================================================= global BASE_TIME BASE_TIME = time.time() def run_epoch(epoch: int) -> Tuple[float, float]: torch.cuda.synchronize(in_device) tick = time.time() data_trained = 0 for i, (input, target) in enumerate(data): data_trained += input.size(0) output = model(input) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() optimizer.zero_grad() # 00:01:02 | 1/20 epoch (42%) | 200.000 samples/sec (estimated) percent = (i + 1) / len(data) * 100 throughput = data_trained / (time.time() - tick) log('%d/%d epoch (%d%%) | %.3f samples/sec (estimated)' '' % (epoch + 1, epochs, percent, throughput), clear=True, nl=False) torch.cuda.synchronize(in_device) tock = time.time() # 00:02:03 | 1/20 epoch | 200.000 samples/sec, 123.456 sec/epoch elapsed_time = tock - tick throughput = dataset_size / elapsed_time log('%d/%d epoch | %.3f samples/sec, %.3f sec/epoch' '' % (epoch + 1, epochs, throughput, elapsed_time), clear=True) return throughput, elapsed_time throughputs = [] elapsed_times = [] hr() for epoch in range(epochs): throughput, elapsed_time = run_epoch(epoch) if epoch < skip_epochs: continue throughputs.append(throughput) elapsed_times.append(elapsed_time) hr() # RESULT ====================================================================================== # pipeline-4, 2-10 epochs | 200.000 samples/sec, 123.456 sec/epoch (average) n = len(throughputs) throughput = sum(throughputs) / n elapsed_time = sum(elapsed_times) / n click.echo('%s | %.3f samples/sec, %.3f sec/epoch (average)' '' % (title, throughput, elapsed_time))
def cli( ctx: click.Context, experiment: str, epochs: int, skip_epochs: int, devices: List[int], ) -> None: """AmoebaNet-D Speed Benchmark""" if skip_epochs >= epochs: ctx.fail('--skip-epochs=%d must be less than --epochs=%d' % (skip_epochs, epochs)) model: nn.Module = amoebanetd(num_classes=10) f = EXPERIMENTS[experiment] try: model, batch_size, _devices = f(model, devices) except ValueError as exc: # Examples: # ValueError: too few devices to hold given partitions (devices: 1, paritions: 2) ctx.fail(str(exc)) optimizer = SGD(model.parameters(), lr=0.1) in_device = _devices[0] out_device = _devices[-1] # This experiment cares about only training speed, rather than accuracy. To # eliminate any overhead due to data loading, we use a fake dataset with # random 224x224 images over 10 labels. dataset = RandomDataset() loader = DataLoader( dataset, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True, drop_last=False, ) # HEADER ====================================================================================== title = '%s, %d-%d epochs' % (experiment, skip_epochs + 1, epochs) click.echo(title) click.echo('python: %s, torch: %s, cudnn: %s, cuda: %s, gpu: %s' % (platform.python_version(), torch.__version__, torch.backends.cudnn.version(), torch.version.cuda, torch.cuda.get_device_name(in_device))) # TRAIN ======================================================================================= global BASE_TIME BASE_TIME = time.time() def run_epoch(epoch: int) -> Tuple[float, float]: torch.cuda.synchronize(in_device) tick = time.time() data_trained = 0 for i, (input, target) in enumerate(loader): data_trained += len(input) input = input.to(in_device, non_blocking=True) target = target.to(out_device, non_blocking=True) output = model(input) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() optimizer.zero_grad() # 00:01:02 | 1/20 epoch (42%) | 200.000 samples/sec (estimated) percent = i / len(loader) * 100 throughput = data_trained / (time.time() - tick) log('%d/%d epoch (%d%%) | %.3f samples/sec (estimated)' '' % (epoch + 1, epochs, percent, throughput), clear=True, nl=False) torch.cuda.synchronize(in_device) tock = time.time() # 00:02:03 | 1/20 epoch | 200.000 samples/sec, 123.456 sec/epoch elapsed_time = tock - tick throughput = len(dataset) / elapsed_time log('%d/%d epoch | %.3f samples/sec, %.3f sec/epoch' '' % (epoch + 1, epochs, throughput, elapsed_time), clear=True) return throughput, elapsed_time throughputs = [] elapsed_times = [] hr() for epoch in range(epochs): throughput, elapsed_time = run_epoch(epoch) if epoch < skip_epochs: continue throughputs.append(throughput) elapsed_times.append(elapsed_time) hr() # RESULT ====================================================================================== # pipeline-4, 2-10 epochs | 200.000 samples/sec, 123.456 sec/epoch (average) n = len(throughputs) throughput = sum(throughputs) / n elapsed_time = sum(elapsed_times) / n click.echo('%s | %.3f samples/sec, %.3f sec/epoch (average)' '' % (title, throughput, elapsed_time))