def test_reuse_workers(DatasetType): shape = [2, 3] num_tensors = 10 opts = poptorch.Options() data = poptorch.DataLoader(opts, DatasetType(shape, num_tensors), batch_size=1, num_workers=2) data_no_reuse = poptorch.DataLoader(opts, DatasetType(shape, num_tensors), batch_size=1, persistent_workers=False, num_workers=2) loader = poptorch.AsynchronousDataAccessor(data) loader_no_reuse = poptorch.AsynchronousDataAccessor(data_no_reuse) start = None # Workers will be created while fetching the first element # so start the timer after the first element is fetched. num_tensors = 0 for _ in loader_no_reuse: num_tensors += 1 if start is None: start = time.perf_counter() end = time.perf_counter() print(f"First epoch no reuse: {end - start} {num_tensors}") for _ in range(3): start = time.perf_counter() for _ in loader_no_reuse: num_tensors += 1 end = time.perf_counter() print(f"Other epoch no reuse: {end - start} {num_tensors}") start = None # Workers will be created while fetching the first element # so start the timer after the first element is fetched. num_tensors_reuse = 0 for _ in loader: num_tensors_reuse += 1 if start is None: start = time.perf_counter() end = time.perf_counter() print(f"First epoch: {end - start} {num_tensors_reuse}") for _ in range(3): start = time.perf_counter() for _ in loader: num_tensors_reuse += 1 end = time.perf_counter() print(f"Other epoch: {end - start} {num_tensors_reuse}")
def _run_dataset_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1, host_id=0, num_hosts=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) opts.Distributed.configureProcessId(host_id, num_hosts) data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=batch_size, num_workers=num_workers) loader = poptorch.AsynchronousDataAccessor(data) offset = host_id * (num_tensors // num_hosts) assert len(data) == num_tensors // (device_iterations * batch_size * replication_factor * num_hosts) for it, d in enumerate(loader): expected = torch.from_numpy( numpy.stack([ numpy.full(shape, offset + i, dtype=numpy.float32) for i in range(data.combinedBatchSize * it, data.combinedBatchSize * (it + 1)) ])) diff = torch.sum(torch.sum(d - expected)) numpy.testing.assert_array_equal(diff.numpy(), [0.])
def _run_process_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1, num_runs=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=batch_size, num_workers=num_workers) loader = poptorch.AsynchronousDataAccessor(data) assert len(loader) == num_tensors // (device_iterations * batch_size * replication_factor) model = poptorch.inferenceModel(DoubleData(), opts) for _ in range(0, num_runs): for it, d in enumerate(loader): out = model(d) expected = torch.stack([ torch.full(shape, i * 2, dtype=torch.float32) for i in range(data.combinedBatchSize * it, data.combinedBatchSize * (it + 1)) ]) assert torch.equal(expected, out)
def setupInference(model, args): """ Setup a training run using the CIFAR-10 training dataset. Uses the poptorch.DataLoader so that each training iteration executed on the IPU will incorporate: * (mini-)batch size * device iterations * replica factor * gradient accumulation factor Applying the poptorch.AsynchronousDataAccessor allows loading the dataset on a separate thread. This reduces the host/IPU communication overhead by using the time that the IPU is running to load the next batch on the CPU. """ opts = setupOptions(args, train=False) inference_model = poptorch.inferenceModel(model, opts) dataset = cifar10(args.data_dir, train=False) loader = poptorch.DataLoader(opts, dataset, batch_size=args.test_batch_size, shuffle=True, drop_last=True, num_workers=8) loader = poptorch.AsynchronousDataAccessor(loader) return inference_model, loader
def test_broken_dataset(): num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, BrokenDataset(num_tensors), batch_size=1, num_workers=32) with pytest.raises(RuntimeError, match="worker thread failed to start"): poptorch.AsynchronousDataAccessor(data)
def test_iterable_dataset(): shape = [2, 3] num_tensors = 100 loader = poptorch.AsynchronousDataAccessor( IncrementIterableDataset(shape, num_tensors)) for _, _ in enumerate(loader): continue # Make sure it works for more than 1 epoch for _, _ in enumerate(loader): continue
def test_len(): shape = [2, 3] num_tensors = 10 opts = poptorch.Options() data = poptorch.DataLoader(opts, IncrementIterableDataset(shape, num_tensors), batch_size=None, drop_last=False, num_workers=1) loader = poptorch.AsynchronousDataAccessor(data) with pytest.raises(TypeError, match="'IncrementIterableDataset' has no len()"): len(loader) data = poptorch.DataLoader(opts, IncrementIterableDatasetWithLen( shape, num_tensors), batch_size=None, drop_last=False, num_workers=1) loader = poptorch.AsynchronousDataAccessor(data) len(loader)
def test_single_epoch(): shape = [2, 3] num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=1, num_workers=32) loader = poptorch.AsynchronousDataAccessor(data) assert len(loader) == num_tensors for _, _ in enumerate(loader): continue
def test_interrupt_async_loader(): """Make sure the worker processes are stopped cleanly even when the end of the dataset is not reached.""" shape = [2, 3] num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=1, num_workers=1) loader = poptorch.AsynchronousDataAccessor(data) assert len(loader) == num_tensors for _, _ in enumerate(loader): break
def test_iterable_dataloader(): shape = [2, 3] num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, IncrementIterableDataset(shape, num_tensors), batch_size=1, num_workers=1) loader = poptorch.AsynchronousDataAccessor(data) for _, t in enumerate(loader): assert t.shape == torch.Size([1, 2, 3]) continue # Make sure it works for more than 1 epoch for _, _ in enumerate(loader): continue
def _run_process_label_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) data = poptorch.DataLoader(opts, IncrementDatasetWithLabels(shape, num_tensors), batch_size=batch_size, num_workers=num_workers) loader = poptorch.AsynchronousDataAccessor(data) assert len(loader) == num_tensors // (device_iterations * batch_size * replication_factor) model = poptorch.inferenceModel(DoubleDataLabel(), opts) total = torch.zeros(shape) label_out = torch.zeros(1, dtype=torch.int) for _, (data, label) in enumerate(loader): out, label = model(data, label) total += torch.sum(out, dim=0) label_out += torch.sum(label, dim=0) actual = 0 for i in range(0, num_tensors): actual += i * 2 numpy.testing.assert_array_equal(total[0][0].numpy(), [actual]) numpy.testing.assert_array_equal(label_out[0].item(), [actual])
def run_data_loader_example(): model_batch_size = 2 # replication_start # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(100) # Duplicate the model over 4 replicas. opts.replicationFactor(4) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2 across # 4 IPUs. "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{labels[-1]}, {output}, {loss}") # replication_end # gradient_acc_start # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(400) # Accumulate the gradient 8 times before applying it. opts.Training.gradientAccumulation(8) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2 across # 4 IPUs. "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{labels[-1]}, {output}, {loss}") # gradient_acc_end # Not displayed: just to keep the linter happy shape = [3, 2] num_tensors = 100 batch_size = 1 num_workers = 0 device_iterations = 1 replication_factor = 1 # Example starts here: # data_accessor_start opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) data = poptorch.DataLoader(opts, ExampleDataset(shape=shape, length=num_tensors), batch_size=batch_size, num_workers=num_workers) loader = poptorch.AsynchronousDataAccessor(data) poptorch_model = poptorch.inferenceModel(model, opts) for it, (data, _) in enumerate(loader): out = poptorch_model(data) # data_accessor_end # distributed_execution_start def process(process_id=0, num_processes=1): # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(400) # Replicate the graph across 2 IPUs in each process. opts.replicationFactor(2) # Set the id of the current process and the total number of processes. opts.Distributed.configureProcessId(process_id, num_processes) # Accumulate the gradient 8 times before applying it. opts.Training.gradientAccumulation(8) # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader. opts.randomSeed(42) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset( shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2 across # 4 IPUs. "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{batch_number} {labels[-1]}, {output}, {loss}")
def get_data(opts, model_opts, train=True, async_dataloader=False): """ A factory method to create a dataload responsible for sending data to the IPU device. This build the appropriate dataset and wraps it in a dataloader. """ if opts.precision[:3] == "16.": half_precision = True elif opts.precision[:3] == "32.": half_precision = False transform = get_preprocessing_pipeline( train, models.available_models[opts.model]["input_shape"], half_precision) # Determine the size of the small datasets if hasattr(opts, "iterations"): dataset_size = opts.batch_size * \ model_opts.device_iterations * \ model_opts.replication_factor * \ model_opts.Training.gradient_accumulation * \ opts.iterations # Select the right dataset if opts.data == "synthetic": if hasattr(opts, "iterations"): dataset = SynthDataset( models.available_models[opts.model]["input_shape"], size=dataset_size, half_precision=half_precision) else: dataset = SynthDataset( models.available_models[opts.model]["input_shape"], half_precision=half_precision) elif opts.data == "real": data_path = Path(__file__).parent.absolute().joinpath("images") if hasattr(opts, "iterations"): dataset = SampleDataset(img_dir=data_path, transform=transform, size=dataset_size) else: dataset = SampleDataset(img_dir=data_path, transform=transform) elif opts.data == "imagenet": if train: data_folder = 'train' else: data_folder = 'validation' dataset = torchvision.datasets.ImageFolder(os.path.join( opts.imagenet_data_path, data_folder), transform=transform) elif opts.data == "cifar10": data_path = Path(__file__).parent.absolute().joinpath("cifar10") dataset = torchvision.datasets.CIFAR10(root=data_path, train=train, download=True, transform=transform) num_loader_workers = min(32, multiprocessing.cpu_count()) dataloader = poptorch.DataLoader(model_opts, dataset, batch_size=opts.batch_size, num_workers=num_loader_workers, shuffle=train, drop_last=True) if async_dataloader: return poptorch.AsynchronousDataAccessor(dataloader, load_indefinitely=True) else: return dataloader
def get_data(opts, model_opts, train=True, async_dataloader=False): """ A factory method to create a dataload responsible for sending data to the IPU device. This build the appropriate dataset and wraps it in a dataloader. """ if opts.precision[:3] == "16.": half_precision = True elif opts.precision[:3] == "32.": half_precision = False transform = get_preprocessing_pipeline( train, models.available_models[opts.model]["input_shape"][-1], half_precision, opts.normalization_location == "host") # Determine the size of the small datasets if hasattr(opts, "iterations"): dataset_size = opts.batch_size * \ model_opts.device_iterations * \ model_opts.replication_factor * \ model_opts.Training.gradient_accumulation * \ opts.iterations # Select the right dataset if opts.data in ["synthetic", "generated"]: if hasattr(opts, "iterations"): dataset = GeneratedDataset( models.available_models[opts.model]["input_shape"], size=dataset_size, half_precision=half_precision) else: dataset = GeneratedDataset( models.available_models[opts.model]["input_shape"], half_precision=half_precision) elif opts.data == "real": data_path = Path(__file__).parent.parent.absolute().joinpath( "data").joinpath("images") if hasattr(opts, "iterations"): dataset = SampleDataset(img_dir=data_path, transform=transform, size=dataset_size) else: dataset = SampleDataset(img_dir=data_path, transform=transform) elif opts.data == "imagenet": if os.path.exists( os.path.join(opts.imagenet_data_path, 'metadata.json')): # WebDataset format dataset = get_webdataset(opts, model_opts, train, half_precision, transform=transform) else: # Original ImageNet format data_folder = 'train' if train else 'validation' dataset = torchvision.datasets.ImageFolder(os.path.join( opts.imagenet_data_path, data_folder), transform=transform) elif opts.data == "cifar10": data_path = Path(__file__).parent.parent.absolute().joinpath( "data").joinpath("cifar10") dataset = torchvision.datasets.CIFAR10(root=data_path, train=train, download=True, transform=transform) mode = poptorch.DataLoaderMode.Async if async_dataloader and not isinstance( dataset, torch.utils.data.IterableDataset) else poptorch.DataLoaderMode.Sync dataloader = poptorch.DataLoader( model_opts, dataset, batch_size=opts.batch_size if not (isinstance(dataset, torch.utils.data.IterableDataset)) else None, num_workers=opts.dataloader_worker, shuffle=train and not (isinstance(dataset, torch.utils.data.IterableDataset)), drop_last=not (isinstance(dataset, torch.utils.data.IterableDataset)), persistent_workers=True, auto_distributed_partitioning=not isinstance( dataset, torch.utils.data.IterableDataset), worker_init_fn=None, mode=mode, async_options={'load_indefinitely': True}) if isinstance(dataset, torch.utils.data.IterableDataset): global_batch_size = opts.batch_size * model_opts.device_iterations * model_opts.replication_factor * model_opts.Training.gradient_accumulation if async_dataloader: dataloader._accessor = poptorch.AsynchronousDataAccessor( DatasetRebatch(dataloader, global_batch_size), load_indefinitely=True) else: dataloader = DatasetRebatch(dataloader, global_batch_size) return dataloader