Beispiel #1
0
 def build_iterator(self, is_train=True):
     if not self.ipu_options:
         self.build_ipu_options()
     self.build_dataset()
     if is_train:
         self.is_spec_aug = self.args['train_dataset']['is_spec_aug']
         collate_fn = CollateFn(self.vocab.sos_id, self.vocab.eos_id,
                                self.is_spec_aug,
                                self.args['train_dataset']['dtype'])
     else:
         self.is_spec_aug = False
         collate_fn = CollateFn(self.vocab.sos_id, self.vocab.eos_id,
                                self.is_spec_aug,
                                self.args['val_dataset']['dtype'])
     self.train_iterator = poptorch.DataLoader(
         self.ipu_options,
         dataset=self.train_dataset,
         collate_fn=collate_fn,
         mode=poptorch.DataLoaderMode.Async,
         shuffle=True,
         **self.args['train_iterator'],
     )
     if not self.use_generate:
         self.val_iterator = poptorch.DataLoader(
             self.ipu_options,
             dataset=self.val_dataset,
             collate_fn=collate_fn,
             mode=poptorch.DataLoaderMode.Async,
             shuffle=True,
             **self.args['val_iterator'],
         )
Beispiel #2
0
def test_reuse_workers(DatasetType):
    shape = [2, 3]
    num_tensors = 10

    opts = poptorch.Options()
    data = poptorch.DataLoader(opts,
                               DatasetType(shape, num_tensors),
                               batch_size=1,
                               num_workers=2)
    data_no_reuse = poptorch.DataLoader(opts,
                                        DatasetType(shape, num_tensors),
                                        batch_size=1,
                                        persistent_workers=False,
                                        num_workers=2)

    loader = poptorch.AsynchronousDataAccessor(data)
    loader_no_reuse = poptorch.AsynchronousDataAccessor(data_no_reuse)

    start = None
    # Workers will be created while fetching the first element
    # so start the timer after the first element is fetched.
    num_tensors = 0
    for _ in loader_no_reuse:
        num_tensors += 1
        if start is None:
            start = time.perf_counter()

    end = time.perf_counter()
    print(f"First epoch no reuse: {end - start} {num_tensors}")

    for _ in range(3):
        start = time.perf_counter()
        for _ in loader_no_reuse:
            num_tensors += 1
        end = time.perf_counter()
        print(f"Other epoch no reuse: {end - start}  {num_tensors}")

    start = None
    # Workers will be created while fetching the first element
    # so start the timer after the first element is fetched.
    num_tensors_reuse = 0
    for _ in loader:
        num_tensors_reuse += 1
        if start is None:
            start = time.perf_counter()
    end = time.perf_counter()
    print(f"First epoch: {end - start} {num_tensors_reuse}")

    for _ in range(3):
        start = time.perf_counter()
        for _ in loader:
            num_tensors_reuse += 1
        end = time.perf_counter()
        print(f"Other epoch: {end - start} {num_tensors_reuse}")
Beispiel #3
0
def _run_dataset_test(shape=None,
                      num_tensors=100,
                      batch_size=1,
                      num_workers=0,
                      device_iterations=1,
                      replication_factor=1,
                      host_id=0,
                      num_hosts=1):
    shape = shape or [2, 3]

    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)
    opts.Distributed.configureProcessId(host_id, num_hosts)

    data = poptorch.DataLoader(opts,
                               IncrementDataset(shape, num_tensors),
                               batch_size=batch_size,
                               num_workers=num_workers)
    loader = poptorch.AsynchronousDataAccessor(data)

    offset = host_id * (num_tensors // num_hosts)
    assert len(data) == num_tensors // (device_iterations * batch_size *
                                        replication_factor * num_hosts)
    for it, d in enumerate(loader):
        expected = torch.from_numpy(
            numpy.stack([
                numpy.full(shape, offset + i, dtype=numpy.float32)
                for i in range(data.combinedBatchSize *
                               it, data.combinedBatchSize * (it + 1))
            ]))
        diff = torch.sum(torch.sum(d - expected))

    numpy.testing.assert_array_equal(diff.numpy(), [0.])
Beispiel #4
0
def _run_process_test(shape=None,
                      num_tensors=100,
                      batch_size=1,
                      num_workers=0,
                      device_iterations=1,
                      replication_factor=1,
                      num_runs=1):
    shape = shape or [2, 3]

    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)

    data = poptorch.DataLoader(opts,
                               IncrementDataset(shape, num_tensors),
                               batch_size=batch_size,
                               num_workers=num_workers)

    loader = poptorch.AsynchronousDataAccessor(data)
    assert len(loader) == num_tensors // (device_iterations * batch_size *
                                          replication_factor)

    model = poptorch.inferenceModel(DoubleData(), opts)

    for _ in range(0, num_runs):
        for it, d in enumerate(loader):
            out = model(d)

            expected = torch.stack([
                torch.full(shape, i * 2, dtype=torch.float32)
                for i in range(data.combinedBatchSize *
                               it, data.combinedBatchSize * (it + 1))
            ])

            assert torch.equal(expected, out)
Beispiel #5
0
    def _convert_to_poptorch_loader(
            self, dataloader: Union[Iterable, DataLoader],
            opts: 'poptorch.Options') -> Union[Iterable, DataLoader]:
        skip_keys = ('sampler', 'batch_sampler', 'dataset_kind')

        attrs = {
            k: v
            for k, v in vars(dataloader).items() if not k.startswith("_")
        }

        params = set(inspect.signature(dataloader.__init__).parameters)
        contains_dataset = True

        if type(dataloader) is not DataLoader:
            contains_dataset = "dataset" in params
            params.update(inspect.signature(DataLoader.__init__).parameters)

        dl_args = {
            name: attrs[name]
            for name in params if name in attrs and name not in skip_keys
        }

        multiprocessing_context = dataloader.multiprocessing_context
        dl_args['multiprocessing_context'] = multiprocessing_context
        if not contains_dataset:
            dl_args.pop('dataset')
        # Override to drop last uneven batch, as IPUs does not support uneven inputs.
        dl_args['drop_last'] = True

        dataloader = poptorch.DataLoader(**dl_args, options=opts)
        dataloader.multiprocessing_context = multiprocessing_context
        return dataloader
def setupTraining(model, args):
    """
    Setup a training run using the CIFAR-10 training dataset.

    Uses the poptorch.DataLoader so that each training iteration executed on the
    IPU will incorporate:

        * (mini-)batch size
        * device iterations
        * replica factor
        * gradient accumulation factor

    Using poptorch.DataLoaderMode.Async allows loading the dataset on a separate
    thread.  This reduces the host/IPU communication overhead by using the time
    that the IPU is running to load the next batch on the CPU.
    """
    opts = setupOptions(args, train=True)
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
    training_model = poptorch.trainingModel(model, opts, optimizer)
    dataset = cifar10(args.data_dir, train=True)

    loader = poptorch.DataLoader(opts,
                                 dataset,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 drop_last=True,
                                 num_workers=8,
                                 mode=poptorch.DataLoaderMode.Async)

    return training_model, loader
def setupInference(model, args):
    """
    Setup a training run using the CIFAR-10 training dataset.

    Uses the poptorch.DataLoader so that each training iteration executed on the
    IPU will incorporate:

        * (mini-)batch size
        * device iterations
        * replica factor
        * gradient accumulation factor

    Applying the poptorch.AsynchronousDataAccessor allows loading the dataset on
    a separate thread.  This reduces the host/IPU communication overhead by
    using the time that the IPU is running to load the next batch on the CPU.
    """
    opts = setupOptions(args, train=False)
    inference_model = poptorch.inferenceModel(model, opts)
    dataset = cifar10(args.data_dir, train=False)

    loader = poptorch.DataLoader(opts,
                                 dataset,
                                 batch_size=args.test_batch_size,
                                 shuffle=True,
                                 drop_last=True,
                                 num_workers=8)
    loader = poptorch.AsynchronousDataAccessor(loader)

    return inference_model, loader
Beispiel #8
0
def _run_test(shape=None,
              num_tensors=100,
              batch_size=1,
              num_workers=0,
              device_iterations=1,
              replication_factor=1):
    shape = shape or [2, 3]

    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)

    data = poptorch.DataLoader(opts,
                               IncrementDataset(shape, num_tensors),
                               batch_size=batch_size,
                               num_workers=num_workers)

    assert len(data) == num_tensors // (device_iterations * batch_size *
                                        replication_factor)
    model = poptorch.inferenceModel(CheckOrderModel(), opts)
    for it, d in enumerate(data):
        expected = torch.from_numpy(
            numpy.stack([
                numpy.full(shape, i, dtype=numpy.float32)
                for i in range(data.combinedBatchSize *
                               it, data.combinedBatchSize * (it + 1))
            ]))
        diff = torch.sum(model(d, expected))

    numpy.testing.assert_array_equal(diff.numpy(), [0.])
Beispiel #9
0
def get_dataloader(batch_size, opts, num_iterations, synthetic=False):
    """
    A factory method to create a dataload responsible for sending data
    to the IPU device. This build the appropriate dataset, whether
    real or synthetic, and wraps it in a dataloader.
    """
    dataset_size = batch_size * \
        opts.device_iterations * \
        opts.replication_factor * \
        num_iterations

    if synthetic:
        dataset = SynthDataset(size=dataset_size)
    else:
        transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
        ])

        dataset = SampleDataset(img_dir='./images',
                                transform=transform,
                                size=dataset_size)

    dataloader = poptorch.DataLoader(opts,
                                     dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     drop_last=True)
    return dataloader
Beispiel #10
0
 def train_dataloader(self):
     dataloader = super().train_dataloader()
     # save to instance to compare the reference later
     self.poptorch_dataloader = poptorch.DataLoader(model_options,
                                                    dataloader.dataset,
                                                    drop_last=True)
     return self.poptorch_dataloader
Beispiel #11
0
def test_broken_dataset():
    num_tensors = 100

    opts = poptorch.Options()
    data = poptorch.DataLoader(opts,
                               BrokenDataset(num_tensors),
                               batch_size=1,
                               num_workers=32)

    with pytest.raises(RuntimeError, match="worker thread failed to start"):
        poptorch.AsynchronousDataAccessor(data)
Beispiel #12
0
    def _convert_to_poptorch_loader(
        self, dataloader: DataLoader, sampler, mode: Optional[RunningStage] = None
    ) -> "poptorch.DataLoader":
        # use full path to avoid circular imports
        dl_kwargs = pl.trainer.trainer.TrainerDataLoadingMixin._get_dataloader_init_kwargs(dataloader, sampler)
        # Override to drop last uneven batch, as IPUs does not support uneven inputs.
        dl_kwargs["drop_last"] = True

        opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts
        dataloader = poptorch.DataLoader(**dl_kwargs, options=opts)
        return dataloader
Beispiel #13
0
    def _convert_to_poptorch_loader(
            self,
            dataloader: DataLoader,
            sampler,
            mode: Optional[RunningStage] = None) -> "poptorch.DataLoader":
        if isinstance(dataloader, poptorch.DataLoader):
            # the user is returning the `poptorch.DataLoader` directly, don't change anything.
            return dataloader

        dl_kwargs = _get_dataloader_init_kwargs(dataloader, sampler)
        opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts
        dataloader = poptorch.DataLoader(opts, **dl_kwargs)
        return dataloader
Beispiel #14
0
def test_len():
    shape = [2, 3]
    num_tensors = 10

    opts = poptorch.Options()
    data = poptorch.DataLoader(opts,
                               IncrementIterableDataset(shape, num_tensors),
                               batch_size=None,
                               drop_last=False,
                               num_workers=1)

    loader = poptorch.AsynchronousDataAccessor(data)
    with pytest.raises(TypeError,
                       match="'IncrementIterableDataset' has no len()"):
        len(loader)
    data = poptorch.DataLoader(opts,
                               IncrementIterableDatasetWithLen(
                                   shape, num_tensors),
                               batch_size=None,
                               drop_last=False,
                               num_workers=1)

    loader = poptorch.AsynchronousDataAccessor(data)
    len(loader)
Beispiel #15
0
def test_single_epoch():
    shape = [2, 3]
    num_tensors = 100

    opts = poptorch.Options()
    data = poptorch.DataLoader(opts,
                               IncrementDataset(shape, num_tensors),
                               batch_size=1,
                               num_workers=32)

    loader = poptorch.AsynchronousDataAccessor(data)
    assert len(loader) == num_tensors

    for _, _ in enumerate(loader):
        continue
Beispiel #16
0
def test_interrupt_async_loader():
    """Make sure the worker processes are stopped cleanly even when the end of
    the dataset is not reached."""

    shape = [2, 3]
    num_tensors = 100

    opts = poptorch.Options()
    data = poptorch.DataLoader(opts,
                               IncrementDataset(shape, num_tensors),
                               batch_size=1,
                               num_workers=1)

    loader = poptorch.AsynchronousDataAccessor(data)
    assert len(loader) == num_tensors

    for _, _ in enumerate(loader):
        break
Beispiel #17
0
def test_iterable_dataloader():
    shape = [2, 3]
    num_tensors = 100

    opts = poptorch.Options()
    data = poptorch.DataLoader(opts,
                               IncrementIterableDataset(shape, num_tensors),
                               batch_size=1,
                               num_workers=1)

    loader = poptorch.AsynchronousDataAccessor(data)

    for _, t in enumerate(loader):
        assert t.shape == torch.Size([1, 2, 3])
        continue

    # Make sure it works for more than 1 epoch
    for _, _ in enumerate(loader):
        continue
Beispiel #18
0
def test_random_raw(random_generator, instances):
    """
    Tests whether all the augmentations are unique.
    """
    class DummyDataset(torch.utils.data.Dataset):
        def __init__(self, size=10, transform=None):
            self.size = size
            self.transform = transform

        def __len__(self):
            return self.size

        def __getitem__(self, index):
            if self.transform == "numpy":
                augment = np.random.random(1)[0]
            elif self.transform == "torch":
                augment = torch.rand(1)[0]
            elif self.transform == "python":
                augment = random.random()
            else:
                augment = 0.0
            return float(index) + augment

    ds = DummyDataset(transform=random_generator)
    augments = []
    elements = []
    for instance_id in range(instances):
        opts = poptorch.Options()
        worker_init = _WorkerInit(42, instance_id, 5)
        if instances > 1:
            opts.Distributed.configureProcessId(instance_id, instances)
        opts = opts.randomSeed(42)
        data_loader = poptorch.DataLoader(opts, ds, batch_size=1, num_workers=5, shuffle=True, worker_init_fn=worker_init)
        for item in data_loader:
            frac = item[0].numpy().tolist() % 1  # Get fraction(augmentation)
            frac = int(10000 * frac)  # avoid rounding error
            augments.append(frac)
            elements.append(int(item))
    assert len(elements) == len(set(elements))
    assert len(augments) == len(set(augments))  # all augmentations must be unique
Beispiel #19
0
def _run_process_label_test(shape=None,
                            num_tensors=100,
                            batch_size=1,
                            num_workers=0,
                            device_iterations=1,
                            replication_factor=1):
    shape = shape or [2, 3]

    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)

    data = poptorch.DataLoader(opts,
                               IncrementDatasetWithLabels(shape, num_tensors),
                               batch_size=batch_size,
                               num_workers=num_workers)

    loader = poptorch.AsynchronousDataAccessor(data)
    assert len(loader) == num_tensors // (device_iterations * batch_size *
                                          replication_factor)

    model = poptorch.inferenceModel(DoubleDataLabel(), opts)

    total = torch.zeros(shape)
    label_out = torch.zeros(1, dtype=torch.int)
    for _, (data, label) in enumerate(loader):
        out, label = model(data, label)
        total += torch.sum(out, dim=0)
        label_out += torch.sum(label, dim=0)

    actual = 0
    for i in range(0, num_tensors):
        actual += i * 2

    numpy.testing.assert_array_equal(total[0][0].numpy(), [actual])
    numpy.testing.assert_array_equal(label_out[0].item(), [actual])
    def process(process_id=0, num_processes=1):
        # Create a poptorch.Options instance to override default options
        opts = poptorch.Options()

        # Run a 100 iteration loop on the IPU, fetching a new batch each time
        opts.deviceIterations(400)

        # Replicate the graph across 2 IPUs in each process.
        opts.replicationFactor(2)

        # Set the id of the current process and the total number of processes.
        opts.Distributed.configureProcessId(process_id, num_processes)

        # Accumulate the gradient 8 times before applying it.
        opts.Training.gradientAccumulation(8)

        # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader.
        opts.randomSeed(42)

        training_data = poptorch.DataLoader(opts,
                                            dataset=ExampleDataset(
                                                shape=[3, 2], length=100000),
                                            batch_size=model_batch_size,
                                            shuffle=True,
                                            drop_last=True)

        # Wrap the model in a PopTorch training wrapper
        poptorch_model = poptorch.trainingModel(model, options=opts)

        # Run over the training data with "batch_size" 200 essentially.
        for batch_number, (data, labels) in enumerate(training_data):
            # Execute the device with a 100 iteration loop of batchsize 2 across
            # 4 IPUs. "output" and "loss" will be the respective output and loss of the
            # final batch of each replica (the default AnchorMode).
            output, loss = poptorch_model(data, labels)
            print(f"{batch_number} {labels[-1]}, {output}, {loss}")
def run_data_loader_example():
    model_batch_size = 2
    # replication_start
    # Create a poptorch.Options instance to override default options
    opts = poptorch.Options()

    # Run a 100 iteration loop on the IPU, fetching a new batch each time
    opts.deviceIterations(100)

    # Duplicate the model over 4 replicas.
    opts.replicationFactor(4)

    training_data = poptorch.DataLoader(opts,
                                        dataset=ExampleDataset(shape=[3, 2],
                                                               length=100000),
                                        batch_size=model_batch_size,
                                        shuffle=True,
                                        drop_last=True)

    model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2)
    # Wrap the model in a PopTorch training wrapper
    poptorch_model = poptorch.trainingModel(model, options=opts)

    # Run over the training data with "batch_size" 200 essentially.
    for batch_number, (data, labels) in enumerate(training_data):
        # Execute the device with a 100 iteration loop of batchsize 2 across
        # 4 IPUs. "output" and "loss" will be the respective output and loss of the
        # final batch of each replica (the default AnchorMode).
        output, loss = poptorch_model(data, labels)
        print(f"{labels[-1]}, {output}, {loss}")
    # replication_end
    # gradient_acc_start
    # Create a poptorch.Options instance to override default options
    opts = poptorch.Options()

    # Run a 100 iteration loop on the IPU, fetching a new batch each time
    opts.deviceIterations(400)

    # Accumulate the gradient 8 times before applying it.
    opts.Training.gradientAccumulation(8)

    training_data = poptorch.DataLoader(opts,
                                        dataset=ExampleDataset(shape=[3, 2],
                                                               length=100000),
                                        batch_size=model_batch_size,
                                        shuffle=True,
                                        drop_last=True)

    # Wrap the model in a PopTorch training wrapper
    poptorch_model = poptorch.trainingModel(model, options=opts)

    # Run over the training data with "batch_size" 200 essentially.
    for batch_number, (data, labels) in enumerate(training_data):
        # Execute the device with a 100 iteration loop of batchsize 2 across
        # 4 IPUs. "output" and "loss" will be the respective output and loss of the
        # final batch of each replica (the default AnchorMode).
        output, loss = poptorch_model(data, labels)
        print(f"{labels[-1]}, {output}, {loss}")
    # gradient_acc_end

    # Not displayed: just to keep the linter happy
    shape = [3, 2]
    num_tensors = 100
    batch_size = 1
    num_workers = 0
    device_iterations = 1
    replication_factor = 1
    # Example starts here:
    # data_accessor_start
    opts = poptorch.Options()
    opts.deviceIterations(device_iterations)
    opts.replicationFactor(replication_factor)

    data = poptorch.DataLoader(opts,
                               ExampleDataset(shape=shape, length=num_tensors),
                               batch_size=batch_size,
                               num_workers=num_workers)

    loader = poptorch.AsynchronousDataAccessor(data)

    poptorch_model = poptorch.inferenceModel(model, opts)

    for it, (data, _) in enumerate(loader):
        out = poptorch_model(data)
    # data_accessor_end

    # distributed_execution_start
    def process(process_id=0, num_processes=1):
        # Create a poptorch.Options instance to override default options
        opts = poptorch.Options()

        # Run a 100 iteration loop on the IPU, fetching a new batch each time
        opts.deviceIterations(400)

        # Replicate the graph across 2 IPUs in each process.
        opts.replicationFactor(2)

        # Set the id of the current process and the total number of processes.
        opts.Distributed.configureProcessId(process_id, num_processes)

        # Accumulate the gradient 8 times before applying it.
        opts.Training.gradientAccumulation(8)

        # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader.
        opts.randomSeed(42)

        training_data = poptorch.DataLoader(opts,
                                            dataset=ExampleDataset(
                                                shape=[3, 2], length=100000),
                                            batch_size=model_batch_size,
                                            shuffle=True,
                                            drop_last=True)

        # Wrap the model in a PopTorch training wrapper
        poptorch_model = poptorch.trainingModel(model, options=opts)

        # Run over the training data with "batch_size" 200 essentially.
        for batch_number, (data, labels) in enumerate(training_data):
            # Execute the device with a 100 iteration loop of batchsize 2 across
            # 4 IPUs. "output" and "loss" will be the respective output and loss of the
            # final batch of each replica (the default AnchorMode).
            output, loss = poptorch_model(data, labels)
            print(f"{batch_number} {labels[-1]}, {output}, {loss}")

# Set the batch size in the conventional sense of being the size that
# runs through an operation in the model at any given time
model_batch_size = 2

# Create a poptorch.Options instance to override default options
opts = poptorch.Options()

# Run a 100 iteration loop on the IPU, fetching a new batch each time
opts.deviceIterations(100)

# Set up the DataLoader to load that much data at each iteration
training_data = poptorch.DataLoader(opts,
                                    dataset=ExampleDataset(shape=[3, 2],
                                                           length=10000),
                                    batch_size=model_batch_size,
                                    shuffle=True,
                                    drop_last=True)

model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2)
# Wrap the model in a PopTorch training wrapper
poptorch_model = poptorch.trainingModel(model, options=opts)

# Run over the training data with "batch_size" 200 essentially.
for batch_number, (data, labels) in enumerate(training_data):
    # Execute the device with a 100 iteration loop of batchsize 2.
    # "output" and "loss" will be the respective output and loss of the final
    # batch (the default AnchorMode).

    output, loss = poptorch_model(data, labels)
    print(f"{labels[-1]}, {output}, {loss}")
            self._all_labels.append(label)

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        return self._all_data[index], self._all_labels[index]


# simple_ipu_start
# Set up the PyTorch DataLoader to load that much data at each iteration
opts = poptorch.Options()
opts.deviceIterations(10)
training_data = poptorch.DataLoader(options=opts,
                                    dataset=ExampleDataset(shape=[1],
                                                           length=20000),
                                    batch_size=10,
                                    shuffle=True,
                                    drop_last=True)

model = ExampleModelWithLoss()
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

# Wrap the model in a PopTorch training wrapper
poptorch_model = poptorch.trainingModel(model,
                                        options=opts,
                                        optimizer=optimizer)

momentum_loss = None
Beispiel #24
0
opts = poptorch.Options()
# Device "step"
opts.deviceIterations(20)

# How many IPUs to replicate over.
opts.replicationFactor(4)

opts.randomSeed(42)

# Load MNIST normally.
training_data = poptorch.DataLoader(
    opts,
    torchvision.datasets.MNIST('mnist_data/',
                               train=True,
                               download=True,
                               transform=torchvision.transforms.Compose([
                                   torchvision.transforms.ToTensor(),
                                   torchvision.transforms.Normalize((0.1307, ),
                                                                    (0.3081, ))
                               ])),
    batch_size=training_batch_size,
    shuffle=True)

# Load MNIST normally.
val_options = poptorch.Options()
validation_data = poptorch.DataLoader(
    val_options,
    torchvision.datasets.MNIST('mnist_data/',
                               train=True,
                               download=True,
                               transform=torchvision.transforms.Compose([
                                   torchvision.transforms.ToTensor(),
Beispiel #25
0
def get_data(configs, model_opts, train=True, async_dataloader=False):
    """
    A factory method to create a dataloader responsible for sending data
    to the IPU device. This build the appropriate dataset and wraps it in a dataloader.
    """
    if configs.precision.startswith("16."):
        half_precision = True
    elif configs.precision.startswith("32."):
        half_precision = False
    transform = get_preprocessing_pipeline(
        train, 224, half_precision, configs.normalization_location == "host")
    # Determine the size of the small datasets
    if hasattr(configs, "iterations"):
        dataset_size = configs.micro_batch_size * \
            model_opts.device_iterations * \
            model_opts.replication_factor * \
            model_opts.Training.gradient_accumulation * \
            configs.iterations

    rebatched_worker_size = None

    # Select the right dataset
    if configs.dataset in ["synthetic", "generated"]:
        if hasattr(configs, "iterations"):
            dataset = GeneratedDataset((3, 224, 224),
                                       size=dataset_size,
                                       half_precision=half_precision)
        else:
            dataset = GeneratedDataset((3, 224, 224),
                                       half_precision=half_precision)

    elif configs.dataset in ["imagenet1k", "imagenet21k"]:
        dataset = torchvision.datasets.ImageFolder(os.path.join(
            configs.dataset_path, "train" if train else "validation"),
                                                   transform=transform)
        if train:
            rebatched_worker_size = 128

    elif configs.dataset == "cifar10":
        dataset = torchvision.datasets.CIFAR10(root=configs.dataset_path,
                                               train=train,
                                               download=True,
                                               transform=transform)
        if train:
            rebatched_worker_size = 256
    else:
        raise Exception('Dataset type not recognized: %s' % configs.dataset)

    mode = poptorch.DataLoaderMode.AsyncRebatched if async_dataloader else poptorch.DataLoaderMode.Sync
    dataloader = poptorch.DataLoader(
        model_opts,
        dataset,
        batch_size=configs.micro_batch_size
        if not (isinstance(dataset, IterableDataset)) else None,
        num_workers=configs.dataloader_workers,
        shuffle=train and not (isinstance(dataset, IterableDataset)),
        drop_last=not (isinstance(dataset, IterableDataset)),
        persistent_workers=True,
        auto_distributed_partitioning=not isinstance(dataset, IterableDataset),
        worker_init_fn=None,
        mode=mode,
        rebatched_worker_size=rebatched_worker_size,
        async_options={
            'load_indefinitely': True,
            "buffer_size": 8
        })
    return dataloader
Beispiel #26
0
def get_data(args, opts, train=True, async_dataloader=False, return_remaining=False, fine_tuning=False):
    """
    A factory method to create a dataload responsible for sending data
    to the IPU device. This build the appropriate dataset and wraps it in a dataloader.
    """
    logging.info("Loading the data")
    input_shape = models.model_input_shape(args, train)
    if args.precision[:3] == "16.":
        half_precision = True
    elif args.precision[:3] == "32.":
        half_precision = False
    use_bbox_info = getattr(args, "use_bbox_info", False)

    if args.data in ["real", "imagenet", "cifar10"]:
        transform = get_preprocessing_pipeline(train, input_shape[-1],
                                               half_precision, args.normalization_location == "host", eightbit = args.eight_bit_io,
                                               use_bbox_info=use_bbox_info, fine_tuning=fine_tuning)
    # Determine the size of the small datasets
    if hasattr(args, "iterations"):
        dataset_size = args.batch_size * \
                       opts.device_iterations * \
                       opts.replication_factor * \
                       opts.Training.gradient_accumulation * \
                       args.iterations

    # Select the right dataset
    if args.data in ["synthetic", "generated"]:
        if hasattr(args, "iterations"):
            dataset = GeneratedDataset(input_shape, size=dataset_size, half_precision=half_precision, eightbit=args.eight_bit_io)
        else:
            dataset = GeneratedDataset(input_shape, half_precision=half_precision, eightbit=args.eight_bit_io)
    elif args.data == "real":
        data_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("images")
        if hasattr(args, "iterations"):
            dataset = SampleDataset(img_dir=data_path, transform=transform, size=dataset_size)
        else:
            dataset = SampleDataset(img_dir=data_path, transform=transform)
    elif args.data == "imagenet":
        assert os.path.exists(args.imagenet_data_path), f"{args.imagenet_data_path} does not exist!"
        if os.path.exists(os.path.join(args.imagenet_data_path, 'metadata.json')):
            # WebDataset format
            dataset = get_webdataset(args, opts, train, transform=transform, use_bbox_info=use_bbox_info)
        else:
            data_folder = 'train' if train else 'validation'
            data_folder = os.path.join(args.imagenet_data_path, data_folder)
            if os.path.exists(data_folder):
                # Original ImageNet format
                bboxes = os.path.join(args.imagenet_data_path, 'imagenet_2012_bounding_boxes.csv') if use_bbox_info and train else None   # use bboxes only for training
                dataset = ImageNetDataset(data_folder, transform=transform, bbox_file=bboxes)
            else:
                # TFRecord format
                dataset = get_tfrecord(args, opts, train, transform=transform, use_bbox_info=use_bbox_info)
    elif args.data == "cifar10":
        data_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10")
        dataset = torchvision.datasets.CIFAR10(root=data_path, train=train, download=True, transform=transform)
    global_batch_size = args.batch_size * opts.device_iterations * opts.replication_factor * opts.Training.gradient_accumulation
    if async_dataloader:
        if global_batch_size == 1:
            # Avoid rebatch overhead
            mode = poptorch.DataLoaderMode.Async
        else:
            mode = poptorch.DataLoaderMode.AsyncRebatched
    else:
        mode = poptorch.DataLoaderMode.Sync
    worker_initialization = _WorkerInit(args.seed, opts.Distributed.processId, args.dataloader_worker) if hasattr(args, 'seed') else None
    rebatch_size = getattr(args, "dataloader_rebatch_size", None)
    rebatch_size = rebatch_size if rebatch_size is not None else min(1024, global_batch_size) // opts.Distributed.numProcesses
    # Make sure rebatch size is smaller than global batch size
    rebatch_size = min(rebatch_size, global_batch_size)
    dataloader = poptorch.DataLoader(opts,
                                     dataset,
                                     batch_size=args.batch_size,
                                     num_workers=args.dataloader_worker,
                                     shuffle=train and not(isinstance(dataset, torch.utils.data.IterableDataset)),
                                     drop_last= not(return_remaining) and not isinstance(dataset, torch.utils.data.IterableDataset),
                                     persistent_workers = True,
                                     auto_distributed_partitioning = not isinstance(dataset, torch.utils.data.IterableDataset),
                                     worker_init_fn=worker_initialization,
                                     mode=mode,
                                     rebatched_worker_size=rebatch_size,
                                     async_options={'load_indefinitely': True})
    if isinstance(dataset, torch.utils.data.IterableDataset):
        dataloader = DatasetRebatch(dataloader, global_batch_size, len(dataset), not(return_remaining))
    return dataloader
Beispiel #27
0
def main():
    config = transformers.BertConfig(**(vars(parse_bert_args())))
    if not config.pretrained_checkpoint:
        logger(
            "[warning] --pretrained-checkpoint was not specified; training with uninitialized BERT..."
        )
    # Warnings for configs where embeddings may not fit
    if config.embedding_serialization_factor == 1:
        if config.replication_factor == 1:
            logger(
                "[warning] With replication_factor == 1 you may need to set "
                "embedding_serialization_factor > 1 for the model to fit")
        elif not config.replicated_tensor_sharding:
            logger(
                "[warning] With replicated_tensor_sharding=False you may need to set "
                "embedding_serialization_factor > 1 for the model to fit")
    samples_per_step = config.batches_per_step * config.micro_batch_size * \
        config.gradient_accumulation * config.replication_factor
    do_training = config.squad_do_training
    do_validation = config.squad_do_validation
    opts = get_options(config)
    opts.outputMode(poptorch.OutputMode.All)

    logger("Loading Dataset...")
    datasets = load_dataset("squad")
    train_dataset = datasets["train"]

    # Create train features from dataset
    logger("Tokenizing Train Dataset...")
    train_dataset = train_dataset.map(
        prepare_train_features,
        batched=True,
        num_proc=1,
        remove_columns=train_dataset.column_names,
        load_from_cache_file=True,
    )

    # Create validation features from dataset
    logger("Tokenizing Validation Dataset...")
    validation_features = datasets["validation"].map(
        prepare_validation_features,
        batched=True,
        num_proc=1,
        remove_columns=datasets["validation"].column_names,
        load_from_cache_file=True,
    )

    # W&B
    if config.wandb and (not config.use_popdist or config.popdist_rank == 0):
        wandb.init(project="torch-bert",
                   settings=wandb.Settings(console="wrap"))
        wandb_config = vars(config)
        wandb_config['sdk_version'] = get_sdk_version()
        wandb.config.update(wandb_config)

    # Create the model
    if config.pretrained_checkpoint:
        model_ipu = PipelinedBertForQuestionAnswering.from_pretrained(
            config.pretrained_checkpoint, config=config).parallelize().half()
    else:
        model_ipu = PipelinedBertForQuestionAnswering(
            config).parallelize().half()

    if do_training:
        train_dl = poptorch.DataLoader(
            opts,
            train_dataset,
            batch_size=config.micro_batch_size,
            shuffle=True,
            drop_last=False,
            collate_fn=PadCollate(
                samples_per_step, {
                    "input_ids": 0,
                    "attention_mask": 0,
                    "token_type_ids": 0,
                    "start_positions": config.sequence_length,
                    "end_positions": config.sequence_length
                }))
        optimizer = get_optimizer(config, model_ipu)
        model_ipu.train()
        training_model = poptorch.trainingModel(model_ipu, opts, optimizer)

        sample_batch = next(iter(train_dl))
        logger("Compiling Model...")
        start_compile = time.perf_counter()
        training_model.compile(sample_batch["input_ids"],
                               sample_batch["attention_mask"],
                               sample_batch["token_type_ids"],
                               sample_batch["start_positions"],
                               sample_batch["end_positions"])

        duration_compilation = time.perf_counter() - start_compile
        logger(f"Compiled/Loaded model in {duration_compilation} secs")

        if config.compile_only:
            sys.exit()

        # Train
        scheduler = get_lr_scheduler(optimizer, "linear", config.lr_warmup,
                                     config.num_epochs * len(train_dl))
        logger("Training...")
        for epoch in range(config.num_epochs):
            for step, batch in enumerate(train_dl):
                start_step = time.perf_counter()
                outputs = training_model(batch["input_ids"],
                                         batch["attention_mask"],
                                         batch["token_type_ids"],
                                         batch["start_positions"],
                                         batch["end_positions"])

                scheduler.step()
                training_model.setOptimizer(optimizer)
                step_length = time.perf_counter() - start_step
                step_throughput = samples_per_step / step_length
                loss = outputs[0].mean().item()
                logger(
                    f"Epoch: {epoch}, Step:{step}, LR={scheduler.get_last_lr()[0]:.2e}, loss={loss:3.3f}, throughput={step_throughput:3.3f} samples/s"
                )

                if config.wandb:
                    wandb.log({
                        "Loss": loss,
                        "LR": scheduler.get_last_lr()[0],
                        "Step": step,
                        "Throughput": step_throughput
                    })
        training_model.detachFromDevice()

    if do_validation:
        config.micro_batch_size = 2
        config.batches_per_step = 16
        config.gradient_accumulation = 1
        config.replication_factor = 1
        samples_per_step = config.batches_per_step * config.micro_batch_size * \
            config.gradient_accumulation * config.replication_factor
        opts = get_options(config)
        opts.outputMode(poptorch.OutputMode.All)
        val_dl = poptorch.DataLoader(opts,
                                     validation_features.remove_columns(
                                         ['example_id', 'offset_mapping']),
                                     batch_size=config.micro_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     collate_fn=default_data_collator)
        raw_predictions = [[], []]
        model_ipu.eval()
        inference_model = poptorch.inferenceModel(model_ipu, opts)
        sample_batch = next(iter(val_dl))
        logger("Compiling Inference Model...")
        inference_model.compile(sample_batch["input_ids"],
                                sample_batch["attention_mask"],
                                sample_batch["token_type_ids"])

        if config.compile_only:
            sys.exit()

        logger("Validating...")
        for step, batch in enumerate(val_dl):
            start_step = time.perf_counter()
            outputs = inference_model(batch["input_ids"],
                                      batch["attention_mask"],
                                      batch["token_type_ids"])
            step_length = time.perf_counter() - start_step
            step_throughput = samples_per_step / step_length
            raw_predictions[0].append(outputs[0])
            raw_predictions[1].append(outputs[1])
            logger(f"Step:{step}, throughput={step_throughput} samples/s")

        raw_predictions[0] = torch.vstack(raw_predictions[0]).float().numpy()
        raw_predictions[1] = torch.vstack(raw_predictions[1]).float().numpy()
        final_predictions = postprocess_qa_predictions(datasets["validation"],
                                                       validation_features,
                                                       raw_predictions)
        metric = load_metric("squad")
        formatted_predictions = [{
            "id": k,
            "prediction_text": v
        } for k, v in final_predictions.items()]
        references = [{
            "id": ex["id"],
            "answers": ex["answers"]
        } for ex in datasets["validation"]]
        metrics = metric.compute(predictions=formatted_predictions,
                                 references=references)
        logger(metrics)
        if config.wandb:
            for k, v in metrics.items():
                wandb.run.summary[k] = v
    # Setup a Poptorch training model
    training_model = poptorch.trainingModel(
        model, opts,
        poptorch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9))

    # Create a dataset from random data
    features = torch.randn([10000, 1, 128, 128])
    labels = torch.empty([10000], dtype=torch.long).random_(10)
    dataset = torch.utils.data.TensorDataset(features, labels)
    print("Dataset size: ", len(dataset))

    # Poptorch Dataloader
    training_data = poptorch.DataLoader(opts,
                                        dataset=dataset,
                                        batch_size=bs,
                                        shuffle=True,
                                        drop_last=True,
                                        num_workers=num_workers,
                                        mode=poptorch.DataLoaderMode.Async,
                                        async_options={"early_preload": True})

    # Number of steps necessary to consume the whole dataset
    steps = len(training_data)

    # Assess asynchronous dataloader throughput on CPU
    print("Evaluating Dataloader: ", steps, "steps")
    t0 = time.time()
    for data, labels in training_data:
        pass
    t1 = time.time()
    total_time = t1 - t0
    print("Total execution Time:", total_time, "s")
Beispiel #29
0
model = ClassificationModel()

# **NOTE**: `self.training` is inherited from `torch.nn.Module` which
# initialises its value to `True`. Use `model.eval()` to set it to `False` and
# `model.train()` to switch it back to `True`.

# ### Prepare training for IPUs
# The compilation and execution on the IPU can be controlled using `poptorch.
# Options`. These options are used by PopTorch's wrappers such as `poptorch.
# DataLoader` and `poptorch.trainingModel`.
opts = poptorch.Options()

train_dataloader = poptorch.DataLoader(opts,
                                       train_dataset,
                                       batch_size=16,
                                       shuffle=True,
                                       num_workers=20)

# ### Train the model
# We will need another component in order to train our model: an optimizer. Its
# role is to apply the computed gradients to the model's weights to optimize
# (usually, minimize) the loss function using a specific algorithm. Not all
# PyTorch's ops are available at the moment, and for optimizers there are 4
# choices already: SGD, AdamW, LAMB and RMSProp.

# We'll use SGD as it's a very popular algorithm.
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# We now introduce the `poptorch.trainingModel` wrapper, which will handle the
# training. It takes an instance of a `torch.nn.Module`, such as our custom
Beispiel #30
0
def get_data(opts, model_opts, train=True, async_dataloader=False):
    """
    A factory method to create a dataload responsible for sending data
    to the IPU device. This build the appropriate dataset and wraps it in a dataloader.
    """
    if opts.precision[:3] == "16.":
        half_precision = True
    elif opts.precision[:3] == "32.":
        half_precision = False
    transform = get_preprocessing_pipeline(
        train, models.available_models[opts.model]["input_shape"],
        half_precision)
    # Determine the size of the small datasets
    if hasattr(opts, "iterations"):
        dataset_size = opts.batch_size * \
                       model_opts.device_iterations * \
                       model_opts.replication_factor * \
                       model_opts.Training.gradient_accumulation * \
                       opts.iterations

    # Select the right dataset
    if opts.data == "synthetic":
        if hasattr(opts, "iterations"):
            dataset = SynthDataset(
                models.available_models[opts.model]["input_shape"],
                size=dataset_size,
                half_precision=half_precision)
        else:
            dataset = SynthDataset(
                models.available_models[opts.model]["input_shape"],
                half_precision=half_precision)
    elif opts.data == "real":
        data_path = Path(__file__).parent.absolute().joinpath("images")
        if hasattr(opts, "iterations"):
            dataset = SampleDataset(img_dir=data_path,
                                    transform=transform,
                                    size=dataset_size)
        else:
            dataset = SampleDataset(img_dir=data_path, transform=transform)
    elif opts.data == "imagenet":
        if train:
            data_folder = 'train'
        else:
            data_folder = 'validation'
        dataset = torchvision.datasets.ImageFolder(os.path.join(
            opts.imagenet_data_path, data_folder),
                                                   transform=transform)
    elif opts.data == "cifar10":
        data_path = Path(__file__).parent.absolute().joinpath("cifar10")
        dataset = torchvision.datasets.CIFAR10(root=data_path,
                                               train=train,
                                               download=True,
                                               transform=transform)

    num_loader_workers = min(32, multiprocessing.cpu_count())
    dataloader = poptorch.DataLoader(model_opts,
                                     dataset,
                                     batch_size=opts.batch_size,
                                     num_workers=num_loader_workers,
                                     shuffle=train,
                                     drop_last=True)
    if async_dataloader:
        return poptorch.AsynchronousDataAccessor(dataloader,
                                                 load_indefinitely=True)
    else:
        return dataloader