Beispiel #1
0
    def __init__(
        self,
        device_iterations: int = 1,
        autoreport: bool = False,
        autoreport_dir: Optional[str] = None,
        parallel_devices: Optional[List[torch.device]] = None,
        cluster_environment: Optional[ClusterEnvironment] = None,
        checkpoint_io: Optional[CheckpointIO] = None,
        training_opts: Optional["poptorch.Options"] = None,
        inference_opts: Optional["poptorch.Options"] = None,
    ) -> None:
        """
        Arguments:

            device_iterations: Number of iterations to run on device at once before returning to host.
                This can be used as an optimization to speed up training.
                https://docs.graphcore.ai/projects/poptorch-user-guide/en/0.1.67/batching.html
            autoreport: Enable auto-reporting for IPUs using PopVision
                https://docs.graphcore.ai/projects/graphcore-popvision-user-guide/en/latest/graph/graph.html
            autoreport_dir: Optional directory to store autoReport output.
            training_opts: Optional ``poptorch.Options`` to override the default created options for training.
            inference_opts: Optional ``poptorch.Options`` to override the default
                created options for validation/testing and predicting.
        """
        super().__init__(
            parallel_devices=parallel_devices,
            cluster_environment=cluster_environment,
            checkpoint_io=checkpoint_io,
        )
        if not _POPTORCH_AVAILABLE or not poptorch.ipuHardwareIsAvailable():
            raise MisconfigurationException(
                "The IPU Accelerator requires IPU devices to run. "
                "Learn more or get started with IPUs at https://www.graphcore.ai/getstarted"
            )

        self.device_iterations = device_iterations
        self.autoreport = autoreport
        self.autoreport_dir = autoreport_dir
        self.poptorch_models = {}
        self._original_accumulate_grad_batches = None
        self._training_opts = training_opts
        self._inference_opts = inference_opts

        if self.autoreport:
            options = {"autoReport.all": self.autoreport}
            if self.autoreport_dir:
                self._fs = get_filesystem(str(self.autoreport_dir))
                self._fs.makedirs(self.autoreport_dir, exist_ok=True)
                options["autoReport.directory"] = self.autoreport_dir
            os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options)
Beispiel #2
0
_GROUP_AVAILABLE = not _IS_WINDOWS and _module_available(
    'torch.distributed.group')
_HOROVOD_AVAILABLE = _module_available("horovod.torch")
_HYDRA_AVAILABLE = _module_available("hydra")
_HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental")
_KINETO_AVAILABLE = _TORCH_GREATER_EQUAL_1_8_1 and torch.profiler.kineto_available(
)
_NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(
    torch.cuda.amp, "autocast")
_OMEGACONF_AVAILABLE = _module_available("omegaconf")
_POPTORCH_AVAILABLE = _module_available('poptorch')
_TORCH_QUANTIZE_AVAILABLE = bool(
    [eg for eg in torch.backends.quantized.supported_engines if eg != 'none'])
_TORCHTEXT_AVAILABLE = _module_available("torchtext")
_TORCHVISION_AVAILABLE = _module_available('torchvision')
_TORCHMETRICS_LOWER_THAN_0_3 = _compare_version("torchmetrics", operator.lt,
                                                "0.3.0")
_TORCHMETRICS_GREATER_EQUAL_0_3 = _compare_version("torchmetrics", operator.ge,
                                                   "0.3.0")
_XLA_AVAILABLE = _module_available("torch_xla")

from pytorch_lightning.utilities.xla_device import XLADeviceUtils  # noqa: E402

_TPU_AVAILABLE = XLADeviceUtils.tpu_device_exists()

if _POPTORCH_AVAILABLE:
    import poptorch
    _IPU_AVAILABLE = poptorch.ipuHardwareIsAvailable()
else:
    _IPU_AVAILABLE = False
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
# iterations_start
from functools import reduce
from operator import mul

import sys
import torch
import poptorch

if not poptorch.ipuHardwareIsAvailable():
    print("Replicated top level graphs are not supported on the IPU model")
    sys.exit(0)


class ExampleModelWithLoss(torch.nn.Module):
    def __init__(self, data_shape, num_classes):
        super().__init__()

        self.fc = torch.nn.Linear(reduce(mul, data_shape), num_classes)
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, x, target=None):
        reshaped = x.reshape([x.shape[0], -1])
        fc = self.fc(reshaped)

        if target is not None:
            return fc, self.loss(fc, target)
        return fc


class ExampleDataset(torch.utils.data.Dataset):
Beispiel #4
0
#!/usr/bin/env python3
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.

import os
import tempfile
import pytest
import poptorch
import torch
import helpers


@pytest.mark.skipif(not poptorch.ipuHardwareIsAvailable(),
                    reason="Hardware IPU needed")
def test_ExecutableCaching(capfd):
    poptorch.setLogLevel(1)  # Force debug logging

    class Model(torch.nn.Module):
        def forward(self, x):
            return x * 6

    with tempfile.TemporaryDirectory() as cache:
        opts = poptorch.Options()
        opts.enableExecutableCaching(cache)
        m = poptorch.inferenceModel(Model(), opts)
        m.compile(torch.rand(2, 3))
        m.destroy()
        log = helpers.LogChecker(capfd)
        log.assert_contains("set enableEngineCaching to value true")
        assert os.listdir(), "No executable saved in the cache"

        n = poptorch.inferenceModel(Model(), opts)
Beispiel #5
0
def example():
    # pylint: disable=import-outside-toplevel
    import sys
    import poptorch
    if not poptorch.ipuHardwareIsAvailable():
        poptorch.logger.warn("This examples requires IPU hardware to run")
        sys.exit(0)

    # pylint: disable=unused-variable, wrong-import-position, reimported, ungrouped-imports, wrong-import-order, import-outside-toplevel
    # mnist_start
    import torch
    import torch.nn as nn
    import torchvision
    import poptorch

    # Normal pytorch batch size
    training_batch_size = 20
    validation_batch_size = 100

    opts = poptorch.Options()
    # Device "step"
    opts.deviceIterations(20)

    # How many IPUs to replicate over.
    opts.replicationFactor(4)

    opts.randomSeed(42)

    # Load MNIST normally.
    training_data = poptorch.DataLoader(
        opts,
        torchvision.datasets.MNIST('mnist_data/',
                                   train=True,
                                   download=True,
                                   transform=torchvision.transforms.Compose([
                                       torchvision.transforms.ToTensor(),
                                       torchvision.transforms.Normalize(
                                           (0.1307, ), (0.3081, ))
                                   ])),
        batch_size=training_batch_size,
        shuffle=True)

    # Load MNIST normally.
    val_options = poptorch.Options()
    validation_data = poptorch.DataLoader(
        val_options,
        torchvision.datasets.MNIST('mnist_data/',
                                   train=True,
                                   download=True,
                                   transform=torchvision.transforms.Compose([
                                       torchvision.transforms.ToTensor(),
                                       torchvision.transforms.Normalize(
                                           (0.1307, ), (0.3081, ))
                                   ])),
        batch_size=validation_batch_size,
        shuffle=True,
        drop_last=True)

    # A helper block to build convolution-pool-relu blocks.
    class Block(nn.Module):
        def __init__(self, in_channels, num_filters, kernel_size, pool_size):
            super(Block, self).__init__()
            self.conv = nn.Conv2d(in_channels,
                                  num_filters,
                                  kernel_size=kernel_size)
            self.pool = nn.MaxPool2d(kernel_size=pool_size)
            self.relu = nn.ReLU()

        def forward(self, x):
            x = self.conv(x)
            x = self.pool(x)
            x = self.relu(x)
            return x

    # Define the network using the above blocks.
    class Network(nn.Module):
        def __init__(self):
            super().__init__()
            self.layer1 = Block(1, 10, 5, 2)
            self.layer2 = Block(10, 20, 5, 2)
            self.layer3 = nn.Linear(320, 256)
            self.layer3_act = nn.ReLU()
            self.layer4 = nn.Linear(256, 10)

            self.softmax = nn.LogSoftmax(1)
            self.loss = nn.NLLLoss(reduction="mean")

        def forward(self, x, target=None):
            x = self.layer1(x)
            x = self.layer2(x)
            x = x.view(-1, 320)

            x = self.layer3_act(self.layer3(x))
            x = self.layer4(x)
            x = self.softmax(x)

            if target is not None:
                loss = self.loss(x, target)
                return x, loss
            return x

    # Create our model.
    model = Network()

    # Create model for training which will run on IPU.
    training_model = poptorch.trainingModel(model, training_data.options)

    # Same model as above, they will share weights (in 'model') which once training is finished can be copied back.
    inference_model = poptorch.inferenceModel(model, validation_data.options)

    def train():
        for batch_number, (data, labels) in enumerate(training_data):
            output, losses = training_model(data, labels)

            if batch_number % 10 == 0:
                print(f"PoptorchIPU loss at batch: {batch_number} is {losses}")

                # Pick the highest probability.
                _, ind = torch.max(output, 1)
                assert training_data.options.anchor_mode in (
                    poptorch.AnchorMode.All, poptorch.AnchorMode.Final
                ), "Only 'Final' and 'All' AnchorMode supported"
                # If we're using Final: only keep the last labels, no-op if using All
                num_labels = ind.shape[0]
                labels = labels[-num_labels:]
                eq = torch.eq(ind, labels)
                elms, counts = torch.unique(eq,
                                            sorted=False,
                                            return_counts=True)

                acc = 0.0
                if len(elms) == 2:
                    if elms[0]:
                        acc = (counts[0].item() / num_labels) * 100.0
                    else:
                        acc = (counts[1].item() / num_labels) * 100.0

                print(
                    f"Training accuracy: {acc}% from batch of size {num_labels}"
                )
        print("Done training")

    def test():
        correct = 0
        total = 0
        with torch.no_grad():
            for (data, labels) in validation_data:
                output = inference_model(data)

                # Argmax the probabilities to get the highest.
                _, ind = torch.max(output, 1)

                # Compare it against the ground truth for this batch.
                eq = torch.eq(ind, labels)

                # Count the number which are True and the number which are False.
                elms, counts = torch.unique(eq,
                                            sorted=False,
                                            return_counts=True)

                if len(elms) == 2 or elms[0]:
                    if elms[0]:
                        correct += counts[0].item()
                    else:
                        correct += counts[1].item()

                total += validation_batch_size
        print("Validation: of " + str(total) + " samples we got: " +
              str((correct / total) * 100.0) + "% correct")

    # Train on IPU.
    train()

    test()
Beispiel #6
0
def disableSmallModel():
    # POPTORCH_IPU_MODEL takes precedence over POPTORCH_SMALL_IPU_MODEL
    if not poptorch.ipuHardwareIsAvailable():
        return {"POPTORCH_IPU_MODEL": "1"}
    return {}