Beispiel #1
0
def test_training_attributes():
    def custom_loss(output, target):
        # Mean squared error with a scale
        loss = output - target
        loss = loss * loss * 5
        return poptorch.identity_loss(loss, reduction="mean")

    class Model(torch.nn.Module):
        def __init__(self, attr):
            super().__init__()
            self.bias = torch.nn.Parameter(torch.zeros(()))
            self.attr = attr

        def getAttr(self):
            return self.attr

        def forward(self, x, target):
            x += 1
            x = poptorch.ipu_print_tensor(x) + self.bias
            return x, custom_loss(x, target)

    model = Model("MyAttr")
    input = torch.tensor([1.0, 2.0, 3.0])
    target = torch.tensor([30.0, 40.0, 50.0])
    poptorch_model = poptorch.trainingModel(model)

    poptorch_model(input, target)

    assert poptorch_model.getAttr() == poptorch_model.attr
    assert poptorch_model.attr == "MyAttr"
def setupTraining(model, args):
    """
    Setup a training run using the CIFAR-10 training dataset.

    Uses the poptorch.DataLoader so that each training iteration executed on the
    IPU will incorporate:

        * (mini-)batch size
        * device iterations
        * replica factor
        * gradient accumulation factor

    Using poptorch.DataLoaderMode.Async allows loading the dataset on a separate
    thread.  This reduces the host/IPU communication overhead by using the time
    that the IPU is running to load the next batch on the CPU.
    """
    opts = setupOptions(args, train=True)
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
    training_model = poptorch.trainingModel(model, opts, optimizer)
    dataset = cifar10(args.data_dir, train=True)

    loader = poptorch.DataLoader(opts,
                                 dataset,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 drop_last=True,
                                 num_workers=8,
                                 mode=poptorch.DataLoaderMode.Async)

    return training_model, loader
Beispiel #3
0
    def pre_dispatch(self) -> None:
        precision = self.lightning_module.trainer.precision
        model = LightningIPUModule(self.lightning_module, precision)
        self.model = model

        # reset the backup
        self.poptorch_models = {}

        # Separate models are instantiated for different stages, but they share the same weights on host.
        # When validation/test models are run, weights are synced first.
        trainer_fn = self.lightning_module.trainer.state.fn
        if trainer_fn in (TrainerFn.FITTING, TrainerFn.TUNING):
            # Create model for training and validation which will run on fit
            training_opts = self.training_opts
            inference_opts = self.inference_opts
            optimizer = self.lightning_module.trainer.optimizers[0]
            model = poptorch.trainingModel(model=model, options=training_opts, optimizer=optimizer)
            self.poptorch_models[RunningStage.TRAINING] = model

            if self.lightning_module.trainer.enable_validation:
                model = poptorch.inferenceModel(model=model, options=inference_opts)
                self.poptorch_models[RunningStage.VALIDATING] = model
        elif trainer_fn == TrainerFn.VALIDATING:
            model = poptorch.inferenceModel(model=model, options=self.inference_opts)
            self.poptorch_models[RunningStage.VALIDATING] = model
        elif trainer_fn == TrainerFn.TESTING:
            model = poptorch.inferenceModel(model=model, options=self.inference_opts)
            self.poptorch_models[RunningStage.TESTING] = model
        elif trainer_fn == TrainerFn.PREDICTING:
            model = poptorch.inferenceModel(model=model, options=self.inference_opts)
            self.poptorch_models[RunningStage.PREDICTING] = model
Beispiel #4
0
def convert_to_ipu_model(model, opts, optimizer):
    model_opts = create_model_opts(opts)
    # PopART settings
    if opts.enable_stochastic_rounding:
        model_opts.Popart.set("enableStochasticRounding", True)
    if opts.data == "synthetic":
        model_opts.Popart.set("syntheticDataMode", 2)
    if opts.half_partial:
        model_opts.Popart.set("partialsTypeMatMuls", "half")
        model_opts.Popart.set("convolutionOptions", {'partialsType': 'half'})

    if opts.enable_pipeline_recompute and len(opts.pipeline_splits) > 0:
        model_opts.Popart.set("autoRecomputation", 3)

    # disable prefetch to save memory
    if opts.replicas > 1:
        model_opts.Popart.set("enablePrefetchDatastreams", False)
    model_opts.Popart.set("disableGradAccumulationTensorStreams", True)

    num_stages = len(opts.pipeline_splits)+1
    if len(opts.available_memory_proportion) == 1:
        model_opts.setAvailableMemoryProportion({f'IPU{i}': opts.available_memory_proportion[0] for i in range(num_stages)})
    elif len(opts.available_memory_proportion) > 1:
            model_opts.setAvailableMemoryProportion({f'IPU{i}': amp for i, amp in enumerate(opts.available_memory_proportion)})

    # Scale the loss to be the same as bs=1 on a single IPU training.
    loss_scaling_factor = (1.0 / opts.batch_size)
    model_with_loss = TrainingModelWithLoss(model, loss_scaling_factor)
    training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optimizer)
    return training_model
Beispiel #5
0
    def pre_dispatch(self) -> None:
        self._handle_gradient_accumulation_steps()
        if self.convert_model_to_half:
            log.info(
                'Using full 16bit precision, converting LightningModule weights to FP16.'
            )
            self.model = self.model.half()
        precision = self.lightning_module.trainer.precision
        precision = 16 if self.convert_model_to_half else precision

        model = LightningIPUModule(self.lightning_module, precision)
        self.model = model

        # Separate models are instantiated for different stages, but they share the same weights on host.
        # When validation/test models are run, weights are synced first.

        if self.lightning_module.trainer.state.stage is RunningStage.TRAINING:
            # Create model for training which will run training.
            optimizer = self.lightning_module.trainer.optimizers[0]
            model = poptorch.trainingModel(model=model,
                                           options=self.training_opts,
                                           optimizer=optimizer)
            self.poptorch_models[RunningStage.TRAINING] = model
        for x in (RunningStage.VALIDATING, RunningStage.TESTING,
                  RunningStage.PREDICTING):
            model = poptorch.inferenceModel(
                model=model,
                options=self.inference_opts,
            )
            self.poptorch_models[x] = model
Beispiel #6
0
 def test_training(self):
     model = Yolov4Head(self.anchors, num_input_channels=32,
                        num_classes=3, stride=8, calculate_loss=True, precision=torch.half)
     optimizer = torch.optim.SGD(
         model.parameters(), lr=0.01, momentum=0.9, nesterov=False)
     model = trainingModel(model.half(), optimizer=optimizer)
     loss = model(self.input_tensor)
     assert torch.numel(loss) == 1
Beispiel #7
0
def test_2x2_parallel_phased_execution_opts(capfd):
    poptorch.setLogLevel(1)  # Force debug logging
    N = 3
    size = 10

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.weights = []
            for n in range(N * 6):
                weight = torch.nn.Parameter(torch.rand(size, size),
                                            requires_grad=True)
                self.register_parameter(f"w{n}", weight)
                self.weights.append(weight)

        def forward(self, in0, target=None):
            phase = 0
            weight = iter(self.weights)
            with poptorch.Block("phase0_ipu0"):
                ins = torch.split(in0, size)
            for n in range(N * 3):
                out = []
                for ipu in range(2):
                    x = ins[ipu]
                    with poptorch.Block(f"phase{phase}_ipu{ipu}"):
                        x = torch.matmul(next(weight), x)
                        out.append(F.relu(x))
                ins = out[1], out[0]
                # We want 2 matmuls in the same phase
                if n % 3 != 1:
                    phase += 1
            with poptorch.Block(f"phase{N*2-1}_ipu1"):
                res = ins[0] + ins[1]
                if target is None:
                    return res
                return res, torch.nn.L1Loss(reduction="mean")(res, target)

    input = torch.rand(size * 2, 1)
    target = torch.rand(size, 1)
    model = Model()
    opts = poptorch.Options()
    phases = []
    # Alternate between 0-2 and 1-3
    for n in range(N):
        phases.append([
            poptorch.Stage(f"phase{2*n}_ipu0").ipu(0),
            poptorch.Stage(f"phase{2*n}_ipu1").ipu(2)
        ])
        phases.append([
            poptorch.Stage(f"phase{2*n+1}_ipu0").ipu(1),
            poptorch.Stage(f"phase{2*n+1}_ipu1").ipu(3)
        ])
    opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(*phases))
    poptorch_model = poptorch.trainingModel(model, opts)
    poptorch_model.compile(input, target)

    testlog = LogChecker(capfd)
    testlog.validate_2x2_parallel_phased_execution()
Beispiel #8
0
 def test_training(self):
     model = Yolov4P5BackBone(3, nn.ReLU(), calculate_loss=True)
     optimizer = torch.optim.SGD(model.parameters(),
                                 lr=0.01,
                                 momentum=0.9,
                                 nesterov=False)
     model = trainingModel(model.half(), optimizer=optimizer)
     _, _, _, loss = model(torch.Tensor(np.random.randn(1, 3, 64, 64)))
     assert torch.numel(loss) == 1
Beispiel #9
0
def convert_to_ipu_model(model, opts, optimizer):
    model_opts = create_model_opts(opts)
    model_opts = utils.train_settings(opts, model_opts)
    replica_count = opts.replicas * (opts.popdist_size
                                     if opts.use_popdist else 1)
    model_with_loss = TrainingModelWithLoss(
        model, replicas=replica_count, label_smoothing=opts.label_smoothing)
    training_model = poptorch.trainingModel(model_with_loss,
                                            model_opts,
                                            optimizer=optimizer)
    return training_model
Beispiel #10
0
def convert_to_ipu_model(model, opts, optimizer):
    model_opts = create_model_opts(opts)
    # PopART settings
    model_opts.Popart.set("enableStochasticRounding",
                          opts.enable_stochastic_rounding)
    if opts.data == "synthetic":
        model_opts.Popart.set("syntheticDataMode",
                              int(popart.SyntheticDataMode.RandomNormal))
    if opts.half_partial:
        model_opts.Popart.set("partialsTypeMatMuls", "half")
        model_opts.Popart.set("convolutionOptions", {'partialsType': 'half'})

    if opts.enable_pipeline_recompute and len(opts.pipeline_splits) > 0:
        model_opts.Popart.set("autoRecomputation",
                              int(popart.RecomputationType.Pipeline))

    # disable prefetch to save memory
    if opts.replicas > 1:
        model_opts.Popart.set("enablePrefetchDatastreams", False)
    model_opts.Popart.set("disableGradAccumulationTensorStreams", True)

    num_stages = len(opts.pipeline_splits) + 1
    if len(opts.available_memory_proportion) == 1:
        model_opts.setAvailableMemoryProportion({
            f'IPU{i}': opts.available_memory_proportion[0]
            for i in range(num_stages)
        })
    elif len(opts.available_memory_proportion) > 1:
        model_opts.setAvailableMemoryProportion({
            f'IPU{i}': amp
            for i, amp in enumerate(opts.available_memory_proportion)
        })

    if opts.reduction == 'mean':
        model_opts.Popart.set('accumulationReductionType',
                              int(popart.ReductionType.Mean))

    if opts.disable_metrics:
        # if not interested in accurate metrics, return only subset of the predictions
        model_opts.anchorMode(poptorch.AnchorMode.Final)
    else:
        model_opts.anchorMode(poptorch.AnchorMode.All)

    # Scale the loss to be the same as bs=1 on a single IPU training.
    loss_scaling = 1.0 / opts.batch_size if opts.reduction == 'sum' else 1.0
    model_with_loss = TrainingModelWithLoss(
        model,
        loss_scaling=loss_scaling,
        label_smoothing=opts.label_smoothing,
        reduction=opts.reduction)
    training_model = poptorch.trainingModel(model_with_loss,
                                            model_opts,
                                            optimizer=optimizer)
    return training_model
Beispiel #11
0
def convert_to_ipu_model(model, args, optimizer):
    opts = create_training_opts(args)
    model_with_loss = TrainingModelWithLoss(
        model,
        label_smoothing=args.label_smoothing,
        use_mixup=args.mixup_enabled,
        use_cutmix=args.cutmix_enabled)
    training_model = poptorch.trainingModel(model_with_loss,
                                            opts,
                                            optimizer=optimizer)
    return training_model
Beispiel #12
0
 def run_model(opts):
     input_data = torch.ones(4, 1)
     labels_data = torch.ones(4).long()
     model = torch.nn.Linear(1, 2, bias=False)
     model_with_loss = TrainingModelWithLoss(model, 0.1)
     optimizer = SGD(model_with_loss.parameters(), lr=0.1, momentum=0., use_combined_accum=True)
     training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer)
     for _ in range(3):
         preds, loss, _ = training_model(input_data, labels_data)
     # return the weights of the model
     return list(model_with_loss.model.named_parameters())[0][1], loss
Beispiel #13
0
 def test_training(self):
     model = Yolov4P5Neck(nn.ReLU(), calculate_loss=True)
     optimizer = torch.optim.SGD(model.parameters(),
                                 lr=0.01,
                                 momentum=0.9,
                                 nesterov=False)
     model = trainingModel(model.half(), optimizer=optimizer)
     x = (torch.Tensor(np.random.randn(1, 1024, 2, 2)),
          torch.Tensor(np.random.randn(1, 512, 4, 4)),
          torch.Tensor(np.random.randn(1, 256, 8, 8)))
     _, _, _, loss = model(x)
     assert torch.numel(loss) == 1
Beispiel #14
0
    def setup(self, trainer: "pl.Trainer") -> None:
        # set the `accumulate_grad_batches` property as early as possible
        self._handle_gradient_accumulation_steps()

        # patch the dataloader creation function with the custom `poptorch.DataLoader`.
        # this violates the intended control flow for the plugins, but since this is experimental, we have chosen
        # to use the simpler solution before adding abstractions to override the `DataLoader` class
        self._update_dataloader_original = pl.trainer.connectors.data_connector._update_dataloader
        pl.trainer.connectors.data_connector._update_dataloader = self._convert_to_poptorch_loader

        super().setup(trainer)

        # disable the `optimizer_zero_grad` function by setting it to `None`.
        # this is because the IPU zeros the gradients internally
        self._optimizer_zero_grad_original = self.lightning_module.optimizer_zero_grad
        self._disable_zero_grad()

        model = LightningIPUModule(self.lightning_module,
                                   self.precision_plugin.precision)
        self.model = model

        # reset the backup
        self.poptorch_models = {}

        # Separate models are instantiated for different stages, but they share the same weights on host.
        # When validation/test models are run, weights are synced first.
        trainer_fn = self.lightning_module.trainer.state.fn
        if trainer_fn in (TrainerFn.FITTING, TrainerFn.TUNING):
            # Create model for training and validation which will run on fit
            training_opts = self.training_opts
            inference_opts = self.inference_opts
            optimizer = self.lightning_module.trainer.optimizers[0]
            model = poptorch.trainingModel(model=model,
                                           options=training_opts,
                                           optimizer=optimizer)
            self.poptorch_models[RunningStage.TRAINING] = model

            if self.lightning_module.trainer.enable_validation:
                model = poptorch.inferenceModel(model=model,
                                                options=inference_opts)
                self.poptorch_models[RunningStage.VALIDATING] = model
        elif trainer_fn == TrainerFn.VALIDATING:
            model = poptorch.inferenceModel(model=model,
                                            options=self.inference_opts)
            self.poptorch_models[RunningStage.VALIDATING] = model
        elif trainer_fn == TrainerFn.TESTING:
            model = poptorch.inferenceModel(model=model,
                                            options=self.inference_opts)
            self.poptorch_models[RunningStage.TESTING] = model
        elif trainer_fn == TrainerFn.PREDICTING:
            model = poptorch.inferenceModel(model=model,
                                            options=self.inference_opts)
            self.poptorch_models[RunningStage.PREDICTING] = model
Beispiel #15
0
def test_optimizer_groups_none_args():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = torch.nn.Sequential(torch.nn.Linear(10, 10),
                                             torch.nn.Linear(10, 10))
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, X, Y, Z, B=None):  # pylint: disable=unused-argument
            fwd = self.model(X)
            return fwd, self.loss(fwd, Y)

    model = Model()

    input = torch.randn(1, 10)
    target = torch.randint(0, 10, [1])

    # Start the optimizer as zero for both groups.
    poptorch_model = poptorch.trainingModel(
        model,
        optimizer=optim.AdamW([{
            'params': model.model[0].parameters(),
            "lr": 0.0
        }, {
            'params': model.model[1].parameters(),
            "lr": 0.0
        }],
                              lr=0.1))

    poptorch_model.compile(input, target, target)

    # Parameter is a soft copy by default oddly.
    weight1 = model.model[0].weight.clone()
    bias1 = model.model[0].bias.clone()
    weight2 = model.model[1].weight.clone()
    bias2 = model.model[1].bias.clone()

    _, _ = poptorch_model(input, target, target)
    for _ in range(0, 100):
        _, _ = poptorch_model(input, target, target)

    weight1_post, bias1_post = model.model[0].parameters()
    weight2_post, bias2_post = model.model[1].parameters()

    # Nothing should have changed.
    assert torch.equal(weight1, weight1_post)
    assert torch.equal(weight2, weight2_post)
    assert torch.equal(bias1, bias1_post)
    assert torch.equal(bias2, bias2_post)
Beispiel #16
0
def test_access_scalar_parameter(use_half):
    class ExampleModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bias = torch.nn.Parameter(torch.zeros(()))

        def forward(self, x):
            x += 1

            # It is important to make sure the result of the print is used.
            x = poptorch.ipu_print_tensor(x)

            return x + self.bias

    def custom_loss(output, target):
        # Mean squared error with a scale
        loss = output - target
        loss = loss * loss * 5
        return poptorch.identity_loss(loss, reduction="mean")

    class ExampleModelWithCustomLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = ExampleModel()

        def forward(self, input, target=None):
            out = self.model(input)
            if target is not None:
                return out, custom_loss(out, target)
            return out

    model = ExampleModelWithCustomLoss()
    input = torch.tensor([1.0, 2.0, 3.0])
    target = torch.tensor([30.0, 40.0, 50.0])
    if use_half:
        model.half()
        input = input.half()
        target = target.half()
    poptorch_model = poptorch.trainingModel(model)
    original_bias = str(poptorch_model.model.model.bias)

    for _ in range(10):
        poptorch_model(input=input, target=target)

    updated_bias = str(poptorch_model.model.model.bias)
    assert original_bias != updated_bias

    poptorch_model.copyWeightsToHost()
    # Bias should already be up to date
    assert updated_bias == str(poptorch_model.model.model.bias)
Beispiel #17
0
def test_training_model(conv_mode):
    model = ClassificationModel(conv_mode)

    # N, C, H, W
    x = torch.randn(5, 3, 32, 32)
    labels = torch.randint(low=1, high=10, size=(5, ))
    out, loss = model(x, labels)

    pop_model = poptorch.trainingModel(
        model, poptorch.Options(), torch.optim.SGD(model.parameters(),
                                                   lr=0.01))
    pop_out, pop_loss = pop_model(x, labels)
    torch.testing.assert_allclose(out, pop_out)
    torch.testing.assert_allclose(loss, pop_loss)
def profile(model, args):
    """
    Profile a single training iteration on the IPU using synthetic data
    """
    opts = setupOptions(args)
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
    training_model = poptorch.trainingModel(model, opts, optimizer)

    # Generate a random dataset for profiling
    device_batch_size = args.batch_size * args.batches_per_step
    torch.manual_seed(0)
    data = torch.randn(device_batch_size, 3, 32, 32)
    labels = torch.randint(0, 10, (device_batch_size, ))
    _, _ = training_model(data, labels)
Beispiel #19
0
def test_optimizer_SGD_nesterov():
    torch.manual_seed(42)

    class Model(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = torch.nn.Sequential(torch.nn.Linear(10, 10),
                                             torch.nn.Linear(10, 10))
            self.loss = torch.nn.CrossEntropyLoss()

        def forward(self, X, Y):
            fwd = self.model(X)
            return fwd, self.loss(fwd, Y)

    model = Model()

    with pytest.raises(ValueError,
                       match="Nesterov momentum is currently not supported"):
        poptorch.trainingModel(model,
                               optimizer=optim.SGD(model.parameters(),
                                                   nesterov=True,
                                                   momentum=0.1,
                                                   lr=0.001))
Beispiel #20
0
def test_explicit_deletion(use_half):
    class ExampleModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.bias = torch.nn.Parameter(torch.zeros(()))

        def forward(self, x):
            x += 1

            # It is important to make sure the result of the print is used.
            x = poptorch.ipu_print_tensor(x)

            return x + self.bias

    def custom_loss(output, target):
        # Mean squared error with a scale
        loss = output - target
        loss = loss * loss * 5
        return poptorch.identity_loss(loss, reduction="mean")

    class ExampleModelWithCustomLoss(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.model = ExampleModel()

        def forward(self, input, target=None):
            out = self.model(input)
            if target is not None:
                return out, custom_loss(out, target)
            return out

    opts = poptorch.Options()
    # Both models will use the same IPU device.
    opts.useIpuId(1)

    model = ExampleModelWithCustomLoss()
    input = torch.tensor([1.0, 2.0, 3.0])
    target = torch.tensor([30.0, 40.0, 50.0])
    if use_half:
        model.half()
        input = input.half()
        target = target.half()
    training_model = poptorch.trainingModel(model, opts)
    inference_model = poptorch.inferenceModel(model, opts)

    training_model(input=input, target=target)
    training_model.destroy()

    inference_model(input)
Beispiel #21
0
def get_model_and_loader(opt: argparse.ArgumentParser,
                         cfg: yacs.config.CfgNode):
    """Prepares the model and gets a new loader for the model.
    Parameters:
        opt: opt object containing options introduced in the command line
        cfg: yacs object containing the config
    Returns:
        model[Detector]: a torch Detector Model
        loader[DataLoader]: a torch or poptorch DataLoader containing the specified dataset on "cfg"
    """

    # Create model
    model = Yolov4P5(cfg)

    if cfg.model.mode == "train":
        model.train()
    else:
        model.eval()

        # Load weights and fuses some batch normalizations with some convolutions
        if cfg.model.normalization == 'batch':
            if opt.weights:
                print("loading pretrained weights")
                model = load_and_fuse_pretrained_weights(model, opt)
            model.optimize_for_inference()

    # Create the specific ipu options if cfg.model.ipu
    ipu_opts = ipu_options(opt, cfg, model) if cfg.model.ipu else None

    # Creates the loader
    loader = get_loader(opt, cfg, ipu_opts)

    # Calls the poptorch wrapper and compiles the model
    if cfg.model.ipu:
        if cfg.model.mode == "train":
            model = trainingModel(model, ipu_opts)
        else:
            model = inferenceModel(model, ipu_opts)
        try:
            img, _, _, _ = next(iter(loader))
            model.compile(img)
            warm_up_iterations = 100
            for _ in range(warm_up_iterations):
                _ = model(img)
        except Exception as e:
            print(e.args)
            exit(0)

    return model, loader
Beispiel #22
0
 def _wrap_model(self, type):
     self.logger.info(f'wrapping model.')
     if type == 'train':
         self.torch_model.train()
         self.training_model = trainingModel(
             model=self.torch_model,
             options=self.ipu_options,
             optimizer=self.optimizer,
         )
         self.logger.info(f'wrapped training model.')
     elif type == 'val':
         self.torch_model.eval()
         self.val_model = inferenceModel(model=self.torch_model,
                                         options=self.ipu_options)
         self.logger.info(f'wrapped inference model.')
Beispiel #23
0
 def train(model, recompute):
     input_data = torch.ones(1, 3, 224, 224)
     labels_data = torch.ones(1).long()
     opts = poptorch.Options()
     if recompute:
         opts._Popart.set("autoRecomputation", int(popart.RecomputationType.Standard))
     opts.outputMode(poptorch.OutputMode.All)
     opts.randomSeed(0)
     opts.Training.gradientAccumulation(1)
     opts.Precision.enableStochasticRounding(False)
     model_with_loss = TrainingModelWithLoss(model)
     optimizer = SGD(model_with_loss.parameters(), lr=0.01, momentum=0., use_combined_accum=True)
     training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer)
     predictions = []
     for _ in range(3):
         preds, _, _ = training_model(input_data, labels_data)
         predictions.append(preds)
     training_model.destroy()
     return predictions
Beispiel #24
0
def test_matmul_training():
    N, M, K, C = 100, 9, 7, 5

    class Net(torch.nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            torch.manual_seed(42)
            self.linear = torch.nn.Linear(K, K)
            self.softmax = torch.nn.LogSoftmax(dim=1)
            self.loss = torch.nn.L1Loss(reduction="mean")

        def forward(self, x, y, target):
            x = self.linear(x)
            x = torch.matmul(x, y)
            return x, self.loss(x, target)

    torch.manual_seed(42)
    model = Net()
    opts = poptorch.Options()

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    torch.manual_seed(42)
    poptorch_model = poptorch.trainingModel(model, opts, optimizer)
    x = torch.randn(N, M, K)
    y = torch.randn(K, K)
    target = torch.empty(N, M, K, dtype=torch.long).random_(0, C)

    for _ in range(0, 400):
        optimizer.zero_grad()
        poptorch_output, poptorch_loss = poptorch_model(x, y, target)
        native_output, native_loss = model(x, y, target)
        native_loss.backward(retain_graph=True)
        optimizer.step()

    torch.testing.assert_allclose(poptorch_output,
                                  native_output,
                                  rtol=1e-02,
                                  atol=1e-02)
    torch.testing.assert_allclose(poptorch_loss,
                                  native_loss,
                                  rtol=1e-03,
                                  atol=1e-03)
Beispiel #25
0
 def train(model, recompute):
     input_data = torch.ones(1, 3, 224, 224)
     labels_data = torch.ones(1).long()
     model_opts = poptorch.Options()
     if recompute:
         model_opts.Popart.set("autoRecomputation",
                               int(popart.RecomputationType.Standard))
     model_opts.anchorMode(poptorch.AnchorMode.All)
     model_opts.randomSeed(0)
     model_opts.Training.gradientAccumulation(1)
     model_with_loss = TrainingModelWithLoss(model)
     optimizer = SGD(model_with_loss.parameters(), lr=0.01, momentum=0.)
     training_model = poptorch.trainingModel(model_with_loss,
                                             model_opts,
                                             optimizer=optimizer)
     predictions = []
     for _ in range(3):
         preds, loss = training_model(input_data, labels_data)
         predictions.append(preds)
     return predictions
def test_constant_lrschedule():
    """
    Test that lr schedule "constant" results in unchanging LR
    """
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    args = """
    --config unit_test
    --lr-schedule constant
    """.split()
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    opts = get_options(config)

    # IPU Model and Optimizer
    model = PipelinedBertWithLoss(config).half().train()
    optimizer = get_optimizer(config, model)
    scheduler = get_lr_scheduler(optimizer, "constant")
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    def mock_data():
        return get_generated_datum(config)

    # Compile the model
    poptorch_model.compile(*mock_data())

    # Starting lr should be 1.0
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == config.learning_rate

    # Run for some steps
    for _ in range(5):
        outputs = poptorch_model(*mock_data())
        scheduler.step()
        poptorch_model.setOptimizer(optimizer)

    # LR should be unchanged
    assert poptorch_model._dict_new_optimizer["groups"][0]["learningRate"][
        0] == config.learning_rate
Beispiel #27
0
def test_recompute_checkpoint_not_in_ir():
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Config
    args = """
    --config unit_test
    --lr-schedule constant
    --layers-per-ipu 0 3
    --vocab-size 30400
    --weight-decay 0.0
    --recompute-checkpoint-every-layer False
    """.split()
    config = BertConfig(**(vars(parse_bert_args(args))))

    assert config.recompute_checkpoint_every_layer is False

    # Execution parameters
    opts = get_options(config)
    model = PipelinedBertForPretraining(config).parallelize().half().train()
    optimizer = get_optimizer(config, model)
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    # Compile model
    datum = get_generated_datum(config)
    poptorch_model.compile(*datum)
    ir = json.loads(poptorch_model._debugGetPopartIR())
    assert not any(["Checkpoint" in node["name"] for node in ir["maingraph"]
                    ]), ("Popart IR should contain a checkpoint")

    # Stash: 5 inputs, and 1 stash for transformers on ipu1
    exp_num_stash = 5 + 1
    assert sum([
        "Stash" in node["type"] for node in ir["maingraph"]
    ]) == exp_num_stash, ("Both the graph input and the checkpoint(s) "
                          "should be stashed")
    print(sum(["Stash" in node["type"] for node in ir["maingraph"]]))
    def process(process_id=0, num_processes=1):
        # Create a poptorch.Options instance to override default options
        opts = poptorch.Options()

        # Run a 100 iteration loop on the IPU, fetching a new batch each time
        opts.deviceIterations(400)

        # Replicate the graph across 2 IPUs in each process.
        opts.replicationFactor(2)

        # Set the id of the current process and the total number of processes.
        opts.Distributed.configureProcessId(process_id, num_processes)

        # Accumulate the gradient 8 times before applying it.
        opts.Training.gradientAccumulation(8)

        # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader.
        opts.randomSeed(42)

        training_data = poptorch.DataLoader(opts,
                                            dataset=ExampleDataset(
                                                shape=[3, 2], length=100000),
                                            batch_size=model_batch_size,
                                            shuffle=True,
                                            drop_last=True)

        # Wrap the model in a PopTorch training wrapper
        poptorch_model = poptorch.trainingModel(model, options=opts)

        # Run over the training data with "batch_size" 200 essentially.
        for batch_number, (data, labels) in enumerate(training_data):
            # Execute the device with a 100 iteration loop of batchsize 2 across
            # 4 IPUs. "output" and "loss" will be the respective output and loss of the
            # final batch of each replica (the default AnchorMode).
            output, loss = poptorch_model(data, labels)
            print(f"{batch_number} {labels[-1]}, {output}, {loss}")
Beispiel #29
0
 def build_model(self):
     self.get_xs_mask()
     model_ipu = self.model[0]
     model_cpu = self.model[1]
     tensor_ = self.tensor_
     tensors = []
     for t in tensor_:
         self.opts.anchorTensor('model.' + t, 'model.' + t)
         self.opts.anchorTensor('Gradient___model.' + t,
                                'Gradient___model.' + t)
     optimizer = poptorch.optim.SGD(model_ipu.parameters(), lr=0.001)
     training_model = poptorch.trainingModel(model_ipu,
                                             options=self.opts,
                                             optimizer=optimizer)
     model_dict = model_cpu.state_dict()
     model_one_iter = training_model.model.state_dict()
     pretrained_dict_model_one_iter = {
         k: v
         for k, v in model_one_iter.items() if k in model_dict
     }
     model_dict.update(pretrained_dict_model_one_iter)
     model_cpu.load_state_dict(model_dict)
     self.model_cpu = model_cpu
     self.training_model = training_model
Beispiel #30
0
        super().__init__()
        self.fc = torch.nn.Linear(10, 10)
        self.loss = torch.nn.MSELoss()

    def forward(self, x, target=None):
        fc = self.fc(x)
        if self.training:
            return fc, self.loss(fc, target)
        return fc


torch.manual_seed(0)
model = ExampleModelWithLoss()

# Wrap the model in our PopTorch annotation wrapper.
poptorch_model = poptorch.trainingModel(model)

# Some dummy inputs.
input = torch.randn(10)
target = torch.randn(10)

# Train on IPU.
for i in range(0, 100):
    # Each call here executes the forward pass, loss calculation, and backward
    # pass in one step.
    # Model input and loss function input are provided together.
    poptorch_out, loss = poptorch_model(input, target)
    print(f"{i}: {loss}")

# Copy the trained weights from the IPU back into the host model.
poptorch_model.copyWeightsToHost()