Ejemplo n.º 1
0
def compare_weights(session0, session1, tmpdir):
    ref_path = str(tmpdir / f"ref_session.onnx")
    session0.modelToHost(ref_path)
    session0_proto = onnx.load(ref_path)
    session0_weights = {}
    session1_weights = {}
    for i in range(len(session0_proto.graph.initializer)):
        init = session0_proto.graph.initializer[i]
        dtype = mapping.TENSOR_TYPE_TO_NP_TYPE[init.data_type]
        empty_init = np.empty(shape=init.dims, dtype=dtype)
        session0_weights[init.name] = empty_init
        session1_weights[init.name] = empty_init

    session0.weightsToHost()
    session0.readWeights(popart.PyWeightsIO(session0_weights))
    session1.weightsToHost()
    session1.readWeights(popart.PyWeightsIO(session1_weights))

    for i in range(len(session0_proto.graph.initializer)):
        init_name = session0_proto.graph.initializer[i].name
        print("Comparing ", init_name)
        print(session0_weights[init_name])
        print(session1_weights[init_name])
        assert np.array_equal(session0_weights[init_name],
                              session1_weights[init_name])
Ejemplo n.º 2
0
def save_weights_for_decoder(builder, training_session,
                             weight_names_to_checkpt, precision,
                             decoder_weights_fp):
    """ save weights for greedy decoding to numpy files
    Note: weight_names_to_checkpt can be either actual model weights or exp-mov-averaged weights """

    # Initializing decoder parameter dictionary
    decoder_weights_dict = dict()
    for uname, wname in weight_names_to_checkpt[
            "prediction_network"] + weight_names_to_checkpt["joint_network"]:
        original_wname = wname.replace(ema_utils.EMA_PREFIX, '')
        param_shape = builder.getTensorShape(original_wname)
        decoder_weights_dict[wname] = np.empty(param_shape, precision)

    logger.info("Saving decoder weights to {}".format(decoder_weights_fp))
    weightsIo = popart.PyWeightsIO(decoder_weights_dict)
    training_session.readWeights(weightsIo)

    for uname, wname in weight_names_to_checkpt[
            "prediction_network"] + weight_names_to_checkpt["joint_network"]:
        # remove exp_mov_avg_ prefix from uname and add to dict so that validation script works
        original_uname = uname.replace(ema_utils.EMA_PREFIX, '')
        decoder_weights_dict[original_uname] = decoder_weights_dict[wname]
        decoder_weights_dict.pop(wname, None)
    np.save(decoder_weights_fp, decoder_weights_dict)

    # NOTE - these weights can be loaded as:
    # decoder_weights_dict =  np.load(decoder_weights_fp, allow_pickle=True)[()]

    return
Ejemplo n.º 3
0
    def run_test(set_pipeline_stages):
        weights = {}

        def init_builder(builder):
            d0 = builder.addInputTensor(dummy_data, 'data0')
            w0 = builder.addInitializedInputTensor(weight_data)
            weights[w0] = np.empty(shape=weight_data.shape,
                                   dtype=weight_data.dtype)
            if inputType is not None:
                d0_float = builder.aiOnnx.cast([d0], "FLOAT")
                t0 = builder.aiOnnx.matmul([d0_float, w0])
            else:
                t0 = builder.aiOnnx.matmul([d0, w0])
            t1 = builder.aiOnnx.sin([t0])
            t2 = builder.aiOnnx.matmul([t1, w0])
            loss = builder.aiGraphcore.identityloss([t2])

            builder.addOutputTensor(loss)

            if set_pipeline_stages:
                if inputType is not None:
                    builder.pipelineStage(d0_float, 0)
                builder.pipelineStage(t0, 0)
                builder.pipelineStage(t1, 1)
                builder.pipelineStage(t2, 2)
                builder.pipelineStage(loss, 2)

                if inputType is not None:
                    builder.virtualGraph(d0_float, 0)
                builder.virtualGraph(t0, 0)
                builder.virtualGraph(t1, 1)
                builder.virtualGraph(t2, 0)
                builder.virtualGraph(loss, 0)

            return [loss]

        session = PopartTestSession()
        session.mode = 'train'
        session.options.enablePipelining = set_pipeline_stages
        session.device = 'ipu_model'
        if set_pipeline_stages:
            session.numIPUs = 2
            session.options.virtualGraphMode = popart.VirtualGraphMode.Manual
        session.batchesPerStep = bps
        session.options.enableGradientAccumulation = True
        session.options.accumulationFactor = accumulation_factor

        # test a pipeline stage appearing on multiple virtual graphs
        session.prepare(init_builder)

        sessionAnchors = session.run({'data0': data})
        assert len(sessionAnchors) == 1
        sessionAnchor = [v for k, v in sessionAnchors.items()][0]

        session._session.weightsToHost()
        weightsIo = popart.PyWeightsIO(weights)
        session._session.readWeights(weightsIo)
        assert len(weights) == 1
        weights = [v for k, v in weights.items()]
        return weights[0], sessionAnchor
Ejemplo n.º 4
0
    def run_test(enable_recomputation):
        weights = {}

        def init_builder(builder):
            d0 = builder.addInputTensor(dummy_data, 'data0')
            w0 = builder.addInitializedInputTensor(weight_data)
            weights[w0] = np.empty(shape=weight_data.shape,
                                   dtype=weight_data.dtype)

            if inputType is not None:
                d0_float = builder.aiOnnx.cast([d0], "FLOAT")
                t0 = builder.aiOnnx.mul([d0_float, w0])
            else:
                t0 = builder.aiOnnx.mul([d0, w0])
            t1 = builder.aiOnnx.sigmoid([t0])
            t2 = builder.aiGraphcore.scale([t1], 2.0)
            loss = builder.aiGraphcore.identityloss([t2])

            if inputType is not None:
                builder.virtualGraph(d0_float, 0)

            for t in (t0, t1, t2):
                builder.virtualGraph(t, 0)

            builder.virtualGraph(loss, 1)

            return [loss]

        session = PopartTestSession()
        session.device = 'ipu_model'
        session.numIPUs = 2
        session.mode = 'train'
        session.options.virtualGraphMode = popart.VirtualGraphMode.Manual
        session.options.enablePipelining = True
        if enable_recomputation:
            session.options.autoRecomputation = popart.RecomputationType.Standard
        session.options.accumulationFactor = accumulationFactor
        session.options.enableGradientAccumulation = True

        session.prepare(init_builder)

        anchors = session.run({'data0': data})

        # return the weights
        session._session.weightsToHost()
        weightsIo = popart.PyWeightsIO(weights)
        session._session.readWeights(weightsIo)
        assert len(weights) == 1
        weights = [v for k, v in weights.items()]
        return weights[0]
Ejemplo n.º 5
0
def test_sgd_with_zero_learning_rate():
    """
    In this test we check that we can run a training step zero learning rate,
    and that it behaves as expected (i.e. no weight update)
    """

    # Let's start with an optimizer with a variable, non-zero learning rate
    optSettings = {
        "defaultLearningRate": (0.5, False),
        "defaultWeightDecay": (0.6, False),
        "lossScaling": (10.0, False)
    }
    stepSize = 2
    session, inputsUserSgd = trainSession({}, popart.SGD(optSettings),
                                          stepSize)
    anchorsArrays = session.initAnchorArrays()

    # Get the initial weights:
    fn = "init.onnx"
    session.modelToHost(fn)
    wId = "init_input"
    weights = {wId: np.empty(shape=[2, 2, 3, 3], dtype=np.float32)}
    weightsio = popart.PyWeightsIO(weights)
    session.readWeights(weightsio)
    init_weights = np.copy(weights[wId])

    # Run for a step with non-zero lr, observe that the weights have changed
    stepio = popart.PyStepIO(inputsUserSgd, anchorsArrays)
    session.run(stepio)
    session.weightsToHost()
    session.readWeights(weightsio)
    updated_weights = np.copy(weights[wId])
    assert np.array_equal(init_weights, updated_weights) is False

    # Update optimizer with zero lr, (only valid if variable)
    optSettings["defaultLearningRate"] = (0.0, True)
    with pytest.raises(popart.popart_exception) as e_info:
        session.updateOptimizerFromHost(popart.SGD(optSettings))
    assert e_info.value.args[0].startswith(
        "Constant, zero learning rate in SGD")

    # Run a training step, and confirm the weights haven't updated
    optSettings["defaultLearningRate"] = (0.0, False)
    session.updateOptimizerFromHost(popart.SGD(optSettings))

    session.weightsToHost()
    session.readWeights(weightsio)
    assert np.array_equal(weights[wId], updated_weights)
Ejemplo n.º 6
0
    def runModel(pipeline, recompute):
        builder = popart.Builder()
        in0 = builder.addInputTensor("FLOAT", dshape)
        in1 = builder.addInputTensor("INT32", lshape)
        w0 = builder.addInitializedInputTensor(w0_data)
        with builder.virtualGraph(0), builder.pipelineStage(0):
            x = builder.aiOnnx.matmul([in0, w0])
        with builder.virtualGraph(1), builder.pipelineStage(1):
            x = builder.aiOnnx.sqrt([x])
        with builder.virtualGraph(0), builder.pipelineStage(2):
            x = builder.aiOnnx.add([w0, x])
            loss = builder.aiGraphcore.nllloss([x, in1])

        opts = popart.SessionOptions()
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        opts.enablePipelining = pipeline
        if pipeline == True:
            opts.enableGradientAccumulation = True
            opts.accumulationFactor = bps
            test_bps = 1
        else:
            test_bps = bps

        if recompute == True:
            opts.autoRecomputation = popart.RecomputationType.Pipeline

        session = popart.TrainingSession(
            deviceInfo=popart.DeviceManager().createIpuModelDevice(
                {"numIPUs": "2"}),
            dataFlow=popart.DataFlow(test_bps, [loss]),
            fnModel=builder.getModelProto(),
            loss=loss,
            optimizer=popart.ConstSGD(0.1),
            userOptions=opts)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()
        stepio = popart.PyStepIO({in0: in0_data, in1: in1_data}, anchors)
        session.run(stepio)

        weights = {}
        weights[w0] = np.empty(shape=dshape, dtype=np.float32)
        weightsIo = popart.PyWeightsIO(weights)
        session.weightsToHost()
        session.readWeights(weightsIo)
        return weights[w0]
    def getWeights(withPipelining):

        device = tu.create_test_device(numIpus=nIPUs, tilesPerIPU=1216)
        userOptions = popart.SessionOptions()
        userOptions.enableOutlining = False
        userOptions.enablePipelining = withPipelining
        userOptions.enableGradientAccumulation = True
        userOptions.accumulationFactor = accumulationFactor
        userOptions.virtualGraphMode = popart.VirtualGraphMode.Manual

        session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                         dataFlow=dataFlow,
                                         optimizer=popart.SGD({
                                             "defaultLearningRate":
                                             (defaultLearningRate0, False),
                                             "defaultMomentum":
                                             (defaultMomentum0, False),
                                             "defaultDampening":
                                             (defaultDampening0, False)
                                         }),
                                         loss=finalLoss,
                                         userOptions=userOptions,
                                         deviceInfo=device)

        anchorArrays = session.initAnchorArrays()

        session.prepareDevice()
        session.weightsFromHost()

        stepio = popart.PyStepIO({input0: inputVals}, anchorArrays)
        session.run(stepio)
        session.weightsToHost()
        w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
        w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
        weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R})
        session.readWeights(weightsRead)
        return w0R, w1R
Ejemplo n.º 8
0
    lb: trainingDataLables
}, trainingAnchors)

# Copy the weights to the device from the host
trainingSession.weightsFromHost()

# Run the training graph
trainingSession.run(trainingStepio)

# Copy the weights to the host from the device
trainingSession.weightsToHost()

# Prepare the map of weights to read the weights into
weights = {}
weights[w] = np.empty([2, 2], np.float16)
weightsIo = popart.PyWeightsIO(weights)

# Read the weights from the session
trainingSession.readWeights(weightsIo)

# Execute the inference graph
#------------------------------------------------------------------------------

# Generate some random input data
interenceData = np.random.rand(1, 2).astype(np.float16)
interenceDataLables = np.random.rand(1).astype(np.int32)

# Create buffers to receive results from the execution
inferenceAnchors = inferenceSession.initAnchorArrays()
inferenceStepio = popart.PyStepIO({
    ip: interenceData,
Ejemplo n.º 9
0
def run_model(tmpdir, batches_per_step, accum_factor, replicas, tile_set,
              exchange_strategy):
    size = 64

    proto, inputs, weights, labels, dataFlow, loss, sum = get_model(
        size, batches_per_step, 4, 1, tile_set, exchange_strategy)

    opts = popart.SessionOptions()
    opts.enableExplicitMainLoops = True
    opts.useHostCopyOps = True
    opts.instrumentWithHardwareCycleCounter = False
    opts.virtualGraphMode = popart.VirtualGraphMode.Auto

    # Both true & false should work - testing with false to avoid
    # host-cycle-overhead
    opts.rearrangeAnchorsOnHost = False
    opts.rearrangeStreamsOnHost = False

    # Set session options to generate the report
    tu.set_autoreport_options(opts, tmpdir, output_execution_profile=True)

    if accum_factor > 1:
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accum_factor

    if tile_set == popart.TileSet.IO:
        opts.numIOTiles = 128
    else:
        opts.numIOTiles = 0

    if replicas > 1:
        opts.enableReplicatedGraphs = True
        opts.replicatedGraphCount = replicas

    pat = popart.Patterns(popart.PatternsLevel.Default)

    session = popart.TrainingSession(
        fnModel=proto,
        dataFlow=dataFlow,
        userOptions=opts,
        loss=loss,
        optimizer=popart.ConstSGD(1e-6),
        patterns=pat,
        # Trying to use less than all the tiles throw an error like
        #   popart_core.poplar_exception: Trying to access tile 72 on IPU
        #   0 but the virtual graph only covers the following tiles on
        #   that IPU: 0-63
        # The error happens in a call to poplar made by gcl::perIPUTiles.
        deviceInfo=tu.create_test_device(numIpus=replicas,
                                         tilesPerIPU=tu.USE_ALL_TILES))

    anchors = session.initAnchorArrays()

    session.prepareDevice()

    np.random.seed(224488)

    session.weightsFromHost()

    warmup_iterations = 1
    calc_iterations = 1

    for i in range(warmup_iterations + calc_iterations):
        datainputs = {
            input: (np.random.normal(
                0, 0.05, (replicas * batches_per_step * accum_factor, 1, size,
                          size)).astype(np.float32))
            for input in inputs
        }
        datainputs[labels] = np.random.randint(
            0, size, (replicas * batches_per_step * accum_factor, 1, size))
        stepio = popart.PyStepIO(datainputs, anchors)
        session.run(stepio)

    session.weightsToHost()
    weights_data = {
        w: np.zeros((1, size, size), dtype=np.float32)
        for w in weights
    }
    weights_read = popart.PyWeightsIO(weights_data)
    session.readWeights(weights_read)

    for w in weights_data:
        assert np.count_nonzero(np.isnan(weights_data[w])) == 0

    report = session.getReport()

    overlapPercentage = get_compute_io_overlap_percentage(
        report, warmup_iterations)

    return overlapPercentage, weights_data
Ejemplo n.º 10
0
def test_manual_serialization():

    # Basic model:
    #
    #  X: data input if shape (N, C0)
    #  W: weight input of shape (C0, C1)
    #
    #  Y    = matmul(X, W)
    #  Z    = relu(Y)
    #  loss = l1Loss(Z)
    #
    # With array dimensions

    N = 12
    C0 = 244
    C1 = 286

    # In this test, we manually serialise the matmul, converting
    # matmul ((N,C0) , (C0,C1))
    #
    # into a sequence of factor-f smaller matmuls
    # matmul (N,C0/f),(C0/f,C1))
    #
    # reapeated and accumulated f times, where f is

    f = 4
    assert (C0 % f == 0)

    # Constructing the model

    builder = popart.Builder()
    # NOTE: T22702 For some seeds this test fails.
    np.random.seed(0)
    wVals = np.array(npr.randn(C0, C1), dtype=np.float32)
    W = builder.addInitializedInputTensor(wVals)
    xInfo = popart.TensorInfo("FLOAT", [N, C0])
    X = builder.addInputTensor(xInfo)
    axesV = np.array([0, 1]).astype(np.int32)
    axes = builder.addInitializedInputTensor(axesV)

    for i in range(f):
        # the lower index of the i'th slice
        lwr = int(i * C0 / f)

        # the upper index of the i'th slice
        upp = int((i + 1) * C0 / f)

        # Take a slice of size (N,C0/f) out of X
        s0 = builder.addInitializedInputTensor(
            np.array([0, lwr]).astype(np.int32))
        e0 = builder.addInitializedInputTensor(
            np.array([N, upp]).astype(np.int32))
        X_slice = builder.aiOnnx.slice([X, s0, e0, axes])

        # Take a slice of size (C0/f,C1) out of W
        s1 = builder.addInitializedInputTensor(
            np.array([lwr, 0]).astype(np.int32))
        e1 = builder.addInitializedInputTensor(
            np.array([upp, C1]).astype(np.int32))
        W_slice = builder.aiOnnx.slice([W, s1, e1, axes])

        # Multiply the slices together, and accumulate as necessary
        mm_part = builder.aiOnnx.matmul([X_slice, W_slice])
        if i == 0:
            Y = mm_part

        else:
            Y = builder.aiOnnx.add([mm_part, Y])

    # Finally, the non-linearity
    Z = builder.aiOnnx.relu([Y])

    # This boiler-plate is currently necessary with opset-10 slice
    graph_transformer = popart.GraphTransformer(builder.getModelProto())
    graph_transformer.convertAllFixedPointInitializersToConstants()
    builder = popart.Builder(graph_transformer.getModelProto())

    l1 = builder.aiGraphcore.l1loss([Z], 0.2)
    dataFlow = popart.DataFlow(1, {})
    device = tu.create_test_device()
    userOptions = popart.SessionOptions()

    # To obtain the final dot graph, uncomment this:
    # userOptions.dotChecks = {"Final"};

    patterns = popart.Patterns()

    session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                     dataFlow=dataFlow,
                                     optimizer=popart.SGD(
                                         {"defaultLearningRate": (0.1, True)}),
                                     loss=l1,
                                     patterns=patterns,
                                     userOptions=userOptions,
                                     deviceInfo=device)
    session.prepareDevice()
    session.weightsFromHost()

    inputVals = np.array(npr.randn(1 * N * C0), dtype=np.float32)
    stepio = popart.PyStepIO({X: inputVals}, {})
    session.run(stepio)
    session.weightsToHost()
    w0R = np.array(-777.0 * np.ones(C0 * C1), dtype=np.float32)
    weightsRead = popart.PyWeightsIO({W: w0R})
    session.readWeights(weightsRead)

    # A pytorch version to confirm numerical correctness:
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.w0 = torch.nn.Parameter(torch.from_numpy(wVals.copy()))

        def forward(self, x):
            return torch.relu(torch.matmul(x, self.w0))

    net = Net()
    optimizer = optim.SGD(net.parameters(), lr=0.1)

    out = net(torch.from_numpy(inputVals.reshape([N, C0])))
    loss = 0.2 * torch.mean(torch.abs(out))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    baseline0 = np.sum(
        np.abs(net.w0.detach().numpy().flatten() - wVals.flatten()))
    baseline1 = np.sum(np.abs(w0R - wVals.flatten()))
    error = np.sum(np.abs(np.abs(net.w0.detach().numpy().flatten() - w0R)))
    assert (error / (baseline0 + baseline1) < 1e-6)
Ejemplo n.º 11
0
def test_save_tensors_optimizer_state_externally(optimizerInfo):
    """
    # 1. create training session with momentum, save initializers externally
    # 2. check file size before session.modelToHost, see it grows after
    #    due to the additional optimizer state tensors being saved
    # 3. read tensors from file, compare with anchors to verify that the
    #    additional optimizer state has saved correctly
    # 4. Create a new session from the saved onnx model, run both sessions
    #    and compare outputs to verify that the optimizer state tensors
    #    were loaded in correctly
    """
    optimizer = optimizerInfo[0]
    extraOptimizerStatePrefs = optimizerInfo[1]

    d1 = np.random.rand(3, 3).astype(np.float32)
    d2 = np.random.rand(3).astype(np.float32)
    builder = popart.Builder()
    i1 = builder.addInitializedInputTensor(d1)
    i2 = builder.addInitializedInputTensor(d2)
    o = builder.aiOnnx.matmul([i1, i2])
    loss = builder.aiGraphcore.identityloss([o])

    with TemporaryDirectory() as tmpdir:
        tmpfile = os.path.join(tmpdir, "model_tensors.onnx")
        builder.saveInitializersExternally([i1, i2], tmpfile)

        # Check file is of expected size: (3 * 3 * 4) + (3 * 4) = 48
        assert os.path.exists(tmpfile)
        assert os.path.getsize(tmpfile) == d1.size * 4 + d2.size * 4

        anchorIds = [o]
        anchorIds.append(popart.reservedGradientPrefix() + i1)
        anchorIds.append(popart.reservedGradientPrefix() + i2)

        session = popart.TrainingSession(
            deviceInfo=popart.DeviceManager().createCpuDevice(),
            fnModel=builder.getModelProto(),
            loss=loss,
            optimizer=optimizer,
            dataFlow=popart.DataFlow(1, anchorIds))

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()
        session.run(popart.PyStepIO({}, anchors))

        session.weightsToHost()
        weightsMap = {}
        weightsMap[i1] = np.ones(d1.size).astype(np.float32)
        weightsMap[i2] = np.ones(d2.size).astype(np.float32)
        for pref in extraOptimizerStatePrefs:
            if pref == popart.reservedStepPrefix():
                size1 = 1
                size2 = 1
            else:
                size1 = d1.size
                size2 = d2.size
            weightsMap[pref + i1] = np.ones(size1).astype(np.float32)
            weightsMap[pref + i2] = np.ones(size2).astype(np.float32)
        session.readWeights(popart.PyWeightsIO(weightsMap))

        tmpfile1 = os.path.join(tmpdir, "model.onnx")
        session.modelToHost(tmpfile1)

        # Extra state for each initializer
        expectedSize = (d1.size * 4) + (d2.size * 4)
        for pref in extraOptimizerStatePrefs:
            if pref == popart.reservedStepPrefix():
                expectedSize += (2 * 4)
            else:
                expectedSize += d1.size * 4
                expectedSize += d2.size * 4

        assert os.path.getsize(tmpfile) == expectedSize

        # Compare anchors with external data written to file
        saved_weights = np.fromfile(tmpfile, dtype=np.float32)
        assert np.allclose(saved_weights[0:d1.size], weightsMap[i1].flatten())
        totalSize = d1.size + d2.size
        assert np.allclose(saved_weights[d1.size:totalSize],
                           weightsMap[i2].flatten())

        for pref in extraOptimizerStatePrefs:
            assert np.allclose(saved_weights[totalSize:totalSize + d1.size],
                               weightsMap[pref + i1].flatten())
            totalSize += d1.size
            assert np.allclose(saved_weights[totalSize:totalSize + d2.size],
                               weightsMap[pref + i2].flatten())
            totalSize += d2.size

        # Create new session
        new_session = popart.TrainingSession(
            deviceInfo=popart.DeviceManager().createCpuDevice(),
            fnModel=tmpfile1,
            loss=loss,
            optimizer=optimizer,
            dataFlow=popart.DataFlow(1, anchorIds))
        new_anchors = new_session.initAnchorArrays()
        new_session.prepareDevice()
        new_session.weightsFromHost()

        new_session.run(popart.PyStepIO({}, new_anchors))
        session.run(popart.PyStepIO({}, anchors))

        # Compare output from both sessions to confirm that the optimizer state
        # tensors have been read back in correctly for the new session
        for anchorId in anchorIds:
            assert np.allclose(anchors[anchorId], new_anchors[anchorId])
Ejemplo n.º 12
0
    def test(config, iteration, true_scaling, test_case):
        builder = popart.Builder()

        w0name = "weight_0"
        w1name = "weight_1"
        w2name = "weight_2"

        input0Shape = [1, 1, 1]
        input0 = builder.addInputTensor(
            popart.TensorInfo("FLOAT", input0Shape), "input0")

        w0data = np.array([test_case[0][0]], dtype=np.float32)
        w0R = np.empty([
            1,
        ], dtype=np.float32)
        w0Id = builder.addInitializedInputTensor(w0data, w0name)

        w1data = np.array([test_case[1][0]], dtype=np.float32)
        w1R = np.empty([
            1,
        ], dtype=np.float32)
        w1Id = builder.addInitializedInputTensor(w1data, w1name)

        w2data = np.array([test_case[2][0]], dtype=np.float32)
        w2R = np.empty([
            1,
        ], dtype=np.float32)
        w2Id = builder.addInitializedInputTensor(w2data, w2name)

        add0 = builder.aiOnnx.add([w0Id, input0])
        add1 = builder.aiOnnx.add([w1Id, add0])
        add2 = builder.aiOnnx.add([w2Id, add1])
        loss = builder.aiGraphcore.l1loss([add2],
                                          1.0,
                                          debugContext="l1LossVal")
        builder.addOutputTensor(add2)

        proto = builder.getModelProto()
        dataFlow = popart.DataFlow(1, {})
        opts = popart.SessionOptions()
        opts.reportOptions = {"showExecutionSteps": "true"}
        pat = popart.Patterns(popart.PatternsLevel.Default)
        dm = popart.DeviceManager()
        dm.setOnDemandAttachTimeout(int(1e4))
        device = dm.acquireAvailableDevice(
            1,
            connectionType=popart.DeviceConnectionType.OnDemand,
            selectionCriterion=popart.DeviceSelectionCriterion.Random)
        if device is None:
            raise OSError("Failed to acquire IPU.")

        # The stage->tensor map would come from the Bert model in reality
        # (see model.tensors)
        mock_tensor_map = {0: [w0Id], 1: [w1Id], 2: [w2Id]}

        factory = ScheduledOptimizerFactory(config,
                                            iteration,
                                            tensors=mock_tensor_map)
        assert_scaled_lr(factory, true_scaling)

        optimizer_step0 = factory.create()

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=loss,
                                         optimizer=optimizer_step0,
                                         patterns=pat,
                                         deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        input_data = np.array([3.1415], dtype=np.float32)
        stepio = popart.PyStepIO({input0: input_data}, anchors)

        for step in range(iteration.total_steps):
            session.run(stepio)
            session.weightsToHost()
            weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R})
            session.readWeights(weightsRead)

            assert (np.isclose(test_case[0][step + 1], w0R))
            assert (np.isclose(test_case[1][step + 1], w1R))
            assert (np.isclose(test_case[2][step + 1], w2R))

            iteration.count += 1

            if factory.should_update(iteration):
                optimizer_step1 = factory.update_and_create(iteration)
                assert_scaled_lr(factory, true_scaling)

                session.updateOptimizerFromHost(optimizer_step1)
Ejemplo n.º 13
0
def compare_against_pytorch(optType, optMaps, batchesPerStep=5, scaled=False):
    seed = 1015
    npr.seed(seed)
    torch.manual_seed(seed)

    optkwargs = {}

    if optType == "adam":
        popartOpt = popart.Adam
        optkwargs[
            "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization
        optkwargs["scaled_optimizer_state"] = scaled
    elif optType == "adamw":
        popartOpt = popart.Adam
        optkwargs["weight_decay_mode"] = popart.WeightDecayMode.Decay
        optkwargs["scaled_optimizer_state"] = scaled
    elif optType == "adamax":
        popartOpt = popart.Adam
        optkwargs["mode"] = popart.AdamMode.AdaMax
        optkwargs[
            "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization
        optkwargs["scaled_optimizer_state"] = scaled
    elif optType == "lamb":
        popartOpt = popart.Adam
        optkwargs["mode"] = popart.AdamMode.Lamb
        optkwargs["weight_decay_mode"] = popart.WeightDecayMode.Decay
        optkwargs["scaled_optimizer_state"] = scaled
    elif optType == "lambnobias":
        popartOpt = popart.Adam
        optkwargs["mode"] = popart.AdamMode.LambNoBias
        optkwargs["weight_decay_mode"] = popart.WeightDecayMode.Decay
        optkwargs["scaled_optimizer_state"] = scaled
    elif optType == "adagrad":
        popartOpt = popart.Adaptive
        optkwargs["mode"] = popart.AdaptiveMode.AdaGrad
        optkwargs[
            "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization
    elif optType == "rmsprop":
        popartOpt = popart.Adaptive
        optkwargs["mode"] = popart.AdaptiveMode.RMSProp
        optkwargs[
            "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization
    elif optType == "centeredrmsprop":
        popartOpt = popart.Adaptive
        optkwargs["mode"] = popart.AdaptiveMode.CenteredRMSProp
        optkwargs[
            "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization
    elif optType == "adadelta":
        popartOpt = popart.Adaptive
        optkwargs["mode"] = popart.AdaptiveMode.AdaDelta
        optkwargs[
            "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization
    elif optType == "sgd0":
        popartOpt = popart.SGD
    elif optType == "sgd1":
        popartOpt = popart.SGD
        optkwargs[
            "accumulatorAndMomentum"] = popart.SGDAccumulatorAndMomentum.Combined
    elif optType == "sgd2":
        popartOpt = popart.SGD
        optkwargs[
            "accumulatorAndMomentum"] = popart.SGDAccumulatorAndMomentum.Separate
    else:
        raise "Unknown optType: " + optType

    #L1 loss value
    lambda1 = 1.0

    # tensor dimensions and replications
    height = 2
    numberOfSteps = len(optMaps)
    sampleShape = [height, height]
    replicationFactor = 1
    accumulationFactor = 1
    nVirtualGraphs = 1
    samplesPerBatch = 4
    divvyFactor = replicationFactor * accumulationFactor
    samplesPerMicroBatch = samplesPerBatch // divvyFactor
    nIPUs = replicationFactor * nVirtualGraphs
    stepDataShape = [batchesPerStep, samplesPerBatch, height, height]
    microBatchShape = [samplesPerMicroBatch, height, height]
    stepDataInfo = popart.TensorInfo("FLOAT", stepDataShape)
    microBatchInfo = popart.TensorInfo("FLOAT", microBatchShape)

    #initial weight and input values
    w0vals = np.array(npr.randn(height, height), dtype=np.float32)
    w1vals = np.array(npr.randn(height, height), dtype=np.float32)
    inputVals = [
        np.array(npr.randn(*stepDataShape), dtype=np.float32)
        for i in range(numberOfSteps)
    ]

    # Build the ONNX Model
    builder = popart.Builder()
    input0 = builder.addInputTensor(microBatchInfo)
    w0 = builder.addInitializedInputTensor(w0vals)
    w1 = builder.addInitializedInputTensor(w1vals)

    # Model:
    #
    # input  w0     w1
    #     \  |      |
    #      mul  -  add - L1 loss

    mm0 = builder.aiOnnx.mul([input0, w0])
    mm1 = builder.aiOnnx.add([mm0, w1])
    l1 = builder.aiGraphcore.l1loss([mm1], lambda1)
    art = popart.AnchorReturnType("All")
    dataFlow = popart.DataFlow(batchesPerStep, {})
    device = tu.create_test_device(numIpus=nIPUs)
    userOptions = popart.SessionOptions()
    userOptions.enableGradientAccumulation = False
    userOptions.enablePrefetchDatastreams = False

    session = popart.TrainingSession(
        fnModel=builder.getModelProto(),
        dataFlow=dataFlow,
        userOptions=userOptions,
        loss=l1,
        optimizer=popartOpt(optMaps[0], **optkwargs),
        deviceInfo=tu.create_test_device(opts={"compileIPUCode": False}))

    anchorArrays = session.initAnchorArrays()

    session.prepareDevice()
    session.weightsFromHost()

    for step in range(numberOfSteps):
        stepio = popart.PyStepIO({input0: inputVals[step]}, anchorArrays)
        session.run(stepio)

        if (step < numberOfSteps - 1):
            session.updateOptimizerFromHost(
                popartOpt(optMaps[step + 1], **optkwargs))

    session.weightsToHost()
    w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R})
    session.readWeights(weightsRead)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.w0 = torch.nn.Parameter(torch.from_numpy(w0vals.copy()))
            self.w1 = torch.nn.Parameter(torch.from_numpy(w1vals.copy()))

        def forward(self, x, i):
            mm0 = torch.mul(x, self.w0)
            mm1 = torch.add(mm0, self.w1)
            return mm1

    net = Net()

    for step in range(numberOfSteps):
        if step is 0:
            oldOptimizer = None
        else:
            oldOptimizer = optimizer

        if optType == "adam":
            optimizer = optim.Adam(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                betas=(optMaps[step]["defaultBeta1"][0],
                       optMaps[step]["defaultBeta2"][0]),
                eps=optMaps[step]["defaultEps"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0])
        elif optType == "adamw":
            optimizer = optim.AdamW(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                betas=(optMaps[step]["defaultBeta1"][0],
                       optMaps[step]["defaultBeta2"][0]),
                eps=optMaps[step]["defaultEps"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0])
        elif optType == "adamax":
            optimizer = optim.Adamax(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                betas=(optMaps[step]["defaultBeta1"][0],
                       optMaps[step]["defaultBeta2"][0]),
                eps=optMaps[step]["defaultEps"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0])
        elif optType == "lamb":
            optimizer = torch_lamb.Lamb(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                betas=(optMaps[step]["defaultBeta1"][0],
                       optMaps[step]["defaultBeta2"][0]),
                eps=optMaps[step]["defaultEps"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0])
        elif optType == "lambnobias":
            optimizer = torch_lamb.Lamb(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                betas=(optMaps[step]["defaultBeta1"][0],
                       optMaps[step]["defaultBeta2"][0]),
                eps=optMaps[step]["defaultEps"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0],
                biasCorrection=False)
        elif optType == "adagrad":
            optimizer = optim.Adagrad(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0],
                eps=optMaps[step]["defaultEps"][0])
        elif optType == "rmsprop":
            optimizer = optim.RMSprop(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                alpha=optMaps[step]["defaultAlpha"][0],
                eps=optMaps[step]["defaultEps"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0],
                momentum=optMaps[step]["defaultMomentum"][0]
                if "defaultMomentum" in optMaps[step] else 0.0)
        elif optType == "centeredrmsprop":
            optimizer = optim.RMSprop(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                alpha=optMaps[step]["defaultAlpha"][0],
                eps=optMaps[step]["defaultEps"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0],
                momentum=optMaps[step]["defaultMomentum"][0]
                if "defaultMomentum" in optMaps[step] else 0.0,
                centered=True)
        elif optType == "adadelta":
            optimizer = optim.Adadelta(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                rho=optMaps[step]["defaultAlpha"][0],
                eps=optMaps[step]["defaultEps"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0])
        else:  # Same for SGD1 and SGD2.
            optimizer = optim.SGD(
                net.parameters(),
                lr=optMaps[step]["defaultLearningRate"][0],
                momentum=optMaps[step]["defaultMomentum"][0],
                dampening=optMaps[step]["defaultDampening"][0],
                weight_decay=optMaps[step]["defaultWeightDecay"][0])

        if step is 0:
            for group in optimizer.param_groups:
                for p in group['params']:
                    param_state = optimizer.state[p][
                        'momentum_buffer'] = p.data * 0
                    param_state = optimizer.state[p]['exp_avg'] = p.data * 0
                    param_state = optimizer.state[p]['exp_avg_sq'] = p.data * 0
                    param_state = optimizer.state[p]['exp_inf'] = p.data * 0
                    param_state = optimizer.state[p]['square_avg'] = p.data * 0
                    param_state = optimizer.state[p]['grad_avg'] = p.data * 0
                    param_state = optimizer.state[p]['acc_delta'] = p.data * 0
                    param_state = optimizer.state[p]['sum'] = p.data * 0
                    param_state = optimizer.state[p]['step'] = 0
        else:
            for group, oldGroup in zip(optimizer.param_groups,
                                       oldOptimizer.param_groups):
                for p, oldp in zip(group['params'], oldGroup['params']):
                    param_state = optimizer.state[p][
                        'momentum_buffer'] = oldOptimizer.state[p][
                            'momentum_buffer']
                    param_state = optimizer.state[p][
                        'exp_avg'] = oldOptimizer.state[p]['exp_avg']
                    param_state = optimizer.state[p][
                        'exp_avg_sq'] = oldOptimizer.state[p]['exp_avg_sq']
                    param_state = optimizer.state[p][
                        'exp_inf'] = oldOptimizer.state[p]['exp_inf']
                    param_state = optimizer.state[p][
                        'square_avg'] = oldOptimizer.state[p]['square_avg']
                    param_state = optimizer.state[p][
                        'grad_avg'] = oldOptimizer.state[p]['grad_avg']
                    param_state = optimizer.state[p][
                        'acc_delta'] = oldOptimizer.state[p]['acc_delta']
                    param_state = optimizer.state[p][
                        'sum'] = oldOptimizer.state[p]['sum']
                    param_state = optimizer.state[p][
                        'step'] = oldOptimizer.state[p]['step']

        for i in range(batchesPerStep):
            out = net(torch.from_numpy(inputVals[step][i]), i)
            loss = lambda1 * torch.mean(torch.abs(out))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    delta0 = np.sum(np.abs(net.w0.detach().numpy() - w0vals))
    delta1 = np.sum(np.abs(net.w1.detach().numpy() - w1vals))
    print("pytorch baseline")
    print("Total moved by w0: ", delta0)
    print("Total moved by w1: ", delta1)

    error0 = np.sum(np.abs(w0R - net.w0.detach().numpy())) / delta0
    error1 = np.sum(np.abs(w1R - net.w1.detach().numpy())) / delta1
    print("without pipelining")
    print("Total moved by w0: ", np.sum(np.abs(w0R - w0vals)))
    print("Total moved by w1: ", np.sum(np.abs(w1R - w1vals)))
    print("l1 error for w0: ", error0)
    print("l1 error for w1: ", error1)
    assert (error0 < 1e-5)
    assert (error1 < 1e-5)
Ejemplo n.º 14
0
def test_rmsprop_tf_mode(use_tf_variant, const_lr, adaptive_mode, momentum,
                         weight_decay, weight_decay_mode, const_weight_decay):
    np.random.seed(0)
    input_dim = 3
    num_steps = 15
    batches_per_step = 10
    samples_per_batch = 8
    # Optimizer parameters.
    learning_rates = np.linspace(0.02, 0.00001, num_steps)
    weight_decays = np.linspace(weight_decay, weight_decay + 0.01, num_steps)
    alpha = 0.95
    eps = 0.001

    # Initial weights and inputs.
    w0_data = np.random.randn(input_dim, input_dim).astype(np.float32)
    w1_data = np.random.randn(input_dim, input_dim).astype(np.float32)
    input_data = [
        np.random.randn(
            batches_per_step,
            samples_per_batch,
            input_dim,
            input_dim,
        ).astype(np.float32) for _ in range(num_steps)
    ]

    # Build the model.
    #
    # input  w0     w1
    #     \  |      |
    #      mul  -  add - L1 loss
    builder = popart.Builder()
    input = builder.addInputTensor(
        popart.TensorInfo("FLOAT", [samples_per_batch, input_dim, input_dim]))
    w0 = builder.addInitializedInputTensor(w0_data)
    w1 = builder.addInitializedInputTensor(w1_data)
    mm0 = builder.aiOnnx.mul([input, w0])
    mm1 = builder.aiOnnx.add([mm0, w1])
    l1 = builder.aiGraphcore.l1loss([mm1], 1.0)

    dataflow = popart.DataFlow(batches_per_step, {})
    session = popart.TrainingSession(
        fnModel=builder.getModelProto(),
        dataFlow=dataflow,
        loss=l1,
        optimizer=get_rmsprop(
            learning_rates[0],
            const_lr,
            alpha,
            momentum,
            weight_decay,
            weight_decay_mode,
            const_weight_decay,
            eps,
            adaptive_mode,
            use_tf_variant,
        ),
        deviceInfo=tu.create_test_device(),
    )
    anchor_arrays = session.initAnchorArrays()
    session.prepareDevice()
    session.weightsFromHost()

    # Run popart training and retrieve the weights.
    for step in range(num_steps):
        stepio = popart.PyStepIO({input: input_data[step]}, anchor_arrays)
        session.run(stepio)
        if step < num_steps - 1:
            # Update optimizer from host in case lr or wd are non-const.
            need_to_update_optimizer = False
            lr = learning_rates[0]
            wd = weight_decays[0]
            if not const_lr:
                lr = learning_rates[step + 1]
                need_to_update_optimizer = True
            if not const_weight_decay:
                wd = weight_decays[step + 1]
                need_to_update_optimizer = True
            if need_to_update_optimizer:
                session.updateOptimizerFromHost(
                    get_rmsprop(
                        lr,
                        const_lr,
                        alpha,
                        momentum,
                        wd,
                        weight_decay_mode,
                        const_weight_decay,
                        eps,
                        adaptive_mode,
                        use_tf_variant,
                    ))

    session.weightsToHost()
    w0_popart = np.zeros((input_dim, input_dim), dtype=np.float32)
    w1_popart = np.zeros((input_dim, input_dim), dtype=np.float32)
    weights_read = popart.PyWeightsIO({w0: w0_popart, w1: w1_popart})
    session.readWeights(weights_read)

    # Run numpy training.
    centered = adaptive_mode == popart.AdaptiveMode.CenteredRMSProp
    if weight_decay_mode == popart.WeightDecayMode.L2Regularization:
        wd_mode = 'L2'
    else:
        wd_mode = 'decay'
    w0_np = w0_data.copy()
    w1_np = w1_data.copy()
    mg0 = np.zeros(w0_np.shape, dtype=w0_np.dtype)
    mg1 = np.zeros(w1_np.shape, dtype=w1_np.dtype)
    rms0 = np.ones(w0_np.shape, dtype=w0_np.dtype)
    rms1 = np.ones(w1_np.shape, dtype=w1_np.dtype)
    mom0 = np.zeros(w0_np.shape, dtype=w0_np.dtype)
    mom1 = np.zeros(w1_np.shape, dtype=w1_np.dtype)

    for step in range(num_steps):
        lr = learning_rates[0] if const_lr else learning_rates[step]
        wd = weight_decays[0] if const_weight_decay else weight_decays[step]
        for batch in range(batches_per_step):
            w0_grad = np.zeros(w0_np.shape, dtype=w0_np.dtype)
            w1_grad = np.zeros(w1_np.shape, dtype=w1_np.dtype)

            for sample in range(samples_per_batch):
                x = input_data[step][batch][sample]
                w0_grad_sample, w1_grad_sample = model_grad(w0_np, w1_np, x)
                w0_grad += (1.0 / samples_per_batch) * w0_grad_sample
                w1_grad += (1.0 / samples_per_batch) * w1_grad_sample

            w0_np, mg0, rms0, mom0 = rpnp.rmsprop_update_numpy(
                w0_np,
                w0_grad,
                mg0,
                rms0,
                mom0,
                lr,
                alpha,
                momentum,
                wd,
                wd_mode,
                eps,
                centered,
            )
            w1_np, mg1, rms1, mom1 = rpnp.rmsprop_update_numpy(
                w1_np,
                w1_grad,
                mg1,
                rms1,
                mom1,
                lr,
                alpha,
                momentum,
                wd,
                wd_mode,
                eps,
                centered,
            )

    # Compare the resulting paramaters.
    if use_tf_variant:
        np.testing.assert_allclose(w0_popart, w0_np, rtol=1e-02, atol=1e-05)
        np.testing.assert_allclose(w1_popart, w1_np, rtol=1e-02, atol=1e-05)
    else:
        assert not np.allclose(w0_popart, w0_np, rtol=1e-02, atol=1e-05)
        assert not np.allclose(w1_popart, w1_np, rtol=1e-02, atol=1e-05)
Ejemplo n.º 15
0
    def test(opt0, opt1, e0, e1, e2):
        builder = popart.Builder()

        input0Shape = [stepSize, batchSize, sampleDim]
        input0 = builder.addInputTensor(
            popart.TensorInfo("FLOAT", input0Shape), "input0")

        w0data = np.array([100.0], dtype=np.float32)
        w0R = np.array([-777.0], dtype=np.float32)
        w0Id = builder.addInitializedInputTensor(w0data, w0name)

        w1data = np.array([200.0], dtype=np.float32)
        w1R = np.array([-777.0], dtype=np.float32)
        w1Id = builder.addInitializedInputTensor(w1data, w1name)

        w2data = np.array([300.0], dtype=np.float32)
        w2R = np.array([-777.0], dtype=np.float32)
        w2Id = builder.addInitializedInputTensor(w2data, w2name)

        add0 = builder.aiOnnx.add([w0Id, input0])
        add1 = builder.aiOnnx.add([w1Id, add0])
        add2 = builder.aiOnnx.add([w2Id, add1])

        l1 = builder.aiGraphcore.l1loss([add2], 1.0)

        proto = builder.getModelProto()

        dataFlow = popart.DataFlow(1, {})

        opts = popart.SessionOptions()
        opts.reportOptions = {"showExecutionSteps": "true"}
        opts.enableGroupedMatmuls = False

        pat = popart.Patterns(popart.PatternsLevel.Default)

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=dataFlow,\
            userOptions=opts,
            loss=l1,
            optimizer=opt0,
            patterns=pat,
            deviceInfo=tu.create_test_device(opts={"compileIPUCode": False}))

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        input0Data = np.array([3.1415], dtype=np.float32)

        stepio = popart.PyStepIO({input0: input0Data}, anchors)

        session.run(stepio)

        session.updateOptimizerFromHost(opt1)

        session.run(stepio)

        session.weightsToHost()

        weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R})

        session.readWeights(weightsRead)

        assert (np.isclose(e0['initalValue'], w0R))
        assert (np.isclose(e1['initalValue'], w1R))
        assert (np.isclose(e2['initalValue'], w2R))
Ejemplo n.º 16
0
def runTest(forceAddOutOfPlace, pipelineRecomputation):
    """
    Test of pipelining with dropout, recomputation, graph replication, 
    gradient accumulation
    """
    #Has dependencies on T12562. T12976, T13098 for full support

    seed = 1015
    npr.seed(seed)
    torch.manual_seed(seed)

    #L1 loss value
    lambda1 = 1.0

    #optimizer parameters
    defaultLearningRate0 = 0.001
    defaultMomentum0 = 0.01
    defaultDampening0 = 0.5
    lossScaling0 = 10.0
    defaultVelocityScaling0 = 0.15
    defaultWeightDecay0 = 0.01

    # tensor dimensions and replications
    height = 6
    batchesPerStep = 5
    sampleShape = [height, height]
    accumulationFactor = 4
    samplesPerBatch = 48
    divvyFactor = replicationFactor * accumulationFactor
    if (samplesPerBatch % divvyFactor != 0):
        raise RuntimeError("Invalid divvy factor")
    samplesPerMicroBatch = samplesPerBatch // divvyFactor
    stepDataShape = [batchesPerStep, samplesPerBatch, height, height]
    microBatchShape = [samplesPerMicroBatch, height, height]
    stepDataInfo = popart.TensorInfo("FLOAT", stepDataShape)
    microBatchInfo = popart.TensorInfo("FLOAT", microBatchShape)

    #initial weight and input values
    w0vals = np.array(npr.randn(height, height), dtype=np.float32)
    w1vals = np.array(npr.randn(height, height), dtype=np.float32)
    w2vals = np.array(npr.randn(height, height), dtype=np.float32)
    inputVals = np.array(npr.randn(*stepDataShape), dtype=np.float32)

    # Build the ONNX Model
    builder = popart.Builder()
    input0 = builder.addInputTensor(microBatchInfo)
    w0 = builder.addInitializedInputTensor(w0vals)
    w1 = builder.addInitializedInputTensor(w1vals)
    w2 = builder.addInitializedInputTensor(w2vals)

    scaleFactor = 1. / np.sqrt(height + 0.)

    # Model:
    #
    # input  w0                            w1
    #     \  |                            /
    #     matmul - scale -> dropout -> matul
    #         \                        |
    #         |                       scale
    #         |                        |
    #         |                      dropout
    #         |                        /\
    #       add  -------<---<----<----   \
    #        |                            |
    #     dropout                     scale by 2
    #        |                            |
    #  = = = | = = = = = IPU barrier = = =|= = = = = =
    #        |                            |
    #        |   w2                       |
    #        |  /                         |
    #       matmul                       /
    #        |                          /
    #      scale                       /
    #        |                        /
    #      dropout                   /
    #        |                       |
    #        ------->---->---->---> add -> L1 loss (lambda 2)

    with builder.virtualGraph(0):
        mm0 = builder.aiOnnx.matmul([input0, w0])
        scale0 = builder.aiGraphcore.scale([mm0], scaleFactor)
        ratio0 = 0.35
        [dropout0, mask0] = builder.aiOnnx.dropout([scale0],
                                                   num_outputs=2,
                                                   ratio=ratio0)
        mm1 = builder.aiOnnx.matmul([dropout0, w1])
        scale1 = builder.aiGraphcore.scale([mm1], scaleFactor)
        ratio1 = 0.5
        [dropout1, mask1] = builder.aiOnnx.dropout([scale1],
                                                   num_outputs=2,
                                                   ratio=ratio1)
        dropout1 = builder.aiGraphcore.scale([dropout1], 2.0)
        skipOut = builder.aiOnnx.add([mm0, dropout1])
        # See resolved task T13137
        if forceAddOutOfPlace:
            builder.setInplacePreferences(skipOut, {"AddRhsInplace": -1.0})

        ratioSkip = 0.6
        [dropoutSkip, maskSkip] = builder.aiOnnx.dropout([skipOut],
                                                         num_outputs=2,
                                                         ratio=ratioSkip)

        # see T13142: we do this so that the recomputation does not modify the anchors
        mask0 = builder.aiOnnx.identity([mask0])
        mask1 = builder.aiOnnx.identity([mask1])
        maskSkip = builder.aiOnnx.identity([maskSkip])

    with builder.virtualGraph(1):
        mm2 = builder.aiOnnx.matmul([dropoutSkip, w2])
        scale2 = builder.aiGraphcore.scale([mm2], scaleFactor)
        ratio2 = 0.7
        [dropout2, mask2] = builder.aiOnnx.dropout([scale2],
                                                   num_outputs=2,
                                                   ratio=ratio2)

        out = builder.aiOnnx.add([dropout2, dropout1])
        l1 = builder.aiGraphcore.l1loss([out],
                                        lambda1,
                                        reduction=popart.ReductionType.Sum)

        # see T13142: we do this so that the recomputation does not modify the anchors
        mask2 = builder.aiOnnx.identity([mask2])

    anchors = {
        mask0: popart.AnchorReturnType("All"),
        mask1: popart.AnchorReturnType("All"),
        mask2: popart.AnchorReturnType("All"),
        maskSkip: popart.AnchorReturnType("All"),
    }

    dataFlow = popart.DataFlow(batchesPerStep, anchors)

    device = tu.create_test_device(numIpus=nIPUs)
    assert device

    userOptions = popart.SessionOptions()
    # This requires T12562 to be solved before enabling (TODO)
    userOptions.enableOutlining = False
    userOptions.enablePipelining = True
    userOptions.enableGradientAccumulation = True
    userOptions.accumulationFactor = accumulationFactor

    if pipelineRecomputation:
        userOptions.autoRecomputation = popart.RecomputationType.Pipeline

    if (replicationFactor > 1):
        userOptions.enableReplicatedGraphs = True
        userOptions.replicatedGraphCount = replicationFactor
    userOptions.virtualGraphMode = popart.VirtualGraphMode.Manual

    # TODO https://phabricator.sourcevertex.net/T14035
    userOptions.enablePrefetchDatastreams = False
    #  passes:
    userOptions.engineOptions = {"exchange.streamBufferOverlap": "any"}
    #  fails:
    #  userOptions.engineOptions = {"exchange.streamBufferOverlap" : "hostRearrangeOnly"}

    patterns = popart.Patterns()
    patterns.InPlace = True

    session = popart.TrainingSession(
        fnModel=builder.getModelProto(),
        dataFlow=dataFlow,
        optimizer=popart.SGD({
            "defaultLearningRate": (defaultLearningRate0, False),
            "defaultMomentum": (defaultMomentum0, False),
            "defaultDampening": (defaultDampening0, False),
            "defaultVelocityScaling": (defaultVelocityScaling0, False),
            "lossScaling": (lossScaling0, True),
            "defaultWeightDecay": (defaultWeightDecay0, True)
        }),
        loss=l1,
        patterns=patterns,
        userOptions=userOptions,
        deviceInfo=device)

    anchorArrays = session.initAnchorArrays()

    session.prepareDevice()
    session.setRandomSeed(7)
    session.weightsFromHost()

    stepio = popart.PyStepIO({input0: inputVals}, anchorArrays)
    session.run(stepio)
    session.weightsToHost()
    w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    w2R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R, w2: w2R})
    session.readWeights(weightsRead)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            # merge replication, accumulation
            flattenedShape = [anchorArrays[mask0].shape[0], -1, height, height]
            self.w0 = torch.nn.Parameter(torch.from_numpy(w0vals.copy()))
            self.mask0 = torch.from_numpy(
                anchorArrays[mask0].reshape(flattenedShape))

            self.w1 = torch.nn.Parameter(torch.from_numpy(w1vals.copy()))
            self.mask1 = torch.from_numpy(
                anchorArrays[mask1].reshape(flattenedShape))

            self.maskSkip = torch.from_numpy(
                anchorArrays[maskSkip].reshape(flattenedShape))

            self.w2 = torch.nn.Parameter(torch.from_numpy(w2vals.copy()))
            self.mask2 = torch.from_numpy(
                anchorArrays[mask2].reshape(flattenedShape))

        def forward(self, x, i):
            mm0 = torch.matmul(x, self.w0)
            dr0 = mm0 * scaleFactor * self.mask0[i].type(
                torch.FloatTensor) / (1 - ratio0)

            mm1 = torch.matmul(dr0, self.w1)
            dr1 = mm1 * scaleFactor * self.mask1[i].type(
                torch.FloatTensor) / (1 - ratio1)
            dr1 = 2 * dr1

            drSkip = (dr1 + mm0) * self.maskSkip[i].type(
                torch.FloatTensor) / (1 - ratioSkip)

            mm2 = torch.matmul(drSkip, self.w2)
            dr2 = mm2 * scaleFactor * self.mask2[i].type(
                torch.FloatTensor) / (1 - ratio2)

            out = dr1 + dr2
            return out

    net = Net()

    optimizer = optim.SGD(net.parameters(),
                          lr=defaultLearningRate0,
                          momentum=defaultMomentum0,
                          dampening=defaultDampening0,
                          weight_decay=defaultWeightDecay0)

    # caveat : alternative work-around for TODO T13098
    for group in optimizer.param_groups:
        for p in group['params']:
            param_state = optimizer.state[p]
            param_state['momentum_buffer'] = p.data * 0

    for i in range(batchesPerStep):
        out = net(torch.from_numpy(inputVals[i]), i)
        loss = lambda1 * torch.sum(torch.abs(out))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    delta0 = np.sum(np.abs(net.w0.detach().numpy() - w0vals))
    delta1 = np.sum(np.abs(net.w1.detach().numpy() - w1vals))
    delta2 = np.sum(np.abs(net.w2.detach().numpy() - w2vals))
    print("pytorch baseline")
    print("Total moved by w0: ", delta0)
    print("Total moved by w1: ", delta1)
    print("Total moved by w2: ", delta2)

    error0 = np.sum(np.abs(w0R - net.w0.detach().numpy())) / delta0
    error1 = np.sum(np.abs(w1R - net.w1.detach().numpy())) / delta1
    error2 = np.sum(np.abs(w2R - net.w2.detach().numpy())) / delta2
    print("without pipelining")
    print("Total moved by w0: ", np.sum(np.abs(w0R - w0vals)))
    print("Total moved by w1: ", np.sum(np.abs(w1R - w1vals)))
    print("Total moved by w2: ", np.sum(np.abs(w2R - w2vals)))
    print("l1 error for w0: ", error0)
    print("l1 error for w1: ", error1)
    print("l1 error for w2: ", error2)
    assert (error0 < 1e-5)
    assert (error1 < 1e-5)
    assert (error2 < 1e-5)
Ejemplo n.º 17
0
def test_against_pytorch():
    """
    Comparison of popart and PyTorch optimizers, and the changes needed to PyTorch 
    to match popart. Note that these differences should have no effect on overall 
    training measures, and this discussion is just for those interested in exact 
    reproducility between popart and pytorch 

    The main differences are:
    1)
    pytorch optimizer, in the very first iteration with a new optimizer: 
    initializes velocity tensor with zeros and does not do damping,
    popart optimizer, in the first iteration with a new optimizer:
    retains velocity tensor from previous state with previous optimizer, 
    or sets it to zero if the first round of training

    2)
    popart and pytorch updates the optimizer at a different "phase" 
    a)    v <- v * mm  + (1 - dp) * wd * w
    b)    v <- v + (1 - dp) * g
    c)    w <- w - lr * v

    pytorch goes (abc)(abc)(abc)(abc) 
    popart goes  (abca)(bca)(bca)(bca)

    where changes to the optimizer can be done between periods. For this reason, 
    updates to mm, dp, and wd have different effects. 

    See also sgd_mixed_mode_test_cpp_1_3.cpp
    """

    #optimizer parameters
    defaultLearningRate0 = 0.005
    defaultLearningRate1 = 0.003
    defaultLearningRate2 = 0.001

    defaultMomentum0 = 0.1
    defaultDampening0 = 0.3
    lossScaling0 = 10.0
    defaultVelocityScaling0 = 0.5
    defaultWeightDecay0 = 0.01

    optMap0 = {
        "defaultLearningRate": (defaultLearningRate0, False),
        "defaultMomentum": (defaultMomentum0, False),
        "defaultDampening": (defaultDampening0, False),
        "defaultVelocityScaling": (defaultVelocityScaling0, False),
        "lossScaling": (lossScaling0, False),
        "defaultWeightDecay": (defaultWeightDecay0, False)
    }

    optMap1 = {
        "defaultLearningRate": (defaultLearningRate1, False),
        "defaultMomentum": (defaultMomentum0, False),
        "defaultDampening": (defaultDampening0, False),
        "defaultVelocityScaling": (defaultVelocityScaling0, False),
        "lossScaling": (lossScaling0, False),
        "defaultWeightDecay": (defaultWeightDecay0, False)
    }

    optMap2 = {
        "defaultLearningRate": (defaultLearningRate2, False),
        "defaultMomentum": (defaultMomentum0, False),
        "defaultDampening": (defaultDampening0, False),
        "defaultVelocityScaling": (defaultVelocityScaling0, False),
        "lossScaling": (lossScaling0, False),
        "defaultWeightDecay": (defaultWeightDecay0, False)
    }
    seed = 1015
    npr.seed(seed)
    torch.manual_seed(seed)

    #L1 loss value
    lambda1 = 1.0

    # tensor dimensions and replications
    height = 2
    numberOfSteps = 3
    batchesPerStep = 5
    sampleShape = [height, height]
    replicationFactor = 1
    accumulationFactor = 1
    nVirtualGraphs = 1
    samplesPerBatch = 4
    divvyFactor = replicationFactor * accumulationFactor
    samplesPerMicroBatch = samplesPerBatch // divvyFactor
    nIPUs = replicationFactor * nVirtualGraphs
    stepDataShape = [batchesPerStep, samplesPerBatch, height, height]
    microBatchShape = [samplesPerMicroBatch, height, height]
    stepDataInfo = popart.TensorInfo("FLOAT", stepDataShape)
    microBatchInfo = popart.TensorInfo("FLOAT", microBatchShape)

    #initial weight and input values
    w0vals = np.array(npr.randn(height, height), dtype=np.float32)
    w1vals = np.array(npr.randn(height, height), dtype=np.float32)
    inputVals = [
        np.array(npr.randn(*stepDataShape), dtype=np.float32)
        for i in range(numberOfSteps)
    ]

    # Build the ONNX Model
    builder = popart.Builder()
    input0 = builder.addInputTensor(microBatchInfo)
    w0 = builder.addInitializedInputTensor(w0vals)
    w1 = builder.addInitializedInputTensor(w1vals)

    # Model:
    #
    # input  w0     w1
    #     \  |      |
    #      mul  -  add - L1 loss

    mm0 = builder.aiOnnx.mul([input0, w0])
    mm1 = builder.aiOnnx.add([mm0, w1])
    l1 = builder.aiGraphcore.l1loss([mm1], lambda1)
    dataFlow = popart.DataFlow(batchesPerStep, {})
    device = tu.create_test_device(numIpus=nIPUs)
    userOptions = popart.SessionOptions()
    userOptions.enableGradientAccumulation = False
    userOptions.enablePrefetchDatastreams = False

    session = popart.TrainingSession(
        fnModel=builder.getModelProto(),
        dataFlow=dataFlow,
        userOptions=userOptions,
        loss=l1,
        optimizer=popart.SGD(optMap0),
        deviceInfo=tu.create_test_device(opts={"compileIPUCode": False}))

    anchorArrays = session.initAnchorArrays()

    session.prepareDevice()
    session.weightsFromHost()

    stepio = popart.PyStepIO({input0: inputVals[0]}, anchorArrays)
    session.run(stepio)

    session.updateOptimizerFromHost(popart.SGD(optMap1))

    stepio = popart.PyStepIO({input0: inputVals[1]}, anchorArrays)
    session.run(stepio)

    session.updateOptimizerFromHost(popart.SGD(optMap2))

    stepio = popart.PyStepIO({input0: inputVals[2]}, anchorArrays)
    session.run(stepio)

    session.weightsToHost()
    w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R})
    session.readWeights(weightsRead)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.w0 = torch.nn.Parameter(torch.from_numpy(w0vals.copy()))
            self.w1 = torch.nn.Parameter(torch.from_numpy(w1vals.copy()))

        def forward(self, x, i):
            mm0 = torch.mul(x, self.w0)
            mm1 = torch.add(mm0, self.w1)
            return mm1

    net = Net()

    optMaps = [optMap0, optMap1, optMap2]

    for step in range(3):
        if step is 0:
            oldOptimizer = None
        else:
            oldOptimizer = optimizer

        optimizer = optim.SGD(
            net.parameters(),
            lr=optMaps[step]["defaultLearningRate"][0],
            momentum=optMaps[step]["defaultMomentum"][0],
            dampening=optMaps[step]["defaultDampening"][0],
            weight_decay=optMaps[step]["defaultWeightDecay"][0])

        if step is 0:
            for group in optimizer.param_groups:
                for p in group['params']:
                    param_state = optimizer.state[p][
                        'momentum_buffer'] = p.data * 0

        else:
            for group, oldGroup in zip(optimizer.param_groups,
                                       oldOptimizer.param_groups):
                for p, oldp in zip(group['params'], oldGroup['params']):
                    param_state = optimizer.state[p][
                        'momentum_buffer'] = oldOptimizer.state[p][
                            'momentum_buffer']

        for i in range(batchesPerStep):
            out = net(torch.from_numpy(inputVals[step][i]), i)
            loss = lambda1 * torch.mean(torch.abs(out))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    delta0 = np.sum(np.abs(net.w0.detach().numpy() - w0vals))
    delta1 = np.sum(np.abs(net.w1.detach().numpy() - w1vals))
    print("pytorch baseline")
    print("Total moved by w0: ", delta0)
    print("Total moved by w1: ", delta1)

    error0 = np.sum(np.abs(w0R - net.w0.detach().numpy())) / delta0
    error1 = np.sum(np.abs(w1R - net.w1.detach().numpy())) / delta1
    print("without pipelining")
    print("Total moved by w0: ", np.sum(np.abs(w0R - w0vals)))
    print("Total moved by w1: ", np.sum(np.abs(w1R - w1vals)))
    print("l1 error for w0: ", error0)
    print("l1 error for w1: ", error1)
    assert (error0 < 1e-5)
    assert (error1 < 1e-5)
Ejemplo n.º 18
0
        def run(self,
                init_builder,
                reference,
                step_type='infer',
                opsets=None,
                optimizer=popart.ConstSGD(0.01),
                seed=None):
            assert step_type in ('infer', 'train')

            bld = Builder(opsets=opsets)

            anchors = {}

            # Allows to pass additional arguments to init_builder, if required
            # by the specific init_builder function implementation.
            kwargs = {}
            kwargs = tu.filter_dict(kwargs, init_builder)
            anchorIds = init_builder(bld, **kwargs)

            for anchorId in anchorIds:
                if anchorId not in bld._init_input_map:
                    anchors[anchorId] = popart.AnchorReturnType("All")

            dataFlow = popart.DataFlow(1, anchors)

            self.options.logDir = self.logging_dir

            device = tu.create_test_device(numIpus=self.numIPUs)
            print(f"Created device {device} with {self.numIPUs} IPUs")

            self.patterns.InPlace = self.inplacing
            if step_type == 'infer':
                session = popart.InferenceSession(fnModel=bld.getModelProto(),
                                                  dataFlow=dataFlow,
                                                  deviceInfo=device,
                                                  patterns=self.patterns,
                                                  userOptions=self.options)
            else:
                assert step_type == 'train'
                # Apply reduction to output (assumed to be the
                # first anchorId) to ensure it is scalar
                lossId = anchorIds[0]
                lossId = bld.aiGraphcore.identityloss(
                    [lossId], reduction=self.lossReduction)

                session = popart.TrainingSession(fnModel=bld.getModelProto(),
                                                 dataFlow=dataFlow,
                                                 loss=lossId,
                                                 optimizer=optimizer,
                                                 deviceInfo=device,
                                                 patterns=self.patterns,
                                                 userOptions=self.options)

            anchor_map = session.initAnchorArrays()

            session.prepareDevice()

            if seed is not None:
                session.setRandomSeed(seed)

            for k, v in bld._input_map.items():
                if not v.flags['C_CONTIGUOUS']:
                    # need to call np.ascontiguousarray
                    # `x = np.ascontiguousarray(x)`
                    raise Exception(
                        'Input "{}" to popart.PyStepIO is not C_CONTIGUOS'.
                        format(k))

            # Add the replication dimension to the inputs
            inputs = {}
            for k, v in bld._input_map.items():
                if self.options.replicatedGraphCount > 1:
                    um = (self.options.replicatedGraphCount, )
                    um = um + tuple([1] * np.ndim(v))

                    # we add this offset to ensure that samples on devices are distinct
                    offset = 1 * np.arange(
                        self.options.replicatedGraphCount).astype(
                            v.dtype).reshape(um)

                    inputs[k] = np.tile(v, um) + offset

                else:
                    inputs[k] = v

            stepio = popart.PyStepIO(inputs, anchor_map)

            if (step_type == 'train'):
                session.weightsFromHost()

            session.run(stepio)

            if (step_type == 'train'):
                session.weightsToHost()

            ref_out = reference(RefData(bld._outputs, anchor_map))

            def fix_type(t):
                if isinstance(t, torch.Tensor):
                    return t.data.numpy()
                elif isinstance(t, np.ndarray):
                    return t
                elif isinstance(t, np.float32):
                    return t
                elif isinstance(t, np.float16):
                    return t
                elif t is None:
                    return None
                else:
                    raise Exception('unexpected type', type(t))

            ref_out = [fix_type(i) for i in ref_out]
            for index, key in enumerate(anchorIds):
                if key in anchors:
                    if ref_out[index] is not None:
                        print('Testing anchor "{}"...'.format(key))
                        self.verifyTensor(anchor_map[key], ref_out[index])
                    else:
                        print('Not Testing anchor "{}" as it is None'.format(
                            key))
                elif key in bld._init_input_map:
                    if ref_out[index] is not None:
                        print('Testing weight "{}"...'.format(key))
                        weightInfo = session.getInfo(key)
                        print('Weight info shape:{} type:{}',
                              weightInfo.shape(), weightInfo.data_type_lcase())
                        weights = {}
                        weights[key] = np.empty(
                            shape=weightInfo.shape(),
                            dtype=weightInfo.data_type_lcase())
                        weightsIo = popart.PyWeightsIO(weights)
                        session.readWeights(weightsIo)

                        self.verifyTensor(weights[key], ref_out[index])

                    else:
                        print('Not Testing weight "{}" as it is None'.format(
                            key))

            return session
Ejemplo n.º 19
0
def _run_popart_test_model(data,
                           weights,
                           clipInfo,
                           pipelineGroups=None,
                           accumulationFactor=None,
                           optimizerType=None,
                           enablePipelining=False):
    # make sure the weights are not accidently modified in this function
    weights = [np.copy(i) for i in weights]
    bld = popart.Builder()
    d0 = bld.addInputTensor(popart.TensorInfo(data))
    # consistently name the weights so we can refer to them later
    weightIds = [
        bld.addInitializedInputTensor(w, f'weight{i}')
        for i, w in enumerate(weights)
    ]

    # Get a pipeline stage for each weight
    if pipelineGroups:
        pipelineStages = {}
        maxPipelineStage = len(pipelineGroups) - 1
        for pipelineStage, indices in enumerate(pipelineGroups):
            for index in indices:
                pipelineStages[index] = pipelineStage

    x = d0
    for i, weightId in enumerate(weightIds):
        x = bld.aiOnnxOpset9.conv([x, weightId],
                                  dilations=[1, 1],
                                  pads=[1, 1, 1, 1],
                                  strides=[1, 1])
        if pipelineGroups:
            bld.pipelineStage(x, pipelineStages[i])
            bld.virtualGraph(x, pipelineStages[i])

    out = bld.aiGraphcore.l1loss([x], 1.0)
    if pipelineGroups:
        bld.pipelineStage(out, maxPipelineStage)
        bld.virtualGraph(out, maxPipelineStage)

    bld.addOutputTensor(out)

    proto = bld.getModelProto()

    dataFlow = popart.DataFlow(1, {
        x: popart.AnchorReturnType("All"),
        out: popart.AnchorReturnType("All")
    })

    if pipelineGroups:
        device = popart.DeviceManager().createIpuModelDevice(
            {"numIPUs": maxPipelineStage + 1})
    else:
        device = popart.DeviceManager().createIpuModelDevice({"numIPUs": 1})

    clipNormSettings = []
    for weightIndices, maxNorm in clipInfo:
        clipNormSettings.append(
            popart.ClipNormSettings.clipWeights(
                [weightIds[i] for i in weightIndices], maxNorm))
    opts = popart.SessionOptions()
    opts.enableOutlining = False
    if pipelineGroups:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        opts.accumulationFactor = accumulationFactor
        opts.enableGradientAccumulation = True
        opts.accumulateOuterFragmentSettings.schedule = popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized
        opts.enablePipelining = enablePipelining

    sess = popart.TrainingSession(proto,
                                  dataFlow=dataFlow,
                                  loss=out,
                                  optimizer=_get_popart_optimizer(
                                      optimizerType, clipNormSettings),
                                  deviceInfo=device,
                                  userOptions=opts)

    sess.prepareDevice()

    sess.weightsFromHost()

    anchors = sess.initAnchorArrays()
    if pipelineGroups:
        data = np.array([data] * accumulationFactor)
    stepio = popart.PyStepIO({d0: data}, anchors)
    sess.run(stepio)

    result = anchors[x]

    sess.weightsToHost()

    resultWeights = {
        weightIds[i]: np.empty(weights[i].shape, dtype=weights[i].dtype)
        for i in range(len(weights))
    }

    weightsio = popart.PyWeightsIO(resultWeights)
    sess.readWeights(weightsio)
    return result, resultWeights