Beispiel #1
0
def test_embedding_bwd(custom_ops):
    #  ------------------- PopART --------------------
    config = BertConfig(task="SQUAD",
                        vocab_length=9728,
                        micro_batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type='relu',
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        update_embedding_dict=True)

    popart_model = Bert(config)
    # Prevent virtualGraph attributes being added to the ops

    sequence_info = popart.TensorInfo(
        "UINT32", [config.micro_batch_size * config.sequence_length])
    indices = popart_model.builder.addInputTensor(sequence_info)
    positions = popart_model.builder.addInputTensor(sequence_info)
    segments = popart_model.builder.addInputTensor(sequence_info)
    data = {
        indices:
        np.random.randint(
            0, config.vocab_length,
            (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32),
        positions:
        np.random.randint(
            0, config.max_positional_length,
            (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32),
        segments:
        np.random.randint(
            0, 2, (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32)
    }

    optimizer = popart.ConstSGD(0.01)

    l1_lambda = 0.1
    with popart_model.builder.nameScope("Embedding"):
        output = popart_model.embedding(indices, positions, segments)
    l1 = popart_model.builder.aiGraphcore.l1loss(
        [output],
        l1_lambda,
        debugContext="l1LossVal",
        reduction=popart.ReductionType.Sum)

    num_reps = 5
    proto = popart_model.builder.getModelProto()
    outputs, post_proto = run_py(proto,
                                 data,
                                 output,
                                 ipus=1,
                                 loss=l1,
                                 num_reps=num_reps,
                                 optimizer=optimizer)

    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = [
        data[t].reshape(config.micro_batch_size,
                        config.sequence_length).astype(np.int32)
        for t in [indices, positions, segments]
    ]

    #  ------------------- PyTorch -------------------------

    torch_model = BertEmbeddings(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        update_embedding_dict=config.update_embedding_dict))
    # Turn off dropout
    torch_model.eval()

    copy_weights_to_torch(torch_model, proto, TORCH_TO_ONNX, {})

    optim = torch.optim.SGD(torch_model.parameters(), 0.01)
    for _ in range(num_reps):
        torch_output = torch_model(
            *[torch.from_numpy(t).long() for t in inputs])
        torch_loss = l1_lambda * torch.norm(torch_output, 1)
        torch_loss.backward()
        optim.step()
        optim.zero_grad()

    torch_outputs = [torch_output.detach().numpy()]

    check_tensors(torch_outputs, outputs, margin=7e-6)

    check_model(torch_model, post_proto, TORCH_TO_ONNX, {}, margin=7e-06)
    def run_test(index, options):
        per_replica_batch_size = batch_size / options["replication"]
        model_input_shape = input_shape[:]
        model_input_shape[0] = int(model_input_shape[0] /
                                   options["replication"])
        model_mask_shape = mask_shape[:]
        model_mask_shape[0] = int(model_mask_shape[0] / options["replication"])

        stride = 2 // options["stages"]
        if "stride" in options and options["stride"]:
            stride = options["stride"]

        builder = popart.Builder(opsets={
            "ai.onnx": 9,
            "ai.onnx.ml": 1,
            "ai.graphcore": 1
        })

        mask = builder.addInputTensor(
            popart.TensorInfo("FLOAT", model_mask_shape), "mask")
        x_in = builder.addInputTensor(
            popart.TensorInfo("FLOAT", model_input_shape), "x_in")

        anchors = {}
        x = x_in
        for i in range(options["numLayers"]):
            qkv = builder.addInitializedInputTensor(qkv_data, f"qkv_{i}")
            anchors[popart.reservedGradientPrefix() +
                    qkv] = popart.AnchorReturnType("All")

            vgid = (i % options["stages"]) if options["phasedExecution"] else i

            with builder.virtualGraph(vgid), builder.executionPhase(i *
                                                                    stride):
                x = builder.aiOnnx.matmul([x, qkv])
                x = attention_onnx(builder, x, mask, per_replica_batch_size,
                                   sequence_length, hidden_size,
                                   attention_heads, qkv_length)

        vgid = ((options["numLayers"] - 1) % options["stages"]
                ) if options["phasedExecution"] else options["numLayers"] - 1

        with builder.virtualGraph(vgid), builder.executionPhase(
            (options["numLayers"] - 1) * stride):
            l1 = builder.aiGraphcore.l1loss([x], 0.2, popart.ReductionType.Sum)

        proto = builder.getModelProto()

        gradient_keys = list(anchors.keys())
        anchors[x] = popart.AnchorReturnType("All")

        dataFlow = popart.DataFlow(batches_per_step, anchors)

        opts = popart.SessionOptions()
        opts.executionPhaseSettings.stages = options["stages"]

        opts.executionPhaseSettings.phases = (
            options["numLayers"] * stride if options["phasedExecution"] else 0)
        opts.enableOutlining = options["outlining"]

        if "phaseSchedule" in options:
            opts.executionPhaseSettings.schedule = options["phaseSchedule"]

        # Phased execution currently does its own recompute annotations
        opts.autoRecomputation = (popart.RecomputationType.Standard
                                  if options["explicitRecomputation"] else
                                  popart.RecomputationType.NoRecompute)

        opts.outlineThreshold = -np.inf
        opts.enableOutliningCopyCostPruning = False
        opts.virtualGraphMode = (popart.VirtualGraphMode.ExecutionPhases
                                 if options["phasedExecution"] else
                                 popart.VirtualGraphMode.Manual)
        opts.explicitRecomputation = options["explicitRecomputation"]
        opts.aliasZeroCopy = options["aliasZeroCopy"]

        opts.batchSerializationSettings.factor = options["batchSerialize"]
        if "batchSchedule" in options:
            opts.batchSerializationSettings.batchSchedule = options[
                "batchSchedule"]
        if "batchConcat" in options:
            # Do not concatenate the batch across phases and virtual graphs
            # (causes more, smalle transfers but allows for individual sub-batch
            # elements to be transferred)
            opts.batchSerializationSettings.concatOnVirtualGraphChange = options[
                "batchConcat"]
            opts.batchSerializationSettings.concatOnExecutionPhaseChange = options[
                "batchConcat"]
            # Wait with loading activations until they are required
            opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand

        if "tensorLocationSettings" in options and options[
                "tensorLocationSettings"]:
            opts.activationTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.weightTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.optimizerStateTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.accumulatorTensorLocationSettings = options[
                "tensorLocationSettings"]
        if "weightTensorLocationSettings" in options and options[
                "weightTensorLocationSettings"]:
            opts.weightTensorLocationSettings = options[
                "weightTensorLocationSettings"]
        if options["replication"] > 1:
            opts.replicatedGraphCount = options["replication"]
            opts.enableReplicatedGraphs = True
        if "ioTiles" in options:
            opts.numIOTiles = options["ioTiles"]

        pat = popart.Patterns(popart.PatternsLevel.Default)
        if options["phasedExecution"]:
            numIpus = options["stages"]
        else:
            numIpus = options["numLayers"] + 1
        if options["replication"] > 1:
            numIpus = numIpus * options["replication"]
        device = tu.create_test_device(numIpus,
                                       pattern=popart.SyncPattern.Full)

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=l1,
                                         optimizer=popart.ConstSGD(0.1),
                                         patterns=pat,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()
        for k, v in anchors.items():
            print(f"anchor_before {k}={v.shape}")

        inputs = {x_in: input_data, mask: mask_data}
        stepio = popart.PyStepIO(inputs, anchors)

        for __ in range(10):
            session.run(stepio)

        session.modelToHost(
            str(tmpdir / f"streamingmemory_attention_{index}.onnx"))

        if options["replication"] > 1:
            for k, v in anchors.items():
                if k in gradient_keys:
                    # The gradient anchors will have an additional replication axis.
                    anchors[k] = np.sum(v, 1 if batches_per_step > 1 else 0)
                else:
                    # Output tensor needs reshaping.
                    anchors[k] = np.reshape(anchors[k], [
                        batches_per_step, sequence_length * batch_size,
                        hidden_size
                    ])
            for k, v in anchors.items():
                print(f"anchor_after {k}={v.shape}")

        return anchors
Beispiel #3
0
def test_pipelined_recomputed_dropout():
    dsize = 10
    ratio = 0.5
    ipus = 4
    layers = 4
    batches_per_step = 7

    # Ensure inputs in range [1.0, 2.0] to ensure comparing with 0 is valid
    ip_shape = [dsize]
    ip_data = np.full([batches_per_step] + ip_shape, 1).astype(np.float32)

    dropouts = []
    dropoutGrads = []
    dropoutInputs = []
    dropoutOutputs = []

    builder = popart.Builder()
    ip = builder.addInputTensor(popart.TensorInfo("FLOAT", ip_shape))

    def add_layer(layer_input, vgraph_num):
        # This is to get the output of the dropout in the bwd pass.
        # D_next_layer_in also includes the gradient of the AddOp.
        identity0 = builder.aiOnnx.identity([layer_input])
        builder.virtualGraph(identity0, vgraph_num)

        [dropout0] = builder.aiOnnx.dropout([identity0],
                                            num_outputs=1,
                                            ratio=ratio)
        builder.virtualGraph(dropout0, vgraph_num)

        # the input to the forward pass dropout
        dropoutInputs.append(identity0)
        # the input to the backward pass dropout
        dropoutInputs.append(popart.reservedGradientPrefix() + dropout0)
        # the output of the backward pass dropout
        dropoutGrads.append(popart.reservedGradientPrefix() + identity0)
        # the output of the forward pass dropout
        dropouts.append(dropout0)

        relu0 = builder.aiOnnx.relu([dropout0])
        builder.virtualGraph(relu0, vgraph_num)

        # This ensures the all input elements to the dropouts, in both
        # the forward and backward passes, will be non-zero.
        add0 = builder.aiOnnx.add([layer_input, dropout0])
        builder.virtualGraph(add0, vgraph_num)

        return add0

    # construct a graph of `layers` number of layers
    # with each layer on a different IPU.
    next_layer_in = ip
    for vgraph in range(layers):
        next_layer_in = add_layer(next_layer_in, vgraph)
    out = next_layer_in

    # TODO: use the tu.requires_ipu decorator
    if tu.ipu_available(ipus):
        device = tu.create_test_device(numIpus=ipus)
    else:
        pytest.skip("Test needs to run on IPU, but none are available")

    dfAnchors = {}
    for x in dropouts + dropoutGrads + dropoutInputs:
        dfAnchors[x] = popart.AnchorReturnType("All")

    dataFlow = popart.DataFlow(batches_per_step, dfAnchors)

    loss = builder.aiGraphcore.identityloss([out])
    builder.virtualGraph(loss, layers - 1)

    userOptions = popart.SessionOptions()
    userOptions.virtualGraphMode = popart.VirtualGraphMode.Manual
    userOptions.enablePipelining = True
    userOptions.autoRecomputation = popart.RecomputationType.Pipeline

    session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                     dataFlow=dataFlow,
                                     optimizer=popart.ConstSGD(0.1),
                                     loss=loss,
                                     userOptions=userOptions,
                                     deviceInfo=device)

    session.prepareDevice()
    session.weightsFromHost()
    anchors = session.initAnchorArrays()
    session.setRandomSeed(0)

    stepio = popart.PyStepIO({ip: ip_data}, anchors)

    session.run(stepio)

    print(anchors.keys())

    # Check that none of the elements of the dropout inputs are zero
    for tid in dropoutInputs:
        x = anchors[tid]
        print(f'{tid}: {x}')
        zero = np.zeros(x.shape)
        assert not np.any(np.equal(x, zero)), \
               f'Some elements of dropout input {tid} are zero'

    print()

    # For each dropout, check that the masked out elements are the same
    # in the forward and backward passes.
    for fwdId, bwdId in zip(dropouts, dropoutGrads):
        print(f'{fwdId}:\n{np.sign(anchors[fwdId])}')
        print(f'{bwdId}:\n{np.sign(anchors[bwdId])}')
        lhs = np.sign(anchors[fwdId])
        rhs = np.sign(anchors[bwdId])
        assert np.array_equal(lhs, rhs), \
               f'{fwdId} and {bwdId} did not use the same dropout mask'
        print()
def test_virtual_graph4():

    builder = popart.Builder()

    i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))

    with builder.virtualGraph(3):
        o1 = builder.aiOnnx.add([i1, i2])
        o1l1 = builder.aiGraphcore.l1loss([o1], 0.1)
        o2 = builder.aiOnnx.add([i3, o1])
        o2l1 = builder.aiGraphcore.l1loss([o2], 0.1)

    with builder.virtualGraph(2):
        o3 = builder.aiOnnx.mul([i1, i3])
        o3l1 = builder.aiGraphcore.l1loss([o3], 0.1)

    with builder.virtualGraph(3):
        loss = builder.aiOnnx.sum([o1l1, o2l1, o3l1])

    proto = builder.getModelProto()

    # Need to anchor the output of the backward pass to stop it being pruned
    dataFlow = popart.DataFlow(
        1, {
            o1: popart.AnchorReturnType("All"),
            o2: popart.AnchorReturnType("All"),
            o3: popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i1:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i2:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i3:
            popart.AnchorReturnType("All")
        })

    optimizer = popart.ConstSGD(0.01)

    opts = popart.SessionOptions()
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    s = popart.TrainingSession(fnModel=proto,
                               dataFlow=dataFlow,
                               loss=loss,
                               optimizer=optimizer,
                               userOptions=opts,
                               deviceInfo=tu.create_test_device(numIpus=4))

    s.prepareDevice()

    anchors = s.initAnchorArrays()

    data1 = np.ones([1], dtype=np.float32)
    data2 = np.ones([1], dtype=np.float32)
    data3 = np.ones([1], dtype=np.float32)

    inputs = {i1: data1, i2: data2, i3: data3}
    stepio = popart.PyStepIO(inputs, anchors)

    s.run(stepio)
    s.weightsFromHost()
def test_virtual_graph3():

    popart.getLogger().setLevel("TRACE")

    builder = popart.Builder()

    i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))
    i4 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1]))

    with builder.virtualGraph(3):
        o1 = builder.aiOnnx.add([i1, i2])
        o2 = builder.aiOnnx.add([i3, i4])

    with builder.virtualGraph(2):
        o3 = builder.aiOnnx.add([o1, o2])
        o = builder.aiOnnx.add([i1, o3])
        o = builder.aiGraphcore.l1loss([o], 0.1)

    proto = builder.getModelProto()

    # Need to anchor the output of the backward pass to stop it being pruned
    dataFlow = popart.DataFlow(
        1, {
            o: popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i1:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i2:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i3:
            popart.AnchorReturnType("All"),
            popart.reservedGradientPrefix() + i4:
            popart.AnchorReturnType("All")
        })

    optimizer = popart.SGD({"defaultLearningRate": (0.01, True)})

    opts = popart.SessionOptions()
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    s = popart.TrainingSession(fnModel=proto,
                               dataFlow=dataFlow,
                               loss=o,
                               optimizer=optimizer,
                               userOptions=opts,
                               deviceInfo=tu.create_test_device(numIpus=4))

    s.prepareDevice()

    anchors = s.initAnchorArrays()

    data1 = np.ones([1], dtype=np.float32)
    data2 = np.ones([1], dtype=np.float32)
    data3 = np.ones([1], dtype=np.float32)
    data4 = np.ones([1], dtype=np.float32)

    inputs = {i1: data1, i2: data2, i3: data3, i4: data4}
    stepio = popart.PyStepIO(inputs, anchors)

    s.run(stepio)
    s.weightsFromHost()
Beispiel #6
0
def test_detach_grad_branches(detach_branch_popart, detach_branch_pytorch):
    # fix the random seed for this test
    np.random.seed(0)
    Batchsize = 8
    Classes = 32

    dshape = [Batchsize, 2, 4, 4]
    lshape = [Batchsize]
    wshape = [2, 2, 3, 3]

    ip_data = np.random.rand(*dshape).astype(np.float32)
    w1_data = np.random.rand(*wshape).astype(np.float32)
    w2_data = np.random.rand(*wshape).astype(np.float32)
    lb_data = np.random.randint(Classes, size=lshape)

    builder = popart.Builder()

    input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", dshape),
                                    "input_i1")

    lb = builder.addInputTensor(popart.TensorInfo("INT32", lshape))
    w1 = builder.addInitializedInputTensor(w1_data)
    w2 = builder.addInitializedInputTensor(w2_data)

    conv1 = builder.aiOnnx.conv([input_, w1],
                                dilations=[1, 1],
                                pads=[1, 1, 1, 1],
                                strides=[1, 1],
                                debugPrefix="conv")
    r1 = builder.reshape_const(builder.aiOnnx, [conv1], [Batchsize, Classes])
    conv2 = builder.aiOnnx.conv([input_, w2],
                                dilations=[1, 1],
                                pads=[1, 1, 1, 1],
                                strides=[1, 1],
                                debugPrefix="conv")
    r2 = builder.reshape_const(builder.aiOnnx, [conv2], [Batchsize, Classes])
    if detach_branch_popart:
        r2 = builder.aiGraphcore.detach([r2])

    add = builder.aiOnnx.sum([r1, r2])
    o = builder.aiOnnx.softmax([add], axis=np.size(lb_data.shape))
    loss = builder.aiGraphcore.nllloss([o, lb])

    dataFlow = popart.DataFlow(1, [
        o, loss,
        popart.reservedGradientPrefix() + o,
        popart.reservedGradientPrefix() + input_, w1, w2
    ])

    opts = popart.SessionOptions()
    session = popart.TrainingSession(
        fnModel=builder.getModelProto(),
        dataFlow=dataFlow,
        loss=loss,
        optimizer=popart.ConstSGD(LEARNING_RATE, WEIGHT_DECAY),
        userOptions=opts,
        deviceInfo=popart.DeviceManager().createIpuModelDevice({}))

    session.prepareDevice()

    anchors = session.initAnchorArrays()
    stepio = popart.PyStepIO({input_: ip_data, lb: lb_data}, anchors)
    session.weightsFromHost()

    # Torch

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(2, 2, 3, padding=[1, 1], bias=False)
            self.conv2 = nn.Conv2d(2, 2, 3, padding=[1, 1], bias=False)
            self.conv1.weight.data = torch.tensor(w1_data)
            self.conv2.weight.data = torch.tensor(w2_data)
            # PyTorch nll loss expects logsoftmax input
            self.sm = nn.LogSoftmax(dim=np.size(lb_data.shape))
            self.nll = nn.NLLLoss()

        def forward(self, x, y):
            x1 = self.conv1(x)
            x1 = torch.reshape(x1, [Batchsize, Classes])
            if detach_branch_pytorch:
                with torch.no_grad():
                    x2 = self.conv2(x)
            else:
                x2 = self.conv2(x)

            x2 = torch.reshape(x2, [Batchsize, Classes])
            x = x1 + x2
            x = self.sm(x)
            x = self.nll(x, y)
            return x

    net = Net()
    optimizer = optim.SGD(net.parameters(),
                          lr=LEARNING_RATE,
                          weight_decay=WEIGHT_DECAY)

    input_t = torch.tensor(ip_data, requires_grad=True, dtype=torch.float32)
    label_t = torch.tensor(lb_data, requires_grad=False, dtype=torch.long)

    for step in range(4):
        print(f"Step {step +1}")
        session.run(stepio)

        # Torch
        #
        optimizer.zero_grad()
        loss = net(input_t, label_t)
        loss.backward()
        optimizer.step()

        print(detach_branch_popart, detach_branch_pytorch)
        print("Popart: w1", np.mean(anchors[w1]))
        print("PyTorch: w1", np.mean(net.conv1.weight.data.numpy()))
        print("Popart: w2", np.mean(anchors[w2]))
        print("PyTorch: w2", np.mean(net.conv2.weight.data.numpy()))

        # Check the weights match if the branches are the same, if not,
        # make sure the right hand branch doesn't match
        if detach_branch_popart == detach_branch_pytorch:
            assert np.allclose(anchors[w1], net.conv1.weight.data.numpy(),
                               1e-4)
            assert np.allclose(anchors[w2], net.conv2.weight.data.numpy(),
                               1e-4)
        else:
            assert not np.allclose(anchors[w2], net.conv2.weight.data.numpy(),
                                   1e-4)
def get_model_anchors_model2(doSharding,
                             doPipelining,
                             batchesPerStep,
                             doTraining,
                             doGradAccl=False,
                             gradAcclFactor=1,
                             doProfiling=False,
                             doDevicex=True,
                             anchorRestoredTensors=False,
                             returnRawInput=False,
                             labelArray=None):

    np.random.seed(1234)
    builder = popart.Builder()
    micro_batch_size = batch_size // gradAcclFactor

    shape_d0 = [micro_batch_size, 2, 4, 4]
    shape_l0 = [batch_size]
    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0), "inp")
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0, "weights")

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugContext="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [micro_batch_size, 32])
    out = builder.aiOnnx.softmax([r0], axis=1, debugContext="sfm")

    label_shape = [micro_batch_size]
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", label_shape),
                                "label")
    nll = builder.aiGraphcore.nllloss([out, l0])

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art, s0: art, c0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradAccl
    opts.accumulationFactor = gradAcclFactor

    if doSharding is False:
        numIPUs = 1
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIPUs = 3
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 1)
        builder.virtualGraph(c0, 1)
        builder.virtualGraph(r0, 2)
        builder.virtualGraph(out, 2)
        builder.virtualGraph(nll, 2)

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=nll,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    classes = np.prod(shape_d0) / (micro_batch_size * batchesPerStep)
    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    outer_dim = 1
    if batchesPerStep > 1:
        # Add an outer dimension of batchesPerStep. We repeat the labels
        # as we want consistency if we have different shape inputs between examples.
        outer_dim *= batchesPerStep
        label = np.repeat(label[np.newaxis], batchesPerStep, 0)
    if gradAcclFactor > 1:
        # Divide up the batches per step batches into gradAcclFactor * batchesPerStep
        # samples.
        outer_dim *= gradAcclFactor
        label = label.reshape([gradAcclFactor * batchesPerStep, -1])
    if outer_dim > 1:
        # Add the gradAcclFactor * batchesPerStep dimension into the input.
        shape_d0.insert(0, outer_dim)
    data = np.ones(shape=shape_d0).astype(np.float32)

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)

    session.weightsFromHost()

    for i in range(6):
        session.run(stepio)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
Beispiel #8
0
def create_model(batch_size):
    """ Create an ONNX protobuf description of a simple model.
        This function uses the popart library builder functions to create the
        ONNX description directly. An alternative would be to load an
        exported ONNX protobuf from a file.
    """
    builder = popart.Builder()

    input_shape = popart.TensorInfo('FLOAT', [batch_size, 1, ROWS, COLS])
    input_t = builder.addInputTensor(input_shape)
    x = input_t

    init_weights = kaiming_init([20, 1, 5, 5], 1 * 5 * 5)
    W1 = builder.addInitializedInputTensor(init_weights)
    init_weights = kaiming_init([20], 1 * 5 * 5, 1, 1)
    b1 = builder.addInitializedInputTensor(init_weights)

    x = builder.aiOnnx.conv([x, W1, b1],
                            dilations=[1, 1],
                            kernel_shape=[5, 5],
                            strides=[1, 1],
                            pads=[0, 0, 0, 0])

    x = builder.aiOnnx.relu([x])
    (x, ) = builder.aiOnnx.maxpool([x],
                                   num_outputs=1,
                                   kernel_shape=[2, 2],
                                   pads=[0, 0, 0, 0],
                                   strides=[2, 2])

    init_weights = kaiming_init([50, 20, 5, 5], 20 * 5 * 5)
    W2 = builder.addInitializedInputTensor(init_weights)
    init_weights = kaiming_init([50], 20 * 5 * 5, 1, 1)
    b2 = builder.addInitializedInputTensor(init_weights)

    x = builder.aiOnnx.conv([x, W2, b2],
                            dilations=[1, 1],
                            kernel_shape=[5, 5],
                            strides=[1, 1],
                            pads=[0, 0, 0, 0])

    x = builder.aiOnnx.relu([x])
    (x, ) = builder.aiOnnx.maxpool([x],
                                   num_outputs=1,
                                   kernel_shape=[2, 2],
                                   pads=[0, 0, 0, 0],
                                   strides=[2, 2])

    shape = builder.aiOnnx.constant(np.asarray([batch_size, 50 * 4**2]))
    x = builder.aiOnnx.reshape([x, shape])

    init_weights = kaiming_init([50 * 4**2, 500], 50 * 4**2)
    W3 = builder.addInitializedInputTensor(init_weights)
    init_weights = kaiming_init([500], 50 * 4**2, 1, 1)
    b3 = builder.addInitializedInputTensor(init_weights)

    x = builder.aiOnnx.matmul([x, W3])
    x = builder.aiOnnx.add([x, b3])
    x = builder.aiOnnx.relu([x])

    init_weights = kaiming_init([500, 10], 500)
    W4 = builder.addInitializedInputTensor(init_weights)
    init_weights = kaiming_init([10], 500, 1, 1)
    b4 = builder.addInitializedInputTensor(init_weights)

    x = builder.aiOnnx.matmul([x, W4])
    output_t = builder.aiOnnx.add([x, b4])

    builder.addOutputTensor(output_t)
    probs = builder.aiOnnx.softmax([output_t])

    label_shape = popart.TensorInfo('INT32', [batch_size])
    label = builder.addInputTensor(label_shape)

    loss = popart.NllLoss(probs, label, 'nllLossVal')

    proto = builder.getModelProto()

    return proto, input_t, label, output_t, loss
Beispiel #9
0
def conv_settings(capfd, operation):
    builder = popart.Builder()

    input_shape = popart.TensorInfo("FLOAT", [1, 2, 4, 4])
    weight_shape = popart.TensorInfo("FLOAT", [3, 2, 3, 3])

    weight_data = np.ones(weight_shape.shape(), np.float32)
    input_ = builder.addInputTensor(input_shape)
    weights = builder.addInitializedInputTensor(weight_data)
    act = builder.aiOnnx.conv([input_, weights],
                              dilations=[1, 1],
                              pads=[1, 1, 1, 1],
                              strides=[1, 1])
    o = builder.aiOnnx.relu([act])
    loss = builder.aiGraphcore.identityloss([o])

    operation(builder, act=act, o=o)

    anchor_names = [
        o,
        popart.reservedGradientPrefix() + input_,
        popart.reservedGradientPrefix() + weights
    ]
    training_dataFlow = popart.DataFlow(
        1, {
            anchor_names[0]: popart.AnchorReturnType("All"),
            anchor_names[1]: popart.AnchorReturnType("All"),
            anchor_names[2]: popart.AnchorReturnType("All")
        })

    opts = popart.SessionOptions()
    opts.constantWeights = False  # Allow the weights to be updated

    # Create the device
    device = tu.create_test_device(1, opts={"compileIPUCode": True})
    device.attach()

    # Prepare the input data
    input_data = np.random.random_sample(input_shape.shape()).astype(
        np.float32)

    # Prepare the Training session
    training_session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                              dataFlow=training_dataFlow,
                                              loss=loss,
                                              optimizer=popart.ConstSGD(0.01),
                                              userOptions=opts,
                                              deviceInfo=device)

    # Compile the training graph
    training_session.prepareDevice()

    # Run the training session
    training_session.weightsFromHost()

    training_anchors = training_session.initAnchorArrays()
    training_inputs = {input_: input_data}

    training_session.run(popart.PyStepIO(training_inputs, training_anchors))

    captured = capfd.readouterr()

    return captured.err
Beispiel #10
0
def test_basic():

    builder = popart.Builder()

    i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1, 2, 32, 32]))
    i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1, 2, 32, 32]))

    old_o = ""

    o = builder.aiOnnx.abs([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.acos([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.acosh([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.add([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.logical_and([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.asin([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.asinh([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.atan([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.atanh([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.ceil([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.cos([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.cosh([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.div([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.elu([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.equal([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.exp([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.floor([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.greater([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.identity([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.less([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.log([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.logsoftmax([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.max([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.mean([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.min([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.mul([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.neg([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.logical_not([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.logical_or([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.pow([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.reciprocal([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.relu([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.sigmoid([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.sin([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.sinh([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.softsign([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.softmax([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.sqrt([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.sub([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.sum([i1, i2])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.tan([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.tanh([i1])
    assert (old_o != o)
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    old_o = o

    o = builder.aiOnnx.logical_xor([i1, i2])
    assert (builder.getTensorShape(o) == [1, 2, 32, 32])
    assert (old_o != o)

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.abs([])
    assert (e_info.value.args[0].startswith("Abs has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.acos([])
    assert (e_info.value.args[0].startswith("Acos has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.acosh([])
    assert (e_info.value.args[0].startswith("Acosh has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.add([])
    assert (e_info.value.args[0].startswith("Add has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.logical_and([])
    assert (e_info.value.args[0].startswith("And has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.asin([])
    assert (e_info.value.args[0].startswith("Asin has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.asinh([])
    assert (e_info.value.args[0].startswith("Asinh has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.atan([])
    assert (e_info.value.args[0].startswith("Atan has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.atanh([])
    assert (e_info.value.args[0].startswith("Atanh has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.ceil([])
    assert (e_info.value.args[0].startswith("Ceil has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.cos([])
    assert (e_info.value.args[0].startswith("Cos has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.cosh([])
    assert (e_info.value.args[0].startswith("Cosh has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.div([])
    assert (e_info.value.args[0].startswith("Div has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.elu([])
    assert (e_info.value.args[0].startswith("Elu has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.equal([])
    assert (e_info.value.args[0].startswith("Equal has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.exp([])
    assert (e_info.value.args[0].startswith("Exp has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.floor([])
    assert (e_info.value.args[0].startswith("Floor has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.greater([])
    assert (e_info.value.args[0].startswith("Greater has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.identity([])
    assert (e_info.value.args[0].startswith("Identity has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.less([])
    assert (e_info.value.args[0].startswith("Less has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.log([])
    assert (e_info.value.args[0].startswith("Log has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.max([])
    assert (e_info.value.args[0].startswith("Max has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.mean([])
    assert (e_info.value.args[0].startswith("Mean has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.min([])
    assert (e_info.value.args[0].startswith("Min has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.mul([])
    assert (e_info.value.args[0].startswith("Mul has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.neg([])
    assert (e_info.value.args[0].startswith("Neg has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.logical_not([])
    assert (e_info.value.args[0].startswith("Not has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.logical_or([])
    assert (e_info.value.args[0].startswith("Or has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.pow([])
    assert (e_info.value.args[0].startswith("Pow has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.reciprocal([])
    assert (
        e_info.value.args[0].startswith("Reciprocal has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.relu([])
    assert (e_info.value.args[0].startswith("Relu has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.sigmoid([])
    assert (e_info.value.args[0].startswith("Sigmoid has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.sin([])
    assert (e_info.value.args[0].startswith("Sin has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.sinh([])
    assert (e_info.value.args[0].startswith("Sinh has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.softsign([])
    assert (e_info.value.args[0].startswith("Softsign has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.sqrt([])
    assert (e_info.value.args[0].startswith("Sqrt has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.sub([])
    assert (e_info.value.args[0].startswith("Sub has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.sum([])
    assert (e_info.value.args[0].startswith("Sum has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.tan([])
    assert (e_info.value.args[0].startswith("Tan has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.tanh([])
    assert (e_info.value.args[0].startswith("Tanh has invalid number of"))

    with pytest.raises(popart.popart_exception) as e_info:
        builder.aiOnnx.logical_xor([])
    assert (e_info.value.args[0].startswith("Xor has invalid number of"))

    proto = builder.getModelProto()

    assert (len(proto) > 0)
    assert (len(i1) > 0)
    assert (len(i2) > 0)
    assert (len(o) > 0)
    assert (i1 != i2)
    assert (i2 != o)

    with pytest.raises(TypeError) as e_info:
        builder.aiOnnx.add(0, 0)

    assert (e_info.value.args[0].startswith("add(): incompatible function"))
Beispiel #11
0
    op_tester.atol = 1e-06
    op_tester.rtol = 1e-05
    op_tester.run(init_builder, reference, 'train')


if __name__ == "__main__":
    builder = popart.Builder()
    d1 = np.random.randint(0, 20, size=(2, 2, 3)).astype(np.float32)

    input_size = d1.shape[2]  # (2,2,3)
    hidden_size = 7

    d2 = np.random.rand(1, 3 * hidden_size, input_size).astype(np.float32)
    d3 = np.random.rand(1, 3 * hidden_size, hidden_size).astype(np.float32)

    i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", d1.shape))
    i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", d2.shape))
    i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", d3.shape))
    Y, Y_h = builder.aiOnnx.gru([i1, i2, i3],
                                2,
                                clip=None,
                                direction="bidirectional")
    builder.addOutputTensor(Y)

    dataFlow = popart.DataFlow(1, {Y: popart.AnchorReturnType("All")})

    # Create a session to compile and the graph for inference
    #------------------------------------------------------------------------------
    inferenceOptions = popart.SessionOptions()
    # Need to compile the inference graph with variable weights we they can be updated
    # before execution
Beispiel #12
0
def test_is_initializer():
    builder = popart.Builder()
    i0 = builder.addInputTensor(popart.TensorInfo("FLOAT", [10, 9, 8, 7]))
    i1 = builder.addInitializedInputTensor(np.array([1, 6], dtype=np.int64))
    assert builder.isInitializer(i0) == False
    assert builder.isInitializer(i1) == True
Beispiel #13
0
def popart_result_and_model(config, mode, weight_transposed, is_bwd=False):
    """Run popart model based on config.

    Args:
        config (BertConfig): Popart config.
        weight_transposed: Construct embedding dict transposed.
        is_bwd (bool, optional): Construct training graph if True,
                                 else inference graph. Defaults to False.

    Returns:
        Tuple: Gathered numpy data, outputs from model, proto, post_proto
    """

    scope_provider = ScopeProvider()
    user_options = {}
    if mode == ExecutionMode.PHASED:
        builder = popart.Builder()

        indices_len = config.batch_size * config.sequence_length
        sequence_info = popart.TensorInfo("UINT32", [indices_len])
        indices = builder.addInputTensor(sequence_info)
        data = {indices: np.random.randint(0, config.vocab_length, (indices_len)).astype(np.uint32)}

        popart_model = EmbeddingSerialised(scope_provider.get_scope('Token'),
                                           input_dim=config.vocab_length,
                                           output_dim=config.hidden_size,
                                           num_splits=config.embedding_serialization_vocab_steps,
                                           custom=True,
                                           dtype=config.dtype,
                                           detach=not config.update_embedding_dict,
                                           weight_transposed=weight_transposed,
                                           builder=builder,
                                           scope_provider=scope_provider)
        user_options = {
            "batchSerializationFactor": 1,
            "executionPhases": popart_model.total_execution_phases
        }
        output = popart_model(indices)
    else:
        popart_model = get_model(config, mode, block="embedding", initializers={})
        builder = popart_model.builder

        indices_len = config.batch_size * config.sequence_length
        sequence_info = popart.TensorInfo("UINT32", [indices_len])
        indices = builder.addInputTensor(sequence_info)
        data = {indices: np.random.randint(0, config.vocab_length, (indices_len)).astype(np.uint32)}
        output = popart_model.word_embedding_serialized(indices, num_splits)

    if is_bwd:
        l1_lambda = 0.1
        if mode == ExecutionMode.PHASED:
            loss_scope = scope_provider.get_scope('Loss', 'prev')
            with popart_model.scope_provider(popart_model.builder, loss_scope):
                l1_loss = popart_model.builder.aiGraphcore.l1loss([output],
                                                                  l1_lambda,
                                                                  debugPrefix="l1LossVal",
                                                                  reduction=popart.ReductionType.Sum)
        else:
            l1_loss = popart_model.builder.aiGraphcore.l1loss([output],
                                                              l1_lambda,
                                                              debugPrefix="l1LossVal",
                                                              reduction=popart.ReductionType.Sum)
        proto = builder.getModelProto()
        optimizer = popart.ConstSGD(0.01)
        outputs, post_proto = run_py(proto,
                                     data, (output, l1_loss),
                                     loss=l1_loss,
                                     optimizer=optimizer,
                                     user_options=user_options,
                                     execution_mode=mode)
    else:
        proto = builder.getModelProto()
        outputs, post_proto = run_py(proto, data, output,
                                     user_options=user_options,
                                     execution_mode=mode)

    return [data[indices]], outputs, proto, post_proto
Beispiel #14
0
def test_embedding_fwd(custom_ops):
    #  ------------------- PopART --------------------
    config = BertConfig(task="SQUAD",
                        vocab_length=9728,
                        micro_batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type='relu',
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        inference=True)
    popart_model = Bert(config)

    sequence_info = popart.TensorInfo(
        "UINT32", [config.micro_batch_size * config.sequence_length])
    indices = popart_model.builder.addInputTensor(sequence_info)
    positions = popart_model.builder.addInputTensor(sequence_info)
    segments = popart_model.builder.addInputTensor(sequence_info)
    data = {
        indices:
        np.random.randint(
            0, config.vocab_length,
            (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32),
        positions:
        np.random.randint(
            0, config.max_positional_length,
            (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32),
        segments:
        np.random.randint(
            0, 2, (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32)
    }

    user_options = {"enableStochasticRounding": True}
    with popart_model.builder.nameScope("Embedding"):
        output = popart_model.embedding(indices, positions, segments)

    proto = popart_model.builder.getModelProto()
    outputs, post_proto = run_py(proto,
                                 data,
                                 output,
                                 user_options=user_options)

    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = [
        data[t].reshape(config.micro_batch_size,
                        config.sequence_length).astype(np.int32)
        for t in [indices, positions, segments]
    ]

    #  ------------------- PyTorch -------------------------
    torch_model = BertEmbeddings(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps))
    torch_model.eval()

    copy_weights_to_torch(torch_model, proto, TORCH_TO_ONNX, {})
    torch_outputs = run_fwd_model(inputs, torch_model)

    check_tensors(torch_outputs, outputs, margin=5e-7)
Beispiel #15
0
    def test(config, iteration, true_scaling, test_case):
        builder = popart.Builder()

        w0name = "weight_0"
        w1name = "weight_1"
        w2name = "weight_2"

        input0Shape = [1, 1, 1]
        input0 = builder.addInputTensor(
            popart.TensorInfo("FLOAT", input0Shape), "input0")

        w0data = np.array([test_case[0][0]], dtype=np.float32)
        w0R = np.empty([
            1,
        ], dtype=np.float32)
        w0Id = builder.addInitializedInputTensor(w0data, w0name)

        w1data = np.array([test_case[1][0]], dtype=np.float32)
        w1R = np.empty([
            1,
        ], dtype=np.float32)
        w1Id = builder.addInitializedInputTensor(w1data, w1name)

        w2data = np.array([test_case[2][0]], dtype=np.float32)
        w2R = np.empty([
            1,
        ], dtype=np.float32)
        w2Id = builder.addInitializedInputTensor(w2data, w2name)

        add0 = builder.aiOnnx.add([w0Id, input0])
        add1 = builder.aiOnnx.add([w1Id, add0])
        add2 = builder.aiOnnx.add([w2Id, add1])
        loss = builder.aiGraphcore.l1loss([add2],
                                          1.0,
                                          debugContext="l1LossVal")
        builder.addOutputTensor(add2)

        proto = builder.getModelProto()
        dataFlow = popart.DataFlow(1, {})
        opts = popart.SessionOptions()
        opts.reportOptions = {"showExecutionSteps": "true"}
        pat = popart.Patterns(popart.PatternsLevel.Default)
        dm = popart.DeviceManager()
        dm.setOnDemandAttachTimeout(int(1e4))
        device = dm.acquireAvailableDevice(
            1,
            connectionType=popart.DeviceConnectionType.OnDemand,
            selectionCriterion=popart.DeviceSelectionCriterion.Random)
        if device is None:
            raise OSError("Failed to acquire IPU.")

        # The stage->tensor map would come from the Bert model in reality
        # (see model.tensors)
        mock_tensor_map = {0: [w0Id], 1: [w1Id], 2: [w2Id]}

        factory = ScheduledOptimizerFactory(config,
                                            iteration,
                                            tensors=mock_tensor_map)
        assert_scaled_lr(factory, true_scaling)

        optimizer_step0 = factory.create()

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=loss,
                                         optimizer=optimizer_step0,
                                         patterns=pat,
                                         deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        input_data = np.array([3.1415], dtype=np.float32)
        stepio = popart.PyStepIO({input0: input_data}, anchors)

        for step in range(iteration.total_steps):
            session.run(stepio)
            session.weightsToHost()
            weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R})
            session.readWeights(weightsRead)

            assert (np.isclose(test_case[0][step + 1], w0R))
            assert (np.isclose(test_case[1][step + 1], w1R))
            assert (np.isclose(test_case[2][step + 1], w2R))

            iteration.count += 1

            if factory.should_update(iteration):
                optimizer_step1 = factory.update_and_create(iteration)
                assert_scaled_lr(factory, true_scaling)

                session.updateOptimizerFromHost(optimizer_step1)
def sparse_mm_infer(sparse_mm_type, lhs_dims, vanilla_rhs_dims, block_size, sparsity_level, transpose_rhs, memory_cycle_ratio, inner_group_size):
    """ """
    if transpose_rhs:
        matmul_dims = [lhs_dims[-2], vanilla_rhs_dims[-1], vanilla_rhs_dims[-2]]
    else:
        matmul_dims = [lhs_dims[-2], vanilla_rhs_dims[-2], vanilla_rhs_dims[-1]]

    lhs = create_dense_matrix(lhs_dims)
    if sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_SPARSE_RHS_DENSE_OUT']:
        bsr_rhs, lengths_per_2d_plane, vanilla_rhs, sparsity_mask = create_sparse_matrix(vanilla_rhs_dims, block_size[1:], sparsity_level)

        rhs = bsr_rhs
        rhs_dims = bsr_rhs.shape
    elif sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_DENSE_RHS_SPARSE_OUT']:
        output_dims = lhs_dims[:-1]
        output_dims.append(vanilla_rhs_dims[-1])
        output_block_size = [block_size[0], block_size[2]]

        bsr_output, lengths_per_2d_plane, _, sparsity_mask = create_sparse_matrix(output_dims, output_block_size, sparsity_level)

        rhs_dims = vanilla_rhs_dims
        rhs = create_dense_matrix(rhs_dims)

    # Create a builder and construct a graph
    builder = popart.Builder()

    lhs_tensorInfo = popart.TensorInfo("FLOAT", lhs_dims)
    rhs_tensorInfo = popart.TensorInfo("FLOAT", rhs_dims)

    lhsTensor = builder.addInputTensor(lhs_tensorInfo)
    rhsTensor = builder.addInputTensor(rhs_tensorInfo)

    outTensor = builder.customOp(opName = "BSMatMul",
                                 opVersion=1,
                                 domain = "ai.graphcore",
                                 inputs = [lhsTensor, rhsTensor],
                                 attributes = {
                                  "bsr_rhs_lengths_per_2d_plane": lengths_per_2d_plane.tolist(),
                                  "matrix_dims": matmul_dims,
                                  "block_size": block_size,
                                  "sparsity_mask": sparsity_mask.tolist(),
                                  "bsmatmul_type": sparse_mm_type,
                                  "transpose_rhs": transpose_rhs,
                                  "memory_cycle_ratio": memory_cycle_ratio,
                                  "inner_group_size": inner_group_size,
                                  "in_type": g_input_data_type,
                                  "out_type": g_output_data_type,
                                  "pp_type": g_pp_data_type
                                 })[0]

    builder.addOutputTensor(outTensor)

    proto = builder.getModelProto()

    # Describe how to run the model
    dataFlow = popart.DataFlow(1, {outTensor: popart.AnchorReturnType("ALL")})

    # Create a session to compile and execute the graph
    session = popart.InferenceSession(
        fnModel=proto,
        dataFlow=dataFlow,
        deviceInfo=popart.DeviceManager().acquireAvailableDevice(1))

    # Compile graph
    session.prepareDevice()

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    rhs = np.array(rhs, dtype=g_input_data_type)

    stepio = popart.PyStepIO({lhsTensor: lhs, rhsTensor: rhs}, anchors)
    session.run(stepio)

    ipuOutput = anchors[outTensor]

    if sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_SPARSE_RHS_DENSE_OUT']:
        if transpose_rhs:
            transpose_indices = list(range(len(vanilla_rhs_dims)))
            transpose_indices[-2], transpose_indices[-1] = transpose_indices[-1], transpose_indices[-2]

            vanilla_rhs = vanilla_rhs.transpose(tuple(transpose_indices))
            goldOutput = mm(lhs, vanilla_rhs)
        else:
            goldOutput = mm(lhs, vanilla_rhs)
    else:
        assert len(lhs.shape) == len(rhs.shape)
        if(len(lhs.shape) == 2):
            lhs = np.expand_dims(lhs, 0)
            rhs = np.expand_dims(rhs, 0)

        mmOutput = mm(lhs, rhs)

        totalGroupDims = int(np.prod(lhs_dims[:-2]))

        num_rows_sparsity_mask_2d = output_dims[-2] // block_size[0]
        num_cols_sparsity_mask_2d = output_dims[-1] // block_size[2]

        assert sparsity_mask.shape == (totalGroupDims * num_rows_sparsity_mask_2d * num_cols_sparsity_mask_2d,)
        mmOutput = mmOutput.reshape((totalGroupDims, lhs_dims[-2], rhs_dims[-1]))

        goldOutput = []
        for dim in range(totalGroupDims):
            offset = num_rows_sparsity_mask_2d * num_cols_sparsity_mask_2d
            mmOutput_2d = mmOutput[dim]
            sliced_sparsity_mask = sparsity_mask[dim * offset: dim * offset + offset]

            for sparsity_mask_idx in range(len(sliced_sparsity_mask)):
                if sliced_sparsity_mask[sparsity_mask_idx]:
                    mmOutput_2d_row_start = (sparsity_mask_idx // num_cols_sparsity_mask_2d) * block_size[0]
                    mmOutput_2d_row_end = mmOutput_2d_row_start + block_size[0]

                    mmOutput_2d_col_start = (sparsity_mask_idx % num_cols_sparsity_mask_2d) * block_size[2]
                    mmOutput_2d_col_end = mmOutput_2d_col_start + block_size[2]

                    mmOutput_2d_sliced = mmOutput_2d[mmOutput_2d_row_start: mmOutput_2d_row_end, mmOutput_2d_col_start: mmOutput_2d_col_end]
                    goldOutput.append(mmOutput_2d_sliced.reshape(block_size[0] * block_size[2]))

        goldOutput = np.array(goldOutput)

    return ipuOutput, goldOutput
Beispiel #17
0
    def run(model_file_name, explicit_recompute=True):
        dsize = 10
        builder = popart.Builder()
        ip = builder.addInputTensor(popart.TensorInfo("FLOAT", [dsize, dsize]))
        d__ip = popart.reservedGradientPrefix() + ip

        def add_layer(in_id):
            np.random.seed(1)
            scaler = 0.01
            w = builder.addInitializedInputTensor(
                np.random.randn(dsize, dsize).astype(np.float32) * scaler)
            b = builder.addInitializedInputTensor(
                np.zeros((dsize, 1)).astype(np.float32))
            matmul_id = builder.aiOnnxOpset10.gemm([in_id, w, b])
            return matmul_id

        if explicit_recompute:
            with builder.recomputeOutput(popart.RecomputeType.Recompute):
                m1 = add_layer(ip)
                m2 = add_layer(m1)
                m3 = add_layer(m2)
        else:
            m1 = add_layer(ip)
            m2 = add_layer(m1)
            m3 = add_layer(m2)

        anchorIds = []
        for i in (ip, m1, m2, m3):
            anchorIds.append(popart.reservedGradientPrefix() + i)

        out = builder.aiGraphcore.identityloss([m3])
        builder.addOutputTensor(out)

        device = tu.create_test_device()

        dataflow_anchors = {}
        for anchorId in anchorIds:
            dataflow_anchors.update({anchorId: popart.AnchorReturnType("All")})

        opts = popart.SessionOptions()
        opts.explicitRecomputation = explicit_recompute

        proto = builder.getModelProto()

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=popart.DataFlow(1, dataflow_anchors),
            optimizer=popart.ConstSGD(0.01),
            loss=out,
            patterns=popart.Patterns(popart.PatternsLevel.All),
            userOptions=opts,
            deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        ip_data = np.ones((dsize, dsize), dtype=np.float32)
        stepio = popart.PyStepIO({ip: ip_data}, anchors)

        session.run(stepio)
        session.modelToHost(str(tmpdir / model_file_name))
def sparse_mm_train(sparse_mm_type, lhs_dims, vanilla_rhs_dims, block_size, sparsity_level, transpose_rhs, memory_cycle_ratio, inner_group_size):
    if transpose_rhs:
        matmul_dims = [lhs_dims[-2], vanilla_rhs_dims[-1], vanilla_rhs_dims[-2]]
    else:
        matmul_dims = [lhs_dims[-2], vanilla_rhs_dims[-2], vanilla_rhs_dims[-1]]

    lhs = create_dense_matrix(lhs_dims)
    if sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_SPARSE_RHS_DENSE_OUT']:
        bsr_rhs, lengths_per_2d_plane, vanilla_rhs, sparsity_mask = create_sparse_matrix(vanilla_rhs_dims, block_size[1:], sparsity_level)

        rhs = bsr_rhs
        rhs_dims = bsr_rhs.shape

    elif sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_DENSE_RHS_SPARSE_OUT']:
        output_dims = lhs_dims[:-1]
        output_dims.append(vanilla_rhs_dims[-1])
        output_block_size = [block_size[0], block_size[2]]

        bsr_output, lengths_per_2d_plane, vanilla_output, sparsity_mask = create_sparse_matrix(output_dims, output_block_size, sparsity_level)

        lhs_inv = np.linalg.inv(lhs)

        rhs = np.matmul(lhs_inv, vanilla_output)
        rhs_dims = vanilla_rhs_dims

    # MODEL CREATION
    builder = popart.Builder()

    lhs_tensorInfo = popart.TensorInfo("FLOAT", lhs_dims)
    lhsTensor = builder.addInputTensor(lhs_tensorInfo)
    rhsTensor = builder.addInitializedInputTensor(rhs)

    outTensor = builder.customOp(opName = "BSMatMul",
                                 opVersion=1,
                                 domain = "ai.graphcore",
                                 inputs = [lhsTensor, rhsTensor],
                                 attributes = {
                                  "bsr_rhs_lengths_per_2d_plane": lengths_per_2d_plane.tolist(),
                                  "matrix_dims": matmul_dims,
                                  "block_size": block_size,
                                  "sparsity_mask": sparsity_mask.tolist(),
                                  "bsmatmul_type": sparse_mm_type,
                                  "transpose_rhs": transpose_rhs,
                                  "memory_cycle_ratio": memory_cycle_ratio,
                                  "inner_group_size": inner_group_size,
                                  "in_type": g_input_data_type,
                                  "out_type": g_output_data_type,
                                  "pp_type": g_pp_data_type
                                 })[0]

    builder.addOutputTensor(outTensor)

    probs = builder.aiOnnx.softmax([outTensor], axis=1)

    if sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_SPARSE_RHS_DENSE_OUT']:
        labels_shape = lhs_dims[:-1]
    elif sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_DENSE_RHS_SPARSE_OUT']:
        labels_shape = [np.sum(sparsity_mask)]

    label_tensorInfo = popart.TensorInfo("INT32", labels_shape)
    labelTensor = builder.addInputTensor(label_tensorInfo)

    loss = builder.aiGraphcore.nllloss([probs, labelTensor], debugPrefix = "nllLossVal")

    proto = builder.getModelProto()
    #######################

    # Describe how to run the model
    anchor_desc = {
        outTensor: popart.AnchorReturnType("ALL"),
        loss: popart.AnchorReturnType("ALL")
    }

    dataFlow = popart.DataFlow(1, anchor_desc)

    label_data = g_random_labels.choice(9, labels_shape)

    session = popart.TrainingSession(fnModel=proto,
                                     loss=loss,
                                     deviceInfo=popart.DeviceManager().acquireAvailableDevice(1),
                                     optimizer=popart.ConstSGD(0.01),
                                     dataFlow=dataFlow)

    # Compile graph
    session.prepareDevice()

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    # TRAINING
    session.weightsFromHost()

    stepio = popart.PyStepIO({
            lhsTensor: lhs,
            labelTensor: label_data}, anchors)

    session.run(stepio)
def get_model_anchors_model1(doSharding,
                             doPipelining,
                             batchesPerStep,
                             doTraining,
                             doGradAccl=False,
                             gradAcclFactor=1,
                             doProfiling=False,
                             doDevicex=True,
                             anchorRestoredTensors=False,
                             labelArray=None):
    micro_batch_size = batch_size // gradAcclFactor
    builder = popart.Builder()

    input_shape = [micro_batch_size, hidden_size]
    input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape))

    x = input_
    with builder.virtualGraph(0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_0_{i}")
            x = builder.aiOnnx.matmul([x, w])
    with builder.virtualGraph(1 if doSharding else 0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_1_{i}")
            x = builder.aiOnnx.matmul([x, w])
    with builder.virtualGraph(2 if doSharding else 0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_2_{i}")
            if i == 1: w0 = w
            x = builder.aiOnnx.matmul([x, w])
        label = builder.addInputTensor("INT32", [micro_batch_size])
        x = builder.aiGraphcore.nllloss([x, label])

    output = x

    builder.addOutputTensor(output)

    art = popart.AnchorReturnType("All")
    anchor_map = {x: art, w0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + x] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + x] = art
            anchor_map[popart.reservedRestoredPrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradAccl
    opts.accumulationFactor = gradAcclFactor
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    if doSharding is False:
        numIPUs = 1
    else:
        numIPUs = 3

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=output,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    outer_dim = 1
    if batchesPerStep > 1:
        # Add an outer dimension of batchesPerStep. We repeat the labels
        # as we want consistency if we have different shape inputs between examples.
        outer_dim *= batchesPerStep
        labelArray = np.repeat(labelArray[np.newaxis], batchesPerStep, 0)
    if gradAcclFactor > 1:
        # Divide up the batches per step batches into gradAcclFactor * batchesPerStep
        # samples.
        outer_dim *= gradAcclFactor
        labelArray = labelArray.reshape([gradAcclFactor * batchesPerStep, -1])
    if outer_dim > 1:
        # Add the gradAcclFactor * batchesPerStep dimension into the input.
        input_shape = [outer_dim] + input_shape

    stepio = popart.PyStepIO(
        {
            input_: np.ones(input_shape, np.float32),
            label: labelArray.astype(np.int32)
        }, anchors)

    session.weightsFromHost()

    session.run(stepio)

    return anchors
def sparse_softmax(dims, block_size, sparsity_level, inner_group_size):
    """ """

    sparse_input, lengths_per_2d_plane, dense_input, sparsity_mask = create_sparse_matrix(dims, block_size, sparsity_level, -1000)

    # Create a builder and construct a graph
    builder = popart.Builder()

    tensor_info = popart.TensorInfo("FLOAT", sparse_input.shape)
    input_tensor = builder.addInputTensor(tensor_info)

    output_tensor = builder.customOp(opName = "BsSoftmax",
                                     opVersion = 1,
                                     domain = "ai.graphcore",
                                     inputs = [input_tensor],
                                     attributes = {
                                      "matrixDims": dims,
                                      "blockSize": block_size,
                                      "sparsity": sparsity_mask.tolist(),
                                      "groupSizes": lengths_per_2d_plane.tolist(),
                                      "innerGroupSize": inner_group_size,
                                      "subBlockMaskPerGroup": "None" * len(lengths_per_2d_plane)
                                     })[0]
    builder.addOutputTensor(output_tensor)

    proto = builder.getModelProto()

    # Describe how to run the model
    dataFlow = popart.DataFlow(1, {output_tensor: popart.AnchorReturnType("ALL")})

    # Create a session to compile and execute the graph
    session = popart.InferenceSession(
        fnModel=proto,
        dataFlow=dataFlow,
        deviceInfo=popart.DeviceManager().acquireAvailableDevice(1))

    # Compile graph
    session.prepareDevice()

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    sparse_input = np.array(sparse_input, dtype=g_input_data_type)
    stepio = popart.PyStepIO({input_tensor: sparse_input}, anchors)
    session.run(stepio)

    ipu_output = anchors[output_tensor]

    group_dims = dims[:-2]
    mat_dims = dims[-2:]
    blocks_2d = [mat_dims[0] // block_size[0], mat_dims[1] // block_size[1]]
    num_blocks_2d = blocks_2d[0] * blocks_2d[1]
    block_area = block_size[0] * block_size[1]

    total_group_dims = int(np.prod(group_dims))
    assert sparsity_mask.shape == (total_group_dims * num_blocks_2d,)

    cpu_output = softmax(dense_input)

    np.set_printoptions(precision=2)
    np.set_printoptions(suppress=True)

    cpu_output = cpu_output.reshape([total_group_dims, blocks_2d[0], block_size[0], blocks_2d[1], block_size[1]])
    cpu_output = np.transpose(cpu_output, [0, 1, 3, 2, 4])
    cpu_output = cpu_output.reshape(total_group_dims, num_blocks_2d, block_area)

    gold_output = []
    offset = 0
    for g in range(total_group_dims):
        cpu_output_2d = cpu_output[g]

        sliced_sparsity_mask = sparsity_mask[offset: offset + num_blocks_2d]
        offset = offset + num_blocks_2d
        for sparsity_mask_idx in range(num_blocks_2d):
            if sliced_sparsity_mask[sparsity_mask_idx]:
                gold_output.append(cpu_output_2d[sparsity_mask_idx])

    gold_output = np.array(gold_output)
    assert ipu_output.shape == gold_output.shape

    return ipu_output, gold_output
def bwd_graph(popart_model,
              torch_model,
              popart_loss_fn,
              torch_loss_fn,
              mapping=None,
              transform=None):
    np.random.seed(1984)
    random.seed(1984)
    torch.manual_seed(1984)

    #  ------------------- PopART --------------------
    config = popart_model.config
    builder = popart_model.builder

    sequence_info = popart.TensorInfo(
        "UINT32", [config.batch_size * config.sequence_length])
    indices = builder.addInputTensor(sequence_info)
    positions = builder.addInputTensor(sequence_info)
    segments = builder.addInputTensor(sequence_info)
    data = {
        indices: np.random.randint(
            0, config.vocab_length, (config.batch_size * config.sequence_length)).astype(np.uint32),
        positions: np.random.randint(
            0, config.sequence_length, (config.batch_size * config.sequence_length)).astype(np.uint32),
        segments: np.random.randint(
            0, 2, (config.batch_size * config.sequence_length)).astype(np.uint32)
    }

    output = popart_model.build_graph(indices, positions, segments)
    proto = builder.getModelProto()

    losses = popart_loss_fn(output)

    optimizer = popart.ConstSGD(0.01)

    outputs, post_proto = run_py(
        proto, data, output, loss=losses, optimizer=optimizer,
        ipus=math.ceil(config.num_layers / config.layers_per_ipu) + popart_model.layer_offset)

    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = {
        "input_ids": data[indices].reshape(config.batch_size, config.sequence_length).astype(np.int32),
        "position_ids": data[positions].reshape(config.batch_size, config.sequence_length).astype(np.int32),
        "token_type_ids": data[segments].reshape(config.batch_size, config.sequence_length).astype(np.int32)
    }

    torch_to_onnx = get_mapping(config, init=mapping)

    transform_weights = get_transform(config, init=transform)

    #  ------------------- PyTorch -------------------------
    # Turn off dropout
    torch_model.eval()

    copy_weights_to_torch(torch_model, proto,
                          torch_to_onnx, transform_weights)

    optim = torch.optim.SGD(torch_model.parameters(), 0.01,
                            weight_decay=0.0, momentum=0.0)

    torch_outputs = torch_model(
        **{k: torch.from_numpy(t).long() for k, t in inputs.items()})
    torch_loss = torch_loss_fn(torch_outputs)
    torch_loss.backward()
    optim.step()

    check_tensors([output.detach().numpy() for output in torch_outputs], outputs)

    check_model(torch_model, post_proto,
                torch_to_onnx, transform_weights,
                margin=6e-7)
Beispiel #22
0
    def test(opt0, opt1, e0, e1, e2):
        builder = popart.Builder()

        input0Shape = [stepSize, batchSize, sampleDim]
        input0 = builder.addInputTensor(
            popart.TensorInfo("FLOAT", input0Shape), "input0")

        w0data = np.array([100.0], dtype=np.float32)
        w0R = np.array([-777.0], dtype=np.float32)
        w0Id = builder.addInitializedInputTensor(w0data, w0name)

        w1data = np.array([200.0], dtype=np.float32)
        w1R = np.array([-777.0], dtype=np.float32)
        w1Id = builder.addInitializedInputTensor(w1data, w1name)

        w2data = np.array([300.0], dtype=np.float32)
        w2R = np.array([-777.0], dtype=np.float32)
        w2Id = builder.addInitializedInputTensor(w2data, w2name)

        add0 = builder.aiOnnx.add([w0Id, input0])
        add1 = builder.aiOnnx.add([w1Id, add0])
        add2 = builder.aiOnnx.add([w2Id, add1])

        l1 = builder.aiGraphcore.l1loss([add2], 1.0)

        proto = builder.getModelProto()

        dataFlow = popart.DataFlow(1, {})

        opts = popart.SessionOptions()
        opts.reportOptions = {"showExecutionSteps": "true"}
        opts.enableGroupedMatmuls = False

        pat = popart.Patterns(popart.PatternsLevel.Default)

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=dataFlow,\
            userOptions=opts,
            loss=l1,
            optimizer=opt0,
            patterns=pat,
            deviceInfo=tu.create_test_device(opts={"compileIPUCode": False}))

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        input0Data = np.array([3.1415], dtype=np.float32)

        stepio = popart.PyStepIO({input0: input0Data}, anchors)

        session.run(stepio)

        session.updateOptimizerFromHost(opt1)

        session.run(stepio)

        session.weightsToHost()

        weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R})

        session.readWeights(weightsRead)

        assert (np.isclose(e0['initalValue'], w0R))
        assert (np.isclose(e1['initalValue'], w1R))
        assert (np.isclose(e2['initalValue'], w2R))
    def run_test(enablePipelining):
        popart.getLogger().setLevel("TRACE")

        builder = popart.Builder()

        i1 = builder.addInputTensor(
            popart.TensorInfo("FLOAT", input_data.shape[1::]))
        w0 = builder.addInitializedInputTensor(weight_data_0)
        w1 = builder.addInitializedInputTensor(weight_data_1)
        w2 = builder.addInitializedInputTensor(weight_data_2)

        o0 = builder.aiOnnx.matmul([i1, w0])
        if enablePipelining:
            builder.virtualGraph(o0, 0)

        o1 = builder.aiOnnx.matmul([o0, w1])
        if enablePipelining:
            builder.virtualGraph(o1, 1)

        o2 = builder.aiOnnx.matmul([o1, w2])
        if enablePipelining:
            builder.virtualGraph(o2, 2)

        o2l1 = builder.aiGraphcore.l1loss([o2], 0.1)
        if enablePipelining:
            builder.virtualGraph(o2l1, 2)

        proto = builder.getModelProto()

        anchorId = popart.reservedDefaultScaledLearningRate0Prefix() + "FLOAT"

        # Need to anchor the output of the backward pass to stop it being pruned
        dataFlow = popart.DataFlow(bps, [anchorId])

        optimizer = popart.SGD({"defaultLearningRate": (1.0, False)})

        opts = popart.SessionOptions()
        if enablePipelining:
            opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        opts.enablePipelining = enablePipelining

        numIPUs = 1
        if enablePipelining:
            numIPUs = 3

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=dataFlow,
            loss=o2l1,
            optimizer=optimizer,
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

        session.prepareDevice()

        anchors = session.initAnchorArrays()

        inputs = {i1: input_data}
        stepio = popart.PyStepIO(inputs, anchors)

        session.weightsFromHost()

        # run 2 steps, changing the optimizer halfway through
        result = []
        session.run(stepio)
        result.append(np.copy(anchors[anchorId]))

        session.updateOptimizerFromHost(
            popart.SGD({"defaultLearningRate": (0.5, False)}))

        session.run(stepio)
        result.append(np.copy(anchors[anchorId]))

        return result
Beispiel #24
0
def runTest(forceAddOutOfPlace, pipelineRecomputation):
    """
    Test of pipelining with dropout, recomputation, graph replication, 
    gradient accumulation
    """
    #Has dependencies on T12562. T12976, T13098 for full support

    seed = 1015
    npr.seed(seed)
    torch.manual_seed(seed)

    #L1 loss value
    lambda1 = 1.0

    #optimizer parameters
    defaultLearningRate0 = 0.001
    defaultMomentum0 = 0.01
    defaultDampening0 = 0.5
    lossScaling0 = 10.0
    defaultVelocityScaling0 = 0.15
    defaultWeightDecay0 = 0.01

    # tensor dimensions and replications
    height = 6
    batchesPerStep = 5
    sampleShape = [height, height]
    accumulationFactor = 4
    samplesPerBatch = 48
    divvyFactor = replicationFactor * accumulationFactor
    if (samplesPerBatch % divvyFactor != 0):
        raise RuntimeError("Invalid divvy factor")
    samplesPerMicroBatch = samplesPerBatch // divvyFactor
    stepDataShape = [batchesPerStep, samplesPerBatch, height, height]
    microBatchShape = [samplesPerMicroBatch, height, height]
    stepDataInfo = popart.TensorInfo("FLOAT", stepDataShape)
    microBatchInfo = popart.TensorInfo("FLOAT", microBatchShape)

    #initial weight and input values
    w0vals = np.array(npr.randn(height, height), dtype=np.float32)
    w1vals = np.array(npr.randn(height, height), dtype=np.float32)
    w2vals = np.array(npr.randn(height, height), dtype=np.float32)
    inputVals = np.array(npr.randn(*stepDataShape), dtype=np.float32)

    # Build the ONNX Model
    builder = popart.Builder()
    input0 = builder.addInputTensor(microBatchInfo)
    w0 = builder.addInitializedInputTensor(w0vals)
    w1 = builder.addInitializedInputTensor(w1vals)
    w2 = builder.addInitializedInputTensor(w2vals)

    scaleFactor = 1. / np.sqrt(height + 0.)

    # Model:
    #
    # input  w0                            w1
    #     \  |                            /
    #     matmul - scale -> dropout -> matul
    #         \                        |
    #         |                       scale
    #         |                        |
    #         |                      dropout
    #         |                        /\
    #       add  -------<---<----<----   \
    #        |                            |
    #     dropout                     scale by 2
    #        |                            |
    #  = = = | = = = = = IPU barrier = = =|= = = = = =
    #        |                            |
    #        |   w2                       |
    #        |  /                         |
    #       matmul                       /
    #        |                          /
    #      scale                       /
    #        |                        /
    #      dropout                   /
    #        |                       |
    #        ------->---->---->---> add -> L1 loss (lambda 2)

    with builder.virtualGraph(0):
        mm0 = builder.aiOnnx.matmul([input0, w0])
        scale0 = builder.aiGraphcore.scale([mm0], scaleFactor)
        ratio0 = 0.35
        [dropout0, mask0] = builder.aiOnnx.dropout([scale0],
                                                   num_outputs=2,
                                                   ratio=ratio0)
        mm1 = builder.aiOnnx.matmul([dropout0, w1])
        scale1 = builder.aiGraphcore.scale([mm1], scaleFactor)
        ratio1 = 0.5
        [dropout1, mask1] = builder.aiOnnx.dropout([scale1],
                                                   num_outputs=2,
                                                   ratio=ratio1)
        dropout1 = builder.aiGraphcore.scale([dropout1], 2.0)
        skipOut = builder.aiOnnx.add([mm0, dropout1])
        # See resolved task T13137
        if forceAddOutOfPlace:
            builder.setInplacePreferences(skipOut, {"AddRhsInplace": -1.0})

        ratioSkip = 0.6
        [dropoutSkip, maskSkip] = builder.aiOnnx.dropout([skipOut],
                                                         num_outputs=2,
                                                         ratio=ratioSkip)

        # see T13142: we do this so that the recomputation does not modify the anchors
        mask0 = builder.aiOnnx.identity([mask0])
        mask1 = builder.aiOnnx.identity([mask1])
        maskSkip = builder.aiOnnx.identity([maskSkip])

    with builder.virtualGraph(1):
        mm2 = builder.aiOnnx.matmul([dropoutSkip, w2])
        scale2 = builder.aiGraphcore.scale([mm2], scaleFactor)
        ratio2 = 0.7
        [dropout2, mask2] = builder.aiOnnx.dropout([scale2],
                                                   num_outputs=2,
                                                   ratio=ratio2)

        out = builder.aiOnnx.add([dropout2, dropout1])
        l1 = builder.aiGraphcore.l1loss([out],
                                        lambda1,
                                        reduction=popart.ReductionType.Sum)

        # see T13142: we do this so that the recomputation does not modify the anchors
        mask2 = builder.aiOnnx.identity([mask2])

    anchors = {
        mask0: popart.AnchorReturnType("All"),
        mask1: popart.AnchorReturnType("All"),
        mask2: popart.AnchorReturnType("All"),
        maskSkip: popart.AnchorReturnType("All"),
    }

    dataFlow = popart.DataFlow(batchesPerStep, anchors)

    device = tu.create_test_device(numIpus=nIPUs)
    assert device

    userOptions = popart.SessionOptions()
    # This requires T12562 to be solved before enabling (TODO)
    userOptions.enableOutlining = False
    userOptions.enablePipelining = True
    userOptions.enableGradientAccumulation = True
    userOptions.accumulationFactor = accumulationFactor

    if pipelineRecomputation:
        userOptions.autoRecomputation = popart.RecomputationType.Pipeline

    if (replicationFactor > 1):
        userOptions.enableReplicatedGraphs = True
        userOptions.replicatedGraphCount = replicationFactor
    userOptions.virtualGraphMode = popart.VirtualGraphMode.Manual

    # TODO https://phabricator.sourcevertex.net/T14035
    userOptions.enablePrefetchDatastreams = False
    #  passes:
    userOptions.engineOptions = {"exchange.streamBufferOverlap": "any"}
    #  fails:
    #  userOptions.engineOptions = {"exchange.streamBufferOverlap" : "hostRearrangeOnly"}

    patterns = popart.Patterns()
    patterns.InPlace = True

    session = popart.TrainingSession(
        fnModel=builder.getModelProto(),
        dataFlow=dataFlow,
        optimizer=popart.SGD({
            "defaultLearningRate": (defaultLearningRate0, False),
            "defaultMomentum": (defaultMomentum0, False),
            "defaultDampening": (defaultDampening0, False),
            "defaultVelocityScaling": (defaultVelocityScaling0, False),
            "lossScaling": (lossScaling0, True),
            "defaultWeightDecay": (defaultWeightDecay0, True)
        }),
        loss=l1,
        patterns=patterns,
        userOptions=userOptions,
        deviceInfo=device)

    anchorArrays = session.initAnchorArrays()

    session.prepareDevice()
    session.setRandomSeed(7)
    session.weightsFromHost()

    stepio = popart.PyStepIO({input0: inputVals}, anchorArrays)
    session.run(stepio)
    session.weightsToHost()
    w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    w2R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32)
    weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R, w2: w2R})
    session.readWeights(weightsRead)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            # merge replication, accumulation
            flattenedShape = [anchorArrays[mask0].shape[0], -1, height, height]
            self.w0 = torch.nn.Parameter(torch.from_numpy(w0vals.copy()))
            self.mask0 = torch.from_numpy(
                anchorArrays[mask0].reshape(flattenedShape))

            self.w1 = torch.nn.Parameter(torch.from_numpy(w1vals.copy()))
            self.mask1 = torch.from_numpy(
                anchorArrays[mask1].reshape(flattenedShape))

            self.maskSkip = torch.from_numpy(
                anchorArrays[maskSkip].reshape(flattenedShape))

            self.w2 = torch.nn.Parameter(torch.from_numpy(w2vals.copy()))
            self.mask2 = torch.from_numpy(
                anchorArrays[mask2].reshape(flattenedShape))

        def forward(self, x, i):
            mm0 = torch.matmul(x, self.w0)
            dr0 = mm0 * scaleFactor * self.mask0[i].type(
                torch.FloatTensor) / (1 - ratio0)

            mm1 = torch.matmul(dr0, self.w1)
            dr1 = mm1 * scaleFactor * self.mask1[i].type(
                torch.FloatTensor) / (1 - ratio1)
            dr1 = 2 * dr1

            drSkip = (dr1 + mm0) * self.maskSkip[i].type(
                torch.FloatTensor) / (1 - ratioSkip)

            mm2 = torch.matmul(drSkip, self.w2)
            dr2 = mm2 * scaleFactor * self.mask2[i].type(
                torch.FloatTensor) / (1 - ratio2)

            out = dr1 + dr2
            return out

    net = Net()

    optimizer = optim.SGD(net.parameters(),
                          lr=defaultLearningRate0,
                          momentum=defaultMomentum0,
                          dampening=defaultDampening0,
                          weight_decay=defaultWeightDecay0)

    # caveat : alternative work-around for TODO T13098
    for group in optimizer.param_groups:
        for p in group['params']:
            param_state = optimizer.state[p]
            param_state['momentum_buffer'] = p.data * 0

    for i in range(batchesPerStep):
        out = net(torch.from_numpy(inputVals[i]), i)
        loss = lambda1 * torch.sum(torch.abs(out))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    delta0 = np.sum(np.abs(net.w0.detach().numpy() - w0vals))
    delta1 = np.sum(np.abs(net.w1.detach().numpy() - w1vals))
    delta2 = np.sum(np.abs(net.w2.detach().numpy() - w2vals))
    print("pytorch baseline")
    print("Total moved by w0: ", delta0)
    print("Total moved by w1: ", delta1)
    print("Total moved by w2: ", delta2)

    error0 = np.sum(np.abs(w0R - net.w0.detach().numpy())) / delta0
    error1 = np.sum(np.abs(w1R - net.w1.detach().numpy())) / delta1
    error2 = np.sum(np.abs(w2R - net.w2.detach().numpy())) / delta2
    print("without pipelining")
    print("Total moved by w0: ", np.sum(np.abs(w0R - w0vals)))
    print("Total moved by w1: ", np.sum(np.abs(w1R - w1vals)))
    print("Total moved by w2: ", np.sum(np.abs(w2R - w2vals)))
    print("l1 error for w0: ", error0)
    print("l1 error for w1: ", error1)
    print("l1 error for w2: ", error2)
    assert (error0 < 1e-5)
    assert (error1 < 1e-5)
    assert (error2 < 1e-5)
Beispiel #25
0
# In this example:
# the l1 loss "out",
# and the input tensor "image0"
anchors = {
    "out": popart.AnchorReturnType("EveryN", 2),
    "image0": popart.AnchorReturnType("All")
}

dataFlow = popart.DataFlow(batchesPerStep, anchors)

# PopART is non-dynamic. All input Tensor shapes and
# types must be fed into the Session constructor.
# In this example there is 1 streamed input, image0.
inputShapeInfo = popart.InputShapeInfo()
inputShapeInfo.add("image0",
                   popart.TensorInfo("FLOAT", [batchSize, nChans, 32, 32]))

inNames = ["image0"]

# outNames: not the same as anchors,
# these are the outputs of the onnx
# model. In training these are the
# this is the scalar loss on which
# 'backward' is called
outNames = ["out"]

#cifar training data loader : at index 0 : image, at index 1 : label.
cifarInIndices = {"image0": 0}


class Module0(torch.nn.Module):
def fwd_graph(popart_model, torch_model, mode, mapping=None, transform=None, replication_factor=1, replicated_tensor_sharding = False):
    #  ------------------- PopART --------------------
    config = popart_model.config
    builder = popart_model.builder

    sequence_info = popart.TensorInfo(
        "UINT32", [config.micro_batch_size * config.sequence_length])
    indices = builder.addInputTensor(sequence_info)
    positions = builder.addInputTensor(sequence_info)
    segments = builder.addInputTensor(sequence_info)
    data = {
        indices: np.random.randint(
            0, config.vocab_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32),
        positions: np.random.randint(
            0, config.sequence_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32),
        segments: np.random.randint(
            0, 2, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32)
    }


    user_options = {}
    if mode == ExecutionMode.PHASED:
        user_options = {
            "batchSerializationFactor": 1,
            "executionPhases": popart_model.total_execution_phases
        }
        output = popart_model(indices, positions, segments)
        ipus = 2
    else:
        output = popart_model.build_graph(indices, positions, segments)
        ipus = popart_model.total_ipus

    proto = builder.getModelProto()

    outputs, _ = run_py(proto,
                        data,
                        output,
                        user_options=user_options,
                        execution_mode=mode,
                        replication_factor=replication_factor,
                        replicated_tensor_sharding=replicated_tensor_sharding,
                        ipus=ipus)


    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = {
        "input_ids": data[indices].reshape(replication_factor * config.micro_batch_size,
                                           config.sequence_length).astype(np.int32),
        "position_ids": data[positions].reshape(replication_factor * config.micro_batch_size,
                                                config.sequence_length).astype(np.int32),
        "token_type_ids": data[segments].reshape(replication_factor * config.micro_batch_size,
                                                 config.sequence_length).astype(np.int32)
    }

    torch_to_onnx = get_mapping(config, init=mapping)

    transform_weights = get_transform(config, init=transform)

    #  ------------------- PyTorch -------------------------
    # Turn off dropout
    torch_model.eval()
    copy_weights_to_torch(torch_model, proto,
                          torch_to_onnx, transform_weights)

    torch_outputs = run_fwd_model(inputs, torch_model)

    check_tensors(torch_outputs, outputs)
Beispiel #27
0
def test_train(tmpdir, capfd):
    filt_data = np.array([1., 2., 1., 2.], dtype=np.float32)
    filt_data = np.reshape(filt_data, [1, 1, 2, 2])
    input_data = np.array([1., 2., 3., 4.], dtype=np.float32)
    input_data = np.reshape(input_data, [1, 1, 2, 2])

    builder = popart.Builder()

    shape = popart.TensorInfo("FLOAT", input_data.shape)
    i1 = builder.addInputTensor(shape, "data")

    i2 = builder.addInitializedInputTensor(filt_data, "filter")

    # both i2 and d__i2 will be printed
    p1 = builder.aiGraphcore.printtensor([i2])

    c1 = builder.aiOnnx.conv([i1, p1],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[2, 2])

    # c1 will be printed, but d__c1 will not
    o = builder.aiGraphcore.printtensor([c1], print_gradient=0)
    l1 = builder.aiGraphcore.l1loss([o],
                                    0.1,
                                    reduction=popart.ReductionType.Sum)

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")})

    opts = popart.SessionOptions()
    opts.enableOutlining = False
    opts.enableOutliningCopyCostPruning = False

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     userOptions=opts,
                                     optimizer=popart.ConstSGD(0.1),
                                     loss=l1,
                                     deviceInfo=tu.create_test_device())

    session.prepareDevice()

    session.weightsFromHost()

    anchors = session.initAnchorArrays()

    inputs = {i1: input_data}
    stepio = popart.PyStepIO(inputs, anchors)

    capfd.readouterr()

    session.run(stepio)

    captured = capfd.readouterr()
    output = captured.err

    # Remove ESC characters
    output = re.sub(chr(27), '', output)

    # Remove termcolor sequences
    output = re.sub('\[\d\dm', '', output)

    # Remove popart log lines
    output = re.sub('\[\d\d\d\d-\d\d-\d\d .*?\n', '', output)

    # remove all whitespace
    output = re.sub('\s+', '', output)

    pattern = 'name:{{{{float,float},{float,float}}}}'
    pattern = re.sub('name', r'[\\w:]+', pattern)
    pattern = re.sub('float', r'\\d(?:\\.\\d+)?', pattern)

    matches = re.findall(pattern, output)

    d__i2 = popart.reservedGradientPrefix() + i2

    assert len(matches) == 3
    assert matches[0] == i2 + ":{{{{1,2},{1,2}}}}"
    assert matches[1] == c1 + ":{{{{2,2},{6,4}}}}"
    assert matches[2] == d__i2 + ":{{{{0.4,0.3},{0.2,0.1}}}}"
def bwd_graph(popart_model,
              torch_model,
              mode,
              popart_loss_fn,
              torch_loss_fn,
              mapping=None,
              transform=None,
              replication_factor=1,
              replicated_tensor_sharding=False,
              opt_type="SGD"):
    np.random.seed(1984)
    random.seed(1984)
    torch.manual_seed(1984)

    #  ------------------- PopART --------------------
    config = popart_model.config
    builder = popart_model.builder

    sequence_info = popart.TensorInfo(
        "UINT32", [config.micro_batch_size * config.sequence_length])
    indices = builder.addInputTensor(sequence_info)
    positions = builder.addInputTensor(sequence_info)
    segments = builder.addInputTensor(sequence_info)
    data = {
        indices: np.random.randint(
            0, config.vocab_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32),
        positions: np.random.randint(
            0, config.sequence_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32),
        segments: np.random.randint(
            0, 2, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32)
    }
    num_reps = 5
    user_options = {}
    if mode == ExecutionMode.PHASED:
        user_options = {
            "batchSerializationFactor": 1,
            "executionPhases": popart_model.total_execution_phases
        }
        output = popart_model(indices, positions, segments)
        ipus = 2
    else:
        output = popart_model.build_graph(indices, positions, segments)
        ipus = popart_model.total_ipus

    loss = popart_loss_fn(output)

    proto = builder.getModelProto()

    if opt_type == "SGD":
        optimizer = popart.ConstSGD(1e-3)
    elif opt_type == "LAMB":
        optMap = {
            "defaultLearningRate": (1e-3, True),
            "defaultBeta1": (0.9, True),
            "defaultBeta2": (0.999, True),
            "defaultWeightDecay": (0.0, True),
            "maxWeightNorm": (10.0, True),
            "defaultEps": (1e-8, True),
            "lossScaling": (1.0, True),
        }
        optimizer = popart.Adam(optMap,
                                mode=popart.AdamMode.Lamb)
    elif opt_type == "LAMB_NO_BIAS":
        optMap = {
            "defaultLearningRate": (1, False),
            "defaultBeta1": (0, False),
            "defaultBeta2": (0, False),
            "defaultWeightDecay": (0.0, False),
            "defaultEps": (1e-8, False),
            "lossScaling": (1.0, False),
        }
        optimizer = popart.Adam(optMap,
                                mode=popart.AdamMode.LambNoBias)
    else:
        raise ValueError(f"Unknown opt_type={opt_type}")

    patterns = popart.Patterns()

    if mode == ExecutionMode.PHASED:
        patterns.enablePattern("TiedGatherPattern", False)
        patterns.enablePattern("SparseAccumulatePattern", False)

    outputs, post_proto = run_py(proto,
                                 data,
                                 output,
                                 loss=loss,
                                 optimizer=optimizer,
                                 user_options=user_options,
                                 execution_mode=mode,
                                 patterns=patterns,
                                 replication_factor=replication_factor,
                                 replicated_tensor_sharding=replicated_tensor_sharding,
                                 ipus=ipus,
                                 num_reps=num_reps)

    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = {
        "input_ids": data[indices].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32),
        "position_ids": data[positions].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32),
        "token_type_ids": data[segments].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32)
    }

    torch_to_onnx = get_mapping(config, init=mapping)

    transform_weights = get_transform(config, init=transform)

    #  ------------------- PyTorch -------------------------
    # Turn off dropout
    torch_model.eval()

    copy_weights_to_torch(torch_model, proto,
                          torch_to_onnx, transform_weights)

    if opt_type == "SGD":
        optim = torch.optim.SGD(torch_model.parameters(), 1e-3,
                                weight_decay=0.0, momentum=0.0)
    elif opt_type == "LAMB":
        optim = torch_lamb.Lamb(torch_model.parameters(),
                                lr=1e-3, weight_decay=0.0, biasCorrection=True)

    for _ in range(num_reps):
        torch_outputs = torch_model(
            **{k: torch.from_numpy(t).long() for k, t in inputs.items()})
        torch_loss = torch_loss_fn(torch_outputs)
        torch_loss.backward()
        optim.step()
        optim.zero_grad()

    check_tensors([output.detach().numpy()
                   for output in torch_outputs], outputs, margin=1.5e-06)

    check_model(torch_model, post_proto,
                torch_to_onnx, transform_weights,
                margin=5e-5)
Beispiel #29
0
def get_model_anchors(doSharding,
                      doPipelining,
                      batchesPerStep,
                      doTraining,
                      doProfiling=False,
                      doDevicex=True,
                      anchorRestoredTensors=False,
                      returnRawInput=False):
    np.random.seed(seed=1)

    builder = popart.Builder()
    batchSize = 2
    shape_d0 = [batchSize, 2, 4, 4]
    shape_l0 = [batchSize]
    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0))
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0)
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0))

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugPrefix="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [batchSize, 32])
    out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm")
    nll = builder.aiGraphcore.nllloss([out, l0])

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining

    if doSharding is False:
        numIPUs = 1
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIPUs = 3
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 1)
        builder.virtualGraph(c0, 1)
        builder.virtualGraph(r0, 2)
        builder.virtualGraph(out, 2)
        builder.virtualGraph(nll, 2)

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=nll,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIpu=20))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIpu=20))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    if batchesPerStep > 1:
        shape_d0.insert(0, batchesPerStep)
        shape_l0.insert(0, batchesPerStep)
    data = np.random.uniform(low=-10.0, high=10.0,
                             size=shape_d0).astype(np.float32)
    classes = np.prod(shape_d0) / (batchSize * batchesPerStep)
    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
Beispiel #30
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

import argparse import ctypes import os

    import numpy as np import popart

#Define a function to build and run the rsqrt graph with
#specified input tensor data and alpha value
    def build_and_run_graph(input_data, run_on_ipu) : builder = popart.Builder() input_len = len(input_data)

                                                                                                     input_tensor = builder.addInputTensor(popart.TensorInfo("FLOAT", [input_len])) print("Shape of {}: {}".format(input_tensor, builder.getTensorShape(input_tensor)))

                                                                                                                                                                                                                                     output_tensor = builder.customOp(opName = "Erf", opVersion = 1, domain = "ai.graphcore", inputs =[input_tensor], attributes = {})[0]

                                                                                                                                                                                                                                                                      print("Inputs: {}".format(builder.getInputTensorIds())) print("Outputs: {}".format(builder.getOutputTensorIds())) print("Values: {}".format(builder.getValueTensorIds())) print("Shape of {}: {}".format(output_tensor, builder.getTensorShape(output_tensor)))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  builder.addOutputTensor(output_tensor)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              proto = builder.getModelProto()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  anchors = {output_tensor : popart.AnchorReturnType("FINAL") } dataFlow = popart.DataFlow(1, anchors)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  if run_on_ipu : device = popart.DeviceManager().acquireAvailableDevice(1) print("IPU hardware device acquired") else : device = popart.DeviceManager().createIpuModelDevice({}) print("Running on IPU Model")

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            session = popart.InferenceSession(proto, dataFlow, device)