def get_model_anchors_model2(doSharding,
                             doPipelining,
                             batchesPerStep,
                             doTraining,
                             doGradAccl=False,
                             gradAcclFactor=1,
                             doProfiling=False,
                             doDevicex=True,
                             anchorRestoredTensors=False,
                             returnRawInput=False,
                             labelArray=None):

    np.random.seed(1234)
    builder = popart.Builder()
    micro_batch_size = batch_size // gradAcclFactor

    shape_d0 = [micro_batch_size, 2, 4, 4]
    shape_l0 = [batch_size]
    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0), "inp")
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0, "weights")

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugPrefix="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [micro_batch_size, 32])
    out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm")

    label_shape = [micro_batch_size]
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", label_shape),
                                "label")
    nll = builder.aiGraphcore.nllloss([out, l0])

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art, s0: art, c0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art
        if doGradAccl is True:
            anchor_map[popart.reservedAcclToUpdatePrefix() +
                       popart.reservedGradientPrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradAccl
    opts.accumulationFactor = gradAcclFactor

    if doSharding is False:
        numIPUs = 1
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIPUs = 3
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 1)
        builder.virtualGraph(c0, 1)
        builder.virtualGraph(r0, 2)
        builder.virtualGraph(out, 2)
        builder.virtualGraph(nll, 2)

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=nll,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    classes = np.prod(shape_d0) / (micro_batch_size * batchesPerStep)
    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    outer_dim = 1
    if batchesPerStep > 1:
        # Add an outer dimension of batchesPerStep. We repeat the labels
        # as we want consistency if we have different shape inputs between examples.
        outer_dim *= batchesPerStep
        label = np.repeat(label[np.newaxis], batchesPerStep, 0)
    if gradAcclFactor > 1:
        # Divide up the batches per step batches into gradAcclFactor * batchesPerStep
        # samples.
        outer_dim *= gradAcclFactor
        label = label.reshape([gradAcclFactor * batchesPerStep, -1])
    if outer_dim > 1:
        # Add the gradAcclFactor * batchesPerStep dimension into the input.
        shape_d0.insert(0, outer_dim)
    data = np.ones(shape=shape_d0).astype(np.float32)

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)

    session.weightsFromHost()

    for i in range(6):
        session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
def get_model_anchors_model1(doSharding,
                             doPipelining,
                             batchesPerStep,
                             doTraining,
                             doGradAccl=False,
                             gradAcclFactor=1,
                             doProfiling=False,
                             doDevicex=True,
                             anchorRestoredTensors=False,
                             labelArray=None):
    micro_batch_size = batch_size // gradAcclFactor
    builder = popart.Builder()

    input_shape = [micro_batch_size, hidden_size]
    input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape))

    x = input_
    with builder.virtualGraph(0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_0_{i}")
            x = builder.aiOnnx.matmul([x, w])
    with builder.virtualGraph(1 if doSharding else 0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_1_{i}")
            x = builder.aiOnnx.matmul([x, w])
    with builder.virtualGraph(2 if doSharding else 0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_2_{i}")
            if i == 1: w0 = w
            x = builder.aiOnnx.matmul([x, w])
        label = builder.addInputTensor("INT32", [micro_batch_size])
        x = builder.aiGraphcore.nllloss([x, label])

    output = x

    builder.addOutputTensor(output)

    art = popart.AnchorReturnType("All")
    anchor_map = {x: art, w0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + x] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + x] = art
            anchor_map[popart.reservedRestoredPrefix() + w0] = art
        if doGradAccl is True:
            anchor_map[popart.reservedAcclToUpdatePrefix() +
                       popart.reservedGradientPrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradAccl
    opts.accumulationFactor = gradAcclFactor
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    if doSharding is False:
        numIPUs = 1
    else:
        numIPUs = 3

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=output,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    outer_dim = 1
    if batchesPerStep > 1:
        # Add an outer dimension of batchesPerStep. We repeat the labels
        # as we want consistency if we have different shape inputs between examples.
        outer_dim *= batchesPerStep
        labelArray = np.repeat(labelArray[np.newaxis], batchesPerStep, 0)
    if gradAcclFactor > 1:
        # Divide up the batches per step batches into gradAcclFactor * batchesPerStep
        # samples.
        outer_dim *= gradAcclFactor
        labelArray = labelArray.reshape([gradAcclFactor * batchesPerStep, -1])
    if outer_dim > 1:
        # Add the gradAcclFactor * batchesPerStep dimension into the input.
        input_shape = [outer_dim] + input_shape

    stepio = popart.PyStepIO(
        {
            input_: np.ones(input_shape, np.float32),
            label: labelArray.astype(np.int32)
        }, anchors)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    return anchors
def test_gradient_accumulation_anchors(tmpdir):
    """
    Check that the accumulated gradients with gradient accumulation match
    the gradients without gradient accumulation enabled.
    """

    label_array = np.random.randint(0, hidden_size, batch_size)

    #TODO T11866 larger batches-per-step, first without weight decay, then with weight decay
    batches_per_step = 1

    accl_initial_proto, accl_proto_filename, accl_anchor_arrays = run_mm_graph(
        sgd_optimizer,
        label_array=label_array,
        accum_factor=4,
        enable_accum=True,
        batches_per_step=batches_per_step,
        number_of_steps=1,
        final_proto_filename=os.path.join(tmpdir,
                                          "accl5batches3stepsAnchorsTest"),
        enable_multi_ipu=False,
        full_anchorage=True,
        inference_mode=False)

    no_accl_initial_proto, no_accl_proto_filename, no_accl_anchor_arrays = run_mm_graph(
        sgd_optimizer,
        label_array=label_array,
        accum_factor=1,
        enable_accum=False,
        batches_per_step=batches_per_step,
        number_of_steps=1,
        final_proto_filename=os.path.join(tmpdir,
                                          "noAccl5batches3stepsAnchorsTest"),
        enable_multi_ipu=False,
        full_anchorage=True,
        inference_mode=False)

    w0_tensor = onnx.load_from_string(accl_initial_proto).graph.initializer[0]
    w0_name = w0_tensor.name

    full_batch_grad = no_accl_anchor_arrays[popart.reservedGradientPrefix() +
                                            w0_name]
    accl_grad = accl_anchor_arrays[popart.reservedAcclToUpdatePrefix() +
                                   w0_name]

    print("full batch grad shape is ")
    print(full_batch_grad.shape)

    print("accl grad shape is ")
    print(accl_grad.shape)

    if (batches_per_step > 1):
        #TODO T11866
        raise RuntimeError("batches per step > 1 needs investigation")

        for i in range(batches_per_step):
            print("\nbatch %d" % (i, ))
            print("Absolute accl grad  %.3f" % (np.sum(np.abs(accl_grad[i]))))
            print("Absolute no accl g  %.3f" %
                  (np.sum(np.abs(full_batch_grad[i]))))
            print("Absolute difference %.3f" %
                  (np.sum(np.abs(full_batch_grad[i] - accl_grad[i]))))

            print("Absolute difference %.3f" %
                  (np.sum(np.abs(full_batch_grad[i] - adjusted_accl_grad[i]))))

    else:
        accl_grad_abs_sum = np.sum(np.abs(accl_grad))
        print("Absolute accl grad  %.3f" % (accl_grad_abs_sum))

        # initialising as per equations. When velocity scaling != 1 this may need changing T12001
        adjusted_accl_grad = accl_grad[-1].flatten().copy()
        for i, v in enumerate(w0_tensor.float_data):
            adjusted_accl_grad[i] -= wd * v

        adjusted_accl_grad_abs_sum = np.sum(np.abs(adjusted_accl_grad))
        print("Absolute accl grad, adjusted for weight decay %.3f" %
              (adjusted_accl_grad_abs_sum))

        full_batch_abs_sum = np.sum(np.abs(full_batch_grad))
        print("Absolute no accl g  %.3f" % (full_batch_abs_sum))

        abs_diff = np.sum(
            np.abs(full_batch_grad.flatten() - adjusted_accl_grad))
        print("Absolute difference %.3f" % (abs_diff))

        assert (abs_diff / (full_batch_abs_sum + accl_grad_abs_sum) < 1e-5)
Esempio n. 4
0
def test_accumulators_names_dont_clash():
    np.random.seed(1984)

    builder = popart.Builder()

    input_data = np.random.rand(4, 4).astype(np.float32)
    weights = ['weight1', 'weight2', 'weight3']

    d0 = builder.addInputTensor(popart.TensorInfo('FLOAT', [4, 4]), 'data0')
    x = builder.aiOnnx.add([
        d0,
        builder.addInitializedInputTensor(
            np.random.rand(4, 4).astype(np.float32), weights[0])
    ])
    x = builder.aiOnnx.add([
        x,
        builder.addInitializedInputTensor(
            np.random.rand(4, 4).astype(np.float32), weights[1])
    ])
    x = builder.aiOnnx.add([
        x,
        builder.addInitializedInputTensor(
            np.random.rand(4, 4).astype(np.float32), weights[2])
    ])

    l1 = builder.aiGraphcore.l1loss([x], 1.0)

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(1, {})

    opt = popart.SGD({
        "defaultLearningRate": (0.1, True),
        "defaultMomentum": (0.9, True),
        "defaultDampening": (0, True)
    })

    session = popart.TrainingSession(
        fnModel=proto,
        dataFlow=dataFlow,
        loss=l1,
        optimizer=opt,
        deviceInfo=tu.create_test_device(opts={"compileIPUCode": False}))

    ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON))

    ops = ir["maingraph"]

    tensors = set()
    for op in ops:
        for i in op["inputs"]:
            tensors.add(i["name"])
        for o in op["outputs"]:
            tensors.add(o["name"])

    prefixes = [
        popart.reservedAcclPrefix(),
        popart.reservedAcclToUpdatePrefix(),
        popart.reservedAcclFinalOutPrefix()
    ]
    for prefix, weight in itertools.product(prefixes, weights):
        assert prefix + weight in tensors
def run_graph(optimizer, input_shape, initial_onnx_model, input_tensor_name,
              output_tensor_name, label_tensor_name, label_array, accum_factor,
              enable_accum, batches_per_step, number_of_steps,
              final_proto_filename, enable_multi_ipu, full_anchorage,
              inference_mode):

    art = popart.AnchorReturnType("All")
    anchorNames = {output_tensor_name: art}

    if full_anchorage:
        w0 = onnx.load_from_string(
            initial_onnx_model).graph.initializer[0].name

        anchorNames[popart.reservedGradientPrefix() + w0] = art

        if enable_accum:
            anchorNames[popart.reservedAcclPrefix() + w0] = art
            anchorNames[popart.reservedAcclToUpdatePrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.enableGradientAccumulation = enable_accum
    opts.accumulationFactor = accum_factor
    opts.enableOutlining = False
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual if enable_multi_ipu else popart.VirtualGraphMode.Off

    if enable_multi_ipu:
        device = tu.create_test_device(numIpus=num_ipus,
                                       tilesPerIPU=testTilesPerIPU,
                                       opts={"compileIPUCode": False})
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    else:
        device = tu.create_test_device(tilesPerIPU=testTilesPerIPU,
                                       opts={"compileIPUCode": False})
        opts.virtualGraphMode = popart.VirtualGraphMode.Off

    # only for test purposes, inference with gradient_accumulation should never work
    if inference_mode:
        popart.InferenceSession(fnModel=initial_onnx_model,
                                dataFlow=popart.DataFlow(
                                    batches_per_step, anchorNames),
                                userOptions=opts,
                                deviceInfo=device)

    session = popart.TrainingSession(fnModel=initial_onnx_model,
                                     dataFlow=popart.DataFlow(
                                         batches_per_step, anchorNames),
                                     deviceInfo=device,
                                     loss=output_tensor_name,
                                     optimizer=optimizer,
                                     userOptions=opts)

    session.prepareDevice()
    session.weightsFromHost()

    anchor_arrays = session.initAnchorArrays()

    outer_dim = 1
    if batches_per_step > 1:
        outer_dim *= batches_per_step
        label_array = np.repeat(label_array[np.newaxis], batches_per_step, 0)
    if accum_factor > 1:
        outer_dim *= accum_factor
        label_array = label_array.reshape(
            [accum_factor * batches_per_step, -1])
    if outer_dim > 1:
        input_shape = [outer_dim] + input_shape

    stepio = popart.PyStepIO(
        {
            input_tensor_name:
            (1.0 - xi * npr.rand(*input_shape)).astype(np.float32),
            label_tensor_name:
            label_array.astype(np.int32)
        }, anchor_arrays)

    for i in range(number_of_steps):
        session.run(stepio)

    final_proto_file = "{}.onnx".format(final_proto_filename)
    session.modelToHost(final_proto_filename)

    return final_proto_filename, anchor_arrays