def check_models(model_init, modelA_fn, modelB_fn):
    """
    for each weight tensor, check the relative error. That is, 
    | model_accl - model_no_accl |_1 / | model_accl - model_initial|_1
    """
    modelA = onnx.load(modelA_fn)
    modelB = onnx.load(modelB_fn)

    #the initial model
    modelC = onnx.load_from_string(model_init)

    for w_i, weightA in enumerate(modelA.graph.initializer):
        # We need to avoid the gradient accl initializers as these won't be present
        # in the non grad accl models.
        if (popart.reservedAcclPrefix() not in weightA.name
                and popart.reservedAccl1Prefix() not in weightA.name
                and popart.reservedAccl2Prefix() not in weightA.name
                and popart.reservedStepPrefix() not in weightA.name
                and popart.reservedAccumPrefix() not in weightA.name):
            # where A, B, C are weight tensors,
            # |A - B|_1
            l1AB = 0
            # |B - C|_1
            l1BC = 0
            # |A - C|_1
            l1AC = 0
            for d_i, dataA in enumerate(weightA.float_data):
                dataB = modelB.graph.initializer[w_i].float_data[d_i]
                dataC = modelC.graph.initializer[w_i].float_data[d_i]

                # abs diff of 2 floats
                l1AB += np.abs(dataA - dataB)
                l1BC += np.abs(dataB - dataC)
                l1AC += np.abs(dataA - dataC)

            relative_error = l1AB / (l1AC)
            print(
                f"{weightA.name}: l1AB = %.2e,  l1AC = %.2e, l1BC = %.2e, relative error = %.2e"
                % (l1AB, l1AC, l1BC, relative_error))

            # check that the weights have moved enough for this to be a valid
            assert l1AC > 1e-3, "change since start of A = %.5f" % (l1AC, )
            assert l1BC > 1e-3, "change since start of B = %.5f" % (l1BC, )

            #relative error assertion
            assert 1e-5 > relative_error, "Relative error {}".format(
                relative_error)
Exemple #2
0
def load_initializers_from_onnx(model_path, load_optimizer=False):
    """Load initial weights from an onnx checkpoint.

    Args:
        model_path (str): Path to onnx file.

    Returns:
        Dict: Mapping of popart weight names to numpy values.
    """
    initializers = {}
    # By default onnx.load will look for initializers in the same dir as onnx model.
    # However builder.saveIntializersExternally takes real path or path relative to run dir
    # and stores it in the onnxproto.
    model = onnx.load(model_path, load_external_data=False)

    has_external_data = any(
        is_external_weight(weight) for weight in model.graph.initializer)
    if has_external_data:
        load_external_data_for_model(model, '')

    optimizer_prefix = (popart.reservedAccl1Prefix(),
                        popart.reservedAccl2Prefix(),
                        popart.reservedAcclPrefix(),
                        popart.reservedAccumPrefix(),
                        popart.reservedStepPrefix())

    for weight in model.graph.initializer:
        is_optimizer_state = any(x in weight.name for x in optimizer_prefix)
        if not load_optimizer and is_optimizer_state:
            continue

        if is_external_weight(
                weight) or weight.data_type != onnx.TensorProto.FLOAT16:
            np_weight = numpy_helper.to_array(weight)
        else:
            int_data = np.asarray(weight.int32_data, np.int32)
            np_weight = int_data.view(dtype=np.float16).reshape(weight.dims)

        if is_optimizer_state:
            initializers[weight.name] = np_weight.astype(np.float32)
        else:
            initializers[weight.name] = np_weight

    initializers = handle_split_qkv(initializers)
    initializers = handle_split_word_embedding(initializers)
    return initializers
Exemple #3
0
def test_anchor_output():
    """
    Test a specific example's output of weights and accumulated gradient.
    This should catch any slicing issues.
    """
    anchorDict = {
        "ReplicationFactor": 2,
        # Accl factor must divide batch size
        "AccumulationFactor": 4,
        "Pipelining": True,
        "ReturnType": "ALL"
    }
    label_array = np.ones([BATCH_SIZE]).astype(np.int32)

    micro_batch_size = BATCH_SIZE // (anchorDict["AccumulationFactor"] *
                                      anchorDict["ReplicationFactor"])

    builder = popart.Builder()
    input_shape = [micro_batch_size, CHANNELS, DATA_LEN, DATA_LEN]

    data_shape = popart.TensorInfo("FLOAT", input_shape)
    lbl_shape = popart.TensorInfo("INT32", [micro_batch_size])
    w = builder.addInitializedInputTensor(
        np.random.random_sample(input_shape).astype(np.float32))

    ip = builder.addInputTensor(data_shape)
    lb = builder.addInputTensor(lbl_shape)

    a = builder.aiOnnx.matmul([ip, w])
    o = builder.reshape_const(
        builder.aiOnnx, [a],
        [micro_batch_size, CHANNELS * DATA_LEN * DATA_LEN])
    o = builder.aiOnnx.relu([o])
    o = builder.aiOnnx.softmax([o])
    nll = builder.aiGraphcore.nllloss([o, lb])

    GRAD = popart.reservedGradientPrefix() + w
    ACCL = popart.reservedAcclPrefix() + w
    art = popart.AnchorReturnType("All")
    data_flow = popart.DataFlow(BATCHES_PER_STEP, {
        o: art,
        a: art,
        ip: art,
        w: art,
        GRAD: art,
        ACCL: art
    })

    opts, device = return_options(anchorDict)

    if device is None:
        pytest.skip("Test needs to run on IPU, but none are available")

    session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                     dataFlow=data_flow,
                                     loss=nll,
                                     optimizer=popart.ConstSGD(LEARNING_RATE),
                                     userOptions=opts,
                                     deviceInfo=device)

    session.prepareDevice()

    if anchorDict["ReplicationFactor"] > 1:
        input_shape = [anchorDict["ReplicationFactor"]] + input_shape
        label_array = label_array.reshape(
            [anchorDict["ReplicationFactor"], -1])
    if anchorDict["AccumulationFactor"] > 1:
        input_shape = [anchorDict["AccumulationFactor"]] + input_shape
        label_array = label_array.reshape(
            [anchorDict["AccumulationFactor"], -1])
    if BATCHES_PER_STEP > 1:
        input_shape = [BATCHES_PER_STEP] + input_shape
        label_array = np.repeat(label_array[np.newaxis], BATCHES_PER_STEP, 0)

    anchors = session.initAnchorArrays()
    in_array = np.random.random_sample(input_shape).astype(np.float32)

    stepio = popart.PyStepIO({ip: in_array, lb: label_array}, anchors)
    session.weightsFromHost()

    session.run(stepio)

    # Returned anchors will be of shape
    # [bps, grad_accl_factor, repl_factor, micro_batch_size, channels, data_len, data_len]
    for batch in range(anchors[w].shape[0]):
        for replica in range(anchors[w].shape[1]):
            # Weights should not change over the gradient accumulation
            # dimension - only after gradAccl steps.
            assert np.allclose(anchors[w][batch, 0, :, :, :, :, :],
                               anchors[w][batch, replica, :, :, :, :, :])

    # Check that the accumulated gradient plus the weights for the current batch
    # equals the weights for the next batch.
    # We will need to multiply by this adjustment factor as with most
    # implementations of replication.
    adj = 1 / anchorDict["ReplicationFactor"]
    # Batch loop
    for batch in range(anchors[w].shape[0] - 1):
        calc_weight = {}
        # Replica loop.
        for replica in range(anchors[w].shape[2]):
            # For each replica in each batch, take the relevant replica's
            #  last weight tensor in the accumulation loop minus
            # the sum of the accumulated gradients across replicas
            calc_weight[replica] = anchors[w][batch, -1, replica, :, :, :, :] - \
                 adj * np.sum(anchors[ACCL][batch, -1, :, :, :, :, :], axis=0)
            # Then compare against the last weight tensor of the next batch,
            # for the relevant replica. These should match.
            assert np.allclose(calc_weight[replica],
                               anchors[w][batch + 1, -1, replica, :, :, :, :])
Exemple #4
0
def test_reset_host_weights_with_extra_tensor_in_onnx_model():
    """
    1. Create a training session, and a corresponding validation session
    2. The training session must contain some feauture that means when writing
       the ONNX model back to the host, it contains extra initializers compared
       with the original (builder-generated) model. In this case we achieve this
       by using an SGD optimizer with momentum.
    3. Try resetting the weights of the validation session using the ONNX model
       with the additional momentum tensor (call resetHostWeights)
    4. Observe that a PopART exception is thrown
    5. Try again, but with ignoreWeightsInModelWithoutCorrespondingHostWeight.
    6. Observe that it succeeds
    """

    def getModelWithRandomWeights():
        builder = popart.Builder()
        dShape = [2, 2]
        i0 = builder.addInputTensor(popart.TensorInfo("FLOAT", dShape))
        wData = np.random.rand(*dShape).astype(np.float32)
        w0 = builder.addInitializedInputTensor(wData)
        o = builder.aiOnnx.matmul([i0, w0])
        loss = builder.aiGraphcore.l1loss([o], 0.1)
        builder.addOutputTensor(loss)
        return builder

    device = tu.create_test_device()
    tr_builder = getModelWithRandomWeights()
    o = tr_builder.getOutputTensorIds()[0]

    # 1. & 2.
    # Training
    tr_opt = popart.SGD({"defaultMomentum": (0.01, True)})
    tr_sess = popart.TrainingSession(fnModel=tr_builder.getModelProto(),
                                     dataFlow=popart.DataFlow(1, []),
                                     loss=o,
                                     optimizer=tr_opt,
                                     deviceInfo=device)
    tr_sess.prepareDevice()
    with TemporaryDirectory() as tmpdir:
        tmpfile = os.path.join(tmpdir, "tr_model.onnx")
        tr_sess.modelToHost(tmpfile)

        # Validation (with different model proto weights)
        va_builder = getModelWithRandomWeights()
        va_opts = popart.SessionOptions()
        va_opts.constantWeights = False
        va_sess = popart.InferenceSession(fnModel=va_builder.getModelProto(),
                                          dataFlow=popart.DataFlow(1, [o]),
                                          deviceInfo=device,
                                          userOptions=va_opts)
        va_sess.prepareDevice()

        # 3. Try reset validation weights with training weights
        wId = [
            w for w in va_builder.getInputTensorIds()
            if va_builder.isInitializer(w)
        ][0]
        missing_tensor_name = popart.reservedAcclPrefix() + wId
        with pytest.raises(popart.popart_exception) as e_info:
            va_sess.resetHostWeights(tmpfile)
        # 4.
        assert e_info.value.args[
            0] == "resetWeights, no tensor '" + missing_tensor_name + "' in tensors"

        # 5. & 6. Try again, but this time ignore the missing tensor
        va_sess.resetHostWeights(
            tmpfile, ignoreWeightsInModelWithoutCorrespondingHostWeight=True)
def test_accumulators_names_dont_clash():
    np.random.seed(1984)

    builder = popart.Builder()

    input_data = np.random.rand(4, 4).astype(np.float32)
    weights = ['weight1', 'weight2', 'weight3']

    d0 = builder.addInputTensor(popart.TensorInfo('FLOAT', [4, 4]), 'data0')
    x = builder.aiOnnx.add([
        d0,
        builder.addInitializedInputTensor(
            np.random.rand(4, 4).astype(np.float32), weights[0])
    ])
    x = builder.aiOnnx.add([
        x,
        builder.addInitializedInputTensor(
            np.random.rand(4, 4).astype(np.float32), weights[1])
    ])
    x = builder.aiOnnx.add([
        x,
        builder.addInitializedInputTensor(
            np.random.rand(4, 4).astype(np.float32), weights[2])
    ])

    l1 = builder.aiGraphcore.l1loss([x], 1.0)

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(1, {})

    opt = popart.SGD({
        "defaultLearningRate": (0.1, True),
        "defaultMomentum": (0.9, True),
        "defaultDampening": (0, True)
    })

    session = popart.TrainingSession(
        fnModel=proto,
        dataFlow=dataFlow,
        loss=l1,
        optimizer=opt,
        deviceInfo=tu.create_test_device(opts={"compileIPUCode": False}))

    ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON))

    ops = ir["maingraph"]

    tensors = set()
    for op in ops:
        for i in op["inputs"]:
            tensors.add(i["name"])
        for o in op["outputs"]:
            tensors.add(o["name"])

    prefixes = [
        popart.reservedAcclPrefix(),
        popart.reservedAcclToUpdatePrefix(),
        popart.reservedAcclFinalOutPrefix()
    ]
    for prefix, weight in itertools.product(prefixes, weights):
        assert prefix + weight in tensors
DATA_LEN = 5
ANCHOR_TYPES = {
    "ReplicationFactor": [1],  # TODO: Enable replication once T12001 done.
    # Exception: Accl factor must divide batch size
    "AccumulationFactor": [4, 1],
    "Pipelining": [True, False],
    "ReturnType": ["FINAL", "ALL"]
}
# Learning rate 1 for easy comparison.
LEARNING_RATE = 1.0
# Strings for the anchors.
INPUT = "input"
WEIGHTS = "init_input"
ACTIVATION = "Reshape:0"
GRADIENT = popart.reservedGradientPrefix() + WEIGHTS
ACCL = popart.reservedAcclPrefix() + WEIGHTS


def dict_product(d):
    keys = d.keys()
    for element in itertools.product(*d.values()):
        yield dict(zip(keys, element))


def return_options(anchorDict):
    opts = popart.SessionOptions()

    if anchorDict["Pipelining"]:
        opts.enablePipelining = True

    if anchorDict["AccumulationFactor"] > 1:
def run_graph(optimizer, input_shape, initial_onnx_model, input_tensor_name,
              output_tensor_name, label_tensor_name, label_array, accum_factor,
              enable_accum, batches_per_step, number_of_steps,
              final_proto_filename, enable_multi_ipu, full_anchorage,
              inference_mode):

    art = popart.AnchorReturnType("All")
    anchorNames = {output_tensor_name: art}

    if full_anchorage:
        w0 = onnx.load_from_string(
            initial_onnx_model).graph.initializer[0].name

        anchorNames[popart.reservedGradientPrefix() + w0] = art

        if enable_accum:
            anchorNames[popart.reservedAcclPrefix() + w0] = art
            anchorNames[popart.reservedAcclToUpdatePrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.enableGradientAccumulation = enable_accum
    opts.accumulationFactor = accum_factor
    opts.enableOutlining = False
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual if enable_multi_ipu else popart.VirtualGraphMode.Off

    if enable_multi_ipu:
        device = tu.create_test_device(numIpus=num_ipus,
                                       tilesPerIPU=testTilesPerIPU,
                                       opts={"compileIPUCode": False})
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    else:
        device = tu.create_test_device(tilesPerIPU=testTilesPerIPU,
                                       opts={"compileIPUCode": False})
        opts.virtualGraphMode = popart.VirtualGraphMode.Off

    # only for test purposes, inference with gradient_accumulation should never work
    if inference_mode:
        popart.InferenceSession(fnModel=initial_onnx_model,
                                dataFlow=popart.DataFlow(
                                    batches_per_step, anchorNames),
                                userOptions=opts,
                                deviceInfo=device)

    session = popart.TrainingSession(fnModel=initial_onnx_model,
                                     dataFlow=popart.DataFlow(
                                         batches_per_step, anchorNames),
                                     deviceInfo=device,
                                     loss=output_tensor_name,
                                     optimizer=optimizer,
                                     userOptions=opts)

    session.prepareDevice()
    session.weightsFromHost()

    anchor_arrays = session.initAnchorArrays()

    outer_dim = 1
    if batches_per_step > 1:
        outer_dim *= batches_per_step
        label_array = np.repeat(label_array[np.newaxis], batches_per_step, 0)
    if accum_factor > 1:
        outer_dim *= accum_factor
        label_array = label_array.reshape(
            [accum_factor * batches_per_step, -1])
    if outer_dim > 1:
        input_shape = [outer_dim] + input_shape

    stepio = popart.PyStepIO(
        {
            input_tensor_name:
            (1.0 - xi * npr.rand(*input_shape)).astype(np.float32),
            label_tensor_name:
            label_array.astype(np.int32)
        }, anchor_arrays)

    for i in range(number_of_steps):
        session.run(stepio)

    final_proto_file = "{}.onnx".format(final_proto_filename)
    session.modelToHost(final_proto_filename)

    return final_proto_filename, anchor_arrays
testTilesPerIPU = 1216

sgd_optimizer = popart.SGD({
    "defaultLearningRate": (lr, False),
    "defaultWeightDecay": (wd, False)
})

adam_optimizer = popart.Adam({
    "defaultLearningRate": (lr, False),
    "defaultBeta1": (b1, False),
    "defaultBeta2": (b2, False),
    "defaultWeightDecay": (wd, False),
    "defaultEps": (1e-6, True),
})

grad_accl_prefix = popart.reservedAcclPrefix()


def get_micro_batch_size(accum_factor):
    """
    no data replication, so micro batch size = batch size / accumlation factor
    """
    if (batch_size % accum_factor is not 0):
        raise RuntimeError("accum_factor is not a factor of batch_size")

    micro_batch_size = batch_size // accum_factor
    return micro_batch_size


def get_mm_model(accum_factor, enable_multi_ipu):
    """
Exemple #9
0
        # Verify that the file containing tensor data has also been updated
        for layer in range(numLayers):
            saved_weights = np.fromfile(tmpfile_weights,
                                        dtype=np.float32,
                                        count=elms,
                                        offset=layer * elms * 4)
            assert np.array_equal(anchors[weightsIds[layer]].flatten(),
                                  saved_weights)


optimizerInfos = []
# 1. SGD with momentum
optimizerInfos.append((popart.SGD({
    "defaultLearningRate": (0.2, True),
    "defaultMomentum": (0.5, True)
}), [popart.reservedAcclPrefix()]))
# 2. Adam
optimizerInfos.append((popart.Adam({
    "defaultLearningRate": (0.2, True),
    "defaultBeta1": (0.1, True),
    "defaultBeta2": (0.1, True),
    "defaultWeightDecay": (0.5, True),
    "defaultEps": (1e-5, True),
    "lossScaling": (2, True)
}), [
    popart.reservedAccl1Prefix(),
    popart.reservedAccl2Prefix(),
    popart.reservedStepPrefix()
]))
# 3. Adaptive
optimizerInfos.append(