def check_models(model_init, modelA_fn, modelB_fn):
    """
    for each weight tensor, check the relative error. That is, 
    | model_accl - model_no_accl |_1 / | model_accl - model_initial|_1
    """
    modelA = onnx.load(modelA_fn)
    modelB = onnx.load(modelB_fn)

    #the initial model
    modelC = onnx.load_from_string(model_init)

    for w_i, weightA in enumerate(modelA.graph.initializer):
        # We need to avoid the gradient accl initializers as these won't be present
        # in the non grad accl models.
        if (popart.reservedAcclPrefix() not in weightA.name
                and popart.reservedAccl1Prefix() not in weightA.name
                and popart.reservedAccl2Prefix() not in weightA.name
                and popart.reservedStepPrefix() not in weightA.name
                and popart.reservedAccumPrefix() not in weightA.name):
            # where A, B, C are weight tensors,
            # |A - B|_1
            l1AB = 0
            # |B - C|_1
            l1BC = 0
            # |A - C|_1
            l1AC = 0
            for d_i, dataA in enumerate(weightA.float_data):
                dataB = modelB.graph.initializer[w_i].float_data[d_i]
                dataC = modelC.graph.initializer[w_i].float_data[d_i]

                # abs diff of 2 floats
                l1AB += np.abs(dataA - dataB)
                l1BC += np.abs(dataB - dataC)
                l1AC += np.abs(dataA - dataC)

            relative_error = l1AB / (l1AC)
            print(
                f"{weightA.name}: l1AB = %.2e,  l1AC = %.2e, l1BC = %.2e, relative error = %.2e"
                % (l1AB, l1AC, l1BC, relative_error))

            # check that the weights have moved enough for this to be a valid
            assert l1AC > 1e-3, "change since start of A = %.5f" % (l1AC, )
            assert l1BC > 1e-3, "change since start of B = %.5f" % (l1BC, )

            #relative error assertion
            assert 1e-5 > relative_error, "Relative error {}".format(
                relative_error)
Esempio n. 2
0
def load_initializers_from_onnx(model_path, load_optimizer=False):
    """Load initial weights from an onnx checkpoint.

    Args:
        model_path (str): Path to onnx file.

    Returns:
        Dict: Mapping of popart weight names to numpy values.
    """
    initializers = {}
    # By default onnx.load will look for initializers in the same dir as onnx model.
    # However builder.saveIntializersExternally takes real path or path relative to run dir
    # and stores it in the onnxproto.
    model = onnx.load(model_path, load_external_data=False)

    has_external_data = any(
        is_external_weight(weight) for weight in model.graph.initializer)
    if has_external_data:
        load_external_data_for_model(model, '')

    optimizer_prefix = (popart.reservedAccl1Prefix(),
                        popart.reservedAccl2Prefix(),
                        popart.reservedAcclPrefix(),
                        popart.reservedAccumPrefix(),
                        popart.reservedStepPrefix())

    for weight in model.graph.initializer:
        is_optimizer_state = any(x in weight.name for x in optimizer_prefix)
        if not load_optimizer and is_optimizer_state:
            continue

        if is_external_weight(
                weight) or weight.data_type != onnx.TensorProto.FLOAT16:
            np_weight = numpy_helper.to_array(weight)
        else:
            int_data = np.asarray(weight.int32_data, np.int32)
            np_weight = int_data.view(dtype=np.float16).reshape(weight.dims)

        if is_optimizer_state:
            initializers[weight.name] = np_weight.astype(np.float32)
        else:
            initializers[weight.name] = np_weight

    initializers = handle_split_qkv(initializers)
    initializers = handle_split_word_embedding(initializers)
    return initializers
def test_adam_loading_saved_gradient_accumulationt_tesors(tmpdir):
    """
    1. Build a model with matmuls, no grad accumulation
    2. Write out onnx model, verify initializers contain no accum tensors
    3. Create session with model, verify accl tensors initialised correctly
    4. Do session.run(), write out model, verify accl tensors have been updated
    5. Create new session with same model. This time before run, write out model
       and check tensors are still there, with the same value
    """

    # 1.
    accum_factor = 4
    [onnx_model, input_name, output_name,
     lb_name] = get_mm_model(accum_factor=accum_factor, enable_multi_ipu=False)

    # 2.
    model = onnx.load_from_string(onnx_model)
    names = [t.name for t in model.graph.initializer]
    for name in names:
        assert popart.reservedAccumPrefix() not in name

    def getTrainingSession(fn):
        opts = popart.SessionOptions()
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accum_factor
        opts.disableGradAccumulationTensorStreams = False
        sess = popart.TrainingSession(
            fnModel=fn,
            dataFlow=popart.DataFlow(1, {}),
            deviceInfo=tu.create_test_device(tilesPerIPU=testTilesPerIPU),
            loss=output_name,
            optimizer=adam_optimizer,
            userOptions=opts)
        sess.prepareDevice()

        sess.weightsFromHost()
        return sess

    # 3.
    sess = getTrainingSession(onnx_model)
    fn = os.path.join(tmpdir, "withInitZeroAccumTensors.onnx")
    sess.modelToHost(fn)
    model = onnx.load(fn)
    weights = {}
    optstates = {}
    for t in model.graph.initializer:
        if (popart.reservedAccumPrefix() in t.name
                or popart.reservedAccl1Prefix() in t.name
                or popart.reservedAccl2Prefix() in t.name
                or popart.reservedStepPrefix() in t.name):
            optstates[t.name] = t.float_data
            assert np.allclose(np.asarray(t.float_data), 0.0)
        else:
            weights[t.name] = t.float_data

    # 4.
    input_shape = [accum_factor] + sess.getInfo(input_name).shape()
    stepio = popart.PyStepIO(
        {
            input_name: npr.rand(*input_shape).astype(np.float32),
            lb_name: np.ones(batch_size).astype(np.int32),
        }, sess.initAnchorArrays())
    sess.run(stepio)
    fn = os.path.join(tmpdir, "withUpdatedAcclTensors.onnx")
    sess.modelToHost(fn)
    model = onnx.load(fn)
    for t in model.graph.initializer:
        if (popart.reservedAccl1Prefix() in t.name
                or popart.reservedAccl2Prefix() in t.name
                or popart.reservedStepPrefix() in t.name):
            # Nonzero, updated accl1, accl2 and step tensors
            assert np.allclose(np.asarray(t.float_data),
                               optstates[t.name]) is False
            optstates[t.name] = np.asarray(t.float_data)
        elif popart.reservedAccumPrefix() in t.name:
            # Because the accumulator is always set to zero after being applied
            # to accl1 and accl2
            assert np.allclose(np.asarray(t.float_data), 0.0)
            optstates[t.name] = np.asarray(t.float_data)

    # 5.
    sess = getTrainingSession(fn)
    fn = os.path.join(tmpdir, "withUpdatedAcclTensors_check.onnx")
    sess.modelToHost(fn)
    model = onnx.load(fn)
    for t in model.graph.initializer:
        if (popart.reservedAccumPrefix() in t.name
                or popart.reservedAccl1Prefix() in t.name
                or popart.reservedAccl2Prefix() in t.name
                or popart.reservedStepPrefix() in t.name):
            assert np.array_equal(optstates[t.name], np.asarray(t.float_data))
def test_adam_gradient_accumulation_model_proto(tmpdir):
    batches_per_step = 5
    for steps in [0, 3]:
        np.random.seed(1234)
        label_array = np.random.randint(0, hidden_size, batch_size)
        accl_initial_proto, accl_proto_filename, accl_anchor_arrays = run_mm_graph(
            adam_optimizer,
            label_array=label_array,
            accum_factor=4,
            enable_accum=True,
            batches_per_step=batches_per_step,
            number_of_steps=steps,
            final_proto_filename=os.path.join(tmpdir, "accl5batches3steps"),
            enable_multi_ipu=False,
            full_anchorage=False)

        model = onnx.load(accl_proto_filename)
        names = [t.name for t in model.graph.initializer]

        weight_names = []
        accum_names = []
        accl1_names = []
        accl2_names = []
        step_names = []

        for name in names:
            if popart.reservedAccumPrefix() in name:
                accum_names.append(name)
            elif popart.reservedAccl1Prefix() in name:
                accl1_names.append(name)
            elif popart.reservedAccl2Prefix() in name:
                accl2_names.append(name)
            elif popart.reservedStepPrefix() in name:
                step_names.append(name)
            elif "weight" in name:
                weight_names.append(name)

        # Model should have 6 weight tensors
        assert len(weight_names) == 6
        assert len(accum_names) == len(weight_names)
        assert len(accl1_names) == len(weight_names)
        assert len(accl2_names) == len(weight_names)
        assert len(step_names) == len(weight_names)

        tensor_mapping = {}
        for tensor in model.graph.initializer:
            tensor_mapping[tensor.name] = tensor

        for w_name in weight_names:
            assert popart.reservedAccumPrefix() + w_name in accum_names
            assert popart.reservedAccl1Prefix() + w_name in accl1_names
            assert popart.reservedAccl2Prefix() + w_name in accl2_names
            assert popart.reservedStepPrefix() + w_name in step_names

        if steps == 0:
            for name in accum_names + accl1_names + accl2_names + step_names:
                tensor = tensor_mapping[name]
                # All Adam states are initialized to zero
                assert np.allclose(tensor.float_data, 0.0)
        else:
            for name in step_names:
                tensor = tensor_mapping[name]
                # Steps counted correctly
                assert tensor.float_data[0] == steps * batches_per_step
Esempio n. 5
0
# 1. SGD with momentum
optimizerInfos.append((popart.SGD({
    "defaultLearningRate": (0.2, True),
    "defaultMomentum": (0.5, True)
}), [popart.reservedAcclPrefix()]))
# 2. Adam
optimizerInfos.append((popart.Adam({
    "defaultLearningRate": (0.2, True),
    "defaultBeta1": (0.1, True),
    "defaultBeta2": (0.1, True),
    "defaultWeightDecay": (0.5, True),
    "defaultEps": (1e-5, True),
    "lossScaling": (2, True)
}), [
    popart.reservedAccl1Prefix(),
    popart.reservedAccl2Prefix(),
    popart.reservedStepPrefix()
]))
# 3. Adaptive
optimizerInfos.append(
    (popart.Adaptive({"defaultLearningRate": (0.2, True)},
                     mode=popart.AdaptiveMode.CenteredRMSProp),
     [popart.reservedAccl1Prefix(),
      popart.reservedAccl2Prefix()]))


@pytest.mark.parametrize("optimizerInfo", optimizerInfos)
def test_save_tensors_optimizer_state_externally(optimizerInfo):
    """
    # 1. create training session with momentum, save initializers externally
    # 2. check file size before session.modelToHost, see it grows after