def getOptimizers():
    optimizers = []

    # SGD
    sgd0 = popart.SGD({
        "lossScaling": (10.0, False),
        "defaultMomentum": (0.5, False),
        "defaultVelocityScaling": (0.5, False),
        "defaultDampening": (0.5, False),
        "defaultWeightDecay": (0.5, False)
    })
    sgd1 = popart.SGD({
        "lossScaling": (0.2, False),
        "defaultMomentum": (0.2, False),
        "defaultVelocityScaling": (0.2, False),
        "defaultDampening": (0.2, False),
        "defaultWeightDecay": (0.2, False)
    })
    optimizers.append([sgd0, sgd1])

    # Adam
    adam0 = popart.Adam({
        "lossScaling": (10.0, False),
        "defaultLearningRate": (0.5, False),
        "defaultWeightDecay": (0.5, False),
        "defaultBeta1": (0.5, False),
        "defaultBeta2": (0.5, False),
        "defaultEps": (0.5, False)
    })
    adam1 = popart.Adam({
        "lossScaling": (0.2, False),
        "defaultLearningRate": (0.2, False),
        "defaultWeightDecay": (0.2, False),
        "defaultBeta1": (0.2, False),
        "defaultBeta2": (0.2, False),
        "defaultEps": (0.2, False)
    })
    optimizers.append([adam0, adam1])

    # Adaptive
    adaptive0 = popart.Adaptive({
        "lossScaling": (10.0, False),
        "defaultLearningRate": (0.5, False),
        "defaultAlpha": (0.5, False),
        "defaultMomentum": (0.5, False),
        "defaultWeightDecay": (0.5, False),
        "defaultEps": (0.5, False)
    })
    adaptive1 = popart.Adaptive({
        "lossScaling": (0.2, False),
        "defaultLearningRate": (0.2, False),
        "defaultAlpha": (0.2, False),
        "defaultMomentum": (0.2, False),
        "defaultWeightDecay": (0.2, False),
        "defaultEps": (0.2, False)
    })
    optimizers.append([adaptive0, adaptive1])

    return optimizers
Exemple #2
0
    def create(self):
        self.iteration.learning_rate = self.option_values["defaultLearningRate"]

        if self.opt_type == "SGD":
            optimizer = popart.SGD(self.optimizer_options)
        elif self.opt_type == "ADAM":
            optimizer = popart.Adam(self.optimizer_options,
                                    accl1_type=self.accl1_type,
                                    scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16)
        elif self.opt_type == "ADAM_NO_BIAS":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.AdamNoBias,
                                    accl1_type=self.accl1_type,
                                    scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16)
        elif self.opt_type == "LAMB":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.Lamb,
                                    accl1_type=self.accl1_type,
                                    scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16)
        elif self.opt_type == "LAMB_NO_BIAS":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.LambNoBias,
                                    accl1_type=self.accl1_type,
                                    scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16)


        weight_decay_tensor_list = []

        for stage, tensors in self.tensors.items():
            for tensor_id in tensors:
                params = self.option_values.copy()

                if self.include_for_weight_decay(tensor_id):
                    params["weightDecay"] = self.weight_decay
                    weight_decay_tensor_list.append(tensor_id)
                else:
                    params["weightDecay"] = 0

                if self.disable_lamb(tensor_id):
                    params["maxWeightNorm"] = 0

                for transform in self.transforms:
                    params = transform(tensor_id, params, stage)

                specific_params = {
                    k: v for k, v in params.items() if k not in self.option_values
                }
                if specific_params:
                    p = self._make_tuple_options(specific_params)
                    optimizer.insertSpecific(tensor_id, p)

        if len(weight_decay_tensor_list) != 0:
            logger.debug(f" Weight decay of {self.weight_decay} applied to: {weight_decay_tensor_list}")

        return optimizer
Exemple #3
0
def test_replicated_adam_weight_update(tmpdir):

    optimizer_dict = {
        "defaultLearningRate": (0.005, True),
        "defaultBeta1": (0.7, True),
        "defaultBeta2": (0.8, True),
        "defaultWeightDecay": (0.1, True),
        "defaultEps": (1e-6, True),
        "lossScaling": (10.0, True),
    }

    run_model(tmpdir,
              'phased.onnx',
              execution_mode="phased",
              batch_size=2,
              num_replicas=1,
              num_iterations=5,
              optimizer=popart.Adam(optimizer_dict),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=offChipLocation)
    run_model(tmpdir,
              'phased_replicated.onnx',
              execution_mode="phased",
              batch_size=1,
              num_replicas=2,
              num_iterations=5,
              optimizer=popart.Adam(optimizer_dict),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=offChipLocation)
    run_model(tmpdir,
              'phased_replicated_rws.onnx',
              execution_mode="phased",
              batch_size=1,
              num_replicas=2,
              num_iterations=5,
              optimizer=popart.Adam(optimizer_dict),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipRtsLocation,
              optimizer_state_tensor_location_settings=offChipRtsLocation,
              accumulator_tensor_location_settings=offChipRtsLocation)

    phased = onnx.load(str(tmpdir / 'phased.onnx'))
    phased_replicated = onnx.load(str(tmpdir / 'phased_replicated.onnx'))
    phased_replicated_rws = onnx.load(
        str(tmpdir / 'phased_replicated_rws.onnx'))

    check_model(phased, phased_replicated)
    check_model(phased, phased_replicated_rws)
def test_adam_mixed_mode_0(tmpdir):

    #optimizer parameters
    defaultLearningRate = 0.005
    defaultBeta1 = 0.7
    defaultBeta2 = 0.8
    defaultWeightDecay = 0.1
    defaultEps = 1e-6
    lossScaling = 10.0

    optMaps = [{
        0:
        popart.Adam({
            "defaultLearningRate": (defaultLearningRate, True),
            "defaultBeta1": (defaultBeta1, True),
            "defaultBeta2": (defaultBeta2, True),
            "defaultWeightDecay": (defaultWeightDecay, True),
            "defaultEps": (defaultEps, True),
            "lossScaling": (lossScaling, True),
        })
    }]
    outlining = [False]

    for i in range(6):
        optMap = {
            "defaultLearningRate": (defaultLearningRate, i != 0),
            "defaultBeta1": (defaultBeta1, i != 1),
            "defaultBeta2": (defaultBeta2, i != 2),
            "defaultWeightDecay": (defaultWeightDecay, i != 3),
            "defaultEps": (defaultEps, i != 4),
            "lossScaling": (lossScaling, i != 5),
        }
        optMaps = optMaps + [{0: popart.Adam(optMap)}]
        outlining = outlining + [False]

    for i in range(6):
        optMap = {
            "defaultLearningRate": (defaultLearningRate, i != 0),
            "defaultBeta1": (defaultBeta1, i != 1),
            "defaultBeta2": (defaultBeta2, i != 2),
            "defaultWeightDecay": (defaultWeightDecay, i != 3),
            "defaultEps": (defaultEps, i != 4),
            "lossScaling": (lossScaling, i != 5),
        }
        optMaps = optMaps + [{0: popart.Adam(optMap)}]
        outlining = outlining + [True]

    run_adam_mixed_mode(10, optMaps, outlining, tmpdir, np.float32)
    run_adam_mixed_mode(10, optMaps, outlining, tmpdir, np.float16)
def session(splits=1):
    proto, data, x, loss = model(splits)

    user_options = {
        "enableOutlining": False,
        "enableGradientAccumulation": True,
        "accumulationFactor": 2,
        "optimizerStateTensorLocationSettings": popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0)
    }

    optimizer = popart.Adam({
        "defaultLearningRate": (0.1, True),
        "defaultBeta1": (0.1, True),
        "defaultBeta2": (0.1, True)
    }, mode=popart.AdamMode.LambNoBias)  # NoBias to increase the error of incorrect gradients

    return run_py(
        proto,
        data=data,
        outputs=x,
        loss=loss,
        optimizer=optimizer,
        patterns=popart.Patterns(),
        user_options=user_options,
        skip_execution=False)
Exemple #6
0
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"):
    proto, data, x, loss = model(splits=splits)
    patterns = popart.Patterns()
    patterns.enablePattern("TiedGatherPattern", include_patterns)
    patterns.enablePattern("TiedGatherAccumulatePattern", include_patterns)

    user_options = {
        "enableOutlining": outline,
        "enableGradientAccumulation": True,
        "accumulationFactor": 2,
        "accumulationAndReplicationReductionType": popart.ReductionType.Mean,
        "meanAccumulationAndReplicationReductionStrategy": popart.MeanReductionStrategy.Running
    }

    if optim == "Lamb":
        optimizer = popart.Adam({
            "defaultLearningRate": (0.1, False),
            "defaultWeightDecay": (0.1, True),
            "defaultBeta1": (0.1, True),
            "defaultBeta2": (0.1, True),
            "lossScaling": (20, True),
        }, mode=popart.AdamMode.LambNoBias)  # NoBias to increase the error of incorrect gradients
        user_options["optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings(
            popart.TensorLocation(
                popart.TensorStorage.OffChip,
                popart.ReplicatedTensorSharding.On),
            0, 0)
        user_options["enableReplicatedGraphs"] = True
        user_options["replicatedGraphCount"] = 2
        ipus = 2
    else:
        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
            "defaultMomentum": (0.9, True),
            "defaultDampening": (0, True),  # 0 dampening to increase the error of incorrect gradients
            "lossScaling": (20, True)})
        ipus = 1

    if train:
        return run_py(
            proto,
            data=data,
            outputs=x,
            loss=loss,
            optimizer=optimizer,
            patterns=patterns,
            user_options=user_options,
            skip_execution=skip_execution)
    else:
        return run_py(
            proto,
            data=data,
            outputs=x,
            patterns=patterns,
            user_options={
                "enableOutlining": outline,
                "constantWeights": False
            },
            skip_execution=skip_execution)
Exemple #7
0
def _get_popart_optimizer(optType, clipNormSettings):
    if optType == 'sgd':
        return popart.SGD({"defaultLearningRate": (0.1, True)},
                          clipNormSettings)
    elif optType == 'adam':
        return popart.Adam(
            {
                "defaultLearningRate": (0.1, False),
                "defaultBeta1": (0.9, False),
                "defaultBeta2": (0.999, False),
                "lossScaling": (20, False),
            },
            weight_decay_mode=popart.WeightDecayMode.L2Regularization,
            mode=popart.AdamMode.Adam,
            clip_norm_settings=clipNormSettings)
    elif optType == 'lamb':
        return popart.Adam(
            {
                "defaultLearningRate": (0.01, False),
                "defaultBeta1": (0.9, False),
                "defaultBeta2": (0.999, False),
                "defaultEps": (1e-06, False),
                "defaultWeightDecay": (0.1, False),
                "lossScaling": (10, False),
            },
            weight_decay_mode=popart.WeightDecayMode.Decay,
            mode=popart.AdamMode.Lamb,
            clip_norm_settings=clipNormSettings)
    elif optType == 'lambnobias':
        return popart.Adam(
            {
                "defaultLearningRate": (0.01, False),
                "defaultBeta1": (0.9, False),
                "defaultBeta2": (0.999, False),
                "defaultEps": (1e-06, False),
                "defaultWeightDecay": (0.1, False),
                "lossScaling": (10, True),
            },
            scaled_optimizer_state=False,
            weight_decay_mode=popart.WeightDecayMode.Decay,
            mode=popart.AdamMode.LambNoBias,
            clip_norm_settings=clipNormSettings)
    else:
        raise Exception(f"Unrecognized optimizer type: '{optType}'")
Exemple #8
0
    def update_and_create(self, step, epoch):
        """ updates the learning rate and returns a new popart optimizer object:
        the learning-rate schedule used here is same as for RNN-T reference model
        """

        new_lr = self.get_new_lr(step, epoch)

        logger.info("Setting learning-rate to {}".format(new_lr))
        self.optimizer_options["defaultLearningRate"] = (new_lr, False)

        if self.optimizer_type == 'SGD':
            optimizer = popart.SGD(self.optimizer_options)
        elif self.optimizer_type == 'LAMB':
            if self.gradient_clipping_norm is None:
                optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.Lamb)
            else:
                optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.Lamb,
                                        clip_norm_settings=[popart.ClipNormSettings.clipAllWeights(self.gradient_clipping_norm)])

        self.current_lr = new_lr

        return optimizer
def test_final_stage_recompute_0():
    np.random.seed(0)

    gradient_accumulation = 5
    batch_size = 1
    hidden_size = 16

    input_shape = [batch_size, hidden_size]
    weight_data = np.random.normal(0, 0.02, [hidden_size, hidden_size]).astype(
        np.float32)
    input_data = np.random.normal(
        0, 0.02, [gradient_accumulation] + input_shape).astype(np.float32)

    builder = popart.Builder()

    x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape),
                                  "x_in")

    with builder.virtualGraph(0), builder.pipelineStage(0):
        weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1")
        x = builder.aiOnnx.matmul([x_in, weight_1])

    with builder.virtualGraph(1), builder.pipelineStage(1):
        weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2")
        x_recomp = builder.aiOnnx.matmul([x, weight_2])
        # This MatMul should be recomputed
        x = builder.checkpointOutput([x_recomp])[0]

        weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3")
        # This MatMul should not be recomputed
        x_no_recomp = builder.aiOnnx.matmul([x, weight_3])
        l1 = builder.aiGraphcore.l1loss([x_no_recomp], 0.1)

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(1, [l1])

    opts = popart.SessionOptions()
    opts.enableOutlining = False
    opts.enablePipelining = True
    opts.enableGradientAccumulation = True
    opts.accumulationFactor = gradient_accumulation
    opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip
    opts.autoRecomputation = popart.RecomputationType.Pipeline
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     userOptions=opts,
                                     loss=l1,
                                     optimizer=popart.Adam({}),
                                     deviceInfo=tu.create_test_device(
                                         numIpus=2,
                                         opts={"compileIPUCode": False}))
    ''' Verify the the matmul in the main graphs is correct'''
    ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON))

    for op in ir["maingraph"]:
        if x_recomp in map(lambda out: out["name"], op["outputs"]):
            assert op["attributes"]["recompute"] == "YES"
        elif x_no_recomp in map(lambda out: out["name"], op["outputs"]):
            assert op["attributes"]["recompute"] == "NO"
    def run_test(mode=None, verify=None):
        builder = popart.Builder()

        x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape),
                                      "x_in")

        weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1")

        # We want a bwd pass that looks like:
        #
        # restore, op1, restore, op2, restore, op3
        #
        # Where op1, op2 & op3 are gradient operations that
        # have implicit recompute inputs.

        with builder.virtualGraph(0), builder.pipelineStage(0):
            x = builder.aiOnnx.matmul([x_in, weight_1])
            x = builder.checkpointOutput([x])[0]

            x = builder.aiOnnx.add([x, x])
            # Gelu is a unary operation that takes the fwd input
            # activation. This satisfies our requirement above
            # of needing an implicit recompute input.
            x = builder.aiGraphcore.gelu([x])

            x = builder.checkpointOutput([x])[0]

            x = builder.aiOnnx.add([x, x])
            x = builder.aiGraphcore.gelu([x])

            x = builder.checkpointOutput([x])[0]
            o = x

        with builder.virtualGraph(1), builder.pipelineStage(1):
            l1 = builder.aiGraphcore.l1loss([o], 0.1)

        proto = builder.getModelProto()

        dataFlow = popart.DataFlow(1, [
            o,
            popart.reservedGradientPrefix() + weight_1,
        ])

        opts = popart.SessionOptions()
        opts.enableOutlining = False
        opts.enablePipelining = True
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = gradient_accumulation
        opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip
        if mode is not None:
            opts.autoRecomputation = mode
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=l1,
                                         optimizer=popart.Adam({}),
                                         deviceInfo=tu.create_test_device(
                                             numIpus=2,
                                             opts={"compileIPUCode": False}))

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        inputs = {x_in: input_data}
        stepio = popart.PyStepIO(inputs, anchors)

        for _ in range(10):
            session.run(stepio)

        if verify is not None:
            verify(session)

        return anchors
def test_adam_mixed_mode_1(tmpdir):

    #optimizer parameters
    defaultLearningRate0 = 0.005
    defaultLearningRate5 = 0.0025

    defaultBeta1 = 0.7
    defaultBeta2 = 0.8
    defaultWeightDecay = 0.1
    defaultEps = 1e-6
    lossScaling = 10.0

    adam00 = popart.Adam({
        "defaultLearningRate": (defaultLearningRate0, False),
        "defaultBeta1": (defaultBeta1, True),
        "defaultBeta2": (defaultBeta2, True),
        "defaultWeightDecay": (defaultWeightDecay, True),
        "defaultEps": (defaultEps, True),
        "lossScaling": (lossScaling, True),
    })

    adam00.insertSpecific("w_0", {"beta1": (0.9, True), "beta2": (0.99, True)})
    adam00.insertSpecific("b_0", {"beta1": (0.9, True), "beta2": (0.99, True)})

    adam05 = popart.Adam({
        "defaultLearningRate": (defaultLearningRate5, False),
        "defaultBeta1": (defaultBeta1, True),
        "defaultBeta2": (defaultBeta2, True),
        "defaultWeightDecay": (defaultWeightDecay, True),
        "defaultEps": (defaultEps, True),
        "lossScaling": (lossScaling, True),
    })

    adam05.insertSpecific("w_0", {"beta1": (0.9, True), "beta2": (0.99, True)})
    adam05.insertSpecific("b_0", {"beta1": (0.9, True), "beta2": (0.99, True)})

    adam10 = popart.Adam({
        "defaultLearningRate": (defaultLearningRate0, False),
        "defaultBeta1": (defaultBeta1, False),
        "defaultBeta2": (defaultBeta2, False),
        "defaultWeightDecay": (defaultWeightDecay, False),
        "defaultEps": (defaultEps, False),
        "lossScaling": (lossScaling, False),
    })

    adam10.insertSpecific("w_0", {
        "beta1": (0.9, False),
        "beta2": (0.99, False)
    })
    adam10.insertSpecific("b_0", {
        "beta1": (0.9, False),
        "beta2": (0.99, False)
    })

    adam15 = popart.Adam({
        "defaultLearningRate": (defaultLearningRate5, False),
        "defaultBeta1": (defaultBeta1, False),
        "defaultBeta2": (defaultBeta2, False),
        "defaultWeightDecay": (defaultWeightDecay, False),
        "defaultEps": (defaultEps, False),
        "lossScaling": (lossScaling, False),
    })

    adam15.insertSpecific("w_0", {
        "beta1": (0.9, False),
        "beta2": (0.99, False)
    })
    adam15.insertSpecific("b_0", {
        "beta1": (0.9, False),
        "beta2": (0.99, False)
    })

    adam20 = popart.Adam({
        "defaultLearningRate": (defaultLearningRate0, False),
        "defaultBeta1": (defaultBeta1, True),
        "defaultBeta2": (defaultBeta2, False),
        "defaultWeightDecay": (defaultWeightDecay, False),
        "defaultEps": (defaultEps, False),
        "lossScaling": (lossScaling, False),
    })

    adam20.insertSpecific("w_0", {
        "beta1": (0.9, False),
        "beta2": (0.99, True)
    })
    adam20.insertSpecific("b_0", {
        "beta1": (0.9, False),
        "beta2": (0.99, True)
    })

    adam25 = popart.Adam({
        "defaultLearningRate": (defaultLearningRate5, False),
        "defaultBeta1": (defaultBeta1, True),
        "defaultBeta2": (defaultBeta2, False),
        "defaultWeightDecay": (defaultWeightDecay, False),
        "defaultEps": (defaultEps, False),
        "lossScaling": (lossScaling, False),
    })

    adam25.insertSpecific("w_0", {
        "beta1": (0.9, False),
        "beta2": (0.99, True)
    })
    adam25.insertSpecific("b_0", {
        "beta1": (0.9, False),
        "beta2": (0.99, True)
    })

    # Change Adam optimizer after 0 and 5 steps
    optMaps = [{
        0: adam00,
        5: adam05
    }, {
        0: adam10,
        5: adam15
    }, {
        0: adam20,
        5: adam25
    }]

    outlining = [True, True, True]

    run_adam_mixed_mode(10, optMaps, outlining, tmpdir, np.float32)
    run_adam_mixed_mode(10, optMaps, outlining, tmpdir, np.float16)
Exemple #12
0
                                        offset=layer * elms * 4)
            assert np.array_equal(anchors[weightsIds[layer]].flatten(),
                                  saved_weights)


optimizerInfos = []
# 1. SGD with momentum
optimizerInfos.append((popart.SGD({
    "defaultLearningRate": (0.2, True),
    "defaultMomentum": (0.5, True)
}), [popart.reservedAcclPrefix()]))
# 2. Adam
optimizerInfos.append((popart.Adam({
    "defaultLearningRate": (0.2, True),
    "defaultBeta1": (0.1, True),
    "defaultBeta2": (0.1, True),
    "defaultWeightDecay": (0.5, True),
    "defaultEps": (1e-5, True),
    "lossScaling": (2, True)
}), [
    popart.reservedAccl1Prefix(),
    popart.reservedAccl2Prefix(),
    popart.reservedStepPrefix()
]))
# 3. Adaptive
optimizerInfos.append(
    (popart.Adaptive({"defaultLearningRate": (0.2, True)},
                     mode=popart.AdaptiveMode.CenteredRMSProp),
     [popart.reservedAccl1Prefix(),
      popart.reservedAccl2Prefix()]))

def run_model(tmpdir,
              model_file_name,
              schedule=popart.ExecutionPhaseSchedule.Interleaving,
              enable_outlining=False,
              stride=1,
              num_layers=5,
              dsize=128,
              batch_size=4,
              batch_serialize=1,
              batch_schedule=popart.BatchSerializationBatchSchedule.Isomorphic,
              num_iterations=5,
              num_replicas=2,
              optimizer=popart.Adam({"defaultLearningRate": (0.1, False)})):

    np.random.seed(52125)

    builder = popart.Builder()
    ip = builder.addInputTensor(
        popart.TensorInfo("FLOAT", [batch_size, dsize, dsize]))

    def add_layer(index, in_id):
        w = builder.addInitializedInputTensor(
            np.random.rand(dsize, dsize).astype(np.float32), f"W{index}")
        matmul_id = builder.aiOnnx.matmul([in_id, w])
        return matmul_id

    out = ip
    l1 = ""
    final_loss = ""

    for i in range(num_layers):
        vgid = 0
        with builder.executionPhase(i * stride), builder.virtualGraph(vgid):
            for j in range(3):
                out = add_layer(i, out)

        if i == num_layers - 1:
            with builder.executionPhase(i *
                                        stride), builder.virtualGraph(vgid):
                l1 = builder.aiGraphcore.l1loss([out], 0.1,
                                                popart.ReductionType.Sum)
                final_loss = builder.aiGraphcore.identityloss([l1])

    anchorIds = []

    builder.addOutputTensor(out)

    num_ipus = 1

    dfAnchors = {}
    for anchorId in anchorIds:
        dfAnchors.update({anchorId: popart.AnchorReturnType("All")})

    opts = popart.SessionOptions()

    # Cycle counting
    opts.instrumentWithHardwareCycleCounter = True

    # Outlining
    opts.enableOutlining = enable_outlining
    opts.enableOutliningCopyCostPruning = False
    opts.outlineThreshold = -np.inf
    opts.aliasZeroCopy = enable_outlining

    # Replicated graphs
    opts.replicatedGraphCount = num_replicas
    opts.enableReplicatedGraphs = True if num_replicas > 1 else False

    # IO tiles
    opts.numIOTiles = 192

    # Phased execution
    opts.executionPhaseSettings.phases = num_layers * stride
    opts.executionPhaseSettings.stages = 1
    opts.executionPhaseSettings.schedule = schedule
    opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases

    # Recomputation
    opts.autoRecomputation = popart.RecomputationType.Standard
    opts.explicitRecomputation = True

    # Batch serialization
    if batch_serialize > 1:
        opts.batchSerializationSettings.factor = batch_serialize
        opts.batchSerializationSettings.concatOnVirtualGraphChange = False
        opts.batchSerializationSettings.concatOnExecutionPhaseChange = False
        opts.batchSerializationSettings.concatOnPipelineStageChange = False
        opts.batchSerializationSettings.batchSchedule = batch_schedule
        # Related execution phase setting
        opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand

    # Streaming memory
    offChipLocation = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.IO,
            storageTileSet=popart.TileSet.IO,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    offChipRtsLocation = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.IO,
            storageTileSet=popart.TileSet.IO,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.On),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    opts.activationTensorLocationSettings = offChipLocation
    opts.weightTensorLocationSettings = offChipRtsLocation
    opts.optimizerStateTensorLocationSettings = offChipRtsLocation

    proto = builder.getModelProto()

    with tu.create_test_device(num_replicas * num_ipus,
                               pattern=popart.SyncPattern.Full) as device:

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=popart.DataFlow(1, dfAnchors),
            optimizer=optimizer,
            loss=final_loss,
            patterns=popart.Patterns(popart.PatternsLevel.All),
            userOptions=opts,
            deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        for i in range(num_iterations):
            ip_data = np.random.rand(num_replicas, batch_size, dsize,
                                     dsize).astype(np.float32)
            stepio = popart.PyStepIO({ip: ip_data}, anchors)
            session.run(stepio)

        cycles = session.getCycleCount()

        print("anchors:")
        print(anchors)
        session.modelToHost(str(tmpdir / model_file_name))

        return cycles
    def run_test(compute_batch, batch_serialization_factor,
                 accumulation_factor, replication_factor, explicit_loops):

        proto, data, xs, loss = model(compute_batch,
                                      batch_serialization_factor,
                                      accumulation_factor, replication_factor)

        options = popart.SessionOptions()
        patterns = popart.Patterns(popart.PatternsLevel.All)

        if optim is "SGD":
            optimizer = popart.SGD({
                "defaultLearningRate": (0.1, False),
                "lossScaling": (20, False)
            })
        elif optim is "SGDM1":
            optimizer = popart.SGD(
                {
                    "defaultLearningRate": (0.1, False),
                    "defaultMomentum": (0.9, False),
                    "defaultDampening": (0.1, False),  # to increase errors
                    "lossScaling": (20, False),
                },
                accumulatorAndMomentum=popart.SGDAccumulatorAndMomentum.
                Combined)
        elif optim is "SGDM2":
            optimizer = popart.SGD(
                {
                    "defaultLearningRate": (0.1, False),
                    "defaultMomentum": (0.9, False),
                    "defaultDampening": (0.1, False),  # to increase errors
                    "lossScaling": (20, False),
                },
                accumulatorAndMomentum=popart.SGDAccumulatorAndMomentum.
                Separate)
        elif optim is "ADAM":
            optimizer = popart.Adam(
                {
                    "defaultLearningRate": (0.1, False),
                    "defaultBeta1": (0.9, False),
                    "defaultBeta2": (0.999, False),
                    "lossScaling": (20, False),
                },
                mode=popart.AdamMode.AdamNoBias)  # to increase errors

        if explicit_loops:
            options.enableExplicitMainLoops = True
            options.aliasZeroCopy = True
            options.explicitRecomputation = True
            options.useHostCopyOps = True

        options.batchSerializationSettings.factor = batch_serialization_factor

        if batch_serialization_factor > 1 and batchserial == "Loop":
            options.batchSerializationSettings.method = popart.BatchSerializationMethod.Loop
            options.batchSerializationSettings.transformContext = popart.BatchSerializationTransformContext.Bwd

        options.accumulationAndReplicationReductionType = reduction

        if accumulation_factor > 1:
            options.enableGradientAccumulation = True
            options.accumulationFactor = accumulation_factor
            if reduction_type == "MeanRunning":
                options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Running
            if reduction_type == "MeanPost":
                options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Post
        if replication_factor > 1:
            options.enableReplicatedGraphs = True
            options.replicatedGraphCount = replication_factor

        device = tu.create_test_device(replication_factor,
                                       pattern=popart.SyncPattern.Full)

        dataFlow = popart.DataFlow(
            batches_per_step, {x: popart.AnchorReturnType("ALL")
                               for x in xs})

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        stepio = popart.PyStepIO(data, anchors)

        session.run(stepio)

        file_path = str(tmpdir / f"model_test.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

        device.detach()

        return [anchors[x] for x in xs], post_proto
Exemple #15
0
def replicated_tensor_sharding_core():
    parser = argparse.ArgumentParser(description="Parse launch parameters.")
    parser.add_argument("--tensors", nargs="*")
    parser.add_argument("--optim", nargs="?")
    parser.add_argument("--tmpdir", nargs="?")
    parser.add_argument("--filename", nargs="?")
    parser.add_argument("--compute_batch", nargs="?")
    args = parser.parse_args(sys.argv[2:])

    ipus_per_replica = 1

    batches_per_step = 10
    accumulation_factor = 4
    compute_batch = int(args.compute_batch)
    hidden_size = 4
    reduction = popart.ReductionType.Sum

    deviceInfo = popdist.popart.getDevice(ipus_per_replica)
    num_local_replicas = popdist.getNumLocalReplicas()
    num_total_replicas = popdist.getNumTotalReplicas()

    builder = popart.Builder()

    np.random.seed(12321)
    weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32)

    input_data = []
    label_data = []

    for i in range(
            0, batches_per_step * num_local_replicas * accumulation_factor *
            compute_batch):
        np.random.seed(popdist.getInstanceIndex() +
                       i * popdist.getNumInstances())
        input_data += [np.random.rand(hidden_size).astype(np.float32)]
        label_data += [np.random.randint(0, hidden_size, size=1)]

    input_data = np.concatenate(input_data)
    label_data = np.concatenate(label_data)

    builder = popart.Builder()

    d0 = builder.addInputTensor(
        popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0")
    l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )),
                                "l0")

    data = {}

    data[d0] = input_data.reshape((batches_per_step, num_local_replicas,
                                   accumulation_factor, compute_batch, -1))

    w0 = builder.addInitializedInputTensor(weight_data, 'weight0')
    x = builder.aiOnnx.matmul([d0, w0])

    x = builder.aiOnnx.softmax([x])

    data[l0] = label_data.reshape((batches_per_step,
                    num_local_replicas,
                    accumulation_factor,
                    compute_batch,
                    -1))\
                .astype(np.uint32)
    loss = builder.aiGraphcore.nllloss([x, l0],
                                       reduction=reduction,
                                       debugContext='loss')

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(
        batches_per_step,
        {av: popart.AnchorReturnType("ALL")
         for av in [x, loss]})

    opts = popart.SessionOptions()
    if accumulation_factor > 1:
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accumulation_factor
    opts.explicitRecomputation = True
    opts.enableExplicitMainLoops = True
    opts.useHostCopyOps = True
    # Let popdist handle distributed settings, such as:
    # opts.enableDistributedReplicatedGraphs
    # opts.globalReplicaOffset
    # opts.globalReplicationFactor
    popdist.popart.configureSessionOptions(opts)

    for tensor in ["weight", "optimizerState", "accumulator"]:
        userOption = tensor + "TensorLocationSettings"
        print(
            f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}"
        )
        locationSetting = getattr(opts, userOption)
        locationSetting.minElementsForOffChip = 0
        locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas
        if tensor in args.tensors:
            locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On
        if num_total_replicas > num_local_replicas:
            locationSetting.location.shardingDomain = popart.CommGroup(
                popart.CommGroupType.Consecutive, num_local_replicas)
        setattr(opts, userOption, locationSetting)

    if args.optim == "Adam":
        optimizer = popart.Adam(
            {
                "defaultLearningRate": (0.01, False),
                "defaultBeta1": (0.9, False),
                "defaultBeta2": (0.999, False),
                "defaultEps": (1e-06, False),
                "defaultWeightDecay": (0.1, False),
                "lossScaling": (10, False),
            },
            weight_decay_mode=popart.WeightDecayMode.Decay,
            mode=popart.AdamMode.LambNoBias)
    if args.optim == "SGD":
        optimizer = popart.ConstSGD(0.01)

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     deviceInfo=deviceInfo,
                                     userOptions=opts,
                                     loss=loss,
                                     optimizer=optimizer)

    session.prepareDevice()

    session.weightsFromHost()

    anchors = session.initAnchorArrays()

    stepio = popart.PyStepIO(data, anchors)

    session.run(stepio)

    tmp_path = Path(args.tmpdir)
    tmp_path.mkdir(parents=True, exist_ok=True)
    file_path = str(tmp_path / args.filename)
    session.modelToHost(file_path)
    post_proto = onnx.load(file_path)
# beta 2 (Adam)
b2 = 0.999

# setting this to, say, 100, and the test fails, see T24563
testTilesPerIPU = 1216

sgd_optimizer = popart.SGD({
    "defaultLearningRate": (lr, False),
    "defaultWeightDecay": (wd, False)
})

adam_optimizer = popart.Adam({
    "defaultLearningRate": (lr, False),
    "defaultBeta1": (b1, False),
    "defaultBeta2": (b2, False),
    "defaultWeightDecay": (wd, False),
    "defaultEps": (1e-6, True),
})

grad_accl_prefix = popart.reservedAcclPrefix()


def get_micro_batch_size(accum_factor):
    """
    no data replication, so micro batch size = batch size / accumlation factor
    """
    if (batch_size % accum_factor is not 0):
        raise RuntimeError("accum_factor is not a factor of batch_size")

    micro_batch_size = batch_size // accum_factor
    def run_test(mode=None, verify=None):
        builder = popart.Builder()

        def norm(input_x):
            gamma = builder.addInitializedInputTensor(
                np.ones(hidden_size, np.float32), "Gamma")
            beta = builder.addInitializedInputTensor(
                np.zeros(hidden_size, np.float32), "Beta")
            return builder.aiGraphcore.groupnormalization(
                [input_x, gamma, beta], 1)[0]

        x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape),
                                      "x_in")

        weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1")
        weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2")
        weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3")

        with builder.virtualGraph(0), builder.pipelineStage(0):
            x_0 = builder.aiOnnx.matmul([x_in, weight_1])
            x_0 = norm(x_0)

            # If recomputeOutputs was used directly on `x_0` all 3 outputs
            # of groupnormalization would be stashed.
            # By using a checkpointOutput only 1 output will be stashed and the
            # rest will be recomputed.
            x_0 = builder.checkpointOutput([x_0])[0]

            x_1 = builder.aiOnnx.matmul([x_0, weight_2])
            x_1 = norm(x_1)
            x_1 = builder.aiOnnx.add([x_0, x_1])

            # This checkpoint should be redundant as x_1 will be stashed
            # at the start of stage1 on ipu1.
            x_1 = builder.checkpointOutput([x_1])[0]

        with builder.virtualGraph(1), builder.pipelineStage(1):
            o = builder.aiOnnx.matmul([x_1, weight_3])
            l1 = builder.aiGraphcore.l1loss([o], 0.1)

        proto = builder.getModelProto()

        dataFlow = popart.DataFlow(1, [
            o,
            popart.reservedGradientPrefix() + weight_1,
            popart.reservedGradientPrefix() + weight_2,
            popart.reservedGradientPrefix() + weight_3,
        ])

        opts = popart.SessionOptions()
        opts.enableOutlining = False
        opts.enablePipelining = True
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = gradient_accumulation
        opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip
        if mode is not None:
            opts.autoRecomputation = mode
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=l1,
                                         optimizer=popart.Adam({}),
                                         deviceInfo=tu.create_test_device(
                                             numIpus=2,
                                             opts={"compileIPUCode": False}))

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        inputs = {x_in: input_data}
        stepio = popart.PyStepIO(inputs, anchors)

        for _ in range(10):
            session.run(stepio)

        if verify is not None:
            verify(session, x_0)

        return anchors
    def create(self):
        self.iteration.learning_rate = self.optimizer_options[
            "defaultLearningRate"][0]

        if self.opt_type == "SGD":
            optimizer = popart.SGD(self.optimizer_options)
        elif self.opt_type == "ADAM":
            optimizer = popart.Adam(self.optimizer_options)
        elif self.opt_type == "ADAM_NO_BIAS":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.AdamNoBias)
        elif self.opt_type == "LAMB":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.Lamb)
        elif self.opt_type == "LAMB_NO_BIAS":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.LambNoBias)

        projection_scale_added = False
        weight_decay_tensor_list = []

        if self.execution_mode == "PIPELINE":
            for stage in self.tensors:

                specific_parameters = {}

                if self.lr_scaling:
                    default_lr, lr_is_const = self.optimizer_options[
                        "defaultLearningRate"]
                    specific_parameters["learningRate"] = (
                        default_lr * self.pipeline_stage_lr_scaling[stage],
                        lr_is_const)

                if self.momentum_scaling:
                    # Momentum values are scaled inverse to the pipeline_stage
                    if self.option_values["defaultMomentum"] != 0:
                        # This arithmetic will create FP rounding errors if momentum == 0.
                        momentum = 1 - (
                            (1 - self.option_values["defaultMomentum"]) *
                            self.pipeline_stage_momentum_scaling[stage])
                    else:
                        momentum = 0
                    specific_parameters["momentum"] = (momentum, True)

                    if self.option_values["defaultDampening"] != 0:
                        dampening = 1 - (
                            (1 - self.option_values["defaultDampening"]) *
                            self.pipeline_stage_dampening_scaling[stage])
                    else:
                        dampening = 0
                    specific_parameters["dampening"] = (dampening, True)

                for tensor_id in self.tensors[stage]:
                    if self.include_for_weight_decay(tensor_id):
                        specific_parameters["weightDecay"] = (
                            self.weight_decay, True)
                        weight_decay_tensor_list.append(tensor_id)

                    if self.squad_lr_scaling and "Squad" in tensor_id:
                        logger.debug(
                            f"Setting SQuAD LR scaling for tensor [{tensor_id}]: {self.squad_lr_scale}"
                        )
                        lr = specific_parameters.get(
                            "learningRate",
                            self.optimizer_options["defaultLearningRate"])
                        params = specific_parameters.copy()
                        params["learningRate"] = (lr[0] * self.squad_lr_scale,
                                                  lr[1])
                        optimizer.insertSpecific(tensor_id, params)
                    else:
                        optimizer.insertSpecific(tensor_id,
                                                 specific_parameters)

        else:
            for tensor_id in self.tensors[0]:
                if self.include_for_weight_decay(tensor_id):
                    specific_parameters = {
                        "weightDecay": (self.weight_decay, True)
                    }
                    weight_decay_tensor_list.append(tensor_id)
                    optimizer.insertSpecific(tensor_id, specific_parameters)

        if len(weight_decay_tensor_list) != 0:
            logger.debug(
                f" Weight decay of {self.weight_decay} applied to: {weight_decay_tensor_list}"
            )

        return optimizer
Exemple #19
0
def test_replicated_lamb_weight_update(tmpdir, isConst, reduction):
    # Test both const & non-const optimizer parameters
    optimizer_dict = {
        "defaultLearningRate": (0.005, isConst),
        "defaultBeta1": (0.7, isConst),
        "defaultBeta2": (0.8, isConst),
        "defaultWeightDecay": (0.1, isConst),
        "defaultEps": (1e-6, isConst),
        "lossScaling": (10.0, isConst),
    }

    # Off-chip, but no RTS (1x replica)
    run_model(tmpdir,
              'phased.onnx',
              execution_mode="phased",
              batch_size=4,
              num_replicas=1,
              num_iterations=5,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=offChipLocation,
              reduction=reduction)

    # Off-chip, but no RTS (2x replicas)
    run_model(tmpdir,
              'phased_replicated.onnx',
              execution_mode="phased",
              batch_size=2,
              num_replicas=2,
              num_iterations=5,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=offChipLocation,
              reduction=reduction)

    # Weights and optimizer off-chip, RTS
    run_model(tmpdir,
              'phased_replicated_rws.onnx',
              execution_mode="phased",
              batch_size=2,
              num_replicas=2,
              num_iterations=5,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipRtsLocation,
              optimizer_state_tensor_location_settings=offChipRtsLocation,
              accumulator_tensor_location_settings=offChipLocation,
              reduction=reduction)

    # Weights and optimizer off-chip, accumulator off chip, RTS
    run_model(tmpdir,
              'phased_replicated_rws_acc.onnx',
              execution_mode="phased",
              batch_size=1,
              num_replicas=2,
              num_iterations=5,
              enable_accum=True,
              accum_factor=2,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipRtsLocation,
              optimizer_state_tensor_location_settings=offChipRtsLocation,
              accumulator_tensor_location_settings=offChipLocation,
              reduction=reduction)

    # Weights on-chip, non-RTS, optimizer state off-chip, RTS
    run_model(tmpdir,
              'phased_replicated_rws_acc_nw.onnx',
              execution_mode="phased",
              batch_size=1,
              num_replicas=2,
              num_iterations=5,
              enable_accum=True,
              accum_factor=2,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=onChipLocation,
              optimizer_state_tensor_location_settings=offChipRtsLocation,
              accumulator_tensor_location_settings=onChipLocation,
              reduction=reduction)

    phased = onnx.load(str(tmpdir / 'phased.onnx'))
    phased_replicated = onnx.load(str(tmpdir / 'phased_replicated.onnx'))
    phased_replicated_rws = onnx.load(
        str(tmpdir / 'phased_replicated_rws.onnx'))
    phased_replicated_rws_acc = onnx.load(
        str(tmpdir / 'phased_replicated_rws_acc.onnx'))
    phased_replicated_rws_acc_nw = onnx.load(
        str(tmpdir / 'phased_replicated_rws_acc_nw.onnx'))

    check_model(phased, phased_replicated)
    check_model(phased, phased_replicated_rws)
    check_model(phased, phased_replicated_rws_acc)
    check_model(phased, phased_replicated_rws_acc_nw)
Exemple #20
0
    def run_test(compute_batch, batch_serialization_factor,
                 accumulation_factor, replication_factor):
        proto, data, xs, loss = model(compute_batch,
                                      batch_serialization_factor,
                                      accumulation_factor, replication_factor)

        options = popart.SessionOptions()
        patterns = popart.Patterns()

        if optim is "SGD":
            optimizer = popart.SGD({
                "defaultLearningRate": (0.1, False),
                "lossScaling": (20, False)
            })
        elif optim is "SGDM":
            optimizer = popart.SGD({
                "defaultLearningRate": (0.1, False),
                "defaultMomentum": (0.9, False),
                "defaultDampening": (0.1, False),  # to increase errors
                "lossScaling": (20, False),
            })
        elif optim is "ADAM":
            optimizer = popart.Adam(
                {
                    "defaultLearningRate": (0.1, False),
                    "defaultBeta1": (0.9, False),
                    "defaultBeta2": (0.999, False),
                    "lossScaling": (20, False),
                },
                mode=popart.AdamMode.AdamNoBias)  # to increase errors
        elif optim is "LAMB":
            optimizer = popart.Adam(
                {
                    "defaultLearningRate": (0.1, False),
                    "defaultBeta1": (0.9, False),
                    "defaultBeta2": (0.999, False),
                    "lossScaling": (20, False),
                },
                mode=popart.AdamMode.LambNoBias)  # to increase errors

        options.batchSerializationSettings.factor = batch_serialization_factor
        if accumulation_factor > 1:
            options.enableGradientAccumulation = True
            options.accumulationFactor = accumulation_factor
            options.accumulationReductionType = reduction
        if replication_factor > 1:
            options.enableReplicatedGraphs = True
            options.replicatedGraphCount = replication_factor

        device = tu.create_test_device(replication_factor,
                                       pattern=popart.SyncPattern.Full)

        dataFlow = popart.DataFlow(
            batches_per_step, {x: popart.AnchorReturnType("ALL")
                               for x in xs})

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        stepio = popart.PyStepIO(data, anchors)

        session.run(stepio)

        file_path = str(tmpdir / f"model_test.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

        device.detach()

        return [anchors[x] for x in xs], post_proto
def test_pipelined_streaming_lamb(tmpdir):

    optimizer_dict = {
        "defaultLearningRate": (0.005, True),
        "defaultBeta1": (0.7, True),
        "defaultBeta2": (0.8, True),
        "defaultWeightDecay": (0.1, True),
        "defaultEps": (1e-6, True),
        "lossScaling": (10.0, True),
    }

    run_model(tmpdir,
              'normal.onnx',
              execution_mode="normal",
              num_layers=2,
              batch_size=12,
              num_replicas=1,
              num_iterations=5,
              enable_accum=False,
              accum_factor=1,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=onChipLocation,
              weight_tensor_location_settings=onChipLocation,
              optimizer_state_tensor_location_settings=onChipLocation,
              accumulator_tensor_location_settings=onChipLocation)
    run_model(tmpdir,
              'pipelined.onnx',
              execution_mode="pipelined",
              num_layers=2,
              batch_size=2,
              num_replicas=1,
              num_iterations=5,
              enable_accum=True,
              accum_factor=6,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=onChipLocation,
              weight_tensor_location_settings=onChipLocation,
              optimizer_state_tensor_location_settings=onChipLocation,
              accumulator_tensor_location_settings=onChipLocation)
    run_model(tmpdir,
              'pipelined_streaming.onnx',
              execution_mode="pipelined",
              num_layers=2,
              batch_size=2,
              num_replicas=1,
              num_iterations=5,
              enable_accum=True,
              accum_factor=6,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=onChipLocation,
              weight_tensor_location_settings=onChipLocation,
              optimizer_state_tensor_location_settings=offChipRtsLocation,
              accumulator_tensor_location_settings=onChipLocation)
    run_model(tmpdir,
              'pipelined_streaming_rep.onnx',
              execution_mode="pipelined",
              num_layers=2,
              batch_size=1,
              num_replicas=2,
              num_iterations=5,
              enable_accum=True,
              accum_factor=6,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=onChipLocation,
              weight_tensor_location_settings=onChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=onChipLocation)
    run_model(tmpdir,
              'pipelined_streaming_rep_rts.onnx',
              execution_mode="pipelined",
              num_layers=2,
              batch_size=1,
              num_replicas=2,
              num_iterations=5,
              enable_accum=True,
              accum_factor=6,
              optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb),
              activation_tensor_location_settings=onChipLocation,
              weight_tensor_location_settings=onChipLocation,
              optimizer_state_tensor_location_settings=offChipRtsLocation,
              accumulator_tensor_location_settings=onChipLocation)

    normal = onnx.load(str(tmpdir / 'normal.onnx'))
    pipelined = onnx.load(str(tmpdir / 'pipelined.onnx'))
    pipelined_streaming = onnx.load(str(tmpdir / 'pipelined_streaming.onnx'))
    pipelined_streaming_rep = onnx.load(
        str(tmpdir / 'pipelined_streaming_rep.onnx'))
    pipelined_streaming_rep_rts = onnx.load(
        str(tmpdir / 'pipelined_streaming_rep_rts.onnx'))

    check_model(normal, pipelined)
    check_model(normal, pipelined_streaming)
    check_model(normal, pipelined_streaming_rep)
    check_model(normal, pipelined_streaming_rep_rts)
Exemple #22
0
    if conf.optimizer == 'SGD':
        optimizer_dict = {"defaultLearningRate": (conf.init_lr, False),
                          "defaultWeightDecay": (0, True)}
        logger.info("Creating SGD optimizer: {}".format(json.dumps(optimizer_dict)))
        optimizer = popart.SGD(optimizer_dict)
    elif conf.optimizer == 'Adam':
        optimizer_dict = {
            "defaultLearningRate": (conf.init_lr, True),
            "defaultBeta1": (conf.beta1, True),
            "defaultBeta2": (conf.beta2, True),
            "defaultWeightDecay": (0.0, True),
            "defaultEps": (conf.adam_eps, True),
            "lossScaling": (1.0, True),
        }
        logger.info("Creating Adam optimizer: {}".format(json.dumps(optimizer_dict)))
        optimizer = popart.Adam(optimizer_dict)
    else:
        logger.info("Not a valid optimizer option: {}".format(conf.optimizer))
        sys.exit(-1)

    # create training session
    logger.info("Creating the training session")
    training_session, anchors = \
        conf_utils.create_session_anchors(proto,
                                          ctc_neg_log_likelihood,
                                          device,
                                          dataflow,
                                          session_options,
                                          training=True,
                                          optimizer=optimizer)
    logger.info("Sending weights from Host")
def bwd_graph(popart_model,
              torch_model,
              popart_loss_fn,
              torch_loss_fn,
              mapping=None,
              transform=None,
              replication_factor=1,
              replicated_tensor_sharding=False,
              opt_type="SGD"):
    np.random.seed(1984)
    random.seed(1984)
    torch.manual_seed(1984)

    #  ------------------- PopART --------------------
    config = popart_model.config
    builder = popart_model.builder

    sequence_info = popart.TensorInfo(
        "UINT32", [config.micro_batch_size * config.sequence_length])
    indices = builder.addInputTensor(sequence_info)
    positions = builder.addInputTensor(sequence_info)
    segments = builder.addInputTensor(sequence_info)
    data = {
        indices:
        np.random.randint(0, config.vocab_length,
                          (replication_factor, config.micro_batch_size *
                           config.sequence_length)).astype(np.uint32),
        positions:
        np.random.randint(0, config.sequence_length,
                          (replication_factor, config.micro_batch_size *
                           config.sequence_length)).astype(np.uint32),
        segments:
        np.random.randint(0, 2, (replication_factor, config.micro_batch_size *
                                 config.sequence_length)).astype(np.uint32)
    }
    num_reps = 5
    output = popart_model.build_graph(indices, positions, segments)
    ipus = popart_model.total_ipus

    loss = popart_loss_fn(output)

    proto = builder.getModelProto()

    if opt_type == "SGD":
        optimizer = popart.ConstSGD(1e-3)
    elif opt_type == "LAMB":
        optMap = {
            "defaultLearningRate": (1e-3, True),
            "defaultBeta1": (0.9, True),
            "defaultBeta2": (0.999, True),
            "defaultWeightDecay": (0.0, True),
            "maxWeightNorm": (10.0, True),
            "defaultEps": (1e-8, True),
            "lossScaling": (1.0, True),
        }
        optimizer = popart.Adam(optMap, mode=popart.AdamMode.Lamb)
    elif opt_type == "LAMB_NO_BIAS":
        optMap = {
            "defaultLearningRate": (1, False),
            "defaultBeta1": (0, False),
            "defaultBeta2": (0, False),
            "defaultWeightDecay": (0.0, False),
            "defaultEps": (1e-8, False),
            "lossScaling": (1.0, False),
        }
        optimizer = popart.Adam(optMap, mode=popart.AdamMode.LambNoBias)
    else:
        raise ValueError(f"Unknown opt_type={opt_type}")

    outputs, post_proto = run_py(
        proto,
        data,
        output,
        loss=loss,
        optimizer=optimizer,
        replication_factor=replication_factor,
        replicated_tensor_sharding=replicated_tensor_sharding,
        ipus=ipus,
        num_reps=num_reps)

    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = {
        "input_ids":
        data[indices].reshape(replication_factor * config.micro_batch_size,
                              config.sequence_length).astype(np.int32),
        "position_ids":
        data[positions].reshape(replication_factor * config.micro_batch_size,
                                config.sequence_length).astype(np.int32),
        "token_type_ids":
        data[segments].reshape(replication_factor * config.micro_batch_size,
                               config.sequence_length).astype(np.int32)
    }

    torch_to_onnx = get_mapping(config, init=mapping)

    transform_weights = get_transform(config, init=transform)

    #  ------------------- PyTorch -------------------------
    # Turn off dropout
    torch_model.eval()

    copy_weights_to_torch(torch_model, proto, torch_to_onnx, transform_weights)

    if opt_type == "SGD":
        optim = torch.optim.SGD(torch_model.parameters(),
                                1e-3,
                                weight_decay=0.0,
                                momentum=0.0)
    elif opt_type == "LAMB":
        optim = torch_lamb.Lamb(torch_model.parameters(),
                                lr=1e-3,
                                weight_decay=0.0,
                                biasCorrection=True)

    for _ in range(num_reps):
        torch_outputs = torch_model(
            **{k: torch.from_numpy(t).long()
               for k, t in inputs.items()})
        torch_loss = torch_loss_fn(torch_outputs)
        torch_loss.backward()
        optim.step()
        optim.zero_grad()

    check_tensors([output.detach().numpy() for output in torch_outputs],
                  outputs,
                  margin=1.5e-06)

    check_model(torch_model,
                post_proto,
                torch_to_onnx,
                transform_weights,
                margin=5e-5)
Exemple #24
0
def session(train=False,
            skip_execution=False,
            include_patterns=True,
            splits=1,
            outline=False,
            optim="Sgd"):
    proto, data, x, loss = model(splits=splits)
    # Required
    extraPatterns = []
    if include_patterns:
        extraPatterns += ["TiedGatherPattern", "TiedGatherAccumulatePattern"]
    patterns = popart.Patterns()
    for extraPattern in extraPatterns:
        patterns.enablePattern(extraPattern, True)

    user_options = {
        "enableOutlining": outline,
        "enableGradientAccumulation": True,
        "accumulationFactor": 2,
    }

    if optim == "Lamb":
        optimizer = popart.Adam(
            {
                "defaultLearningRate": (0.1, True),
                "lossScaling": (20, False),
            },
            mode=popart.AdamMode.LambNoBias
        )  # NoBias to increase the error of incorrect gradients
        user_options[
            "optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings(
                popart.TensorStorage.OffChip, 0)
    else:
        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
            "defaultMomentum": (0.9, True),
            "defaultDampening":
            (0,
             True),  # 0 dampening to increase the error of incorrect gradients
            "lossScaling": (20, False)
        })

    if train:
        return run_py(proto,
                      data=data,
                      outputs=x,
                      loss=loss,
                      optimizer=optimizer,
                      patterns=patterns,
                      user_options=user_options,
                      skip_execution=skip_execution)
    else:
        return run_py(proto,
                      data=data,
                      outputs=x,
                      patterns=patterns,
                      user_options={
                          "enableOutlining": outline,
                          "constantWeights": False
                      },
                      skip_execution=skip_execution)