def __init__(self, learning_rate=0.001, momentum=0.0005, weight_decay=0.0001, use_locking=False, name='Momentum', use_nesterov=False, clip_norm=None, lossScaling=1.0, specific_dic={}): assert use_locking is False assert use_nesterov is False self.learning_rate = learning_rate self.name = name self.clip_norm = clip_norm self.lossScaling = lossScaling self.opti_cfg = { "defaultLearningRate": (self.learning_rate, False), "defaultMomentum": (momentum, True), "defaultWeightDecay": (weight_decay, True), } if self.lossScaling != 1.0: self.opti_cfg['lossScaling'] = (self.lossScaling, True) if clip_norm is not None: print('clip norm gradients:', clip_norm) self.gc_optimizer = popart.SGD( self.opti_cfg, clip_norm_settings=[ popart.ClipNormSettings.clipAllWeights(clip_norm) ]) else: self.gc_optimizer = popart.SGD(self.opti_cfg) for name in specific_dic: self.gc_optimizer.insertSpecific(name, specific_dic[name])
def test_sgd_param_check(): """ In this test we check that learning rate tensor, returned as an anchor, matches the value supplied to the optimizer constructor """ lrName = popart.reservedDefaultScaledLearningRate0Prefix() + "FLOAT" wdName = popart.reservedDefaultWeightDecayScaleFactor0Prefix() + "FLOAT" lsName = popart.reservedLossScalingPrefix() + "FLOAT" anchorNames = { lrName: popart.AnchorReturnType("All"), wdName: popart.AnchorReturnType("All"), lsName: popart.AnchorReturnType("All") } # Just a placeholder optimizer. We overwrite the hyper-parameters in this # test once the session is created userSGD = popart.SGD({ "defaultLearningRate": (0.5, False), "defaultWeightDecay": (0.6, False), "lossScaling": (10.0, False) }) stepSize = 2 session, inputsUserSgd = trainSession(anchorNames, userSGD, stepSize) anchorsArrays = session.initAnchorArrays() # train numSteps = 3 learningRate = np.random.rand(numSteps).astype('float32') weightDecay = np.random.rand(numSteps).astype('float32') lossScaling = np.random.rand(numSteps).astype('float32') for step in range(numSteps): # Update learning rate parameter between training steps stepLr = learningRate[step] stepWd = weightDecay[step] stepLs = lossScaling[step] session.updateOptimizerFromHost( popart.SGD({ "defaultLearningRate": (stepLr, False), "defaultWeightDecay": (stepWd, False), "lossScaling": (stepLs, False) })) stepio = popart.PyStepIO(inputsUserSgd, anchorsArrays) session.run(stepio) assert (np.array_equal(anchorsArrays[lsName][0], stepLs)) scaled = (stepLr / stepLs) assert (np.array_equal(anchorsArrays[lrName][0], scaled)) # The weight decay tensor is scaled by lr on the host # before training scaled = 1 - (stepWd * stepLr) assert (np.allclose(anchorsArrays[wdName][0], scaled))
def getOptimizers(): optimizers = [] # SGD sgd0 = popart.SGD({ "lossScaling": (10.0, False), "defaultMomentum": (0.5, False), "defaultVelocityScaling": (0.5, False), "defaultDampening": (0.5, False), "defaultWeightDecay": (0.5, False) }) sgd1 = popart.SGD({ "lossScaling": (0.2, False), "defaultMomentum": (0.2, False), "defaultVelocityScaling": (0.2, False), "defaultDampening": (0.2, False), "defaultWeightDecay": (0.2, False) }) optimizers.append([sgd0, sgd1]) # Adam adam0 = popart.Adam({ "lossScaling": (10.0, False), "defaultLearningRate": (0.5, False), "defaultWeightDecay": (0.5, False), "defaultBeta1": (0.5, False), "defaultBeta2": (0.5, False), "defaultEps": (0.5, False) }) adam1 = popart.Adam({ "lossScaling": (0.2, False), "defaultLearningRate": (0.2, False), "defaultWeightDecay": (0.2, False), "defaultBeta1": (0.2, False), "defaultBeta2": (0.2, False), "defaultEps": (0.2, False) }) optimizers.append([adam0, adam1]) # Adaptive adaptive0 = popart.Adaptive({ "lossScaling": (10.0, False), "defaultLearningRate": (0.5, False), "defaultAlpha": (0.5, False), "defaultMomentum": (0.5, False), "defaultWeightDecay": (0.5, False), "defaultEps": (0.5, False) }) adaptive1 = popart.Adaptive({ "lossScaling": (0.2, False), "defaultLearningRate": (0.2, False), "defaultAlpha": (0.2, False), "defaultMomentum": (0.2, False), "defaultWeightDecay": (0.2, False), "defaultEps": (0.2, False) }) optimizers.append([adaptive0, adaptive1]) return optimizers
def test_replicated_sgd1_weight_update(tmpdir): optimizer_dict = { "defaultLearningRate": (0.00001, False), "defaultMomentum": (0.9, False), "defaultDampening": (0.2, False), "defaultVelocityScaling": (0.1, False), "lossScaling": (1.0, True), "defaultWeightDecay": (0.2, True) } run_model(tmpdir, 'phased.onnx', execution_mode="phased", batch_size=2, num_replicas=1, num_iterations=5, optimizer=popart.SGD(optimizer_dict), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=offChipLocation) run_model(tmpdir, 'phased_replicated.onnx', execution_mode="phased", batch_size=1, num_replicas=2, num_iterations=5, optimizer=popart.SGD(optimizer_dict), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=offChipLocation) run_model(tmpdir, 'phased_replicated_rws.onnx', execution_mode="phased", batch_size=1, num_replicas=2, num_iterations=5, optimizer=popart.SGD(optimizer_dict), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipRtsLocation, optimizer_state_tensor_location_settings=offChipRtsLocation, accumulator_tensor_location_settings=offChipRtsLocation) phased = onnx.load(str(tmpdir / 'phased.onnx')) phased_replicated = onnx.load(str(tmpdir / 'phased_replicated.onnx')) phased_replicated_rws = onnx.load( str(tmpdir / 'phased_replicated_rws.onnx')) check_model(phased, phased_replicated) check_model(phased, phased_replicated_rws)
def test_sgd_mixed_mode_0(tmpdir): #optimizer parameters defaultLearningRate = 1e-4 defaultMomentum = 0.7 defaultVelocityScaling = 1.0 defaultWeightDecay = 0.1 defaultDampening = 0.05 lossScaling = 10.0 optMaps = [{ 0: popart.SGD({ "defaultLearningRate": (defaultLearningRate, True), "defaultMomentum": (defaultMomentum, True), "defaultVelocityScaling": (defaultVelocityScaling, True), "defaultWeightDecay": (defaultWeightDecay, True), "defaultDampening": (defaultDampening, True), "lossScaling": (lossScaling, True), }) }] outlining = [False] for i in range(6): optMap = { "defaultLearningRate": (defaultLearningRate, i != 0), "defaultMomentum": (defaultMomentum, i != 1), "defaultVelocityScaling": (defaultVelocityScaling, i != 2), "defaultWeightDecay": (defaultWeightDecay, i != 3), "defaultDampening": (defaultDampening, i != 4), "lossScaling": (lossScaling, i != 5), } optMaps = optMaps + [{0: popart.SGD(optMap)}] outlining = outlining + [False] for i in range(6): optMap = { "defaultLearningRate": (defaultLearningRate, i != 0), "defaultMomentum": (defaultMomentum, i != 1), "defaultVelocityScaling": (defaultVelocityScaling, i != 2), "defaultWeightDecay": (defaultWeightDecay, i != 3), "defaultDampening": (defaultDampening, i != 4), "lossScaling": (lossScaling, i != 5), } optMaps = optMaps + [{0: popart.SGD(optMap)}] outlining = outlining + [True] run_sgd_mixed_mode(10, optMaps, outlining, tmpdir, np.float32) run_sgd_mixed_mode(10, optMaps, outlining, tmpdir, np.float16)
def test_sgd_with_zero_learning_rate(): """ In this test we check that we can run a training step zero learning rate, and that it behaves as expected (i.e. no weight update) """ # Let's start with an optimizer with a variable, non-zero learning rate optSettings = { "defaultLearningRate": (0.5, False), "defaultWeightDecay": (0.6, False), "lossScaling": (10.0, False) } stepSize = 2 session, inputsUserSgd = trainSession({}, popart.SGD(optSettings), stepSize) anchorsArrays = session.initAnchorArrays() # Get the initial weights: fn = "init.onnx" session.modelToHost(fn) wId = "init_input" weights = {wId: np.empty(shape=[2, 2, 3, 3], dtype=np.float32)} weightsio = popart.PyWeightsIO(weights) session.readWeights(weightsio) init_weights = np.copy(weights[wId]) # Run for a step with non-zero lr, observe that the weights have changed stepio = popart.PyStepIO(inputsUserSgd, anchorsArrays) session.run(stepio) session.weightsToHost() session.readWeights(weightsio) updated_weights = np.copy(weights[wId]) assert np.array_equal(init_weights, updated_weights) is False # Update optimizer with zero lr, (only valid if variable) optSettings["defaultLearningRate"] = (0.0, True) with pytest.raises(popart.popart_exception) as e_info: session.updateOptimizerFromHost(popart.SGD(optSettings)) assert e_info.value.args[0].startswith( "Constant, zero learning rate in SGD") # Run a training step, and confirm the weights haven't updated optSettings["defaultLearningRate"] = (0.0, False) session.updateOptimizerFromHost(popart.SGD(optSettings)) session.weightsToHost() session.readWeights(weightsio) assert np.array_equal(weights[wId], updated_weights)
def create(self): self.iteration.learning_rate = self.optimizer_options[ "defaultLearningRate"][0] optimizer = popart.SGD(self.optimizer_options) for stage in self.pipeline_stage_tensors: specific_parameters = {} if self.lr_scaling: default_lr, lr_is_const = self.optimizer_options[ "defaultLearningRate"] specific_parameters["learningRate"] = ( default_lr * self.pipeline_stage_lr_scaling[stage], lr_is_const) if self.momentum_scaling: # Momentum values are scaled inverse to the pipeline_stage momentum = 1 - ((1 - self.option_values["defaultMomentum"]) * self.pipeline_stage_momentum_scaling[stage]) specific_parameters["momentum"] = (momentum, True) dampening = 1 - ((1 - self.option_values["defaultDampening"]) * self.pipeline_stage_dampening_scaling[stage]) specific_parameters["dampening"] = (dampening, True) for tensor_id in self.pipeline_stage_tensors[stage]: optimizer.insertSpecific(tensor_id, specific_parameters) return optimizer
def test_implicit_recompute_op_scheduled_pre_loss_no(): """ Regression test for T36828. Confirm that compilation completes without an exception being thrown. It is possible that the MulGrad op that produces Gradient___t3 is scheduled early (e.g. at index 0 in the schedule). If this happens, all ops after it in the schedule are classified as 'post loss'. The matmul operation is recomputed in the backwards pass. The implicit recomputation setting forbids that an op to be recomputed is a 'post loss' op. """ builder = popart.Builder() t0 = builder.addInputTensor("FLOAT", [2, 2]) t1 = builder.addInitializedInputTensor( np.random.rand(2, 2).astype(np.float32)) t2 = builder.aiOnnx.matmul([t0, t1]) t3 = builder.aiGraphcore.l1loss([t2], 0.1) const = np.array([4]).astype(np.float32) t5 = builder.aiOnnx.constant(const) t6 = builder.aiOnnx.mul([t3, t5]) builder.recomputeOutputInBackwardPass(t2) session = popart.TrainingSession(deviceInfo=tu.create_test_device(), fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(1, []), loss=t6, optimizer=popart.SGD( {"lossScaling": (2.0, False)})) session.prepareDevice()
def test_optimizer_state_tensor_location_settings(): # Check optimizer state tensor location settings work. optimizer_with_state = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.0, False), "defaultWeightDecay": (0.0, False), "defaultDampening": (0.0, True) }) ir = get_ir(optimizer_state_tensor_location_settings=None, optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W1', 'Accl___W2', 'Accl___W0'], check_offchip=[]) ir = get_ir( optimizer_state_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), optimizer=optimizer_with_state) check_ir(ir, check_onchip=[], check_offchip=['Accl___W1', 'Accl___W2', 'Accl___W0']) ir = get_ir( optimizer_state_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W1', 'Accl___W2', 'Accl___W0'], check_offchip=[])
def step(session, anchors, data, update_optimizer_lr=None): if update_optimizer_lr is not None: optimizer = popart.SGD(update_optimizer_lr) session.updateOptimizer(optimizer) stepio = popart.PyStepIO(data, anchors) session.run(stepio)
def test_auto_loss_scaling_with_mixed_precision_trackable_tensors(): """ Create a Session with automatic loss scaling and a model that contains both fp32 and fp16 initializers, and see that no incompatibility error is thrown. """ builder = popart.Builder() t0 = builder.addInputTensor("FLOAT", [2, 2]) t1_data = np.random.rand(2, 2).astype(np.float32) t1 = builder.addInitializedInputTensor(t1_data) mm0 = builder.aiOnnx.matmul([t0, t1]) t2 = builder.aiOnnx.cast([mm0], "FLOAT16") t3 = builder.addInputTensor("FLOAT16", [2, 2]) mm1 = builder.aiOnnx.matmul([t2, t3]) loss = builder.aiGraphcore.identityloss([mm1]) optimizer = popart.SGD({"lossScaling": (2, False)}) opts = popart.SessionOptions() opts.automaticLossScalingSettings.enabled = True opts.automaticLossScalingSettings.binEdgeLocation = 0.5 opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2 session = popart.TrainingSession(builder.getModelProto(), deviceInfo=tu.create_test_device(), dataFlow=popart.DataFlow(1, [loss]), loss=loss, optimizer=optimizer, userOptions=opts) session.prepareDevice()
def adj_lr(self, lr, sess, specific_dic={}): self.opti_cfg['defaultLearningRate'] = (lr, False) new_optimizer = popart.SGD(self.opti_cfg) for name in specific_dic: new_optimizer.insertSpecific(name, specific_dic[name]) sess.updateOptimizerFromHost(new_optimizer) self.gc_optimizer = new_optimizer
def test_auto_loss_scaling_and_continuous_update_pipelining(): """ Create a Session with automatic loss scaling and pipelining enabled, but gradient accumulation disabled, and see that an incompatibility error is thrown. """ builder = popart.Builder() t0 = builder.addInputTensor("FLOAT", [2, 2]) mm0 = builder.aiOnnx.matmul([t0, t0]) loss = builder.aiGraphcore.identityloss([mm0]) optimizer = popart.SGD({"lossScaling": (2, False)}) builder.virtualGraph(mm0, 0) builder.virtualGraph(loss, 0) opts = popart.SessionOptions() opts.automaticLossScalingSettings.enabled = True opts.automaticLossScalingSettings.binEdgeLocation = 0.5 opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2 opts.enablePipelining = True opts.enableGradientAccumulation = False opts.virtualGraphMode = popart.VirtualGraphMode.Manual with pytest.raises(popart.popart_exception) as e_info: session = popart.TrainingSession(builder.getModelProto(), deviceInfo=tu.create_test_device(2), dataFlow=popart.DataFlow(1, [loss]), loss=loss, optimizer=optimizer, userOptions=opts) assert e_info.value.args[0].endswith( "Automatic loss scaling is not currently supported when the 'enablePipelining' SessionOption is set to 'true', but the 'enableGradientAccumulation' SessionOption is set to 'false'" )
def test_auto_loss_scaling_with_no_tracked_tensors(): """ Build a model with ops, the outputs of which the auto loss scale transform does not decide to 'track'. Observe an error from the auto loss scale transform """ builder = popart.Builder() t0 = builder.addInputTensor("FLOAT", [2, 2]) out = builder.aiOnnx.relu([t0]) loss = builder.aiGraphcore.identityloss([out]) optimizer = popart.SGD({"lossScaling": (2, False)}) opts = popart.SessionOptions() opts.automaticLossScalingSettings.enabled = True opts.automaticLossScalingSettings.binEdgeLocation = 0.5 opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2 with pytest.raises(popart.popart_exception) as e_info: session = popart.TrainingSession(builder.getModelProto(), deviceInfo=tu.create_test_device(), dataFlow=popart.DataFlow(1, [loss]), loss=loss, optimizer=optimizer, userOptions=opts) assert e_info.value.args[0].endswith("No tracked tensors were found")
def test_auto_loss_scaling_with_const_loss_scale_tensor(): """ Create a session with auto loss scaling enabled, and with an optimizer with a constant loss scale value. Observe an error from the auto loss scale transform """ builder = popart.Builder() t0 = builder.addInputTensor("FLOAT", [2, 2]) t1_data = np.random.rand(2, 2).astype(np.float32) t1 = builder.addInitializedInputTensor(t1_data) out = builder.aiOnnx.matmul([t0, t1]) loss = builder.aiGraphcore.identityloss([out]) makeLossScalingTensorConst = True optimizer = popart.SGD({"lossScaling": (2, makeLossScalingTensorConst)}) opts = popart.SessionOptions() opts.automaticLossScalingSettings.enabled = True opts.automaticLossScalingSettings.binEdgeLocation = 0.5 opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2 with pytest.raises(popart.popart_exception) as e_info: session = popart.TrainingSession(builder.getModelProto(), deviceInfo=tu.create_test_device(), dataFlow=popart.DataFlow(1, []), loss=loss, optimizer=optimizer, userOptions=opts) assert e_info.value.args[0].endswith( "The optimizer must have non-const loss scaling")
def test_accumulator_tensor_location_settings_plus_override(): # Check optimizer state tensor location settings work optimizer_with_state = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.0, False), "defaultWeightDecay": (0.0, False), "defaultDampening": (0.0, True) }) ir = get_ir( accumulator_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), tensor_location_setting_override={ 'Accl___W1': popart.TensorLocation(popart.TensorStorage.OnChip) }, optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W1'], check_offchip=['Accl___W2', 'Accl___W0']) ir = get_ir( accumulator_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), tensor_location_setting_override={ 'Accl___W1': popart.TensorLocation(popart.TensorStorage.OffChip) }, optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W2', 'Accl___W0'], check_offchip=['Accl___W1'])
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"): proto, data, x, loss = model(splits=splits) patterns = popart.Patterns() patterns.enablePattern("TiedGatherPattern", include_patterns) patterns.enablePattern("TiedGatherAccumulatePattern", include_patterns) user_options = { "enableOutlining": outline, "enableGradientAccumulation": True, "accumulationFactor": 2, "accumulationAndReplicationReductionType": popart.ReductionType.Mean, "meanAccumulationAndReplicationReductionStrategy": popart.MeanReductionStrategy.Running } if optim == "Lamb": optimizer = popart.Adam({ "defaultLearningRate": (0.1, False), "defaultWeightDecay": (0.1, True), "defaultBeta1": (0.1, True), "defaultBeta2": (0.1, True), "lossScaling": (20, True), }, mode=popart.AdamMode.LambNoBias) # NoBias to increase the error of incorrect gradients user_options["optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings( popart.TensorLocation( popart.TensorStorage.OffChip, popart.ReplicatedTensorSharding.On), 0, 0) user_options["enableReplicatedGraphs"] = True user_options["replicatedGraphCount"] = 2 ipus = 2 else: optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True), # 0 dampening to increase the error of incorrect gradients "lossScaling": (20, True)}) ipus = 1 if train: return run_py( proto, data=data, outputs=x, loss=loss, optimizer=optimizer, patterns=patterns, user_options=user_options, skip_execution=skip_execution) else: return run_py( proto, data=data, outputs=x, patterns=patterns, user_options={ "enableOutlining": outline, "constantWeights": False }, skip_execution=skip_execution)
def run_model(builder_fn, steps, seed, training=True, options=popart.SessionOptions()): """ Helper function that runs a model and returns the anchors. builder_fn - a function that takes a PopART builder and returns a tuple comprising a loss, a dictionary of inputs and a dictionary that maps python variable names to PopART tensor IDs for anchors. steps - number of batches per step seed - random seed to pass to the PopART session. Returns a named tuple with .anchors being the anchors and .seed being the seed used. """ builder = popart.Builder() loss, inputs, random_outs = builder_fn(builder) dataFlow = popart.DataFlow( steps, {op[1]: popart.AnchorReturnType("ALL") for op in random_outs.items()}) proto = builder.getModelProto() optimizer = popart.SGD({"defaultLearningRate": (0.1, True)}) patterns = popart.Patterns() device = tu.create_test_device(1, pattern=popart.SyncPattern.Full) if training: session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns, deviceInfo=device) else: session = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, userOptions=options, patterns=patterns, deviceInfo=device) session.prepareDevice() session.weightsFromHost() session.setRandomSeed(seed) anchors = session.initAnchorArrays() stepio = popart.PyStepIO(inputs, anchors) session.run(stepio) return Run(anchors=anchors, seed=seed, steps=steps, random_outs=random_outs)
def train_builder(opts): builder, data, outputs, loss, __ = eval_builder(opts) return [ builder, data, outputs, loss, popart.SGD(0.01) ]
def create(self): self.iteration.learning_rate = self.option_values["defaultLearningRate"] if self.opt_type == "SGD": optimizer = popart.SGD(self.optimizer_options) elif self.opt_type == "ADAM": optimizer = popart.Adam(self.optimizer_options, accl1_type=self.accl1_type, scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16) elif self.opt_type == "ADAM_NO_BIAS": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.AdamNoBias, accl1_type=self.accl1_type, scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16) elif self.opt_type == "LAMB": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.Lamb, accl1_type=self.accl1_type, scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16) elif self.opt_type == "LAMB_NO_BIAS": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.LambNoBias, accl1_type=self.accl1_type, scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16) weight_decay_tensor_list = [] for stage, tensors in self.tensors.items(): for tensor_id in tensors: params = self.option_values.copy() if self.include_for_weight_decay(tensor_id): params["weightDecay"] = self.weight_decay weight_decay_tensor_list.append(tensor_id) else: params["weightDecay"] = 0 if self.disable_lamb(tensor_id): params["maxWeightNorm"] = 0 for transform in self.transforms: params = transform(tensor_id, params, stage) specific_params = { k: v for k, v in params.items() if k not in self.option_values } if specific_params: p = self._make_tuple_options(specific_params) optimizer.insertSpecific(tensor_id, p) if len(weight_decay_tensor_list) != 0: logger.debug(f" Weight decay of {self.weight_decay} applied to: {weight_decay_tensor_list}") return optimizer
def test_incomplete_grad(): # Reproducer for T37001, included as regression test. This test doesn't # actually check any assertions, it just ensure that a code path that # previously failed does not result in any exceptions. # # The problem originally revealed by this test was that an exception was # thrown if for some inputs of a fwd subgraph the backwards pass creator was # not able to create gradients for those inputs (for example for a seed # input). This problem was fixed in the code base by allowing subgraph # inputs in the fwd subgraph to not have an associated gradients outputs in # the associated bwd subgraph. def get_subgraph_builder(builder, weights, labels): subgraph_builder = builder.createSubgraphBuilder() subgraph_builder.addInputTensorFromParentGraph(weights) input = subgraph_builder.addInputTensor( popart.TensorInfo("FLOAT16", [4, 32, 1, 64])) subgraph_builder.addInputTensorFromParentGraph(labels) matmul_out = subgraph_builder.aiOnnx.matmul([input, weights]) log_probs = subgraph_builder.aiOnnx.logsoftmax([matmul_out], axis=3) log_probs_compact = subgraph_builder.aiOnnx.gather([log_probs, labels], axis=3) subgraph_builder.addOutputTensor(log_probs_compact) return subgraph_builder builder = popart.Builder() float16_input = builder.addInputTensor( popart.TensorInfo("FLOAT16", [4, 32, 1, 64]), "float16_input") int32_input = builder.addInputTensor(popart.TensorInfo("INT32", [4, 2]), "int32_input") weights = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16), "weights") fn = get_subgraph_builder(builder, weights, int32_input) log_probs_compact = builder.aiGraphcore.call( [weights, float16_input, int32_input], 1, fn)[0] l1_loss = builder.aiGraphcore.l1loss([log_probs_compact], 1.0) optimizer = popart.SGD({ "defaultLearningRate": (0.1, False), "defaultWeightDecay": (0, True) }) training_session = popart.TrainingSession( builder.getModelProto(), loss=l1_loss, deviceInfo=popart.DeviceManager().createIpuModelDevice({}), optimizer=optimizer, dataFlow=popart.DataFlow(1, {}), userOptions=popart.SessionOptions())
def create(self): self.iteration.learning_rate = self.optimizer_options[ "defaultLearningRate"][0] optimizer = popart.SGD(self.optimizer_options) projection_scale_added = False for stage in self.pipeline_stage_tensors: specific_parameters = {} if self.lr_scaling: default_lr, lr_is_const = self.optimizer_options[ "defaultLearningRate"] specific_parameters["learningRate"] = ( default_lr * self.pipeline_stage_lr_scaling[stage], lr_is_const) if self.momentum_scaling: # Momentum values are scaled inverse to the pipeline_stage if self.option_values["defaultMomentum"] != 0: # This arithmetic will create FP rounding errors if momentum == 0. momentum = 1 - ( (1 - self.option_values["defaultMomentum"]) * self.pipeline_stage_momentum_scaling[stage]) else: momentum = 0 specific_parameters["momentum"] = (momentum, True) if self.option_values["defaultDampening"] != 0: dampening = 1 - ( (1 - self.option_values["defaultDampening"]) * self.pipeline_stage_dampening_scaling[stage]) else: dampening = 0 specific_parameters["dampening"] = (dampening, True) for tensor_id in self.pipeline_stage_tensors[stage]: # Special case for embedding/projection variable. if self.projection_lr_scaling and "Embedding_Dict" in tensor_id: lr = specific_parameters.get( "learningRate", self.optimizer_options["defaultLearningRate"]) params = specific_parameters.copy() params["learningRate"] = (lr[0] * self.projection_lr_scale, lr[1]) optimizer.insertSpecific(tensor_id, params) projection_scale_added = True else: optimizer.insertSpecific(tensor_id, specific_parameters) if self.projection_lr_scaling and not projection_scale_added: lr = self.optimizer_options["defaultLearningRate"] optimizer.insertSpecific( "Embedding/Embedding_Dict", {"learningRate": (lr[0] * self.projection_lr_scale, lr[1])}) return optimizer
def run_test(aliaszerocopy): proto, data, x, loss = model() options = popart.SessionOptions() patterns = popart.Patterns() optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True) }) options.enableOutlining = True options.outlineThreshold = -np.inf options.enableOutliningCopyCostPruning = False options.autoRecomputation = popart.RecomputationType.Standard options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.explicitRecomputation = True options.aliasZeroCopy = aliaszerocopy options.executionPhaseSettings.phases = 5 request_ipus = 2 device = tu.create_test_device(2, pattern=popart.SyncPattern.Full) dataFlow = popart.DataFlow(1, {x: popart.AnchorReturnType("ALL")}) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) file_path = str(tmpdir / f"aliaszerocopy_model_test.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) device.detach() graph_report = json.loads(session.getGraphReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) return anchors[x], post_proto, total_memory
def run_test(outlining): proto, data, x, loss = model() options = popart.SessionOptions() patterns = popart.Patterns() optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), }) options.enableOutlining = outlining options.outlineThreshold = 10.0 options.enableGradientAccumulation = True options.accumulationFactor = 4 options.enableReplicatedGraphs = True options.replicatedGraphCount = 2 options.virtualGraphMode = popart.VirtualGraphMode.Manual if pipeline: options.enablePipelining = True options.autoRecomputation = popart.RecomputationType.Pipeline device = tu.create_test_device(4) dataFlow = popart.DataFlow(1, {x: popart.AnchorReturnType("ALL")}) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) file_path = str(tmpdir / f"outlining_execution_context_model.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) device.detach() graph_report = json.loads(session.getGraphReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) return session, anchors[x], post_proto, total_memory
def createOptimizer(self): if not isinstance(self.optimizer, torch.optim.SGD): raise RuntimeError("PopART currently only accepts SGD optimizers.") elif self.optimizer.defaults["nesterov"]: raise RuntimeError("Nesterov momentum is currently not supported.") return popart.SGD({ "defaultLearningRate": (self.optimizer.defaults["lr"], False), "defaultMomentum": (self.optimizer.defaults["momentum"], False), "defaultWeightDecay": (self.optimizer.defaults["weight_decay"], False), "defaultDampening": (self.optimizer.defaults["dampening"], False) })
def test_sgd_with_float16_model(): popart.getLogger().setLevel("TRACE") input1 = np.zeros((2, 2, 4, 4), dtype=np.float16) input2 = np.zeros((2, 2, 3, 3), dtype=np.float16) input3 = np.zeros((2, 2, 3, 3), dtype=np.float16) builder = popart.Builder() inid1 = builder.addInputTensor(popart.TensorInfo(input1)) inid2 = builder.addInitializedInputTensor(input2) inid3 = builder.addInitializedInputTensor(input2) c1 = builder.aiOnnx.conv([inid1, inid2], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1]) c2 = builder.aiOnnx.conv([c1, inid3], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1]) # Reduce to scalar out = builder.aiGraphcore.identityloss([c2]) proto = builder.getModelProto() optimizer = popart.SGD({ "defaultLearningRate": (0.1, False), "defaultWeightDecay": (0.1, False), "lossScaling": (1000, False) }) anchorNames = { popart.reservedGradientPrefix() + inid1: popart.AnchorReturnType("All"), } opts = popart.SessionOptions() session = popart.TrainingSession( fnModel=proto, dataFlow=popart.DataFlow(1, anchorNames), loss=out, optimizer=optimizer, deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) session.prepareDevice() session.weightsFromHost() anchorArrays = session.initAnchorArrays() stepio = popart.PyStepIO({inid1: input1}, anchorArrays) session.run(stepio)
def test_postnrepl_overzealous_elimination(): # Reproducer for T36270, included as regression test. This test doesn't # actually do any assertions, it just checks that a code path that # previously failed does not result in any exceptions. # # The bug was that the PostNRepl pattern removed the gradient sum op that # produces Gradient_<in0> (which has 1 input) in the backwards subgraph, also # rewriting the subgraph itself to use the input to the gradient sum op # instead, as it's identical. However, the tensor produced by the op is a # graph output that is used by a call op in the main graph. The pattern did # not adjust this CallOp or the subgraph's output tensors and so the CallOp # in the main graph fails because it's using a tensor that no longer exists. def get_subgraph_builder(b, w): builder = b.createSubgraphBuilder() builder.addInputTensorFromParentGraph(w) in0 = builder.addInputTensor( popart.TensorInfo("FLOAT16", [4, 32, 16, 64])) x = builder.aiOnnx.matmul([in0, w]) builder.addOutputTensor(x) return builder # building model and dataflow builder = popart.Builder() in0 = builder.addInputTensor(popart.TensorInfo('FLOAT16', [4, 32, 1, 64]), "in0") w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16), "weights") fn = get_subgraph_builder(builder, w) x = builder.aiGraphcore.call([w, in0], 1, fn)[0] l1_loss = builder.aiGraphcore.l1loss([x], 1.0) optimizer = popart.SGD({ "defaultLearningRate": (0.1, False), "defaultWeightDecay": (0, True) }) device = popart.DeviceManager().createIpuModelDevice({}) # create training session popart.TrainingSession(fnModel=builder.getModelProto(), loss=l1_loss, deviceInfo=device, optimizer=optimizer, dataFlow=popart.DataFlow(1, {}), userOptions=popart.SessionOptions())
def test_gradient_accumulation_model_proto(tmpdir, explicit_loops): np.random.seed(1234) label_array = np.random.randint(0, hidden_size, batch_size) accl_initial_proto, accl_proto_filename, accl_anchor_arrays = run_mm_graph( # Using Momentum to create accl tensors. popart.SGD({ "defaultLearningRate": (0.1, False), "defaultMomentum": (0.9, True) }), label_array=label_array, accum_factor=4, enable_accum=True, batches_per_step=5, number_of_steps=3, final_proto_filename=os.path.join(tmpdir, "accl5batches3steps"), enable_multi_ipu=False, full_anchorage=False, explicit_loops=explicit_loops) model = onnx.load(accl_proto_filename) names = [t.name for t in model.graph.initializer] grad_accl_names = [] weight_names = [] for name in names: if grad_accl_prefix in name: grad_accl_names.append(name) elif "weight" in name: weight_names.append(name) # Model should have 6 weight tensors assert len(weight_names) == 6 assert len(grad_accl_names) == len(weight_names) tensor_mapping = {} for tensor in model.graph.initializer: tensor_mapping[tensor.name] = tensor rev_map = {} for w_name in weight_names: assert grad_accl_prefix + w_name in grad_accl_names rev_map[grad_accl_prefix + w_name] = w_name for g_a_name in grad_accl_names: weight_tensor = tensor_mapping[rev_map[g_a_name]] g_a_tensor = tensor_mapping[g_a_name] for d_i, v in enumerate(weight_tensor.float_data): # initialisation as per equations. When velocity scaling != 1 this # will need changing : T12001 assert g_a_tensor.float_data[d_i] - v * wd < 1e-8
def test_inplacing_phased_constraints(tmpdir): # This used to fail, see T23985 run_model(tmpdir, 'phased.onnx', execution_mode="phased", num_layers=5, optimizer=popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.0, False), "defaultWeightDecay": (0.0, False), "defaultDampening": (0.0, True) }), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=offChipLocation)
def init_session(proto, loss, dataFlow, userOpts, device): # Create a session to compile and execute the graph optimizer = popart.SGD({"defaultLearningRate": (0.1, False)}) session = popart.TrainingSession(fnModel=proto, loss=loss, deviceInfo=device, optimizer=optimizer, dataFlow=dataFlow, userOptions=userOpts) session.prepareDevice() session.setRandomSeed(42) # Create buffers to receive results from the execution anchors = session.initAnchorArrays() return Session(session, anchors), optimizer