def compare_weights(session0, session1, tmpdir): ref_path = str(tmpdir / f"ref_session.onnx") session0.modelToHost(ref_path) session0_proto = onnx.load(ref_path) session0_weights = {} session1_weights = {} for i in range(len(session0_proto.graph.initializer)): init = session0_proto.graph.initializer[i] dtype = mapping.TENSOR_TYPE_TO_NP_TYPE[init.data_type] empty_init = np.empty(shape=init.dims, dtype=dtype) session0_weights[init.name] = empty_init session1_weights[init.name] = empty_init session0.weightsToHost() session0.readWeights(popart.PyWeightsIO(session0_weights)) session1.weightsToHost() session1.readWeights(popart.PyWeightsIO(session1_weights)) for i in range(len(session0_proto.graph.initializer)): init_name = session0_proto.graph.initializer[i].name print("Comparing ", init_name) print(session0_weights[init_name]) print(session1_weights[init_name]) assert np.array_equal(session0_weights[init_name], session1_weights[init_name])
def save_weights_for_decoder(builder, training_session, weight_names_to_checkpt, precision, decoder_weights_fp): """ save weights for greedy decoding to numpy files Note: weight_names_to_checkpt can be either actual model weights or exp-mov-averaged weights """ # Initializing decoder parameter dictionary decoder_weights_dict = dict() for uname, wname in weight_names_to_checkpt[ "prediction_network"] + weight_names_to_checkpt["joint_network"]: original_wname = wname.replace(ema_utils.EMA_PREFIX, '') param_shape = builder.getTensorShape(original_wname) decoder_weights_dict[wname] = np.empty(param_shape, precision) logger.info("Saving decoder weights to {}".format(decoder_weights_fp)) weightsIo = popart.PyWeightsIO(decoder_weights_dict) training_session.readWeights(weightsIo) for uname, wname in weight_names_to_checkpt[ "prediction_network"] + weight_names_to_checkpt["joint_network"]: # remove exp_mov_avg_ prefix from uname and add to dict so that validation script works original_uname = uname.replace(ema_utils.EMA_PREFIX, '') decoder_weights_dict[original_uname] = decoder_weights_dict[wname] decoder_weights_dict.pop(wname, None) np.save(decoder_weights_fp, decoder_weights_dict) # NOTE - these weights can be loaded as: # decoder_weights_dict = np.load(decoder_weights_fp, allow_pickle=True)[()] return
def run_test(set_pipeline_stages): weights = {} def init_builder(builder): d0 = builder.addInputTensor(dummy_data, 'data0') w0 = builder.addInitializedInputTensor(weight_data) weights[w0] = np.empty(shape=weight_data.shape, dtype=weight_data.dtype) if inputType is not None: d0_float = builder.aiOnnx.cast([d0], "FLOAT") t0 = builder.aiOnnx.matmul([d0_float, w0]) else: t0 = builder.aiOnnx.matmul([d0, w0]) t1 = builder.aiOnnx.sin([t0]) t2 = builder.aiOnnx.matmul([t1, w0]) loss = builder.aiGraphcore.identityloss([t2]) builder.addOutputTensor(loss) if set_pipeline_stages: if inputType is not None: builder.pipelineStage(d0_float, 0) builder.pipelineStage(t0, 0) builder.pipelineStage(t1, 1) builder.pipelineStage(t2, 2) builder.pipelineStage(loss, 2) if inputType is not None: builder.virtualGraph(d0_float, 0) builder.virtualGraph(t0, 0) builder.virtualGraph(t1, 1) builder.virtualGraph(t2, 0) builder.virtualGraph(loss, 0) return [loss] session = PopartTestSession() session.mode = 'train' session.options.enablePipelining = set_pipeline_stages session.device = 'ipu_model' if set_pipeline_stages: session.numIPUs = 2 session.options.virtualGraphMode = popart.VirtualGraphMode.Manual session.batchesPerStep = bps session.options.enableGradientAccumulation = True session.options.accumulationFactor = accumulation_factor # test a pipeline stage appearing on multiple virtual graphs session.prepare(init_builder) sessionAnchors = session.run({'data0': data}) assert len(sessionAnchors) == 1 sessionAnchor = [v for k, v in sessionAnchors.items()][0] session._session.weightsToHost() weightsIo = popart.PyWeightsIO(weights) session._session.readWeights(weightsIo) assert len(weights) == 1 weights = [v for k, v in weights.items()] return weights[0], sessionAnchor
def run_test(enable_recomputation): weights = {} def init_builder(builder): d0 = builder.addInputTensor(dummy_data, 'data0') w0 = builder.addInitializedInputTensor(weight_data) weights[w0] = np.empty(shape=weight_data.shape, dtype=weight_data.dtype) if inputType is not None: d0_float = builder.aiOnnx.cast([d0], "FLOAT") t0 = builder.aiOnnx.mul([d0_float, w0]) else: t0 = builder.aiOnnx.mul([d0, w0]) t1 = builder.aiOnnx.sigmoid([t0]) t2 = builder.aiGraphcore.scale([t1], 2.0) loss = builder.aiGraphcore.identityloss([t2]) if inputType is not None: builder.virtualGraph(d0_float, 0) for t in (t0, t1, t2): builder.virtualGraph(t, 0) builder.virtualGraph(loss, 1) return [loss] session = PopartTestSession() session.device = 'ipu_model' session.numIPUs = 2 session.mode = 'train' session.options.virtualGraphMode = popart.VirtualGraphMode.Manual session.options.enablePipelining = True if enable_recomputation: session.options.autoRecomputation = popart.RecomputationType.Standard session.options.accumulationFactor = accumulationFactor session.options.enableGradientAccumulation = True session.prepare(init_builder) anchors = session.run({'data0': data}) # return the weights session._session.weightsToHost() weightsIo = popart.PyWeightsIO(weights) session._session.readWeights(weightsIo) assert len(weights) == 1 weights = [v for k, v in weights.items()] return weights[0]
def test_sgd_with_zero_learning_rate(): """ In this test we check that we can run a training step zero learning rate, and that it behaves as expected (i.e. no weight update) """ # Let's start with an optimizer with a variable, non-zero learning rate optSettings = { "defaultLearningRate": (0.5, False), "defaultWeightDecay": (0.6, False), "lossScaling": (10.0, False) } stepSize = 2 session, inputsUserSgd = trainSession({}, popart.SGD(optSettings), stepSize) anchorsArrays = session.initAnchorArrays() # Get the initial weights: fn = "init.onnx" session.modelToHost(fn) wId = "init_input" weights = {wId: np.empty(shape=[2, 2, 3, 3], dtype=np.float32)} weightsio = popart.PyWeightsIO(weights) session.readWeights(weightsio) init_weights = np.copy(weights[wId]) # Run for a step with non-zero lr, observe that the weights have changed stepio = popart.PyStepIO(inputsUserSgd, anchorsArrays) session.run(stepio) session.weightsToHost() session.readWeights(weightsio) updated_weights = np.copy(weights[wId]) assert np.array_equal(init_weights, updated_weights) is False # Update optimizer with zero lr, (only valid if variable) optSettings["defaultLearningRate"] = (0.0, True) with pytest.raises(popart.popart_exception) as e_info: session.updateOptimizerFromHost(popart.SGD(optSettings)) assert e_info.value.args[0].startswith( "Constant, zero learning rate in SGD") # Run a training step, and confirm the weights haven't updated optSettings["defaultLearningRate"] = (0.0, False) session.updateOptimizerFromHost(popart.SGD(optSettings)) session.weightsToHost() session.readWeights(weightsio) assert np.array_equal(weights[wId], updated_weights)
def runModel(pipeline, recompute): builder = popart.Builder() in0 = builder.addInputTensor("FLOAT", dshape) in1 = builder.addInputTensor("INT32", lshape) w0 = builder.addInitializedInputTensor(w0_data) with builder.virtualGraph(0), builder.pipelineStage(0): x = builder.aiOnnx.matmul([in0, w0]) with builder.virtualGraph(1), builder.pipelineStage(1): x = builder.aiOnnx.sqrt([x]) with builder.virtualGraph(0), builder.pipelineStage(2): x = builder.aiOnnx.add([w0, x]) loss = builder.aiGraphcore.nllloss([x, in1]) opts = popart.SessionOptions() opts.virtualGraphMode = popart.VirtualGraphMode.Manual opts.enablePipelining = pipeline if pipeline == True: opts.enableGradientAccumulation = True opts.accumulationFactor = bps test_bps = 1 else: test_bps = bps if recompute == True: opts.autoRecomputation = popart.RecomputationType.Pipeline session = popart.TrainingSession( deviceInfo=popart.DeviceManager().createIpuModelDevice( {"numIPUs": "2"}), dataFlow=popart.DataFlow(test_bps, [loss]), fnModel=builder.getModelProto(), loss=loss, optimizer=popart.ConstSGD(0.1), userOptions=opts) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO({in0: in0_data, in1: in1_data}, anchors) session.run(stepio) weights = {} weights[w0] = np.empty(shape=dshape, dtype=np.float32) weightsIo = popart.PyWeightsIO(weights) session.weightsToHost() session.readWeights(weightsIo) return weights[w0]
def getWeights(withPipelining): device = tu.create_test_device(numIpus=nIPUs, tilesPerIPU=1216) userOptions = popart.SessionOptions() userOptions.enableOutlining = False userOptions.enablePipelining = withPipelining userOptions.enableGradientAccumulation = True userOptions.accumulationFactor = accumulationFactor userOptions.virtualGraphMode = popart.VirtualGraphMode.Manual session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=dataFlow, optimizer=popart.SGD({ "defaultLearningRate": (defaultLearningRate0, False), "defaultMomentum": (defaultMomentum0, False), "defaultDampening": (defaultDampening0, False) }), loss=finalLoss, userOptions=userOptions, deviceInfo=device) anchorArrays = session.initAnchorArrays() session.prepareDevice() session.weightsFromHost() stepio = popart.PyStepIO({input0: inputVals}, anchorArrays) session.run(stepio) session.weightsToHost() w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R}) session.readWeights(weightsRead) return w0R, w1R
lb: trainingDataLables }, trainingAnchors) # Copy the weights to the device from the host trainingSession.weightsFromHost() # Run the training graph trainingSession.run(trainingStepio) # Copy the weights to the host from the device trainingSession.weightsToHost() # Prepare the map of weights to read the weights into weights = {} weights[w] = np.empty([2, 2], np.float16) weightsIo = popart.PyWeightsIO(weights) # Read the weights from the session trainingSession.readWeights(weightsIo) # Execute the inference graph #------------------------------------------------------------------------------ # Generate some random input data interenceData = np.random.rand(1, 2).astype(np.float16) interenceDataLables = np.random.rand(1).astype(np.int32) # Create buffers to receive results from the execution inferenceAnchors = inferenceSession.initAnchorArrays() inferenceStepio = popart.PyStepIO({ ip: interenceData,
def run_model(tmpdir, batches_per_step, accum_factor, replicas, tile_set, exchange_strategy): size = 64 proto, inputs, weights, labels, dataFlow, loss, sum = get_model( size, batches_per_step, 4, 1, tile_set, exchange_strategy) opts = popart.SessionOptions() opts.enableExplicitMainLoops = True opts.useHostCopyOps = True opts.instrumentWithHardwareCycleCounter = False opts.virtualGraphMode = popart.VirtualGraphMode.Auto # Both true & false should work - testing with false to avoid # host-cycle-overhead opts.rearrangeAnchorsOnHost = False opts.rearrangeStreamsOnHost = False # Set session options to generate the report tu.set_autoreport_options(opts, tmpdir, output_execution_profile=True) if accum_factor > 1: opts.enableGradientAccumulation = True opts.accumulationFactor = accum_factor if tile_set == popart.TileSet.IO: opts.numIOTiles = 128 else: opts.numIOTiles = 0 if replicas > 1: opts.enableReplicatedGraphs = True opts.replicatedGraphCount = replicas pat = popart.Patterns(popart.PatternsLevel.Default) session = popart.TrainingSession( fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=loss, optimizer=popart.ConstSGD(1e-6), patterns=pat, # Trying to use less than all the tiles throw an error like # popart_core.poplar_exception: Trying to access tile 72 on IPU # 0 but the virtual graph only covers the following tiles on # that IPU: 0-63 # The error happens in a call to poplar made by gcl::perIPUTiles. deviceInfo=tu.create_test_device(numIpus=replicas, tilesPerIPU=tu.USE_ALL_TILES)) anchors = session.initAnchorArrays() session.prepareDevice() np.random.seed(224488) session.weightsFromHost() warmup_iterations = 1 calc_iterations = 1 for i in range(warmup_iterations + calc_iterations): datainputs = { input: (np.random.normal( 0, 0.05, (replicas * batches_per_step * accum_factor, 1, size, size)).astype(np.float32)) for input in inputs } datainputs[labels] = np.random.randint( 0, size, (replicas * batches_per_step * accum_factor, 1, size)) stepio = popart.PyStepIO(datainputs, anchors) session.run(stepio) session.weightsToHost() weights_data = { w: np.zeros((1, size, size), dtype=np.float32) for w in weights } weights_read = popart.PyWeightsIO(weights_data) session.readWeights(weights_read) for w in weights_data: assert np.count_nonzero(np.isnan(weights_data[w])) == 0 report = session.getReport() overlapPercentage = get_compute_io_overlap_percentage( report, warmup_iterations) return overlapPercentage, weights_data
def test_manual_serialization(): # Basic model: # # X: data input if shape (N, C0) # W: weight input of shape (C0, C1) # # Y = matmul(X, W) # Z = relu(Y) # loss = l1Loss(Z) # # With array dimensions N = 12 C0 = 244 C1 = 286 # In this test, we manually serialise the matmul, converting # matmul ((N,C0) , (C0,C1)) # # into a sequence of factor-f smaller matmuls # matmul (N,C0/f),(C0/f,C1)) # # reapeated and accumulated f times, where f is f = 4 assert (C0 % f == 0) # Constructing the model builder = popart.Builder() # NOTE: T22702 For some seeds this test fails. np.random.seed(0) wVals = np.array(npr.randn(C0, C1), dtype=np.float32) W = builder.addInitializedInputTensor(wVals) xInfo = popart.TensorInfo("FLOAT", [N, C0]) X = builder.addInputTensor(xInfo) axesV = np.array([0, 1]).astype(np.int32) axes = builder.addInitializedInputTensor(axesV) for i in range(f): # the lower index of the i'th slice lwr = int(i * C0 / f) # the upper index of the i'th slice upp = int((i + 1) * C0 / f) # Take a slice of size (N,C0/f) out of X s0 = builder.addInitializedInputTensor( np.array([0, lwr]).astype(np.int32)) e0 = builder.addInitializedInputTensor( np.array([N, upp]).astype(np.int32)) X_slice = builder.aiOnnx.slice([X, s0, e0, axes]) # Take a slice of size (C0/f,C1) out of W s1 = builder.addInitializedInputTensor( np.array([lwr, 0]).astype(np.int32)) e1 = builder.addInitializedInputTensor( np.array([upp, C1]).astype(np.int32)) W_slice = builder.aiOnnx.slice([W, s1, e1, axes]) # Multiply the slices together, and accumulate as necessary mm_part = builder.aiOnnx.matmul([X_slice, W_slice]) if i == 0: Y = mm_part else: Y = builder.aiOnnx.add([mm_part, Y]) # Finally, the non-linearity Z = builder.aiOnnx.relu([Y]) # This boiler-plate is currently necessary with opset-10 slice graph_transformer = popart.GraphTransformer(builder.getModelProto()) graph_transformer.convertAllFixedPointInitializersToConstants() builder = popart.Builder(graph_transformer.getModelProto()) l1 = builder.aiGraphcore.l1loss([Z], 0.2) dataFlow = popart.DataFlow(1, {}) device = tu.create_test_device() userOptions = popart.SessionOptions() # To obtain the final dot graph, uncomment this: # userOptions.dotChecks = {"Final"}; patterns = popart.Patterns() session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=dataFlow, optimizer=popart.SGD( {"defaultLearningRate": (0.1, True)}), loss=l1, patterns=patterns, userOptions=userOptions, deviceInfo=device) session.prepareDevice() session.weightsFromHost() inputVals = np.array(npr.randn(1 * N * C0), dtype=np.float32) stepio = popart.PyStepIO({X: inputVals}, {}) session.run(stepio) session.weightsToHost() w0R = np.array(-777.0 * np.ones(C0 * C1), dtype=np.float32) weightsRead = popart.PyWeightsIO({W: w0R}) session.readWeights(weightsRead) # A pytorch version to confirm numerical correctness: class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.w0 = torch.nn.Parameter(torch.from_numpy(wVals.copy())) def forward(self, x): return torch.relu(torch.matmul(x, self.w0)) net = Net() optimizer = optim.SGD(net.parameters(), lr=0.1) out = net(torch.from_numpy(inputVals.reshape([N, C0]))) loss = 0.2 * torch.mean(torch.abs(out)) optimizer.zero_grad() loss.backward() optimizer.step() baseline0 = np.sum( np.abs(net.w0.detach().numpy().flatten() - wVals.flatten())) baseline1 = np.sum(np.abs(w0R - wVals.flatten())) error = np.sum(np.abs(np.abs(net.w0.detach().numpy().flatten() - w0R))) assert (error / (baseline0 + baseline1) < 1e-6)
def test_save_tensors_optimizer_state_externally(optimizerInfo): """ # 1. create training session with momentum, save initializers externally # 2. check file size before session.modelToHost, see it grows after # due to the additional optimizer state tensors being saved # 3. read tensors from file, compare with anchors to verify that the # additional optimizer state has saved correctly # 4. Create a new session from the saved onnx model, run both sessions # and compare outputs to verify that the optimizer state tensors # were loaded in correctly """ optimizer = optimizerInfo[0] extraOptimizerStatePrefs = optimizerInfo[1] d1 = np.random.rand(3, 3).astype(np.float32) d2 = np.random.rand(3).astype(np.float32) builder = popart.Builder() i1 = builder.addInitializedInputTensor(d1) i2 = builder.addInitializedInputTensor(d2) o = builder.aiOnnx.matmul([i1, i2]) loss = builder.aiGraphcore.identityloss([o]) with TemporaryDirectory() as tmpdir: tmpfile = os.path.join(tmpdir, "model_tensors.onnx") builder.saveInitializersExternally([i1, i2], tmpfile) # Check file is of expected size: (3 * 3 * 4) + (3 * 4) = 48 assert os.path.exists(tmpfile) assert os.path.getsize(tmpfile) == d1.size * 4 + d2.size * 4 anchorIds = [o] anchorIds.append(popart.reservedGradientPrefix() + i1) anchorIds.append(popart.reservedGradientPrefix() + i2) session = popart.TrainingSession( deviceInfo=popart.DeviceManager().createCpuDevice(), fnModel=builder.getModelProto(), loss=loss, optimizer=optimizer, dataFlow=popart.DataFlow(1, anchorIds)) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() session.run(popart.PyStepIO({}, anchors)) session.weightsToHost() weightsMap = {} weightsMap[i1] = np.ones(d1.size).astype(np.float32) weightsMap[i2] = np.ones(d2.size).astype(np.float32) for pref in extraOptimizerStatePrefs: if pref == popart.reservedStepPrefix(): size1 = 1 size2 = 1 else: size1 = d1.size size2 = d2.size weightsMap[pref + i1] = np.ones(size1).astype(np.float32) weightsMap[pref + i2] = np.ones(size2).astype(np.float32) session.readWeights(popart.PyWeightsIO(weightsMap)) tmpfile1 = os.path.join(tmpdir, "model.onnx") session.modelToHost(tmpfile1) # Extra state for each initializer expectedSize = (d1.size * 4) + (d2.size * 4) for pref in extraOptimizerStatePrefs: if pref == popart.reservedStepPrefix(): expectedSize += (2 * 4) else: expectedSize += d1.size * 4 expectedSize += d2.size * 4 assert os.path.getsize(tmpfile) == expectedSize # Compare anchors with external data written to file saved_weights = np.fromfile(tmpfile, dtype=np.float32) assert np.allclose(saved_weights[0:d1.size], weightsMap[i1].flatten()) totalSize = d1.size + d2.size assert np.allclose(saved_weights[d1.size:totalSize], weightsMap[i2].flatten()) for pref in extraOptimizerStatePrefs: assert np.allclose(saved_weights[totalSize:totalSize + d1.size], weightsMap[pref + i1].flatten()) totalSize += d1.size assert np.allclose(saved_weights[totalSize:totalSize + d2.size], weightsMap[pref + i2].flatten()) totalSize += d2.size # Create new session new_session = popart.TrainingSession( deviceInfo=popart.DeviceManager().createCpuDevice(), fnModel=tmpfile1, loss=loss, optimizer=optimizer, dataFlow=popart.DataFlow(1, anchorIds)) new_anchors = new_session.initAnchorArrays() new_session.prepareDevice() new_session.weightsFromHost() new_session.run(popart.PyStepIO({}, new_anchors)) session.run(popart.PyStepIO({}, anchors)) # Compare output from both sessions to confirm that the optimizer state # tensors have been read back in correctly for the new session for anchorId in anchorIds: assert np.allclose(anchors[anchorId], new_anchors[anchorId])
def test(config, iteration, true_scaling, test_case): builder = popart.Builder() w0name = "weight_0" w1name = "weight_1" w2name = "weight_2" input0Shape = [1, 1, 1] input0 = builder.addInputTensor( popart.TensorInfo("FLOAT", input0Shape), "input0") w0data = np.array([test_case[0][0]], dtype=np.float32) w0R = np.empty([ 1, ], dtype=np.float32) w0Id = builder.addInitializedInputTensor(w0data, w0name) w1data = np.array([test_case[1][0]], dtype=np.float32) w1R = np.empty([ 1, ], dtype=np.float32) w1Id = builder.addInitializedInputTensor(w1data, w1name) w2data = np.array([test_case[2][0]], dtype=np.float32) w2R = np.empty([ 1, ], dtype=np.float32) w2Id = builder.addInitializedInputTensor(w2data, w2name) add0 = builder.aiOnnx.add([w0Id, input0]) add1 = builder.aiOnnx.add([w1Id, add0]) add2 = builder.aiOnnx.add([w2Id, add1]) loss = builder.aiGraphcore.l1loss([add2], 1.0, debugContext="l1LossVal") builder.addOutputTensor(add2) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {}) opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} pat = popart.Patterns(popart.PatternsLevel.Default) dm = popart.DeviceManager() dm.setOnDemandAttachTimeout(int(1e4)) device = dm.acquireAvailableDevice( 1, connectionType=popart.DeviceConnectionType.OnDemand, selectionCriterion=popart.DeviceSelectionCriterion.Random) if device is None: raise OSError("Failed to acquire IPU.") # The stage->tensor map would come from the Bert model in reality # (see model.tensors) mock_tensor_map = {0: [w0Id], 1: [w1Id], 2: [w2Id]} factory = ScheduledOptimizerFactory(config, iteration, tensors=mock_tensor_map) assert_scaled_lr(factory, true_scaling) optimizer_step0 = factory.create() session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=loss, optimizer=optimizer_step0, patterns=pat, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() input_data = np.array([3.1415], dtype=np.float32) stepio = popart.PyStepIO({input0: input_data}, anchors) for step in range(iteration.total_steps): session.run(stepio) session.weightsToHost() weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R}) session.readWeights(weightsRead) assert (np.isclose(test_case[0][step + 1], w0R)) assert (np.isclose(test_case[1][step + 1], w1R)) assert (np.isclose(test_case[2][step + 1], w2R)) iteration.count += 1 if factory.should_update(iteration): optimizer_step1 = factory.update_and_create(iteration) assert_scaled_lr(factory, true_scaling) session.updateOptimizerFromHost(optimizer_step1)
def compare_against_pytorch(optType, optMaps, batchesPerStep=5, scaled=False): seed = 1015 npr.seed(seed) torch.manual_seed(seed) optkwargs = {} if optType == "adam": popartOpt = popart.Adam optkwargs[ "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization optkwargs["scaled_optimizer_state"] = scaled elif optType == "adamw": popartOpt = popart.Adam optkwargs["weight_decay_mode"] = popart.WeightDecayMode.Decay optkwargs["scaled_optimizer_state"] = scaled elif optType == "adamax": popartOpt = popart.Adam optkwargs["mode"] = popart.AdamMode.AdaMax optkwargs[ "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization optkwargs["scaled_optimizer_state"] = scaled elif optType == "lamb": popartOpt = popart.Adam optkwargs["mode"] = popart.AdamMode.Lamb optkwargs["weight_decay_mode"] = popart.WeightDecayMode.Decay optkwargs["scaled_optimizer_state"] = scaled elif optType == "lambnobias": popartOpt = popart.Adam optkwargs["mode"] = popart.AdamMode.LambNoBias optkwargs["weight_decay_mode"] = popart.WeightDecayMode.Decay optkwargs["scaled_optimizer_state"] = scaled elif optType == "adagrad": popartOpt = popart.Adaptive optkwargs["mode"] = popart.AdaptiveMode.AdaGrad optkwargs[ "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization elif optType == "rmsprop": popartOpt = popart.Adaptive optkwargs["mode"] = popart.AdaptiveMode.RMSProp optkwargs[ "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization elif optType == "centeredrmsprop": popartOpt = popart.Adaptive optkwargs["mode"] = popart.AdaptiveMode.CenteredRMSProp optkwargs[ "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization elif optType == "adadelta": popartOpt = popart.Adaptive optkwargs["mode"] = popart.AdaptiveMode.AdaDelta optkwargs[ "weight_decay_mode"] = popart.WeightDecayMode.L2Regularization elif optType == "sgd0": popartOpt = popart.SGD elif optType == "sgd1": popartOpt = popart.SGD optkwargs[ "accumulatorAndMomentum"] = popart.SGDAccumulatorAndMomentum.Combined elif optType == "sgd2": popartOpt = popart.SGD optkwargs[ "accumulatorAndMomentum"] = popart.SGDAccumulatorAndMomentum.Separate else: raise "Unknown optType: " + optType #L1 loss value lambda1 = 1.0 # tensor dimensions and replications height = 2 numberOfSteps = len(optMaps) sampleShape = [height, height] replicationFactor = 1 accumulationFactor = 1 nVirtualGraphs = 1 samplesPerBatch = 4 divvyFactor = replicationFactor * accumulationFactor samplesPerMicroBatch = samplesPerBatch // divvyFactor nIPUs = replicationFactor * nVirtualGraphs stepDataShape = [batchesPerStep, samplesPerBatch, height, height] microBatchShape = [samplesPerMicroBatch, height, height] stepDataInfo = popart.TensorInfo("FLOAT", stepDataShape) microBatchInfo = popart.TensorInfo("FLOAT", microBatchShape) #initial weight and input values w0vals = np.array(npr.randn(height, height), dtype=np.float32) w1vals = np.array(npr.randn(height, height), dtype=np.float32) inputVals = [ np.array(npr.randn(*stepDataShape), dtype=np.float32) for i in range(numberOfSteps) ] # Build the ONNX Model builder = popart.Builder() input0 = builder.addInputTensor(microBatchInfo) w0 = builder.addInitializedInputTensor(w0vals) w1 = builder.addInitializedInputTensor(w1vals) # Model: # # input w0 w1 # \ | | # mul - add - L1 loss mm0 = builder.aiOnnx.mul([input0, w0]) mm1 = builder.aiOnnx.add([mm0, w1]) l1 = builder.aiGraphcore.l1loss([mm1], lambda1) art = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(batchesPerStep, {}) device = tu.create_test_device(numIpus=nIPUs) userOptions = popart.SessionOptions() userOptions.enableGradientAccumulation = False userOptions.enablePrefetchDatastreams = False session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=dataFlow, userOptions=userOptions, loss=l1, optimizer=popartOpt(optMaps[0], **optkwargs), deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) anchorArrays = session.initAnchorArrays() session.prepareDevice() session.weightsFromHost() for step in range(numberOfSteps): stepio = popart.PyStepIO({input0: inputVals[step]}, anchorArrays) session.run(stepio) if (step < numberOfSteps - 1): session.updateOptimizerFromHost( popartOpt(optMaps[step + 1], **optkwargs)) session.weightsToHost() w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R}) session.readWeights(weightsRead) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.w0 = torch.nn.Parameter(torch.from_numpy(w0vals.copy())) self.w1 = torch.nn.Parameter(torch.from_numpy(w1vals.copy())) def forward(self, x, i): mm0 = torch.mul(x, self.w0) mm1 = torch.add(mm0, self.w1) return mm1 net = Net() for step in range(numberOfSteps): if step is 0: oldOptimizer = None else: oldOptimizer = optimizer if optType == "adam": optimizer = optim.Adam( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], betas=(optMaps[step]["defaultBeta1"][0], optMaps[step]["defaultBeta2"][0]), eps=optMaps[step]["defaultEps"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0]) elif optType == "adamw": optimizer = optim.AdamW( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], betas=(optMaps[step]["defaultBeta1"][0], optMaps[step]["defaultBeta2"][0]), eps=optMaps[step]["defaultEps"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0]) elif optType == "adamax": optimizer = optim.Adamax( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], betas=(optMaps[step]["defaultBeta1"][0], optMaps[step]["defaultBeta2"][0]), eps=optMaps[step]["defaultEps"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0]) elif optType == "lamb": optimizer = torch_lamb.Lamb( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], betas=(optMaps[step]["defaultBeta1"][0], optMaps[step]["defaultBeta2"][0]), eps=optMaps[step]["defaultEps"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0]) elif optType == "lambnobias": optimizer = torch_lamb.Lamb( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], betas=(optMaps[step]["defaultBeta1"][0], optMaps[step]["defaultBeta2"][0]), eps=optMaps[step]["defaultEps"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0], biasCorrection=False) elif optType == "adagrad": optimizer = optim.Adagrad( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0], eps=optMaps[step]["defaultEps"][0]) elif optType == "rmsprop": optimizer = optim.RMSprop( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], alpha=optMaps[step]["defaultAlpha"][0], eps=optMaps[step]["defaultEps"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0], momentum=optMaps[step]["defaultMomentum"][0] if "defaultMomentum" in optMaps[step] else 0.0) elif optType == "centeredrmsprop": optimizer = optim.RMSprop( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], alpha=optMaps[step]["defaultAlpha"][0], eps=optMaps[step]["defaultEps"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0], momentum=optMaps[step]["defaultMomentum"][0] if "defaultMomentum" in optMaps[step] else 0.0, centered=True) elif optType == "adadelta": optimizer = optim.Adadelta( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], rho=optMaps[step]["defaultAlpha"][0], eps=optMaps[step]["defaultEps"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0]) else: # Same for SGD1 and SGD2. optimizer = optim.SGD( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], momentum=optMaps[step]["defaultMomentum"][0], dampening=optMaps[step]["defaultDampening"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0]) if step is 0: for group in optimizer.param_groups: for p in group['params']: param_state = optimizer.state[p][ 'momentum_buffer'] = p.data * 0 param_state = optimizer.state[p]['exp_avg'] = p.data * 0 param_state = optimizer.state[p]['exp_avg_sq'] = p.data * 0 param_state = optimizer.state[p]['exp_inf'] = p.data * 0 param_state = optimizer.state[p]['square_avg'] = p.data * 0 param_state = optimizer.state[p]['grad_avg'] = p.data * 0 param_state = optimizer.state[p]['acc_delta'] = p.data * 0 param_state = optimizer.state[p]['sum'] = p.data * 0 param_state = optimizer.state[p]['step'] = 0 else: for group, oldGroup in zip(optimizer.param_groups, oldOptimizer.param_groups): for p, oldp in zip(group['params'], oldGroup['params']): param_state = optimizer.state[p][ 'momentum_buffer'] = oldOptimizer.state[p][ 'momentum_buffer'] param_state = optimizer.state[p][ 'exp_avg'] = oldOptimizer.state[p]['exp_avg'] param_state = optimizer.state[p][ 'exp_avg_sq'] = oldOptimizer.state[p]['exp_avg_sq'] param_state = optimizer.state[p][ 'exp_inf'] = oldOptimizer.state[p]['exp_inf'] param_state = optimizer.state[p][ 'square_avg'] = oldOptimizer.state[p]['square_avg'] param_state = optimizer.state[p][ 'grad_avg'] = oldOptimizer.state[p]['grad_avg'] param_state = optimizer.state[p][ 'acc_delta'] = oldOptimizer.state[p]['acc_delta'] param_state = optimizer.state[p][ 'sum'] = oldOptimizer.state[p]['sum'] param_state = optimizer.state[p][ 'step'] = oldOptimizer.state[p]['step'] for i in range(batchesPerStep): out = net(torch.from_numpy(inputVals[step][i]), i) loss = lambda1 * torch.mean(torch.abs(out)) optimizer.zero_grad() loss.backward() optimizer.step() delta0 = np.sum(np.abs(net.w0.detach().numpy() - w0vals)) delta1 = np.sum(np.abs(net.w1.detach().numpy() - w1vals)) print("pytorch baseline") print("Total moved by w0: ", delta0) print("Total moved by w1: ", delta1) error0 = np.sum(np.abs(w0R - net.w0.detach().numpy())) / delta0 error1 = np.sum(np.abs(w1R - net.w1.detach().numpy())) / delta1 print("without pipelining") print("Total moved by w0: ", np.sum(np.abs(w0R - w0vals))) print("Total moved by w1: ", np.sum(np.abs(w1R - w1vals))) print("l1 error for w0: ", error0) print("l1 error for w1: ", error1) assert (error0 < 1e-5) assert (error1 < 1e-5)
def test_rmsprop_tf_mode(use_tf_variant, const_lr, adaptive_mode, momentum, weight_decay, weight_decay_mode, const_weight_decay): np.random.seed(0) input_dim = 3 num_steps = 15 batches_per_step = 10 samples_per_batch = 8 # Optimizer parameters. learning_rates = np.linspace(0.02, 0.00001, num_steps) weight_decays = np.linspace(weight_decay, weight_decay + 0.01, num_steps) alpha = 0.95 eps = 0.001 # Initial weights and inputs. w0_data = np.random.randn(input_dim, input_dim).astype(np.float32) w1_data = np.random.randn(input_dim, input_dim).astype(np.float32) input_data = [ np.random.randn( batches_per_step, samples_per_batch, input_dim, input_dim, ).astype(np.float32) for _ in range(num_steps) ] # Build the model. # # input w0 w1 # \ | | # mul - add - L1 loss builder = popart.Builder() input = builder.addInputTensor( popart.TensorInfo("FLOAT", [samples_per_batch, input_dim, input_dim])) w0 = builder.addInitializedInputTensor(w0_data) w1 = builder.addInitializedInputTensor(w1_data) mm0 = builder.aiOnnx.mul([input, w0]) mm1 = builder.aiOnnx.add([mm0, w1]) l1 = builder.aiGraphcore.l1loss([mm1], 1.0) dataflow = popart.DataFlow(batches_per_step, {}) session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=dataflow, loss=l1, optimizer=get_rmsprop( learning_rates[0], const_lr, alpha, momentum, weight_decay, weight_decay_mode, const_weight_decay, eps, adaptive_mode, use_tf_variant, ), deviceInfo=tu.create_test_device(), ) anchor_arrays = session.initAnchorArrays() session.prepareDevice() session.weightsFromHost() # Run popart training and retrieve the weights. for step in range(num_steps): stepio = popart.PyStepIO({input: input_data[step]}, anchor_arrays) session.run(stepio) if step < num_steps - 1: # Update optimizer from host in case lr or wd are non-const. need_to_update_optimizer = False lr = learning_rates[0] wd = weight_decays[0] if not const_lr: lr = learning_rates[step + 1] need_to_update_optimizer = True if not const_weight_decay: wd = weight_decays[step + 1] need_to_update_optimizer = True if need_to_update_optimizer: session.updateOptimizerFromHost( get_rmsprop( lr, const_lr, alpha, momentum, wd, weight_decay_mode, const_weight_decay, eps, adaptive_mode, use_tf_variant, )) session.weightsToHost() w0_popart = np.zeros((input_dim, input_dim), dtype=np.float32) w1_popart = np.zeros((input_dim, input_dim), dtype=np.float32) weights_read = popart.PyWeightsIO({w0: w0_popart, w1: w1_popart}) session.readWeights(weights_read) # Run numpy training. centered = adaptive_mode == popart.AdaptiveMode.CenteredRMSProp if weight_decay_mode == popart.WeightDecayMode.L2Regularization: wd_mode = 'L2' else: wd_mode = 'decay' w0_np = w0_data.copy() w1_np = w1_data.copy() mg0 = np.zeros(w0_np.shape, dtype=w0_np.dtype) mg1 = np.zeros(w1_np.shape, dtype=w1_np.dtype) rms0 = np.ones(w0_np.shape, dtype=w0_np.dtype) rms1 = np.ones(w1_np.shape, dtype=w1_np.dtype) mom0 = np.zeros(w0_np.shape, dtype=w0_np.dtype) mom1 = np.zeros(w1_np.shape, dtype=w1_np.dtype) for step in range(num_steps): lr = learning_rates[0] if const_lr else learning_rates[step] wd = weight_decays[0] if const_weight_decay else weight_decays[step] for batch in range(batches_per_step): w0_grad = np.zeros(w0_np.shape, dtype=w0_np.dtype) w1_grad = np.zeros(w1_np.shape, dtype=w1_np.dtype) for sample in range(samples_per_batch): x = input_data[step][batch][sample] w0_grad_sample, w1_grad_sample = model_grad(w0_np, w1_np, x) w0_grad += (1.0 / samples_per_batch) * w0_grad_sample w1_grad += (1.0 / samples_per_batch) * w1_grad_sample w0_np, mg0, rms0, mom0 = rpnp.rmsprop_update_numpy( w0_np, w0_grad, mg0, rms0, mom0, lr, alpha, momentum, wd, wd_mode, eps, centered, ) w1_np, mg1, rms1, mom1 = rpnp.rmsprop_update_numpy( w1_np, w1_grad, mg1, rms1, mom1, lr, alpha, momentum, wd, wd_mode, eps, centered, ) # Compare the resulting paramaters. if use_tf_variant: np.testing.assert_allclose(w0_popart, w0_np, rtol=1e-02, atol=1e-05) np.testing.assert_allclose(w1_popart, w1_np, rtol=1e-02, atol=1e-05) else: assert not np.allclose(w0_popart, w0_np, rtol=1e-02, atol=1e-05) assert not np.allclose(w1_popart, w1_np, rtol=1e-02, atol=1e-05)
def test(opt0, opt1, e0, e1, e2): builder = popart.Builder() input0Shape = [stepSize, batchSize, sampleDim] input0 = builder.addInputTensor( popart.TensorInfo("FLOAT", input0Shape), "input0") w0data = np.array([100.0], dtype=np.float32) w0R = np.array([-777.0], dtype=np.float32) w0Id = builder.addInitializedInputTensor(w0data, w0name) w1data = np.array([200.0], dtype=np.float32) w1R = np.array([-777.0], dtype=np.float32) w1Id = builder.addInitializedInputTensor(w1data, w1name) w2data = np.array([300.0], dtype=np.float32) w2R = np.array([-777.0], dtype=np.float32) w2Id = builder.addInitializedInputTensor(w2data, w2name) add0 = builder.aiOnnx.add([w0Id, input0]) add1 = builder.aiOnnx.add([w1Id, add0]) add2 = builder.aiOnnx.add([w2Id, add1]) l1 = builder.aiGraphcore.l1loss([add2], 1.0) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {}) opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enableGroupedMatmuls = False pat = popart.Patterns(popart.PatternsLevel.Default) session = popart.TrainingSession( fnModel=proto, dataFlow=dataFlow,\ userOptions=opts, loss=l1, optimizer=opt0, patterns=pat, deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() input0Data = np.array([3.1415], dtype=np.float32) stepio = popart.PyStepIO({input0: input0Data}, anchors) session.run(stepio) session.updateOptimizerFromHost(opt1) session.run(stepio) session.weightsToHost() weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R}) session.readWeights(weightsRead) assert (np.isclose(e0['initalValue'], w0R)) assert (np.isclose(e1['initalValue'], w1R)) assert (np.isclose(e2['initalValue'], w2R))
def runTest(forceAddOutOfPlace, pipelineRecomputation): """ Test of pipelining with dropout, recomputation, graph replication, gradient accumulation """ #Has dependencies on T12562. T12976, T13098 for full support seed = 1015 npr.seed(seed) torch.manual_seed(seed) #L1 loss value lambda1 = 1.0 #optimizer parameters defaultLearningRate0 = 0.001 defaultMomentum0 = 0.01 defaultDampening0 = 0.5 lossScaling0 = 10.0 defaultVelocityScaling0 = 0.15 defaultWeightDecay0 = 0.01 # tensor dimensions and replications height = 6 batchesPerStep = 5 sampleShape = [height, height] accumulationFactor = 4 samplesPerBatch = 48 divvyFactor = replicationFactor * accumulationFactor if (samplesPerBatch % divvyFactor != 0): raise RuntimeError("Invalid divvy factor") samplesPerMicroBatch = samplesPerBatch // divvyFactor stepDataShape = [batchesPerStep, samplesPerBatch, height, height] microBatchShape = [samplesPerMicroBatch, height, height] stepDataInfo = popart.TensorInfo("FLOAT", stepDataShape) microBatchInfo = popart.TensorInfo("FLOAT", microBatchShape) #initial weight and input values w0vals = np.array(npr.randn(height, height), dtype=np.float32) w1vals = np.array(npr.randn(height, height), dtype=np.float32) w2vals = np.array(npr.randn(height, height), dtype=np.float32) inputVals = np.array(npr.randn(*stepDataShape), dtype=np.float32) # Build the ONNX Model builder = popart.Builder() input0 = builder.addInputTensor(microBatchInfo) w0 = builder.addInitializedInputTensor(w0vals) w1 = builder.addInitializedInputTensor(w1vals) w2 = builder.addInitializedInputTensor(w2vals) scaleFactor = 1. / np.sqrt(height + 0.) # Model: # # input w0 w1 # \ | / # matmul - scale -> dropout -> matul # \ | # | scale # | | # | dropout # | /\ # add -------<---<----<---- \ # | | # dropout scale by 2 # | | # = = = | = = = = = IPU barrier = = =|= = = = = = # | | # | w2 | # | / | # matmul / # | / # scale / # | / # dropout / # | | # ------->---->---->---> add -> L1 loss (lambda 2) with builder.virtualGraph(0): mm0 = builder.aiOnnx.matmul([input0, w0]) scale0 = builder.aiGraphcore.scale([mm0], scaleFactor) ratio0 = 0.35 [dropout0, mask0] = builder.aiOnnx.dropout([scale0], num_outputs=2, ratio=ratio0) mm1 = builder.aiOnnx.matmul([dropout0, w1]) scale1 = builder.aiGraphcore.scale([mm1], scaleFactor) ratio1 = 0.5 [dropout1, mask1] = builder.aiOnnx.dropout([scale1], num_outputs=2, ratio=ratio1) dropout1 = builder.aiGraphcore.scale([dropout1], 2.0) skipOut = builder.aiOnnx.add([mm0, dropout1]) # See resolved task T13137 if forceAddOutOfPlace: builder.setInplacePreferences(skipOut, {"AddRhsInplace": -1.0}) ratioSkip = 0.6 [dropoutSkip, maskSkip] = builder.aiOnnx.dropout([skipOut], num_outputs=2, ratio=ratioSkip) # see T13142: we do this so that the recomputation does not modify the anchors mask0 = builder.aiOnnx.identity([mask0]) mask1 = builder.aiOnnx.identity([mask1]) maskSkip = builder.aiOnnx.identity([maskSkip]) with builder.virtualGraph(1): mm2 = builder.aiOnnx.matmul([dropoutSkip, w2]) scale2 = builder.aiGraphcore.scale([mm2], scaleFactor) ratio2 = 0.7 [dropout2, mask2] = builder.aiOnnx.dropout([scale2], num_outputs=2, ratio=ratio2) out = builder.aiOnnx.add([dropout2, dropout1]) l1 = builder.aiGraphcore.l1loss([out], lambda1, reduction=popart.ReductionType.Sum) # see T13142: we do this so that the recomputation does not modify the anchors mask2 = builder.aiOnnx.identity([mask2]) anchors = { mask0: popart.AnchorReturnType("All"), mask1: popart.AnchorReturnType("All"), mask2: popart.AnchorReturnType("All"), maskSkip: popart.AnchorReturnType("All"), } dataFlow = popart.DataFlow(batchesPerStep, anchors) device = tu.create_test_device(numIpus=nIPUs) assert device userOptions = popart.SessionOptions() # This requires T12562 to be solved before enabling (TODO) userOptions.enableOutlining = False userOptions.enablePipelining = True userOptions.enableGradientAccumulation = True userOptions.accumulationFactor = accumulationFactor if pipelineRecomputation: userOptions.autoRecomputation = popart.RecomputationType.Pipeline if (replicationFactor > 1): userOptions.enableReplicatedGraphs = True userOptions.replicatedGraphCount = replicationFactor userOptions.virtualGraphMode = popart.VirtualGraphMode.Manual # TODO https://phabricator.sourcevertex.net/T14035 userOptions.enablePrefetchDatastreams = False # passes: userOptions.engineOptions = {"exchange.streamBufferOverlap": "any"} # fails: # userOptions.engineOptions = {"exchange.streamBufferOverlap" : "hostRearrangeOnly"} patterns = popart.Patterns() patterns.InPlace = True session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=dataFlow, optimizer=popart.SGD({ "defaultLearningRate": (defaultLearningRate0, False), "defaultMomentum": (defaultMomentum0, False), "defaultDampening": (defaultDampening0, False), "defaultVelocityScaling": (defaultVelocityScaling0, False), "lossScaling": (lossScaling0, True), "defaultWeightDecay": (defaultWeightDecay0, True) }), loss=l1, patterns=patterns, userOptions=userOptions, deviceInfo=device) anchorArrays = session.initAnchorArrays() session.prepareDevice() session.setRandomSeed(7) session.weightsFromHost() stepio = popart.PyStepIO({input0: inputVals}, anchorArrays) session.run(stepio) session.weightsToHost() w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) w2R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R, w2: w2R}) session.readWeights(weightsRead) class Net(nn.Module): def __init__(self): super(Net, self).__init__() # merge replication, accumulation flattenedShape = [anchorArrays[mask0].shape[0], -1, height, height] self.w0 = torch.nn.Parameter(torch.from_numpy(w0vals.copy())) self.mask0 = torch.from_numpy( anchorArrays[mask0].reshape(flattenedShape)) self.w1 = torch.nn.Parameter(torch.from_numpy(w1vals.copy())) self.mask1 = torch.from_numpy( anchorArrays[mask1].reshape(flattenedShape)) self.maskSkip = torch.from_numpy( anchorArrays[maskSkip].reshape(flattenedShape)) self.w2 = torch.nn.Parameter(torch.from_numpy(w2vals.copy())) self.mask2 = torch.from_numpy( anchorArrays[mask2].reshape(flattenedShape)) def forward(self, x, i): mm0 = torch.matmul(x, self.w0) dr0 = mm0 * scaleFactor * self.mask0[i].type( torch.FloatTensor) / (1 - ratio0) mm1 = torch.matmul(dr0, self.w1) dr1 = mm1 * scaleFactor * self.mask1[i].type( torch.FloatTensor) / (1 - ratio1) dr1 = 2 * dr1 drSkip = (dr1 + mm0) * self.maskSkip[i].type( torch.FloatTensor) / (1 - ratioSkip) mm2 = torch.matmul(drSkip, self.w2) dr2 = mm2 * scaleFactor * self.mask2[i].type( torch.FloatTensor) / (1 - ratio2) out = dr1 + dr2 return out net = Net() optimizer = optim.SGD(net.parameters(), lr=defaultLearningRate0, momentum=defaultMomentum0, dampening=defaultDampening0, weight_decay=defaultWeightDecay0) # caveat : alternative work-around for TODO T13098 for group in optimizer.param_groups: for p in group['params']: param_state = optimizer.state[p] param_state['momentum_buffer'] = p.data * 0 for i in range(batchesPerStep): out = net(torch.from_numpy(inputVals[i]), i) loss = lambda1 * torch.sum(torch.abs(out)) optimizer.zero_grad() loss.backward() optimizer.step() delta0 = np.sum(np.abs(net.w0.detach().numpy() - w0vals)) delta1 = np.sum(np.abs(net.w1.detach().numpy() - w1vals)) delta2 = np.sum(np.abs(net.w2.detach().numpy() - w2vals)) print("pytorch baseline") print("Total moved by w0: ", delta0) print("Total moved by w1: ", delta1) print("Total moved by w2: ", delta2) error0 = np.sum(np.abs(w0R - net.w0.detach().numpy())) / delta0 error1 = np.sum(np.abs(w1R - net.w1.detach().numpy())) / delta1 error2 = np.sum(np.abs(w2R - net.w2.detach().numpy())) / delta2 print("without pipelining") print("Total moved by w0: ", np.sum(np.abs(w0R - w0vals))) print("Total moved by w1: ", np.sum(np.abs(w1R - w1vals))) print("Total moved by w2: ", np.sum(np.abs(w2R - w2vals))) print("l1 error for w0: ", error0) print("l1 error for w1: ", error1) print("l1 error for w2: ", error2) assert (error0 < 1e-5) assert (error1 < 1e-5) assert (error2 < 1e-5)
def test_against_pytorch(): """ Comparison of popart and PyTorch optimizers, and the changes needed to PyTorch to match popart. Note that these differences should have no effect on overall training measures, and this discussion is just for those interested in exact reproducility between popart and pytorch The main differences are: 1) pytorch optimizer, in the very first iteration with a new optimizer: initializes velocity tensor with zeros and does not do damping, popart optimizer, in the first iteration with a new optimizer: retains velocity tensor from previous state with previous optimizer, or sets it to zero if the first round of training 2) popart and pytorch updates the optimizer at a different "phase" a) v <- v * mm + (1 - dp) * wd * w b) v <- v + (1 - dp) * g c) w <- w - lr * v pytorch goes (abc)(abc)(abc)(abc) popart goes (abca)(bca)(bca)(bca) where changes to the optimizer can be done between periods. For this reason, updates to mm, dp, and wd have different effects. See also sgd_mixed_mode_test_cpp_1_3.cpp """ #optimizer parameters defaultLearningRate0 = 0.005 defaultLearningRate1 = 0.003 defaultLearningRate2 = 0.001 defaultMomentum0 = 0.1 defaultDampening0 = 0.3 lossScaling0 = 10.0 defaultVelocityScaling0 = 0.5 defaultWeightDecay0 = 0.01 optMap0 = { "defaultLearningRate": (defaultLearningRate0, False), "defaultMomentum": (defaultMomentum0, False), "defaultDampening": (defaultDampening0, False), "defaultVelocityScaling": (defaultVelocityScaling0, False), "lossScaling": (lossScaling0, False), "defaultWeightDecay": (defaultWeightDecay0, False) } optMap1 = { "defaultLearningRate": (defaultLearningRate1, False), "defaultMomentum": (defaultMomentum0, False), "defaultDampening": (defaultDampening0, False), "defaultVelocityScaling": (defaultVelocityScaling0, False), "lossScaling": (lossScaling0, False), "defaultWeightDecay": (defaultWeightDecay0, False) } optMap2 = { "defaultLearningRate": (defaultLearningRate2, False), "defaultMomentum": (defaultMomentum0, False), "defaultDampening": (defaultDampening0, False), "defaultVelocityScaling": (defaultVelocityScaling0, False), "lossScaling": (lossScaling0, False), "defaultWeightDecay": (defaultWeightDecay0, False) } seed = 1015 npr.seed(seed) torch.manual_seed(seed) #L1 loss value lambda1 = 1.0 # tensor dimensions and replications height = 2 numberOfSteps = 3 batchesPerStep = 5 sampleShape = [height, height] replicationFactor = 1 accumulationFactor = 1 nVirtualGraphs = 1 samplesPerBatch = 4 divvyFactor = replicationFactor * accumulationFactor samplesPerMicroBatch = samplesPerBatch // divvyFactor nIPUs = replicationFactor * nVirtualGraphs stepDataShape = [batchesPerStep, samplesPerBatch, height, height] microBatchShape = [samplesPerMicroBatch, height, height] stepDataInfo = popart.TensorInfo("FLOAT", stepDataShape) microBatchInfo = popart.TensorInfo("FLOAT", microBatchShape) #initial weight and input values w0vals = np.array(npr.randn(height, height), dtype=np.float32) w1vals = np.array(npr.randn(height, height), dtype=np.float32) inputVals = [ np.array(npr.randn(*stepDataShape), dtype=np.float32) for i in range(numberOfSteps) ] # Build the ONNX Model builder = popart.Builder() input0 = builder.addInputTensor(microBatchInfo) w0 = builder.addInitializedInputTensor(w0vals) w1 = builder.addInitializedInputTensor(w1vals) # Model: # # input w0 w1 # \ | | # mul - add - L1 loss mm0 = builder.aiOnnx.mul([input0, w0]) mm1 = builder.aiOnnx.add([mm0, w1]) l1 = builder.aiGraphcore.l1loss([mm1], lambda1) dataFlow = popart.DataFlow(batchesPerStep, {}) device = tu.create_test_device(numIpus=nIPUs) userOptions = popart.SessionOptions() userOptions.enableGradientAccumulation = False userOptions.enablePrefetchDatastreams = False session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=dataFlow, userOptions=userOptions, loss=l1, optimizer=popart.SGD(optMap0), deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) anchorArrays = session.initAnchorArrays() session.prepareDevice() session.weightsFromHost() stepio = popart.PyStepIO({input0: inputVals[0]}, anchorArrays) session.run(stepio) session.updateOptimizerFromHost(popart.SGD(optMap1)) stepio = popart.PyStepIO({input0: inputVals[1]}, anchorArrays) session.run(stepio) session.updateOptimizerFromHost(popart.SGD(optMap2)) stepio = popart.PyStepIO({input0: inputVals[2]}, anchorArrays) session.run(stepio) session.weightsToHost() w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R}) session.readWeights(weightsRead) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.w0 = torch.nn.Parameter(torch.from_numpy(w0vals.copy())) self.w1 = torch.nn.Parameter(torch.from_numpy(w1vals.copy())) def forward(self, x, i): mm0 = torch.mul(x, self.w0) mm1 = torch.add(mm0, self.w1) return mm1 net = Net() optMaps = [optMap0, optMap1, optMap2] for step in range(3): if step is 0: oldOptimizer = None else: oldOptimizer = optimizer optimizer = optim.SGD( net.parameters(), lr=optMaps[step]["defaultLearningRate"][0], momentum=optMaps[step]["defaultMomentum"][0], dampening=optMaps[step]["defaultDampening"][0], weight_decay=optMaps[step]["defaultWeightDecay"][0]) if step is 0: for group in optimizer.param_groups: for p in group['params']: param_state = optimizer.state[p][ 'momentum_buffer'] = p.data * 0 else: for group, oldGroup in zip(optimizer.param_groups, oldOptimizer.param_groups): for p, oldp in zip(group['params'], oldGroup['params']): param_state = optimizer.state[p][ 'momentum_buffer'] = oldOptimizer.state[p][ 'momentum_buffer'] for i in range(batchesPerStep): out = net(torch.from_numpy(inputVals[step][i]), i) loss = lambda1 * torch.mean(torch.abs(out)) optimizer.zero_grad() loss.backward() optimizer.step() delta0 = np.sum(np.abs(net.w0.detach().numpy() - w0vals)) delta1 = np.sum(np.abs(net.w1.detach().numpy() - w1vals)) print("pytorch baseline") print("Total moved by w0: ", delta0) print("Total moved by w1: ", delta1) error0 = np.sum(np.abs(w0R - net.w0.detach().numpy())) / delta0 error1 = np.sum(np.abs(w1R - net.w1.detach().numpy())) / delta1 print("without pipelining") print("Total moved by w0: ", np.sum(np.abs(w0R - w0vals))) print("Total moved by w1: ", np.sum(np.abs(w1R - w1vals))) print("l1 error for w0: ", error0) print("l1 error for w1: ", error1) assert (error0 < 1e-5) assert (error1 < 1e-5)
def run(self, init_builder, reference, step_type='infer', opsets=None, optimizer=popart.ConstSGD(0.01), seed=None): assert step_type in ('infer', 'train') bld = Builder(opsets=opsets) anchors = {} # Allows to pass additional arguments to init_builder, if required # by the specific init_builder function implementation. kwargs = {} kwargs = tu.filter_dict(kwargs, init_builder) anchorIds = init_builder(bld, **kwargs) for anchorId in anchorIds: if anchorId not in bld._init_input_map: anchors[anchorId] = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(1, anchors) self.options.logDir = self.logging_dir device = tu.create_test_device(numIpus=self.numIPUs) print(f"Created device {device} with {self.numIPUs} IPUs") self.patterns.InPlace = self.inplacing if step_type == 'infer': session = popart.InferenceSession(fnModel=bld.getModelProto(), dataFlow=dataFlow, deviceInfo=device, patterns=self.patterns, userOptions=self.options) else: assert step_type == 'train' # Apply reduction to output (assumed to be the # first anchorId) to ensure it is scalar lossId = anchorIds[0] lossId = bld.aiGraphcore.identityloss( [lossId], reduction=self.lossReduction) session = popart.TrainingSession(fnModel=bld.getModelProto(), dataFlow=dataFlow, loss=lossId, optimizer=optimizer, deviceInfo=device, patterns=self.patterns, userOptions=self.options) anchor_map = session.initAnchorArrays() session.prepareDevice() if seed is not None: session.setRandomSeed(seed) for k, v in bld._input_map.items(): if not v.flags['C_CONTIGUOUS']: # need to call np.ascontiguousarray # `x = np.ascontiguousarray(x)` raise Exception( 'Input "{}" to popart.PyStepIO is not C_CONTIGUOS'. format(k)) # Add the replication dimension to the inputs inputs = {} for k, v in bld._input_map.items(): if self.options.replicatedGraphCount > 1: um = (self.options.replicatedGraphCount, ) um = um + tuple([1] * np.ndim(v)) # we add this offset to ensure that samples on devices are distinct offset = 1 * np.arange( self.options.replicatedGraphCount).astype( v.dtype).reshape(um) inputs[k] = np.tile(v, um) + offset else: inputs[k] = v stepio = popart.PyStepIO(inputs, anchor_map) if (step_type == 'train'): session.weightsFromHost() session.run(stepio) if (step_type == 'train'): session.weightsToHost() ref_out = reference(RefData(bld._outputs, anchor_map)) def fix_type(t): if isinstance(t, torch.Tensor): return t.data.numpy() elif isinstance(t, np.ndarray): return t elif isinstance(t, np.float32): return t elif isinstance(t, np.float16): return t elif t is None: return None else: raise Exception('unexpected type', type(t)) ref_out = [fix_type(i) for i in ref_out] for index, key in enumerate(anchorIds): if key in anchors: if ref_out[index] is not None: print('Testing anchor "{}"...'.format(key)) self.verifyTensor(anchor_map[key], ref_out[index]) else: print('Not Testing anchor "{}" as it is None'.format( key)) elif key in bld._init_input_map: if ref_out[index] is not None: print('Testing weight "{}"...'.format(key)) weightInfo = session.getInfo(key) print('Weight info shape:{} type:{}', weightInfo.shape(), weightInfo.data_type_lcase()) weights = {} weights[key] = np.empty( shape=weightInfo.shape(), dtype=weightInfo.data_type_lcase()) weightsIo = popart.PyWeightsIO(weights) session.readWeights(weightsIo) self.verifyTensor(weights[key], ref_out[index]) else: print('Not Testing weight "{}" as it is None'.format( key)) return session
def _run_popart_test_model(data, weights, clipInfo, pipelineGroups=None, accumulationFactor=None, optimizerType=None, enablePipelining=False): # make sure the weights are not accidently modified in this function weights = [np.copy(i) for i in weights] bld = popart.Builder() d0 = bld.addInputTensor(popart.TensorInfo(data)) # consistently name the weights so we can refer to them later weightIds = [ bld.addInitializedInputTensor(w, f'weight{i}') for i, w in enumerate(weights) ] # Get a pipeline stage for each weight if pipelineGroups: pipelineStages = {} maxPipelineStage = len(pipelineGroups) - 1 for pipelineStage, indices in enumerate(pipelineGroups): for index in indices: pipelineStages[index] = pipelineStage x = d0 for i, weightId in enumerate(weightIds): x = bld.aiOnnxOpset9.conv([x, weightId], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1]) if pipelineGroups: bld.pipelineStage(x, pipelineStages[i]) bld.virtualGraph(x, pipelineStages[i]) out = bld.aiGraphcore.l1loss([x], 1.0) if pipelineGroups: bld.pipelineStage(out, maxPipelineStage) bld.virtualGraph(out, maxPipelineStage) bld.addOutputTensor(out) proto = bld.getModelProto() dataFlow = popart.DataFlow(1, { x: popart.AnchorReturnType("All"), out: popart.AnchorReturnType("All") }) if pipelineGroups: device = popart.DeviceManager().createIpuModelDevice( {"numIPUs": maxPipelineStage + 1}) else: device = popart.DeviceManager().createIpuModelDevice({"numIPUs": 1}) clipNormSettings = [] for weightIndices, maxNorm in clipInfo: clipNormSettings.append( popart.ClipNormSettings.clipWeights( [weightIds[i] for i in weightIndices], maxNorm)) opts = popart.SessionOptions() opts.enableOutlining = False if pipelineGroups: opts.virtualGraphMode = popart.VirtualGraphMode.Manual opts.accumulationFactor = accumulationFactor opts.enableGradientAccumulation = True opts.accumulateOuterFragmentSettings.schedule = popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized opts.enablePipelining = enablePipelining sess = popart.TrainingSession(proto, dataFlow=dataFlow, loss=out, optimizer=_get_popart_optimizer( optimizerType, clipNormSettings), deviceInfo=device, userOptions=opts) sess.prepareDevice() sess.weightsFromHost() anchors = sess.initAnchorArrays() if pipelineGroups: data = np.array([data] * accumulationFactor) stepio = popart.PyStepIO({d0: data}, anchors) sess.run(stepio) result = anchors[x] sess.weightsToHost() resultWeights = { weightIds[i]: np.empty(weights[i].shape, dtype=weights[i].dtype) for i in range(len(weights)) } weightsio = popart.PyWeightsIO(resultWeights) sess.readWeights(weightsio) return result, resultWeights