def get_model_anchors_model2(doSharding, doPipelining, batchesPerStep, doTraining, doGradAccl=False, gradAcclFactor=1, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False, labelArray=None): np.random.seed(1234) builder = popart.Builder() micro_batch_size = batch_size // gradAcclFactor shape_d0 = [micro_batch_size, 2, 4, 4] shape_l0 = [batch_size] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0), "inp") data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0, "weights") s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [micro_batch_size, 32]) out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm") label_shape = [micro_batch_size] l0 = builder.addInputTensor(popart.TensorInfo("INT32", label_shape), "label") nll = builder.aiGraphcore.nllloss([out, l0]) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art, s0: art, c0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art if doGradAccl is True: anchor_map[popart.reservedAcclToUpdatePrefix() + popart.reservedGradientPrefix() + w0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradAccl opts.accumulationFactor = gradAcclFactor if doSharding is False: numIPUs = 1 else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIPUs = 3 builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 1) builder.virtualGraph(c0, 1) builder.virtualGraph(r0, 2) builder.virtualGraph(out, 2) builder.virtualGraph(nll, 2) if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() classes = np.prod(shape_d0) / (micro_batch_size * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) outer_dim = 1 if batchesPerStep > 1: # Add an outer dimension of batchesPerStep. We repeat the labels # as we want consistency if we have different shape inputs between examples. outer_dim *= batchesPerStep label = np.repeat(label[np.newaxis], batchesPerStep, 0) if gradAcclFactor > 1: # Divide up the batches per step batches into gradAcclFactor * batchesPerStep # samples. outer_dim *= gradAcclFactor label = label.reshape([gradAcclFactor * batchesPerStep, -1]) if outer_dim > 1: # Add the gradAcclFactor * batchesPerStep dimension into the input. shape_d0.insert(0, outer_dim) data = np.ones(shape=shape_d0).astype(np.float32) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() for i in range(6): session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
def get_model_anchors_model1(doSharding, doPipelining, batchesPerStep, doTraining, doGradAccl=False, gradAcclFactor=1, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, labelArray=None): micro_batch_size = batch_size // gradAcclFactor builder = popart.Builder() input_shape = [micro_batch_size, hidden_size] input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape)) x = input_ with builder.virtualGraph(0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_0_{i}") x = builder.aiOnnx.matmul([x, w]) with builder.virtualGraph(1 if doSharding else 0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_1_{i}") x = builder.aiOnnx.matmul([x, w]) with builder.virtualGraph(2 if doSharding else 0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_2_{i}") if i == 1: w0 = w x = builder.aiOnnx.matmul([x, w]) label = builder.addInputTensor("INT32", [micro_batch_size]) x = builder.aiGraphcore.nllloss([x, label]) output = x builder.addOutputTensor(output) art = popart.AnchorReturnType("All") anchor_map = {x: art, w0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + x] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + x] = art anchor_map[popart.reservedRestoredPrefix() + w0] = art if doGradAccl is True: anchor_map[popart.reservedAcclToUpdatePrefix() + popart.reservedGradientPrefix() + w0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradAccl opts.accumulationFactor = gradAcclFactor opts.virtualGraphMode = popart.VirtualGraphMode.Manual if doSharding is False: numIPUs = 1 else: numIPUs = 3 if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=output, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() outer_dim = 1 if batchesPerStep > 1: # Add an outer dimension of batchesPerStep. We repeat the labels # as we want consistency if we have different shape inputs between examples. outer_dim *= batchesPerStep labelArray = np.repeat(labelArray[np.newaxis], batchesPerStep, 0) if gradAcclFactor > 1: # Divide up the batches per step batches into gradAcclFactor * batchesPerStep # samples. outer_dim *= gradAcclFactor labelArray = labelArray.reshape([gradAcclFactor * batchesPerStep, -1]) if outer_dim > 1: # Add the gradAcclFactor * batchesPerStep dimension into the input. input_shape = [outer_dim] + input_shape stepio = popart.PyStepIO( { input_: np.ones(input_shape, np.float32), label: labelArray.astype(np.int32) }, anchors) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) return anchors
def test_gradient_accumulation_anchors(tmpdir): """ Check that the accumulated gradients with gradient accumulation match the gradients without gradient accumulation enabled. """ label_array = np.random.randint(0, hidden_size, batch_size) #TODO T11866 larger batches-per-step, first without weight decay, then with weight decay batches_per_step = 1 accl_initial_proto, accl_proto_filename, accl_anchor_arrays = run_mm_graph( sgd_optimizer, label_array=label_array, accum_factor=4, enable_accum=True, batches_per_step=batches_per_step, number_of_steps=1, final_proto_filename=os.path.join(tmpdir, "accl5batches3stepsAnchorsTest"), enable_multi_ipu=False, full_anchorage=True, inference_mode=False) no_accl_initial_proto, no_accl_proto_filename, no_accl_anchor_arrays = run_mm_graph( sgd_optimizer, label_array=label_array, accum_factor=1, enable_accum=False, batches_per_step=batches_per_step, number_of_steps=1, final_proto_filename=os.path.join(tmpdir, "noAccl5batches3stepsAnchorsTest"), enable_multi_ipu=False, full_anchorage=True, inference_mode=False) w0_tensor = onnx.load_from_string(accl_initial_proto).graph.initializer[0] w0_name = w0_tensor.name full_batch_grad = no_accl_anchor_arrays[popart.reservedGradientPrefix() + w0_name] accl_grad = accl_anchor_arrays[popart.reservedAcclToUpdatePrefix() + w0_name] print("full batch grad shape is ") print(full_batch_grad.shape) print("accl grad shape is ") print(accl_grad.shape) if (batches_per_step > 1): #TODO T11866 raise RuntimeError("batches per step > 1 needs investigation") for i in range(batches_per_step): print("\nbatch %d" % (i, )) print("Absolute accl grad %.3f" % (np.sum(np.abs(accl_grad[i])))) print("Absolute no accl g %.3f" % (np.sum(np.abs(full_batch_grad[i])))) print("Absolute difference %.3f" % (np.sum(np.abs(full_batch_grad[i] - accl_grad[i])))) print("Absolute difference %.3f" % (np.sum(np.abs(full_batch_grad[i] - adjusted_accl_grad[i])))) else: accl_grad_abs_sum = np.sum(np.abs(accl_grad)) print("Absolute accl grad %.3f" % (accl_grad_abs_sum)) # initialising as per equations. When velocity scaling != 1 this may need changing T12001 adjusted_accl_grad = accl_grad[-1].flatten().copy() for i, v in enumerate(w0_tensor.float_data): adjusted_accl_grad[i] -= wd * v adjusted_accl_grad_abs_sum = np.sum(np.abs(adjusted_accl_grad)) print("Absolute accl grad, adjusted for weight decay %.3f" % (adjusted_accl_grad_abs_sum)) full_batch_abs_sum = np.sum(np.abs(full_batch_grad)) print("Absolute no accl g %.3f" % (full_batch_abs_sum)) abs_diff = np.sum( np.abs(full_batch_grad.flatten() - adjusted_accl_grad)) print("Absolute difference %.3f" % (abs_diff)) assert (abs_diff / (full_batch_abs_sum + accl_grad_abs_sum) < 1e-5)
def test_accumulators_names_dont_clash(): np.random.seed(1984) builder = popart.Builder() input_data = np.random.rand(4, 4).astype(np.float32) weights = ['weight1', 'weight2', 'weight3'] d0 = builder.addInputTensor(popart.TensorInfo('FLOAT', [4, 4]), 'data0') x = builder.aiOnnx.add([ d0, builder.addInitializedInputTensor( np.random.rand(4, 4).astype(np.float32), weights[0]) ]) x = builder.aiOnnx.add([ x, builder.addInitializedInputTensor( np.random.rand(4, 4).astype(np.float32), weights[1]) ]) x = builder.aiOnnx.add([ x, builder.addInitializedInputTensor( np.random.rand(4, 4).astype(np.float32), weights[2]) ]) l1 = builder.aiGraphcore.l1loss([x], 1.0) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {}) opt = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True) }) session = popart.TrainingSession( fnModel=proto, dataFlow=dataFlow, loss=l1, optimizer=opt, deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON)) ops = ir["maingraph"] tensors = set() for op in ops: for i in op["inputs"]: tensors.add(i["name"]) for o in op["outputs"]: tensors.add(o["name"]) prefixes = [ popart.reservedAcclPrefix(), popart.reservedAcclToUpdatePrefix(), popart.reservedAcclFinalOutPrefix() ] for prefix, weight in itertools.product(prefixes, weights): assert prefix + weight in tensors
def run_graph(optimizer, input_shape, initial_onnx_model, input_tensor_name, output_tensor_name, label_tensor_name, label_array, accum_factor, enable_accum, batches_per_step, number_of_steps, final_proto_filename, enable_multi_ipu, full_anchorage, inference_mode): art = popart.AnchorReturnType("All") anchorNames = {output_tensor_name: art} if full_anchorage: w0 = onnx.load_from_string( initial_onnx_model).graph.initializer[0].name anchorNames[popart.reservedGradientPrefix() + w0] = art if enable_accum: anchorNames[popart.reservedAcclPrefix() + w0] = art anchorNames[popart.reservedAcclToUpdatePrefix() + w0] = art opts = popart.SessionOptions() opts.enableGradientAccumulation = enable_accum opts.accumulationFactor = accum_factor opts.enableOutlining = False opts.virtualGraphMode = popart.VirtualGraphMode.Manual if enable_multi_ipu else popart.VirtualGraphMode.Off if enable_multi_ipu: device = tu.create_test_device(numIpus=num_ipus, tilesPerIPU=testTilesPerIPU, opts={"compileIPUCode": False}) opts.virtualGraphMode = popart.VirtualGraphMode.Manual else: device = tu.create_test_device(tilesPerIPU=testTilesPerIPU, opts={"compileIPUCode": False}) opts.virtualGraphMode = popart.VirtualGraphMode.Off # only for test purposes, inference with gradient_accumulation should never work if inference_mode: popart.InferenceSession(fnModel=initial_onnx_model, dataFlow=popart.DataFlow( batches_per_step, anchorNames), userOptions=opts, deviceInfo=device) session = popart.TrainingSession(fnModel=initial_onnx_model, dataFlow=popart.DataFlow( batches_per_step, anchorNames), deviceInfo=device, loss=output_tensor_name, optimizer=optimizer, userOptions=opts) session.prepareDevice() session.weightsFromHost() anchor_arrays = session.initAnchorArrays() outer_dim = 1 if batches_per_step > 1: outer_dim *= batches_per_step label_array = np.repeat(label_array[np.newaxis], batches_per_step, 0) if accum_factor > 1: outer_dim *= accum_factor label_array = label_array.reshape( [accum_factor * batches_per_step, -1]) if outer_dim > 1: input_shape = [outer_dim] + input_shape stepio = popart.PyStepIO( { input_tensor_name: (1.0 - xi * npr.rand(*input_shape)).astype(np.float32), label_tensor_name: label_array.astype(np.int32) }, anchor_arrays) for i in range(number_of_steps): session.run(stepio) final_proto_file = "{}.onnx".format(final_proto_filename) session.modelToHost(final_proto_filename) return final_proto_filename, anchor_arrays