def check_models(model_init, modelA_fn, modelB_fn): """ for each weight tensor, check the relative error. That is, | model_accl - model_no_accl |_1 / | model_accl - model_initial|_1 """ modelA = onnx.load(modelA_fn) modelB = onnx.load(modelB_fn) #the initial model modelC = onnx.load_from_string(model_init) for w_i, weightA in enumerate(modelA.graph.initializer): # We need to avoid the gradient accl initializers as these won't be present # in the non grad accl models. if (popart.reservedAcclPrefix() not in weightA.name and popart.reservedAccl1Prefix() not in weightA.name and popart.reservedAccl2Prefix() not in weightA.name and popart.reservedStepPrefix() not in weightA.name and popart.reservedAccumPrefix() not in weightA.name): # where A, B, C are weight tensors, # |A - B|_1 l1AB = 0 # |B - C|_1 l1BC = 0 # |A - C|_1 l1AC = 0 for d_i, dataA in enumerate(weightA.float_data): dataB = modelB.graph.initializer[w_i].float_data[d_i] dataC = modelC.graph.initializer[w_i].float_data[d_i] # abs diff of 2 floats l1AB += np.abs(dataA - dataB) l1BC += np.abs(dataB - dataC) l1AC += np.abs(dataA - dataC) relative_error = l1AB / (l1AC) print( f"{weightA.name}: l1AB = %.2e, l1AC = %.2e, l1BC = %.2e, relative error = %.2e" % (l1AB, l1AC, l1BC, relative_error)) # check that the weights have moved enough for this to be a valid assert l1AC > 1e-3, "change since start of A = %.5f" % (l1AC, ) assert l1BC > 1e-3, "change since start of B = %.5f" % (l1BC, ) #relative error assertion assert 1e-5 > relative_error, "Relative error {}".format( relative_error)
def load_initializers_from_onnx(model_path, load_optimizer=False): """Load initial weights from an onnx checkpoint. Args: model_path (str): Path to onnx file. Returns: Dict: Mapping of popart weight names to numpy values. """ initializers = {} # By default onnx.load will look for initializers in the same dir as onnx model. # However builder.saveIntializersExternally takes real path or path relative to run dir # and stores it in the onnxproto. model = onnx.load(model_path, load_external_data=False) has_external_data = any( is_external_weight(weight) for weight in model.graph.initializer) if has_external_data: load_external_data_for_model(model, '') optimizer_prefix = (popart.reservedAccl1Prefix(), popart.reservedAccl2Prefix(), popart.reservedAcclPrefix(), popart.reservedAccumPrefix(), popart.reservedStepPrefix()) for weight in model.graph.initializer: is_optimizer_state = any(x in weight.name for x in optimizer_prefix) if not load_optimizer and is_optimizer_state: continue if is_external_weight( weight) or weight.data_type != onnx.TensorProto.FLOAT16: np_weight = numpy_helper.to_array(weight) else: int_data = np.asarray(weight.int32_data, np.int32) np_weight = int_data.view(dtype=np.float16).reshape(weight.dims) if is_optimizer_state: initializers[weight.name] = np_weight.astype(np.float32) else: initializers[weight.name] = np_weight initializers = handle_split_qkv(initializers) initializers = handle_split_word_embedding(initializers) return initializers
def test_anchor_output(): """ Test a specific example's output of weights and accumulated gradient. This should catch any slicing issues. """ anchorDict = { "ReplicationFactor": 2, # Accl factor must divide batch size "AccumulationFactor": 4, "Pipelining": True, "ReturnType": "ALL" } label_array = np.ones([BATCH_SIZE]).astype(np.int32) micro_batch_size = BATCH_SIZE // (anchorDict["AccumulationFactor"] * anchorDict["ReplicationFactor"]) builder = popart.Builder() input_shape = [micro_batch_size, CHANNELS, DATA_LEN, DATA_LEN] data_shape = popart.TensorInfo("FLOAT", input_shape) lbl_shape = popart.TensorInfo("INT32", [micro_batch_size]) w = builder.addInitializedInputTensor( np.random.random_sample(input_shape).astype(np.float32)) ip = builder.addInputTensor(data_shape) lb = builder.addInputTensor(lbl_shape) a = builder.aiOnnx.matmul([ip, w]) o = builder.reshape_const( builder.aiOnnx, [a], [micro_batch_size, CHANNELS * DATA_LEN * DATA_LEN]) o = builder.aiOnnx.relu([o]) o = builder.aiOnnx.softmax([o]) nll = builder.aiGraphcore.nllloss([o, lb]) GRAD = popart.reservedGradientPrefix() + w ACCL = popart.reservedAccumPrefix() + w art = popart.AnchorReturnType("All") data_flow = popart.DataFlow(BATCHES_PER_STEP, { o: art, a: art, ip: art, w: art, GRAD: art, ACCL: art }) opts, device = return_options(anchorDict) if device is None: pytest.skip("Test needs to run on IPU, but none are available") session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=data_flow, loss=nll, optimizer=popart.ConstSGD(LEARNING_RATE), userOptions=opts, deviceInfo=device) session.prepareDevice() if anchorDict["ReplicationFactor"] > 1: input_shape = [anchorDict["ReplicationFactor"]] + input_shape label_array = label_array.reshape( [anchorDict["ReplicationFactor"], -1]) if anchorDict["AccumulationFactor"] > 1: input_shape = [anchorDict["AccumulationFactor"]] + input_shape label_array = label_array.reshape( [anchorDict["AccumulationFactor"], -1]) if BATCHES_PER_STEP > 1: input_shape = [BATCHES_PER_STEP] + input_shape label_array = np.repeat(label_array[np.newaxis], BATCHES_PER_STEP, 0) anchors = session.initAnchorArrays() in_array = np.random.random_sample(input_shape).astype(np.float32) stepio = popart.PyStepIO({ip: in_array, lb: label_array}, anchors) session.weightsFromHost() session.run(stepio) # Returned anchors will be of shape # [bps, grad_accl_factor, repl_factor, micro_batch_size, channels, data_len, data_len] for batch in range(anchors[w].shape[0]): for replica in range(anchors[w].shape[1]): # Weights should not change over the gradient accumulation # dimension - only after gradAccl steps. assert np.allclose(anchors[w][batch, 0, :, :, :, :, :], anchors[w][batch, replica, :, :, :, :, :]) # Check that the accumulated gradient plus the weights for the current batch # equals the weights for the next batch. # Batch loop for batch in range(anchors[w].shape[0] - 1): calc_weight = {} # Replica loop. for replica in range(anchors[w].shape[2]): # For each replica in each batch, take the relevant replica's # last weight tensor in the accumulation loop minus # the sum of the accumulated gradients across replicas calc_weight[replica] = anchors[w][batch, -1, replica, :, :, :, :] - \ np.sum(anchors[ACCL][batch, -1, :, :, :, :, :], axis=0) # Then compare against the last weight tensor of the next batch, # for the relevant replica. These should match. assert np.allclose(calc_weight[replica], anchors[w][batch + 1, -1, replica, :, :, :, :])
def test_adam_loading_saved_gradient_accumulationt_tesors(tmpdir): """ 1. Build a model with matmuls, no grad accumulation 2. Write out onnx model, verify initializers contain no accum tensors 3. Create session with model, verify accl tensors initialised correctly 4. Do session.run(), write out model, verify accl tensors have been updated 5. Create new session with same model. This time before run, write out model and check tensors are still there, with the same value """ # 1. accum_factor = 4 [onnx_model, input_name, output_name, lb_name] = get_mm_model(accum_factor=accum_factor, enable_multi_ipu=False) # 2. model = onnx.load_from_string(onnx_model) names = [t.name for t in model.graph.initializer] for name in names: assert popart.reservedAccumPrefix() not in name def getTrainingSession(fn): opts = popart.SessionOptions() opts.enableGradientAccumulation = True opts.accumulationFactor = accum_factor opts.disableGradAccumulationTensorStreams = False sess = popart.TrainingSession( fnModel=fn, dataFlow=popart.DataFlow(1, {}), deviceInfo=tu.create_test_device(tilesPerIPU=testTilesPerIPU), loss=output_name, optimizer=adam_optimizer, userOptions=opts) sess.prepareDevice() sess.weightsFromHost() return sess # 3. sess = getTrainingSession(onnx_model) fn = os.path.join(tmpdir, "withInitZeroAccumTensors.onnx") sess.modelToHost(fn) model = onnx.load(fn) weights = {} optstates = {} for t in model.graph.initializer: if (popart.reservedAccumPrefix() in t.name or popart.reservedAccl1Prefix() in t.name or popart.reservedAccl2Prefix() in t.name or popart.reservedStepPrefix() in t.name): optstates[t.name] = t.float_data assert np.allclose(np.asarray(t.float_data), 0.0) else: weights[t.name] = t.float_data # 4. input_shape = [accum_factor] + sess.getInfo(input_name).shape() stepio = popart.PyStepIO( { input_name: npr.rand(*input_shape).astype(np.float32), lb_name: np.ones(batch_size).astype(np.int32), }, sess.initAnchorArrays()) sess.run(stepio) fn = os.path.join(tmpdir, "withUpdatedAcclTensors.onnx") sess.modelToHost(fn) model = onnx.load(fn) for t in model.graph.initializer: if (popart.reservedAccl1Prefix() in t.name or popart.reservedAccl2Prefix() in t.name or popart.reservedStepPrefix() in t.name): # Nonzero, updated accl1, accl2 and step tensors assert np.allclose(np.asarray(t.float_data), optstates[t.name]) is False optstates[t.name] = np.asarray(t.float_data) elif popart.reservedAccumPrefix() in t.name: # Because the accumulator is always set to zero after being applied # to accl1 and accl2 assert np.allclose(np.asarray(t.float_data), 0.0) optstates[t.name] = np.asarray(t.float_data) # 5. sess = getTrainingSession(fn) fn = os.path.join(tmpdir, "withUpdatedAcclTensors_check.onnx") sess.modelToHost(fn) model = onnx.load(fn) for t in model.graph.initializer: if (popart.reservedAccumPrefix() in t.name or popart.reservedAccl1Prefix() in t.name or popart.reservedAccl2Prefix() in t.name or popart.reservedStepPrefix() in t.name): assert np.array_equal(optstates[t.name], np.asarray(t.float_data))
def test_adam_gradient_accumulation_model_proto(tmpdir): batches_per_step = 5 for steps in [0, 3]: np.random.seed(1234) label_array = np.random.randint(0, hidden_size, batch_size) accl_initial_proto, accl_proto_filename, accl_anchor_arrays = run_mm_graph( adam_optimizer, label_array=label_array, accum_factor=4, enable_accum=True, batches_per_step=batches_per_step, number_of_steps=steps, final_proto_filename=os.path.join(tmpdir, "accl5batches3steps"), enable_multi_ipu=False, full_anchorage=False) model = onnx.load(accl_proto_filename) names = [t.name for t in model.graph.initializer] weight_names = [] accum_names = [] accl1_names = [] accl2_names = [] step_names = [] for name in names: if popart.reservedAccumPrefix() in name: accum_names.append(name) elif popart.reservedAccl1Prefix() in name: accl1_names.append(name) elif popart.reservedAccl2Prefix() in name: accl2_names.append(name) elif popart.reservedStepPrefix() in name: step_names.append(name) elif "weight" in name: weight_names.append(name) # Model should have 6 weight tensors assert len(weight_names) == 6 assert len(accum_names) == len(weight_names) assert len(accl1_names) == len(weight_names) assert len(accl2_names) == len(weight_names) assert len(step_names) == len(weight_names) tensor_mapping = {} for tensor in model.graph.initializer: tensor_mapping[tensor.name] = tensor for w_name in weight_names: assert popart.reservedAccumPrefix() + w_name in accum_names assert popart.reservedAccl1Prefix() + w_name in accl1_names assert popart.reservedAccl2Prefix() + w_name in accl2_names assert popart.reservedStepPrefix() + w_name in step_names if steps == 0: for name in accum_names + accl1_names + accl2_names + step_names: tensor = tensor_mapping[name] # All Adam states are initialized to zero assert np.allclose(tensor.float_data, 0.0) else: for name in step_names: tensor = tensor_mapping[name] # Steps counted correctly assert tensor.float_data[0] == steps * batches_per_step
def run_graph(optimizer, input_shape, initial_onnx_model, input_tensor_name, output_tensor_name, label_tensor_name, label_array, accum_factor, enable_accum, batches_per_step, number_of_steps, final_proto_filename, enable_multi_ipu, full_anchorage, inference_mode, explicit_loops): art = popart.AnchorReturnType("All") anchorNames = {output_tensor_name: art} if full_anchorage: w0 = onnx.load_from_string( initial_onnx_model).graph.initializer[0].name anchorNames[popart.reservedGradientPrefix() + w0] = art if enable_accum: anchorNames[popart.reservedAccumPrefix() + w0] = art opts = popart.SessionOptions() opts.enableGradientAccumulation = enable_accum opts.accumulationFactor = accum_factor opts.enableOutlining = False opts.virtualGraphMode = popart.VirtualGraphMode.Manual if enable_multi_ipu else popart.VirtualGraphMode.Off if explicit_loops: opts.enableExplicitMainLoops = True opts.aliasZeroCopy = True opts.explicitRecomputation = True opts.useHostCopyOps = True if enable_multi_ipu: device = tu.create_test_device(numIpus=num_ipus, tilesPerIPU=testTilesPerIPU, opts={"compileIPUCode": False}) opts.virtualGraphMode = popart.VirtualGraphMode.Manual else: device = tu.create_test_device(tilesPerIPU=testTilesPerIPU, opts={"compileIPUCode": False}) opts.virtualGraphMode = popart.VirtualGraphMode.Off # only for test purposes, inference with gradient_accumulation should never work if inference_mode: popart.InferenceSession(fnModel=initial_onnx_model, dataFlow=popart.DataFlow( batches_per_step, anchorNames), userOptions=opts, deviceInfo=device) session = popart.TrainingSession(fnModel=initial_onnx_model, dataFlow=popart.DataFlow( batches_per_step, anchorNames), deviceInfo=device, loss=output_tensor_name, optimizer=optimizer, userOptions=opts) session.prepareDevice() session.weightsFromHost() anchor_arrays = session.initAnchorArrays() outer_dim = 1 if batches_per_step > 1: outer_dim *= batches_per_step label_array = np.repeat(label_array[np.newaxis], batches_per_step, 0) if accum_factor > 1: outer_dim *= accum_factor label_array = label_array.reshape( [accum_factor * batches_per_step, -1]) if outer_dim > 1: input_shape = [outer_dim] + input_shape stepio = popart.PyStepIO( { input_tensor_name: (1.0 - xi * npr.rand(*input_shape)).astype(np.float32), label_tensor_name: label_array.astype(np.int32) }, anchor_arrays) for i in range(number_of_steps): session.run(stepio) final_proto_file = "{}.onnx".format(final_proto_filename) session.modelToHost(final_proto_filename) return final_proto_filename, anchor_arrays
def test_tensor_replication(): """ This test will verify that a broadcasted input tensor is the same on all replicas. """ anchorDict = { "ReplicationFactor": 2, # Accl factor must divide batch size "AccumulationFactor": 4, "Pipelining": False, #True "ReturnType": "ALL" } label_array = np.ones([BATCH_SIZE]).astype(np.int32) micro_batch_size = BATCH_SIZE // (anchorDict["AccumulationFactor"] * anchorDict["ReplicationFactor"]) builder = popart.Builder() input_shape = [micro_batch_size, CHANNELS, DATA_LEN, DATA_LEN] data_shape = popart.TensorInfo("FLOAT", input_shape) lbl_shape = popart.TensorInfo("INT32", [micro_batch_size]) w = builder.addInitializedInputTensor( np.random.random_sample(input_shape).astype(np.float32)) settings = popart.InputSettings(popart.ReplicatedStreamMode.Broadcast) ip = builder.addInputTensor(data_shape, settings) lb = builder.addInputTensor(lbl_shape) a = builder.aiOnnx.matmul([ip, w]) o = builder.reshape_const( builder.aiOnnx, [a], [micro_batch_size, CHANNELS * DATA_LEN * DATA_LEN]) o = builder.aiOnnx.relu([o]) o = builder.aiOnnx.softmax([o]) nll = builder.aiGraphcore.nllloss([o, lb]) GRAD = popart.reservedGradientPrefix() + w ACCL = popart.reservedAccumPrefix() + w art = popart.AnchorReturnType("All") data_flow = popart.DataFlow(BATCHES_PER_STEP, { o: art, a: art, ip: art, w: art, GRAD: art, ACCL: art }) opts, device = return_options(anchorDict) if device is None: pytest.skip("Test needs to run on IPU, but none are available") session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=data_flow, loss=nll, optimizer=popart.ConstSGD(LEARNING_RATE), userOptions=opts, deviceInfo=device) session.prepareDevice() if anchorDict["ReplicationFactor"] > 1 and settings.replicatedStreamMode( ) != popart.ReplicatedStreamMode.Broadcast: input_shape = [anchorDict["ReplicationFactor"]] + input_shape label_array = label_array.reshape( [anchorDict["ReplicationFactor"], -1]) if anchorDict["AccumulationFactor"] > 1: input_shape = [anchorDict["AccumulationFactor"]] + input_shape label_array = label_array.reshape( [anchorDict["AccumulationFactor"], -1]) if BATCHES_PER_STEP > 1: input_shape = [BATCHES_PER_STEP] + input_shape label_array = np.repeat(label_array[np.newaxis], BATCHES_PER_STEP, 0) anchors = session.initAnchorArrays() in_array = np.random.random_sample(input_shape).astype(np.float32) stepio = popart.PyStepIO({ip: in_array, lb: label_array}, anchors) session.weightsFromHost() session.run(stepio) #Compare the inputs for batch in range(anchors[ip].shape[0]): in_0 = anchors[ip][batch, -1, 0, :, :, :, :] in_1 = anchors[ip][batch, -1, 1, :, :, :, :] mx = anchors[ip].shape[len(anchors[ip].shape) - 1] s = mx if mx <= 5 else 5 assert np.allclose(in_0, in_1, equal_nan=False)
DATA_LEN = 5 ANCHOR_TYPES = { "ReplicationFactor": [1], # TODO: Enable replication once T12001 done. # Exception: Accl factor must divide batch size "AccumulationFactor": [4, 1], "Pipelining": [True, False], "ReturnType": ["FINAL", "ALL"] } # Learning rate 1 for easy comparison. LEARNING_RATE = 1.0 # Strings for the anchors. INPUT = "input" WEIGHTS = "init_input" ACTIVATION = "Reshape:0" GRADIENT = popart.reservedGradientPrefix() + WEIGHTS ACCUM = popart.reservedAccumPrefix() + WEIGHTS def dict_product(d): keys = d.keys() for element in itertools.product(*d.values()): yield dict(zip(keys, element)) def return_options(anchorDict): opts = popart.SessionOptions() if anchorDict["Pipelining"]: opts.enablePipelining = True if anchorDict["AccumulationFactor"] > 1: