def run_graph(optimizer, input_shape, initial_onnx_model, input_tensor_name, output_tensor_name, label_tensor_name, label_array, accum_factor, enable_accum, batches_per_step, number_of_steps, final_proto_filename, enable_multi_ipu, full_anchorage, inference_mode): art = popart.AnchorReturnType("All") anchorNames = {output_tensor_name: art} if full_anchorage: w0 = onnx.load_from_string( initial_onnx_model).graph.initializer[0].name anchorNames[popart.reservedGradientPrefix() + w0] = art if enable_accum: anchorNames[popart.reservedAcclPrefix() + w0] = art anchorNames[popart.reservedAcclToUpdatePrefix() + w0] = art opts = popart.SessionOptions() opts.enableGradientAccumulation = enable_accum opts.accumulationFactor = accum_factor opts.enableOutlining = False opts.virtualGraphMode = popart.VirtualGraphMode.Manual if enable_multi_ipu else popart.VirtualGraphMode.Off if enable_multi_ipu: device = tu.create_test_device(numIpus=num_ipus, tilesPerIPU=testTilesPerIPU, opts={"compileIPUCode": False}) opts.virtualGraphMode = popart.VirtualGraphMode.Manual else: device = tu.create_test_device(tilesPerIPU=testTilesPerIPU, opts={"compileIPUCode": False}) opts.virtualGraphMode = popart.VirtualGraphMode.Off # only for test purposes, inference with gradient_accumulation should never work if inference_mode: popart.InferenceSession(fnModel=initial_onnx_model, dataFlow=popart.DataFlow( batches_per_step, anchorNames), userOptions=opts, deviceInfo=device) session = popart.TrainingSession(fnModel=initial_onnx_model, dataFlow=popart.DataFlow( batches_per_step, anchorNames), deviceInfo=device, loss=output_tensor_name, optimizer=optimizer, userOptions=opts) session.prepareDevice() session.weightsFromHost() anchor_arrays = session.initAnchorArrays() outer_dim = 1 if batches_per_step > 1: outer_dim *= batches_per_step label_array = np.repeat(label_array[np.newaxis], batches_per_step, 0) if accum_factor > 1: outer_dim *= accum_factor label_array = label_array.reshape( [accum_factor * batches_per_step, -1]) if outer_dim > 1: input_shape = [outer_dim] + input_shape stepio = popart.PyStepIO( { input_tensor_name: (1.0 - xi * npr.rand(*input_shape)).astype(np.float32), label_tensor_name: label_array.astype(np.int32) }, anchor_arrays) for i in range(number_of_steps): session.run(stepio) final_proto_file = "{}.onnx".format(final_proto_filename) session.modelToHost(final_proto_filename) return final_proto_filename, anchor_arrays
graph_transformer = popart.GraphTransformer(export_name) inputShapeInfo = popart.InputShapeInfo() inputShapeInfo.add("data", popart.TensorInfo("FLOAT", [n, nTracks, nFeatures])) inputShapeInfo.add("init_hc", popart.TensorInfo("FLOAT", [1, n, nHidden])) anchors = {"tag": popart.AnchorReturnType("ALL")} dataFeed = popart.DataFlow(1, anchors) # device = popart.DeviceManager().createIpuModelDevice({}) device = popart.DeviceManager().acquireAvailableDevice(1) session = popart.InferenceSession( graph_transformer.getModelProto(), dataFeed, device, inputShapeInfo=inputShapeInfo, ) session.prepareDevice() inferenceAnchors = session.initAnchorArrays() data_input = np.random.rand(n, nTracks, nFeatures).astype(np.float32) init_hc = np.zeros([1, n, nHidden]).astype(np.float32) stepio = popart.PyStepIO({ "data": data_input, "init_hc": init_hc }, inferenceAnchors)
def get_model_anchors_model2(doSharding, doPipelining, batchesPerStep, doTraining, doGradAccl=False, gradAcclFactor=1, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False, labelArray=None): np.random.seed(1234) builder = popart.Builder() micro_batch_size = batch_size // gradAcclFactor shape_d0 = [micro_batch_size, 2, 4, 4] shape_l0 = [batch_size] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0), "inp") data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0, "weights") s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [micro_batch_size, 32]) out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm") label_shape = [micro_batch_size] l0 = builder.addInputTensor(popart.TensorInfo("INT32", label_shape), "label") nll = builder.aiGraphcore.nllloss([out, l0]) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art, s0: art, c0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art if doGradAccl is True: anchor_map[popart.reservedAcclToUpdatePrefix() + w0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradAccl opts.accumulationFactor = gradAcclFactor if doSharding is False: numIPUs = 1 else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIPUs = 3 builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 1) builder.virtualGraph(c0, 1) builder.virtualGraph(r0, 2) builder.virtualGraph(out, 2) builder.virtualGraph(nll, 2) if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() classes = np.prod(shape_d0) / (micro_batch_size * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) outer_dim = 1 if batchesPerStep > 1: # Add an outer dimension of batchesPerStep. We repeat the labels # as we want consistency if we have different shape inputs between examples. outer_dim *= batchesPerStep label = np.repeat(label[np.newaxis], batchesPerStep, 0) if gradAcclFactor > 1: # Divide up the batches per step batches into gradAcclFactor * batchesPerStep # samples. outer_dim *= gradAcclFactor label = label.reshape([gradAcclFactor * batchesPerStep, -1]) if outer_dim > 1: # Add the gradAcclFactor * batchesPerStep dimension into the input. shape_d0.insert(0, outer_dim) data = np.ones(shape=shape_d0).astype(np.float32) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() for i in range(6): session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
def build_and_run_graph(data_size): # Create a builder object: builder = popart.Builder() # Specify two input vectors: data_spec = popart.TensorInfo("FLOAT", [data_size]) id_a = builder.addInputTensor(data_spec) id_b = builder.addInputTensor(data_spec) # Describe the computation: o1 = builder.aiOnnx.add([id_a, id_b]) o2 = builder.aiOnnx.mul([id_a, id_b]) # Designate the two output vectors and how # often the result will be required: builder.addOutputTensor(o1) builder.addOutputTensor(o2) data_flow = popart.DataFlow( 1, {o1: popart.AnchorReturnType("ALL"), o2: popart.AnchorReturnType("ALL")} ) # Setup an inference graph: proto = builder.getModelProto() session = popart.InferenceSession( fnModel=proto, dataFlow=data_flow, deviceInfo=popart.DeviceManager().createIpuModelDevice({}), ) # Compile graph: session.prepareDevice() # Create input data buffers: data_a = np.random.rand(data_size).astype(np.float32) data_b = np.random.rand(data_size).astype(np.float32) inputs = {id_a: data_a, id_b: data_b} # Create output data buffers: anchors = session.initAnchorArrays() # Create timer objects and dictionaries: timer = PerfIntervalTimer() rtts = {} # Input callback is called when the data is needed: def input_callback(id, is_prefetch: bool): if is_prefetch: return if timer.not_set(): timer.reset() return inputs[id] # Called after the input buffer has been consumed: def input_complete_callback(id): return # Output callback is called when a buffer is needed for the result: def output_callback(id): return anchors[id] # Complete callback is called when the output buffer has # been filled (result is ready to be consumed by the host): def output_complete_callback(id): rtt = timer.interval() rtts[id] = rtt # Create the callback IO system: stepio = popart.PyStepIOCallback( input_callback, input_complete_callback, output_callback, output_complete_callback, ) # Run the graph and return timings: session.run(stepio, 'AddAndMulCallback') print(rtts) return rtts
def test_train_then_infer_via_file(): builder = popart.Builder() input_shape = popart.TensorInfo("FLOAT", [1, 2, 4, 4]) weight_shape = popart.TensorInfo("FLOAT", [3, 2, 3, 3]) weight_data = np.ones([3, 2, 3, 3], np.float32) input = builder.addInputTensor(input_shape) weights = builder.addInitializedInputTensor(weight_data) act = builder.aiOnnx.conv([input, weights], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1]) o = builder.aiOnnx.relu([act]) l1 = builder.aiGraphcore.l1loss([o], 0.1) anchor_names = [ o, popart.reservedGradientPrefix() + input, popart.reservedGradientPrefix() + weights ] training_dataFlow = popart.DataFlow( 1, { anchor_names[0]: popart.AnchorReturnType("All"), anchor_names[1]: popart.AnchorReturnType("All"), anchor_names[2]: popart.AnchorReturnType("All") }) opts = popart.SessionOptions() opts.constantWeights = False # Allow the weights to be updated # ---------------------------------------------- # Create the device device = tu.create_test_device(1, opts={"compileIPUCode": True}) device.attach() # ---------------------------------------------- # Prepare the input data input_data = np.ones(input_shape.shape(), dtype=np.float32) # ---------------------------------------------- # Prepare the Inference session inference_dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) inference_session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=inference_dataFlow, userOptions=opts, deviceInfo=device) # Compile the inference graph inference_session.prepareDevice() # ---------------------------------------------- # Prepare the Training session training_session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=training_dataFlow, loss=l1, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=device) # Compile the training graph training_session.prepareDevice() # ---------------------------------------------- # Run the training session training_session.weightsFromHost() training_anchors = training_session.initAnchorArrays() training_inputs = {input: input_data} for i in range(4): training_session.run(popart.PyStepIO(training_inputs, training_anchors)) # Save the trained weights training_session.modelToHost("test.onnx") # ---------------------------------------------- # Run the inference session ## Load the updated weights from the training session inference_session.resetHostWeights("test.onnx") inference_session.weightsFromHost() inference_anchors = inference_session.initAnchorArrays() inference_inputs = {input: input_data} inference_session.run(popart.PyStepIO(inference_inputs, inference_anchors))
def sparse_mm_infer(sparse_mm_type, lhs_dims, vanilla_rhs_dims, block_size, sparsity_level, transpose_rhs, memory_cycle_ratio, inner_group_size): """ """ if transpose_rhs: matmul_dims = [ lhs_dims[-2], vanilla_rhs_dims[-1], vanilla_rhs_dims[-2] ] else: matmul_dims = [ lhs_dims[-2], vanilla_rhs_dims[-2], vanilla_rhs_dims[-1] ] lhs = create_dense_matrix(lhs_dims) if sparse_mm_type == g_sparseMatMulTypeLookup[ 'DENSE_LHS_SPARSE_RHS_DENSE_OUT']: bsr_rhs, lengths_per_2d_plane, vanilla_rhs, sparsity_mask = create_sparse_matrix( vanilla_rhs_dims, block_size[1:], sparsity_level) rhs = bsr_rhs rhs_dims = bsr_rhs.shape elif sparse_mm_type == g_sparseMatMulTypeLookup[ 'DENSE_LHS_DENSE_RHS_SPARSE_OUT']: output_dims = lhs_dims[:-1] output_dims.append(vanilla_rhs_dims[-1]) output_block_size = [block_size[0], block_size[2]] bsr_output, lengths_per_2d_plane, _, sparsity_mask = create_sparse_matrix( output_dims, output_block_size, sparsity_level) rhs_dims = vanilla_rhs_dims rhs = create_dense_matrix(rhs_dims) # Create a builder and construct a graph builder = popart.Builder() lhs_tensorInfo = popart.TensorInfo("FLOAT", lhs_dims) rhs_tensorInfo = popart.TensorInfo("FLOAT", rhs_dims) lhsTensor = builder.addInputTensor(lhs_tensorInfo) rhsTensor = builder.addInputTensor(rhs_tensorInfo) outTensor = builder.customOp(opName="BSMatMul", opVersion=1, domain="ai.graphcore", inputs=[lhsTensor, rhsTensor], attributes={ "bsr_rhs_lengths_per_2d_plane": lengths_per_2d_plane.tolist(), "matrix_dims": matmul_dims, "block_size": block_size, "sparsity_mask": sparsity_mask.tolist(), "bsmatmul_type": sparse_mm_type, "transpose_rhs": transpose_rhs, "memory_cycle_ratio": memory_cycle_ratio, "inner_group_size": inner_group_size, "in_type": g_input_data_type, "out_type": g_output_data_type, "pp_type": g_pp_data_type })[0] builder.addOutputTensor(outTensor) proto = builder.getModelProto() # Describe how to run the model dataFlow = popart.DataFlow(1, {outTensor: popart.AnchorReturnType("ALL")}) # Create a session to compile and execute the graph session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, deviceInfo=popart.DeviceManager().acquireAvailableDevice(1)) # Compile graph session.prepareDevice() # Create buffers to receive results from the execution anchors = session.initAnchorArrays() rhs = np.array(rhs, dtype=g_input_data_type) stepio = popart.PyStepIO({lhsTensor: lhs, rhsTensor: rhs}, anchors) session.run(stepio) ipuOutput = anchors[outTensor] if sparse_mm_type == g_sparseMatMulTypeLookup[ 'DENSE_LHS_SPARSE_RHS_DENSE_OUT']: if transpose_rhs: transpose_indices = list(range(len(vanilla_rhs_dims))) transpose_indices[-2], transpose_indices[-1] = transpose_indices[ -1], transpose_indices[-2] vanilla_rhs = vanilla_rhs.transpose(tuple(transpose_indices)) goldOutput = mm(lhs, vanilla_rhs) else: goldOutput = mm(lhs, vanilla_rhs) else: assert len(lhs.shape) == len(rhs.shape) if (len(lhs.shape) == 2): lhs = np.expand_dims(lhs, 0) rhs = np.expand_dims(rhs, 0) mmOutput = mm(lhs, rhs) totalGroupDims = int(np.prod(lhs_dims[:-2])) num_rows_sparsity_mask_2d = output_dims[-2] // block_size[0] num_cols_sparsity_mask_2d = output_dims[-1] // block_size[2] assert sparsity_mask.shape == (totalGroupDims * num_rows_sparsity_mask_2d * num_cols_sparsity_mask_2d, ) mmOutput = mmOutput.reshape( (totalGroupDims, lhs_dims[-2], rhs_dims[-1])) goldOutput = [] for dim in range(totalGroupDims): offset = num_rows_sparsity_mask_2d * num_cols_sparsity_mask_2d mmOutput_2d = mmOutput[dim] sliced_sparsity_mask = sparsity_mask[dim * offset:dim * offset + offset] for sparsity_mask_idx in range(len(sliced_sparsity_mask)): if sliced_sparsity_mask[sparsity_mask_idx]: mmOutput_2d_row_start = ( sparsity_mask_idx // num_cols_sparsity_mask_2d) * block_size[0] mmOutput_2d_row_end = mmOutput_2d_row_start + block_size[0] mmOutput_2d_col_start = ( sparsity_mask_idx % num_cols_sparsity_mask_2d) * block_size[2] mmOutput_2d_col_end = mmOutput_2d_col_start + block_size[2] mmOutput_2d_sliced = mmOutput_2d[ mmOutput_2d_row_start:mmOutput_2d_row_end, mmOutput_2d_col_start:mmOutput_2d_col_end] goldOutput.append( mmOutput_2d_sliced.reshape(block_size[0] * block_size[2])) goldOutput = np.array(goldOutput) return ipuOutput, goldOutput
def run(transposed): bsize = 8 dsize = 10 builder = popart.Builder() ip = builder.addInputTensor( popart.TensorInfo("FLOAT", [bsize, dsize, dsize])) if transposed: # Explicitly specify the batch dimension for init init = builder.aiGraphcore.init([dsize, dsize, bsize], popart.DataType.FLOAT, popart.InitType.Zero, 2) else: init = builder.aiGraphcore.init([bsize, dsize, dsize], popart.DataType.FLOAT, popart.InitType.Zero, 0) def add_layer(in_id): w = builder.addInitializedInputTensor( np.ones([dsize, dsize], np.float32)) if transposed: inputs = [w, in_id] else: inputs = [in_id, w] matmul_id = builder.aiOnnx.matmul(inputs) return matmul_id if transposed: ip_t = builder.aiOnnx.transpose([ip]) else: ip_t = ip m1 = add_layer(ip_t) init = builder.aiOnnx.add([init, m1]) m2 = add_layer(m1) init = builder.aiOnnx.add([init, m2]) m3 = add_layer(m2) init = builder.aiOnnx.add([init, m3]) out = builder.aiGraphcore.l1loss([init], 0.1) builder.addOutputTensor(out) device = tu.create_test_device(1) dfAnchors = {out: popart.AnchorReturnType("All")} opts = popart.SessionOptions() opts.enableOutlining = True opts.batchSerializationSettings.factor = 4 proto = builder.getModelProto() session = popart.InferenceSession( fnModel=proto, dataFlow=popart.DataFlow(1, dfAnchors), patterns=popart.Patterns(popart.PatternsLevel.All), userOptions=opts, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() ip_data = np.ones((bsize, dsize, dsize), dtype=np.float32) stepio = popart.PyStepIO({ip: ip_data}, anchors) session.run(stepio)
def run_py(proto: onnx.ModelProto, data: Mapping[str, np.ndarray], outputs: Optional[Union[str, Iterable[str]]], loss: Optional[str] = None, optimizer: Optional[popart.Optimizer] = None, patterns: Optional[popart.Patterns] = None, return_stats: bool = False, log_dir: Optional[str] = None, ipus: Optional[int] = None, batches_per_step: int = 1, user_options: Optional[Mapping[str, Any]] = None, skip_execution: bool = False, execution_mode: str = 'DEFAULT', replication_factor: int = 1, replicated_weight_sharding: bool = False, num_reps: int = 1): outputs = make_tuple(outputs) # Setting up the Session data_flow = popart.DataFlow( batches_per_step, {output: popart.AnchorReturnType("ALL") for output in outputs}) if user_options is None: user_options = {} options = popart.SessionOptions() options.reportOptions = {"showVarStorage": "true"} if replicated_weight_sharding: options.weightTensorLocationSettings.location.replicatedTensorSharding.On options.optimizerStateTensorLocationSettings.location.replicatedTensorSharding.On if replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = replication_factor if execution_mode == 'PHASED': options.enableOutlining = True options.outlineThreshold = -np.inf options.enableOutliningCopyCostPruning = False options.autoRecomputation = popart.RecomputationType.Standard options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.explicitRecomputation = True options.aliasZeroCopy = True options.batchSerializationSettings.factor = user_options[ "batchSerializationFactor"] options.executionPhaseSettings.phases = user_options["executionPhases"] ipus = 2 else: options.enableGroupedMatmuls = False options.enableStochasticRounding = False options.constantWeights = True options.outlineThreshold = 10.0 if ipus is not None and ipus > 1: options.virtualGraphMode = popart.VirtualGraphMode.Manual else: ipus = 1 for key, value in user_options.items(): if key not in ["batchSerializationFactor", "executionPhases"]: setattr(options, key, value) if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true", "opt.internalExchangeOptimisationTarget": "balanced", } request_ipus = pow(2, math.ceil(math.log2(ipus))) request_ipus *= replication_factor dm = popart.DeviceManager() dm.setOnDemandAttachTimeout(int(1e4)) device = dm.acquireAvailableDevice( request_ipus, connectionType=popart.DeviceConnectionType.OnDemand, selectionCriterion=popart.DeviceSelectionCriterion.Random) if device is None: raise Exception("Failed to acquire IPU.") print("Compiling graph") if optimizer is not None: session = popart.TrainingSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, patterns=patterns) if skip_execution: device.detach() return session # Compile the Poplar Graph. If it fails, return the memory stats try: session.prepareDevice() except popart.session.OutOfMemoryException as e: if return_stats and log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) gcprofile.save_popart_report(session, log_dir=log_dir, exception=e) device.detach() raise e print("Compilation complete") session.weightsFromHost() session.setRandomSeed(1984) anchors = session.initAnchorArrays() # Add a gradient accumulation factor dimension if needed af = user_options.get("accumulationFactor") if af is not None and af > 1: data = {k: np.repeat(v[np.newaxis], af, 0) for k, v in data.items()} # Add a batches_per_step dimension if needed if batches_per_step > 1: data = {k: np.repeat(v[np.newaxis], batches_per_step, 0) for k, v in data.items()} for _ in range(num_reps): stepio = popart.PyStepIO(data, anchors) session.run(stepio) with tempfile.TemporaryDirectory() as tmp: file_path = os.path.join(tmp, "model.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) # Release device device.detach() if return_stats: if log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) reports = gcprofile.save_popart_report(session, log_dir=log_dir) graph_report = json.loads(reports["graph"]) exec_report = json.loads(reports["execution"]) else: graph_report = json.loads(session.getGraphReport()) exec_report = json.loads(session.getExecutionReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) cycles = exec_report["simulation"]["cycles"] return (anchors[output] for output in outputs ), post_proto, total_memory, max_tile_memory, cycles return (anchors[output] for output in outputs), post_proto
input_tensor = builder.addInputTensor(popart.TensorInfo("FLOAT", [input_len])) print("Shape of {}: {}".format(input_tensor, builder.getTensorShape(input_tensor))) output_tensor = builder.customOp(opName = "Rsqrt", opVersion = 1, domain = "ai.graphcore", inputs =[input_tensor], attributes = {})[0] print("Inputs: {}".format(builder.getInputTensorIds())) print("Outputs: {}".format(builder.getOutputTensorIds())) print("Values: {}".format(builder.getValueTensorIds())) print("Shape of {}: {}".format(output_tensor, builder.getTensorShape(output_tensor))) builder.addOutputTensor(output_tensor) proto = builder.getModelProto() anchors = {output_tensor : popart.AnchorReturnType("FINAL") } dataFlow = popart.DataFlow(1, anchors) if run_on_ipu : device = popart.DeviceManager().acquireAvailableDevice(1) print("IPU hardware device acquired") else : device = popart.DeviceManager().createIpuModelDevice({}) print("Running on IPU Model") session = popart.InferenceSession(proto, dataFlow, device) session.prepareDevice() result = session.initAnchorArrays() X =(np.array(input_data)).astype(np.float32) print("X={}".format(X)) stepio = popart.PyStepIO({input_tensor : X }, result) session.run(stepio) return result def load_custom_ops_lib() : so_path = os.path.join(os.path.dirname(__file__), "build/custom_ops.so") if not os.path.isfile(so_path) : print("Build the custom ops library with `make` before running this script") exit(1) ctypes.cdll.LoadLibrary(so_path)
def run(benchmark, opts): proto, data, outputs, losses, optimizer = benchmark.graph_builder(opts) if opts.save_graph: with open('model.onnx', "wb") as f: f.write(proto) print("Written to file: model.onnx") dataFlow = popart.DataFlow(opts.batches_per_step, outputs) # Create a session to compile and execute the graph options = popart.SessionOptions() if not opts.use_data: options.syntheticDataMode = popart.SyntheticDataMode.Zeros options.instrumentWithHardwareCycleCounter = opts.report_hw_cycle_count options.engineOptions = { "debug.instrumentCompute": "true" if opts.report else "false" } if opts.convolution_options: options.convolutionOptions = json.loads(opts.convolution_options) if opts.shards > 1: if opts.auto_sharding: options.virtualGraphMode = popart.VirtualGraphMode.Auto else: options.virtualGraphMode = popart.VirtualGraphMode.Manual options.enablePipelining = opts.pipeline # Select a device deviceManager = popart.DeviceManager() if opts.simulation: deviceOptions = {"compileIPUCode": True, 'numIPUs': opts.shards, "tilesPerIPU": 1216} device = deviceManager.createIpuModelDevice(deviceOptions) else: device = deviceManager.acquireAvailableDevice(opts.shards) if device is None: raise OSError("Failed to acquire IPU.") if opts.mode == 'train': session = popart.TrainingSession(fnModel=proto, loss=losses, deviceInfo=device, optimizer=optimizer, dataFlow=dataFlow, userOptions=options) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=dataFlow, userOptions=options) print("Compiling...") start = time.time() session.prepareDevice() compilation_duration = time.time() - start print("Duration: {:.3f} seconds\n".format(compilation_duration)) if opts.tensor_tile_mapping: with open("tile_mapping.json", 'w') as f: json.dump(session.getTensorTileMap(), f) print("Written to file: tile_mapping.json") # Create buffers to receive results from the execution anchors = session.initAnchorArrays() # Copy weights and optimization parameters onto the device session.weightsFromHost() # Add a batches_per_step dimension if needed if opts.batches_per_step > 1: data = {k: np.repeat(v[np.newaxis], opts.batches_per_step, 0) for k, v in data.items()} stepio = popart.PyStepIO(data, anchors) print("Executing...") average_batches_per_sec = 0 # Steps for __ in range(opts.steps): # Run start = time.time() session.run(stepio) duration = time.time() - start if opts.report: return save_reports(opts, session) average_batches_per_sec += (opts.batches_per_step / duration)/opts.steps report_string = "{:<8.3} sec/itr.".format(duration) report_string += " " + benchmark.iteration_report(opts, duration) print(report_string) if opts.report_hw_cycle_count: print("Hardware cycle count per 'run':", session.getCycleCount()) return compilation_duration, average_batches_per_sec
def get_model_anchors(doSharding, doPipelining, batchesPerStep, doTraining, replicated_graph_count=1, doProfiling=False, doDropout=False, doGradientAccl=False, acclSteps=1, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False): np.random.seed(seed=1) builder = popart.Builder() batchSize = 16 microBatchSize = batchSize // acclSteps shape_d0 = [microBatchSize, 2, 4, 4] shape_l0 = [microBatchSize] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0)) data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0) l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0)) s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [microBatchSize, 32]) if doDropout: do0 = builder.aiOnnx.dropout([r0], num_outputs=1, ratio=0.2)[0] out = builder.aiOnnx.softmax([do0], axis=1, debugPrefix="sfm") else: out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm") nll = builder.aiGraphcore.nllloss([out, l0], reduction=popart.ReductionType.Sum) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradientAccl opts.accumulationFactor = acclSteps opts.enableStochasticRounding = False if doSharding is False: numIpus = 1 * replicated_graph_count else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIpus = 2 * replicated_graph_count builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 0) builder.virtualGraph(c0, 0) builder.virtualGraph(r0, 1) if doDropout: builder.virtualGraph(do0, 1) builder.virtualGraph(out, 1) builder.virtualGraph(nll, 1) if replicated_graph_count > 1: opts.replicatedGraphCount = replicated_graph_count opts.enableReplicatedGraphs = True device = tu.create_test_device(numIpus=numIpus) if doTraining is True: session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow( batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=device) else: session = popart.InferenceSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow( batchesPerStep, anchor_map), userOptions=opts, deviceInfo=device) if doDevicex is False: return None session.prepareDevice() anchors = session.initAnchorArrays() session.setRandomSeed(0) classes = np.prod(shape_d0) // (batchSize * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) # With all options enabled return anchors are of the shape: # [batches_per_step, accl_factor, repl_factor, micro_batch, *data_shape] if acclSteps > 1: shape_d0.insert(0, acclSteps) label = label.reshape([acclSteps, -1]) if batchesPerStep > 1: shape_d0.insert(0, batchesPerStep) label = np.repeat(label[np.newaxis], batchesPerStep, 0) data = np.random.random_sample(shape_d0).astype(np.float32) # This is a slightly odd case - we want the same data to be input for both # replicated graphs, but the dimension we need to repeat on is either the # first or second (the replication dimension) depending on whether we # have gradient accumulation enabled. # If we are not testing, this is a lot simpler as we can split samples however # we want. if replicated_graph_count > 1: if acclSteps > 1: data = np.repeat(data[np.newaxis], replicated_graph_count, 2) label = label.reshape([replicated_graph_count, -1]) else: data = np.repeat(data[np.newaxis], replicated_graph_count, 1) label = label.reshape([replicated_graph_count, -1]) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) stepio.enableRuntimeAsserts(False) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
opts.executionPhaseSettings.stages = 2 opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases opts.numIOTiles = 128 varLocation = popart.TensorLocation() varLocation.storage = popart.TensorStorage.OffChip varLocation.loadTileSet = popart.TileSet.IO varLocation.storageTileSet = popart.TileSet.IO opts.weightTensorLocationSettings.location = varLocation else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual print("Compiling.") session = popart.InferenceSession(fnModel=proto, dataFlow=popart.DataFlow( args.batches_per_step, anchor_map), userOptions=opts, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() print("Running.") for i in range(args.iters): input_data = np.random.rand(args.batches_per_step, args.batch_size, args.dsize, args.dsize).astype(dtype) stepio = popart.PyStepIO({input_id: input_data}, anchors) start = time.time() session.run(stepio) duration = time.time() - start
timePerEvent = elapsed / n print("{:.12f}".format(elapsed)) else: # POPART IMPORT graph_transformer = popart.GraphTransformer(onnx_model) anchors = {"tag": popart.AnchorReturnType("ALL")} dataFeed = popart.DataFlow(1, anchors) device = popart.DeviceManager().acquireAvailableDevice(1) session = popart.InferenceSession( graph_transformer.getModelProto(), dataFeed, device ) session.prepareDevice() inferenceAnchors = session.initAnchorArrays() inputs = np.random.rand(n, nFeatures, nTracks, 1).astype(np.float32) stepio = popart.PyStepIO({"data": inputs}, inferenceAnchors) # for i in range(10): session.run(stepio) start = time.perf_counter() session.run(stepio)
def run(self, init_builder, reference, step_type='infer', opsets=None, optimizer=popart.ConstSGD(0.01), seed=None): assert step_type in ('infer', 'train') bld = Builder(opsets=opsets, check_model=self.check_model) anchors = {} # Allows to pass additional arguments to init_builder, if required # by the specific init_builder function implementation. kwargs = {} kwargs = tu.filter_dict(kwargs, init_builder) anchorIds = init_builder(bld, **kwargs) for anchorId in anchorIds: if anchorId not in bld._init_input_map: anchors[anchorId] = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(1, anchors) self.options.logDir = self.logging_dir if self.tilesPerIPU is not None: device = tu.create_test_device(numIpus=self.numIPUs, tilesPerIPU=self.tilesPerIPU) print(f"Created device {device} with {self.numIPUs}" f" IPUs and {self.tilesPerIPU} tiles per IPU") else: device = tu.create_test_device(numIpus=self.numIPUs) print(f"Created device {device} with {self.numIPUs} IPUs") self.patterns.InPlace = self.inplacing if step_type == 'infer': session = popart.InferenceSession(fnModel=bld.getModelProto(), dataFlow=dataFlow, deviceInfo=device, patterns=self.patterns, userOptions=self.options) else: assert step_type == 'train' # Apply reduction to output (assumed to be the # first anchorId) to ensure it is scalar lossId = anchorIds[0] lossId = bld.aiGraphcore.identityloss( [lossId], reduction=self.lossReduction) session = popart.TrainingSession(fnModel=bld.getModelProto(), dataFlow=dataFlow, loss=lossId, optimizer=optimizer, deviceInfo=device, patterns=self.patterns, userOptions=self.options) anchor_map = session.initAnchorArrays() session.prepareDevice() if seed is not None: session.setRandomSeed(seed) for k, v in bld._input_map.items(): if not v.flags['C_CONTIGUOUS']: # need to call np.ascontiguousarray # `x = np.ascontiguousarray(x)` raise Exception( 'Input "{}" to popart.PyStepIO is not C_CONTIGUOS'. format(k)) # Add the replication dimension to the inputs inputs = {} for k, v in bld._input_map.items(): if self.options.replicatedGraphCount > 1: um = (self.options.replicatedGraphCount, ) um = um + tuple([1] * np.ndim(v)) # we add this offset to ensure that samples on devices are distinct offset = 1 * np.arange( self.options.replicatedGraphCount).astype( v.dtype).reshape(um) inputs[k] = np.tile(v, um) + offset else: inputs[k] = v stepio = popart.PyStepIO(inputs, anchor_map) if (step_type == 'train'): session.weightsFromHost() session.run(stepio) if (step_type == 'train'): session.weightsToHost() ref_out = reference(RefData(bld._outputs, anchor_map)) def fix_type(t): if isinstance(t, torch.Tensor): return t.data.numpy() elif isinstance(t, np.ndarray): return t elif isinstance(t, np.float32): return t elif isinstance(t, np.float16): return t elif isinstance(t, np.int32): return t elif t is None: return None else: raise Exception('unexpected type', type(t)) ref_out = [fix_type(i) for i in ref_out] for index, key in enumerate(anchorIds): if key in anchors: if ref_out[index] is not None: print('Testing anchor "{}"...'.format(key)) self.verifyTensor(anchor_map[key], ref_out[index]) else: print('Not Testing anchor "{}" as it is None'.format( key)) elif key in bld._init_input_map: if ref_out[index] is not None: print('Testing weight "{}"...'.format(key)) weightInfo = session.getInfo(key) print('Weight info shape:{} type:{}', weightInfo.shape(), weightInfo.data_type_lcase()) weights = {} weights[key] = np.empty( shape=weightInfo.shape(), dtype=weightInfo.data_type_lcase()) weightsIo = popart.PyWeightsIO(weights) session.readWeights(weightsIo) self.verifyTensor(weights[key], ref_out[index]) else: print('Not Testing weight "{}" as it is None'.format( key)) return session
def test_auto_virtual_graph_subgraphs_2(): ipus = 2 popart.getLogger().setLevel("TRACE") builder = popart.Builder() input_shape = [1, 64] input1 = builder.addInputTensor(popart.TensorInfo("FLOAT16", input_shape)) input2 = builder.addInputTensor(popart.TensorInfo("FLOAT16", input_shape)) # Subgraph 0 w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16), "TESTID-A") x0 = builder.aiOnnx.matmul([input1, w]) w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16), "TESTID-B") x0 = builder.aiOnnx.matmul([x0, w]) # Subgraph 1 w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16), "TESTID-C") x1 = builder.aiOnnx.matmul([input2, w]) # Subgraph 2 x2 = builder.aiOnnx.add([x0, x1]) w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16), "TESTID-D") x2 = builder.aiOnnx.matmul([x2, w]) output = x2 builder.addOutputTensor(output) # Desired split is: # ipu1: 0. ipu2: 1,2 proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {output: popart.AnchorReturnType("Final")}) opts = popart.SessionOptions() opts.virtualGraphMode = popart.VirtualGraphMode.Auto device = tu.create_test_device(numIpus=ipus) session = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, deviceInfo=device) ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON)) for op in ir["maingraph"]: ipu = op["attributes"]["__ipu_number"] for input in op["inputs"]: if ("TESTID-A" in input["name"]): assert (int(ipu) == 0) if ("TESTID-B" in input["name"]): assert (int(ipu) == 0) if ("TESTID-C" in input["name"]): assert (int(ipu) == 1) if ("TESTID-D" in input["name"]): assert (int(ipu) == 1)
def run_py(proto: onnx.ModelProto, data: Mapping[str, np.ndarray], outputs: Optional[Union[str, Iterable[str]]], loss: Optional[str] = None, optimizer: Optional[popart.Optimizer] = None, patterns: Optional[popart.Patterns] = None, return_stats: bool = False, log_dir: Optional[str] = None, ipus: Optional[int] = None, batches_per_step: int = 1, user_options: Optional[Mapping[str, Any]] = None, skip_execution: bool = False): outputs = make_tuple(outputs) # Setting up the Session data_flow = popart.DataFlow( batches_per_step, {output: popart.AnchorReturnType("ALL") for output in outputs}) if user_options is None: user_options = {} options = popart.SessionOptions() options.enableGroupedMatmuls = False options.enableStochasticRounding = False options.constantWeights = True options.outlineThreshold = 10.0 options.reportOptions = { "showVarStorage": "true" } if ipus is not None and ipus > 1: options.virtualGraphMode = popart.VirtualGraphMode.Manual else: ipus = 1 if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true" } for key, value in user_options.items(): setattr(options, key, value) if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true" } request_ipus = pow(2, math.ceil(math.log2(ipus))) device = popart.DeviceManager().acquireAvailableDevice(request_ipus) if device is None: raise Exception("Failed to acquire IPU.") print("Compiling graph") if optimizer is not None: session = popart.TrainingSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, patterns=patterns) if skip_execution: device.detach() return session # Compile the Poplar Graph. If it fails, return the memory stats try: session.prepareDevice() except popart.session.OutOfMemoryException as e: if return_stats and log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) gcprofile.save_popart_report(session, log_dir=log_dir, exception=e) raise e print("Compilation complete") session.weightsFromHost() session.setRandomSeed(1984) anchors = session.initAnchorArrays() # Add a batches_per_step dimension if needed if batches_per_step > 1: data = {k: np.repeat(v[np.newaxis], batches_per_step, 0) for k, v in data.items()} stepio = popart.PyStepIO(data, anchors) session.run(stepio) with tempfile.TemporaryDirectory() as tmp: file_path = os.path.join(tmp, "model.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) # Release device device.detach() if return_stats: if log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) reports = gcprofile.save_popart_report(session, log_dir=log_dir) graph_report = json.loads(reports["graph"]) exec_report = json.loads(reports["execution"]) else: graph_report = json.loads(session.getGraphReport()) exec_report = json.loads(session.getExecutionReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) cycles = exec_report["simulation"]["cycles"] return (anchors[output] for output in outputs ), post_proto, total_memory, max_tile_memory, cycles return (anchors[output] for output in outputs), post_proto
def test_rng_set_and_get(): """ 1. Create a training and validation session with the option to enable rng set/get. 2. Get the initial RNG state values 3. Step 1 : do 5 runs of the training session, twice 4. Step 2 : - reset the RNG to the initial state - do 5 runs of the training session - capture the rng state - do 1 run of the validation session - restore the rng - do 5 runs of the training session again 5. Step 3 : - Reset the RNG to the initial state - do 5 runs of the training session, - do 1 run of the validation session - do 5 runs of the training session again 6. Results comparison: Steps 1 and 2 must have the same outputs after the series of 5 runs. Step 3 must have a different output after the second series of 5 runs, due to session overwritting RNG state. """ np.random.seed(0) # Model definition builder = popart.Builder() dShape = [100, 100] i0 = builder.addInputTensor(popart.TensorInfo("FLOAT16", dShape)) wData = np.random.rand(*dShape).astype(np.float16) w0 = builder.addInitializedInputTensor(wData) out = builder.aiOnnx.matmul([i0, w0]) loss = builder.aiGraphcore.l1loss([out], 0.1) device = tu.create_test_device(1) # Enable the options options = popart.SessionOptions() options.enableLoadAndOffloadRNGState = True options.enableStochasticRounding = True options.constantWeights = False options._enableRngStateManagement = True # Training session bps = 5 tr_opt = popart.SGD({"defaultMomentum": (0.01, True)}) session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(bps, [out]), loss=loss, optimizer=tr_opt, deviceInfo=device, userOptions=options) session.prepareDevice() anchors = session.initAnchorArrays() # Get the initial RNG state before any other operation. init_rng = session.getRNGState() # Interfering inference session interfering_session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(bps, [out]), deviceInfo=device, userOptions=options) interfering_session.prepareDevice() inf_anchors = interfering_session.initAnchorArrays() # Input data data_a = np.random.rand(5, 100, 100).astype(np.float16) def run_session(session): stepio = popart.PyStepIO({i0: data_a}, anchors) session.run(stepio) return session.getRNGState(), anchors['MatMul:0'].tolist() def run_interference(interfering_session): interfering_session.weightsFromHost() inf_stepio = popart.PyStepIO({i0: data_a}, inf_anchors) interfering_session.run(inf_stepio) # Step 1 -> training, training session.weightsFromHost() session.setRNGState(init_rng) rng, pre1 = run_session(session) session.weightsFromHost() rng2, output1 = run_session(session) assert rng != rng2 # Step 2 -> interleaved training, validation, training session.weightsFromHost() session.setRNGState(init_rng) rng, pre2 = run_session(session) run_interference(interfering_session) session.weightsFromHost() session.setRNGState(rng) rng2, output2 = run_session(session) assert output1 == output2 # Step 3 -> interleaved training, validation, RNG not restored session.weightsFromHost() session.setRNGState(init_rng) rng, pre3 = run_session(session) run_interference(interfering_session) session.weightsFromHost() rng2, output3 = run_session(session) assert pre1 == pre2 == pre3 assert (output3 != output1) # Small tests about the seed init_rng = session.getRNGState() # not all states are valid, but we don't check that as long as the size is correct new_rng = [k for k in range(len(init_rng))] session.setRNGState(new_rng) rng1 = session.getRNGState() assert (rng1 == new_rng) session.setRNGState(init_rng) rng2 = session.getRNGState() assert (rng2 == init_rng) # check that an RNGState of the wrong size raises an exception init_rng.append(0) with pytest.raises(popart.popart_exception) as e_info: session.setRNGState(init_rng) assert e_info.value.args[0].startswith( "Devicex::setRngStateValue received rngState of size")
def test_outlining_bca2(): """ In this test we check that the default behaviour is for matmul to be cached. """ popart.getLogger().setLevel("TRACE") builder = popart.Builder() matmul_lhs_shape = popart.TensorInfo("FLOAT", [2, 3]) matmul_rhs_shape = popart.TensorInfo("FLOAT", [3, 4]) i1 = builder.addInputTensor(matmul_lhs_shape) i2 = builder.addInputTensor(matmul_rhs_shape) i3 = builder.addInputTensor(matmul_lhs_shape) i4 = builder.addInputTensor(matmul_rhs_shape) c1 = builder.aiOnnx.matmul([i1, i2]) c2 = builder.aiOnnx.matmul([i3, i4]) r1 = builder.aiOnnx.relu([c1]) r2 = builder.aiOnnx.relu([c2]) a1 = builder.aiOnnx.sum([r1, r2, c1, c2]) c3 = builder.aiOnnx.matmul([i1, i2]) c4 = builder.aiOnnx.matmul([i3, i4]) r3 = builder.aiOnnx.relu([c3]) r4 = builder.aiOnnx.relu([c4]) a2 = builder.aiOnnx.sum([r3, r4, c3, c4]) o = builder.aiOnnx.add([a1, a2]) builder.addOutputTensor(o) proto = builder.getModelProto() anchor_names = [o] dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, userOptions=opts, deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) anchors = session.initAnchorArrays() session.prepareDevice() matmul1_lhs = np.ones(matmul_lhs_shape.shape(), dtype=np.float32) matmul1_rhs = np.ones(matmul_rhs_shape.shape(), dtype=np.float32) matmul2_lhs = np.ones(matmul_lhs_shape.shape(), dtype=np.float32) matmul2_rhs = np.ones(matmul_rhs_shape.shape(), dtype=np.float32) inputs = { i1: matmul1_lhs, i2: matmul1_rhs, i3: matmul2_lhs, i4: matmul2_rhs } stepio = popart.PyStepIO(inputs, anchors) session.run(stepio) # Check that there is only one convolution computation set. summaryReport = session.getSummaryReport() computeSets = tu.get_compute_sets_from_report(summaryReport) num_matmuls = tu.get_compute_set_regex_count( r'^.+/matmulGrouped/Conv_1/Convolve$', computeSets) # There should be only one matmul assert (num_matmuls == 1)
def sparse_softmax(dims, block_size, sparsity_level, inner_group_size): """ """ sparse_input, lengths_per_2d_plane, dense_input, sparsity_mask = create_sparse_matrix( dims, block_size, sparsity_level, -1000) # Create a builder and construct a graph builder = popart.Builder() tensor_info = popart.TensorInfo("FLOAT", sparse_input.shape) input_tensor = builder.addInputTensor(tensor_info) output_tensor = builder.customOp(opName="BsSoftmax", opVersion=1, domain="ai.graphcore", inputs=[input_tensor], attributes={ "matrixDims": dims, "blockSize": block_size, "sparsity": sparsity_mask.tolist(), "groupSizes": lengths_per_2d_plane.tolist(), "innerGroupSize": inner_group_size, "subBlockMaskPerGroup": "None" * len(lengths_per_2d_plane) })[0] builder.addOutputTensor(output_tensor) proto = builder.getModelProto() # Describe how to run the model dataFlow = popart.DataFlow(1, {output_tensor: popart.AnchorReturnType("ALL")}) # Create a session to compile and execute the graph session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, deviceInfo=popart.DeviceManager().acquireAvailableDevice(1)) # Compile graph session.prepareDevice() # Create buffers to receive results from the execution anchors = session.initAnchorArrays() sparse_input = np.array(sparse_input, dtype=g_input_data_type) stepio = popart.PyStepIO({input_tensor: sparse_input}, anchors) session.run(stepio) ipu_output = anchors[output_tensor] group_dims = dims[:-2] mat_dims = dims[-2:] blocks_2d = [mat_dims[0] // block_size[0], mat_dims[1] // block_size[1]] num_blocks_2d = blocks_2d[0] * blocks_2d[1] block_area = block_size[0] * block_size[1] total_group_dims = int(np.prod(group_dims)) assert sparsity_mask.shape == (total_group_dims * num_blocks_2d, ) cpu_output = softmax(dense_input) np.set_printoptions(precision=2) np.set_printoptions(suppress=True) cpu_output = cpu_output.reshape([ total_group_dims, blocks_2d[0], block_size[0], blocks_2d[1], block_size[1] ]) cpu_output = np.transpose(cpu_output, [0, 1, 3, 2, 4]) cpu_output = cpu_output.reshape(total_group_dims, num_blocks_2d, block_area) gold_output = [] offset = 0 for g in range(total_group_dims): cpu_output_2d = cpu_output[g] sliced_sparsity_mask = sparsity_mask[offset:offset + num_blocks_2d] offset = offset + num_blocks_2d for sparsity_mask_idx in range(num_blocks_2d): if sliced_sparsity_mask[sparsity_mask_idx]: gold_output.append(cpu_output_2d[sparsity_mask_idx]) gold_output = np.array(gold_output) assert ipu_output.shape == gold_output.shape return ipu_output, gold_output
for it in sess.get_inputs(): space_input[it.name] = np.array([1.0] * np.product(it.shape), dtype=np.float32) for it in sess.get_outputs(): space_output[it.name] = popart.AnchorReturnType("ALL") if 'PROF' in os.environ: popart.getLogger().setLevel("DEBUG") anchors = space_output dataFeed = popart.DataFlow(1, anchors) try: session = popart.InferenceSession( model_path, dataFeed, popart.DeviceManager().acquireAvailableDevice()) print('Using IPU Hardware ..') except: session = popart.InferenceSession( model_path, dataFeed, popart.DeviceManager().createIpuModelDevice({})) print('Using IPU Model ..') session.prepareDevice() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(space_input, anchors) session.run(stepio) import time
# Copyright (c) 2020 Graphcore Ltd. All rights reserved. import popart import torch.onnx import torchvision input_ = torch.FloatTensor(torch.randn(4, 3, 224, 224)) model = torchvision.models.alexnet(pretrained=True) output_name = "output" torch.onnx.export(model, input_, "alexnet.onnx", output_names=[output_name]) # Create a runtime environment anchors = {output_name: popart.AnchorReturnType("All")} dataFlow = popart.DataFlow(100, anchors) device = popart.DeviceManager().createCpuDevice() session = popart.InferenceSession("alexnet.onnx", dataFlow, device)
if args.input_tensor: input_ = args.input_tensor else: input_ = builder.getInputTensorIds()[0] if args.output_tensor: output = args.output_tensor else: output = builder.getOutputTensorIds()[0] print("Input:", input_, "Output:", output) graph_transformer = popart.GraphTransformer(onnx_model) graph_transformer.convertAllFixedPointInitializersToConstants() # Create forward pass session session = popart.InferenceSession( fnModel=graph_transformer.getModelProto(), dataFlow=popart.DataFlow(1, {output: popart.AnchorReturnType("All")}), deviceInfo=popart.DeviceManager().createIpuModelDevice({})) # Compile graph print("Compiling...") session.prepareDevice() # Create buffers to receive results from the execution inferenceAnchors = session.initAnchorArrays() stepio = popart.PyStepIO({input_: inputs[0]}, inferenceAnchors) # Run the inference graph session.run(stepio) # Check the output from the test data is approximately equal to our inference try:
def test_stepio_callbackinput(tmpdir): builder = popart.Builder() shape = popart.TensorInfo("FLOAT", [2]) i1 = builder.addInputTensor(shape) i2 = builder.addInputTensor(shape) o = builder.aiOnnx.add([i1, i2]) builder.addOutputTensor(o) proto = builder.getModelProto() batches_per_step = 2 dataFlow = popart.DataFlow( batches_per_step, { i1: popart.AnchorReturnType("All"), i2: popart.AnchorReturnType("All"), o: popart.AnchorReturnType("All") }) session = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, deviceInfo=tu.create_test_device()) session.prepareDevice() anchors = session.initAnchorArrays() i1_data = np.random.rand(batches_per_step, 2).astype(np.float32) i2_data = np.random.rand(batches_per_step, 2).astype(np.float32) inputs = {i1: i1_data, i2: i2_data} i1_c = 0 i2_c = 0 def input_callback(id, prefetch): nonlocal i1_c, i2_c time.sleep(2) print("input_callback ", id) t = inputs[id] print(t) if id == i1: print("input_callback ", id, len(t)) if (i1_c < len(t)): result = t[i1_c] i1_c = i1_c + 1 if id == i2: print("input_callback ", id, len(t)) if (i2_c < len(t)): result = t[i2_c] i2_c = i2_c + 1 print(result) return result def input_complete_callback(id): print("input_complete_callback ", id) i1_d = 0 i2_d = 0 o_d = 0 def output_callback(id): nonlocal i1_d, i2_d, o_d time.sleep(2) print("output_callback ", id) t = anchors[id] if id == i1: result = t[i1_d] i1_d = i1_d + 1 if id == i2: result = t[i2_d] i2_d = i2_d + 1 if id == o: result = t[o_d] o_d = o_d + 1 return result def output_complete_callback(id): print("output_complete_callback ", id) stepio = popart.PyStepIOCallback(input_callback, input_complete_callback, output_callback, output_complete_callback) session.run(stepio) # confirm that writing device-to-host of a Stream Tensor returns correctly (unchanged) assert (np.allclose(anchors[i1], i1_data)) assert (np.allclose(anchors[i2], i2_data)) expected_result = i1_data + i2_data assert (np.allclose(anchors[o], expected_result))
def main(): net = Net() criterion = nn.NLLLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) inputs, labels = iter(trainloader).next() opts = popart.SessionOptions() start = time.process_time() # Pass all the pytorch stuff to the session torchSession = popart.torch.TrainingSession( torchModel=net, inputs=inputs, targets=labels, optimizer=optimizer, losses=criterion, batch_size=batch_size, batches_per_step=batches_per_step, deviceInfo=popart.DeviceManager().acquireAvailableDevice(1), userOptions=opts) print("Converting pytorch model took {:.2f}s".format(time.process_time() - start)) # Prepare for training. anchors = torchSession.initAnchorArrays() print("Compiling model...") torchSession.prepareDevice() torchSession.weightsFromHost() for epoch in range(10): # loop over the dataset multiple times start_time = time.time() running_loss = 0.0 running_accuracy = 0 print("#" * 20, "Train phase:", "#" * 20) for i, data in enumerate(trainloader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data torchSession.run(inputs, labels) running_loss += np.mean(anchors["loss_0"]) progress = 20 * ( i + 1) * batch_size * batches_per_step // len(trainset) print('\repoch {} [{}{}] '.format(epoch + 1, progress * '.', (20 - progress) * ' '), end='') results = np.argmax( anchors['output_0'].reshape( [batches_per_step * batch_size, 10]), 1) num_correct = np.sum(results == anchors['target_0'].reshape( [batches_per_step * batch_size])) running_accuracy += num_correct print("Accuracy: {}%".format(running_accuracy * 100 / len(trainset))) end_time = time.time() print('loss: {:.2f}'.format(running_loss / (i + 1))) print("Images per second: {:.0f}".format( len(trainset) / (end_time - start_time))) # Save the model with weights torchSession.modelToHost("torchModel.onnx") # Pytorch currently doesn't support importing from onnx: # https://github.com/pytorch/pytorch/issues/21683 # And pytorch->onnx->caffe2 is broken: # https://github.com/onnx/onnx/issues/2463 # So we import into popart session and infer. # Alternatively, use any other ONNX compatible runtime. builder = popart.Builder("torchModel.onnx") inferenceSession = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow( batches_per_step, {"output_0": popart.AnchorReturnType("All")}), deviceInfo=popart.DeviceManager().acquireAvailableDevice(1)) print("Compiling test model...") inferenceSession.prepareDevice() inferenceAnchors = inferenceSession.initAnchorArrays() print("#" * 20, "Test phase:", "#" * 20) test_accuracy = 0 for j, data in enumerate(testloader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data stepio = popart.PyStepIO({"input_0": inputs.data.numpy()}, inferenceAnchors) inferenceSession.run(stepio) progress = 20 * (j + 1) * batch_size * batches_per_step // len(testset) print('\rtest epoch {} [{}{}] '.format(epoch + 1, progress * '.', (20 - progress) * ' '), end='') results = np.argmax( inferenceAnchors['output_0'].reshape( [batches_per_step * batch_size, 10]), 1) num_correct = np.sum(results == labels.data.numpy().reshape( [batches_per_step * batch_size])) test_accuracy += num_correct print("Accuracy: {}%".format(test_accuracy * 100 / len(testset))) print('Finished Training')
def main(argv): FLAGS = flags.FLAGS print(f"micro batch size is {FLAGS.micro_batch_size}") print(f"batch size is {FLAGS.batch_size}") print(f"batches_per_step is {FLAGS.batches_per_step}") proto, data, outputs, output_id = graph_builder() print(f"Model: {FLAGS.model_name}") if not FLAGS.synthetic: print(f"Data_dir: {FLAGS.data_dir}") else: print(f"Using synthetic data") print(f"Data_sub_dir for this process: {FLAGS.data_sub_dir}") print(f"num_workers: {FLAGS.num_workers}") print(f"batches per step: {FLAGS.batches_per_step}") dataFlow = popart.DataFlow(FLAGS.batches_per_step, outputs) # Create a session to compile and execute the graph options = popart.SessionOptions() if FLAGS.synthetic: options.syntheticDataMode = popart.SyntheticDataMode.Zeros options.instrumentWithHardwareCycleCounter = FLAGS.report_hw_cycle_count # Configure precision of convolutions and MatMuls if FLAGS.half_partials: options.convolutionOptions = {'partialsType': 'half'} options.partialsTypeMatMuls = "half" # Select a device deviceManager = popart.DeviceManager() device = deviceManager.acquireAvailableDevice(1) print(f"{device}\n") if device is None: raise Exception("Not enough IPUs available.") session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=dataFlow, userOptions=options) print("Compiling...") start = time.time() try: session.prepareDevice() except popart.PrepareDeviceException as e: import gcprofile gcprofile.save_popart_report(session, exception=e) sys.exit(1) compilation_duration = time.time() - start print("Time to compile: {:.3f} seconds\n".format(compilation_duration)) # Create buffers to receive results from the execution anchors = session.initAnchorArrays() # Copy weights and optimisation parameters onto the device session.weightsFromHost() def report_time(duration, data_duration=None, compute_duration=None): report_string = "Total {:<8.3} sec.".format(duration) if data_duration: report_string += " Preprocessing {:<8.3} sec ({:4.3}%).".format( data_duration, 100 * (data_duration / duration)) if compute_duration: report_string += " Compute {:<8.3} sec ({:4.3}%).".format( compute_duration, 100 * (compute_duration / duration)) report_string += " {:5f} images/sec.".format( int(FLAGS.micro_batch_size * FLAGS.batches_per_step / duration)) print(report_string) if FLAGS.report_hw_cycle_count: print("Hardware cycle count per 'run':", session.getCycleCount()) print("Executing...") average_batches_per_sec = 0 # Run start = time.time() durations = [] if FLAGS.synthetic: for i in range(FLAGS.iterations): stepio = popart.PyStepIO(data, anchors) data_time = time.time() data_d = data_time - start # Run compute session.run(stepio) # Calc compute duration results = anchors[output_id] comp_d = time.time() - data_time # Calc total duration t = time.time() - start report_time(t, data_d, comp_d) durations.append(t) start = time.time() duration = np.mean(durations) else: for d in data: stepio = popart.PyStepIO(d, anchors) # Calc data duration data_time = time.time() data_d = data_time - start # Run compute session.run(stepio) # Calc compute duration results = anchors[output_id] comp_d = time.time() - data_time # Calc total duration t = time.time() - start report_time(t, data_d, comp_d) durations.append(t) start = time.time() duration = np.mean(durations)
d3 = np.random.rand(1, 3 * hidden_size, hidden_size).astype(np.float32) i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", d1.shape)) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", d2.shape)) i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", d3.shape)) Y, Y_h = builder.aiOnnx.gru([i1, i2, i3], 2, clip=None, direction="bidirectional") builder.addOutputTensor(Y) dataFlow = popart.DataFlow(1, {Y: popart.AnchorReturnType("All")}) # Create a session to compile and the graph for inference #------------------------------------------------------------------------------ inferenceOptions = popart.SessionOptions() # Need to compile the inference graph with variable weights we they can be updated # before execution inferenceSession = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=dataFlow, userOptions=inferenceOptions, deviceInfo=popart.DeviceManager().createIpuModelDevice({})) # Compile graph inferenceSession.prepareDevice() # Create buffers to receive results from the execution inferenceAnchors = inferenceSession.initAnchorArrays()
def get_model_anchors(doSharding, doPipelining, batchesPerStep, doTraining, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False): np.random.seed(seed=1) builder = popart.Builder() batchSize = 2 shape_d0 = [batchSize, 2, 4, 4] shape_l0 = [batchSize] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0)) data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0) l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0)) s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [batchSize, 32]) out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm") nll = builder.aiGraphcore.nllloss([out, l0]) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining if doSharding is False: numIPUs = 1 else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIPUs = 3 builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 1) builder.virtualGraph(c0, 1) builder.virtualGraph(r0, 2) builder.virtualGraph(out, 2) builder.virtualGraph(nll, 2) if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIPU=20)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIPU=20)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() if batchesPerStep > 1: shape_d0.insert(0, batchesPerStep) shape_l0.insert(0, batchesPerStep) data = np.random.uniform(low=-10.0, high=10.0, size=shape_d0).astype(np.float32) classes = np.prod(shape_d0) / (batchSize * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
def get_model_anchors_model1(doSharding, doPipelining, batchesPerStep, doTraining, doGradAccl=False, gradAcclFactor=1, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, labelArray=None): micro_batch_size = batch_size // gradAcclFactor builder = popart.Builder() input_shape = [micro_batch_size, hidden_size] input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape)) x = input_ with builder.virtualGraph(0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_0_{i}") x = builder.aiOnnx.matmul([x, w]) with builder.virtualGraph(1 if doSharding else 0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_1_{i}") x = builder.aiOnnx.matmul([x, w]) with builder.virtualGraph(2 if doSharding else 0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_2_{i}") if i == 1: w0 = w x = builder.aiOnnx.matmul([x, w]) label = builder.addInputTensor("INT32", [micro_batch_size]) x = builder.aiGraphcore.nllloss([x, label]) output = x builder.addOutputTensor(output) art = popart.AnchorReturnType("All") anchor_map = {x: art, w0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + x] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + x] = art anchor_map[popart.reservedRestoredPrefix() + w0] = art if doGradAccl is True: anchor_map[popart.reservedAcclToUpdatePrefix() + w0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradAccl opts.accumulationFactor = gradAcclFactor opts.virtualGraphMode = popart.VirtualGraphMode.Manual if doSharding is False: numIPUs = 1 else: numIPUs = 3 if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=output, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() outer_dim = 1 if batchesPerStep > 1: # Add an outer dimension of batchesPerStep. We repeat the labels # as we want consistency if we have different shape inputs between examples. outer_dim *= batchesPerStep labelArray = np.repeat(labelArray[np.newaxis], batchesPerStep, 0) if gradAcclFactor > 1: # Divide up the batches per step batches into gradAcclFactor * batchesPerStep # samples. outer_dim *= gradAcclFactor labelArray = labelArray.reshape([gradAcclFactor * batchesPerStep, -1]) if outer_dim > 1: # Add the gradAcclFactor * batchesPerStep dimension into the input. input_shape = [outer_dim] + input_shape stepio = popart.PyStepIO( { input_: np.ones(input_shape, np.float32), label: labelArray.astype(np.int32) }, anchors) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) return anchors