def bert_process_data(args, session, labels, data, anchors, losses, predictions, iteration: Iteration, optimizer_factory: ScheduledOptimizerFactory): labels_data = [data[label] for label in labels] if not np.any([np.any(label) for label in labels_data]): # Label may be all padding due to args.vocab_length being smaller than when the data was generated return stepio = popart.PyStepIO(data, anchors) start = time.time() session.run(stepio) duration = time.time() - start if args.gc_profile: import gcprofile gcprofile.save_popart_report(session) sys.exit(0) iteration.add_stats(duration, labels_data, anchors, losses, predictions) if (iteration.count % iteration.steps_per_log) == 0: iteration.report_stats() # The following will only be true if: # Learning rate mode is STEP and the current total step counter is in the schedule # Learning rate mode is EPOCH and the current epoch has just changed to one in the schedule if optimizer_factory.should_update(iteration): optimizer = optimizer_factory.update_and_create(iteration) session.updateOptimizer(optimizer) session.optimizerFromHost() iteration.count += 1
def bert_process_infer_data(args, session, data, anchors, logits, iteration: Iteration): start_times = defaultdict(list) end_times = defaultdict(list) if args.low_latency_inference and args.task == "SQUAD": stepio = create_callback_stepio(data, anchors, start_times, end_times) else: stepio = popart.PyStepIO(data, anchors) start = time.perf_counter() session.run(stepio) duration = time.perf_counter() - start if args.gc_profile: import gcprofile gcprofile.save_popart_report(session) sys.exit(0) iteration.durations.append(duration) mean_latency, min_latency, max_latency = compute_latency(args, start_times, end_times) if (iteration.count % iteration.steps_per_log) == 0: status_string = \ f"Iteration: {iteration.count:6} " \ f"Duration: {np.average(iteration.durations):6.4f} s " \ f"Throughput: {np.average(iteration.throughput):6.1f} samples/s" if mean_latency is not None: status_string += f" Per-sample Latency: {mean_latency} {min_latency} {max_latency} seconds (mean min max)" logger.info(status_string) iteration.count += 1 return [anchors[logit] for logit in logits]
def bert_process_infer_data(args, session, data, anchors, logits, iteration: Iteration, start_times, end_times, stepio): if stepio is None: stepio = popart.PyStepIO(data, anchors) start = time.perf_counter() session.run(stepio) duration = time.perf_counter() - start hw_cycles = session.getCycleCount() if args.report_hw_cycle_count else None if args.gc_profile: import gcprofile gcprofile.save_popart_report(session) sys.exit(0) iteration.durations.append(duration) mean_latency, min_latency, max_latency = compute_latency( args, start_times, end_times) if (iteration.count % iteration.steps_per_log) == 0: status_string = \ f"Iteration: {iteration.count:6} " \ f"Duration: {np.average(iteration.durations):6.4f} s " \ f"Throughput: {np.average(iteration.throughput):6.1f} samples/s" if mean_latency is not None: status_string += f" Per-sample Latency: {mean_latency} {min_latency} {max_latency} seconds (mean min max)" if hw_cycles is not None: status_string += f" Cycles: {hw_cycles}" logger.info(status_string) iteration.count += 1 return [anchors[logit] for logit in logits]
def fetch_reports(args, session=None, exception=None, execution=False): if session is None and exception is None: raise Exception("Must provide session or exception to 'fetch_reports'") should_exit = False if args.gc_profile: import gcprofile gcprofile.save_popart_report(session, exception=exception) should_exit = execution if args.graph_report: with open(args.graph_report, "wb") as f: if exception is not None: graph_report = exception.getGraphReport() else: graph_report = session.getGraphReport() f.write(graph_report) if args.execution_report and execution and session is not None: with open(args.execution_report, "wb") as f: exec_report = session.getExecutionReport() f.write(exec_report) should_exit = True if should_exit: sys.exit(0)
def compile_graph_checked(args, session): try: start_time = time.time() session.prepareDevice() end_time = time.time() logger.info(f"Compiled. Duration {end_time - start_time} seconds") except popart.PrepareDeviceException as e: if args.gc_profile: import gcprofile gcprofile.save_popart_report(session, exception=e) raise e
def init_session(proto, losses, device, dataFlow, options, training, optimizer=None, gcpLogDir=None): # Create a session to compile and execute the graph if training: session_type = "training" session = popart.TrainingSession(fnModel=proto, losses=losses, deviceInfo=device, optimizer=optimizer, dataFeed=dataFlow, userOptions=options) else: session_type = "validation" session = popart.InferenceSession(fnModel=proto, losses=losses, deviceInfo=device, dataFeed=dataFlow, userOptions=options) try: print("Preparing the {} graph".format(session_type)) with Timer() as prepareTimer: session.prepareDevice() except popart.PrepareDeviceException as e: print("Caught PrepareDeviceException") if (gcpLogDir is not None): from gcprofile import save_popart_report save_popart_report(session, log_dir=gcpLogDir, exception=e) raise print("{0} graph preparation complete. Duration: {1:.3f} seconds".format( session_type.capitalize(), prepareTimer.interval())) # Create buffers to receive results from the execution anchors = session.initAnchorArrays() return session, anchors
def create_session_anchors(proto, loss, device, dataFlow, options, training, optimizer=None, profile=False): """ Create the desired session and compile the graph """ if training: session_type = "training" session = popart.TrainingSession(fnModel=proto, loss=loss, deviceInfo=device, optimizer=optimizer, dataFlow=dataFlow, userOptions=options) else: session_type = "validation" session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=dataFlow, userOptions=options) try: logger.info("Preparing the {} graph".format(session_type)) session.prepareDevice() logger.info("{0} graph preparation complete.".format( session_type.capitalize(), )) except popart.OutOfMemoryException as e: logger.warn("Caught Exception while Preparing Device") # Dump the profiled result before raising exception and exit if profile: from gcprofile import save_popart_report save_popart_report(session, exception=e) raise # Create buffers to receive results from the execution anchors = session.initAnchorArrays() return session, anchors
def run_py(proto: onnx.ModelProto, data: Mapping[str, np.ndarray], outputs: Optional[Union[str, Iterable[str]]], loss: Optional[str] = None, optimizer: Optional[popart.Optimizer] = None, patterns: Optional[popart.Patterns] = None, return_stats: bool = False, log_dir: Optional[str] = None, ipus: Optional[int] = None, batches_per_step: int = 1, user_options: Optional[Mapping[str, Any]] = None, skip_execution: bool = False, execution_mode: str = 'DEFAULT', replication_factor: int = 1, replicated_weight_sharding: bool = False, num_reps: int = 1): outputs = make_tuple(outputs) # Setting up the Session data_flow = popart.DataFlow( batches_per_step, {output: popart.AnchorReturnType("ALL") for output in outputs}) if user_options is None: user_options = {} options = popart.SessionOptions() options.reportOptions = {"showVarStorage": "true"} if replicated_weight_sharding: options.weightTensorLocationSettings.location.replicatedTensorSharding.On options.optimizerStateTensorLocationSettings.location.replicatedTensorSharding.On if replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = replication_factor if execution_mode == 'PHASED': options.enableOutlining = True options.outlineThreshold = -np.inf options.enableOutliningCopyCostPruning = False options.autoRecomputation = popart.RecomputationType.Standard options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.explicitRecomputation = True options.aliasZeroCopy = True options.batchSerializationSettings.factor = user_options[ "batchSerializationFactor"] options.executionPhaseSettings.phases = user_options["executionPhases"] ipus = 2 else: options.enableGroupedMatmuls = False options.enableStochasticRounding = False options.constantWeights = True options.outlineThreshold = 10.0 if ipus is not None and ipus > 1: options.virtualGraphMode = popart.VirtualGraphMode.Manual else: ipus = 1 for key, value in user_options.items(): if key not in ["batchSerializationFactor", "executionPhases"]: setattr(options, key, value) if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true", "opt.internalExchangeOptimisationTarget": "balanced", } request_ipus = pow(2, math.ceil(math.log2(ipus))) request_ipus *= replication_factor dm = popart.DeviceManager() dm.setOnDemandAttachTimeout(int(1e4)) device = dm.acquireAvailableDevice( request_ipus, connectionType=popart.DeviceConnectionType.OnDemand, selectionCriterion=popart.DeviceSelectionCriterion.Random) if device is None: raise Exception("Failed to acquire IPU.") print("Compiling graph") if optimizer is not None: session = popart.TrainingSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, patterns=patterns) if skip_execution: device.detach() return session # Compile the Poplar Graph. If it fails, return the memory stats try: session.prepareDevice() except popart.session.OutOfMemoryException as e: if return_stats and log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) gcprofile.save_popart_report(session, log_dir=log_dir, exception=e) device.detach() raise e print("Compilation complete") session.weightsFromHost() session.setRandomSeed(1984) anchors = session.initAnchorArrays() # Add a gradient accumulation factor dimension if needed af = user_options.get("accumulationFactor") if af is not None and af > 1: data = {k: np.repeat(v[np.newaxis], af, 0) for k, v in data.items()} # Add a batches_per_step dimension if needed if batches_per_step > 1: data = {k: np.repeat(v[np.newaxis], batches_per_step, 0) for k, v in data.items()} for _ in range(num_reps): stepio = popart.PyStepIO(data, anchors) session.run(stepio) with tempfile.TemporaryDirectory() as tmp: file_path = os.path.join(tmp, "model.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) # Release device device.detach() if return_stats: if log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) reports = gcprofile.save_popart_report(session, log_dir=log_dir) graph_report = json.loads(reports["graph"]) exec_report = json.loads(reports["execution"]) else: graph_report = json.loads(session.getGraphReport()) exec_report = json.loads(session.getExecutionReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) cycles = exec_report["simulation"]["cycles"] return (anchors[output] for output in outputs ), post_proto, total_memory, max_tile_memory, cycles return (anchors[output] for output in outputs), post_proto
def train_process(opts): builder = popart.Builder() # Create the data set training_dataset = load_dataset(opts, training=True) validation_dataset = load_dataset(opts, training=True) # Calulate the learning rate for training steps_per_epoch = len(training_dataset) lrs, lr_drops = calulate_learning_rate(opts, steps_per_epoch) current_lr = lrs.pop(0) next_drop = lr_drops.pop(0) # Create the resnet model image, label = create_inputs(builder, opts) # Get the popart session options options = get_options(opts) # Get the device to run on device = get_device(opts.num_ipus, opts.simulation) # Create the training session proto, loss, argmax, outputs = create_model(builder, opts, image, label) (training_session, training_anchors) = init_session( proto, [loss], device, dataFlow=popart.DataFlow(opts.batches_per_step, outputs), options=options, training=True, optimizer=popart.SGD({ "defaultLearningRate": (current_lr, False), "defaultWeightDecay": (opts.weight_decay, True) }), gcpLogDir=opts.gc_profile_log_dir) if not opts.no_validation: # Create the validation session (validation_session, validation_anchors) = init_session( proto, [loss], device, dataFlow=popart.DataFlow(opts.batches_per_step, outputs), options=options, training=False, gcpLogDir=opts.gc_profile_log_dir) # Copy weights and optimization parameters onto the device training_session.weightsFromHost() training_session.optimizerFromHost() batch_losses = deque(maxlen=opts.steps_per_log) batch_accs = deque(maxlen=opts.steps_per_log) batch_run_duration = deque(maxlen=opts.steps_per_log) total_samples = 0 validation_losses = deque(maxlen=opts.steps_per_log) validation_accs = deque(maxlen=opts.steps_per_log) # Iterations for e in range(opts.epochs): # Set the timing start point for training training_start_point = time.time() print("Executing epoch ", e) for step, data in enumerate(training_dataset): total_steps = (e * steps_per_epoch) + step epoch = e + (step / steps_per_epoch) # Follow Learning Rate Schedule if total_steps > next_drop: current_lr = lrs.pop(0) if len(lr_drops) > 0: next_drop = lr_drops.pop(0) else: next_drop = np.inf training_session.updateOptimizer( popart.SGD({"defaultLearningRate": (current_lr, False)})) training_session.optimizerFromHost() print("Learning_rate change to {}".format(current_lr)) images = data[0] labels = data[1] stepio = popart.PyStepIO({ image: images, label: labels }, training_anchors) # Train with Timer() as t1: training_session.run(stepio) batch_run_duration.append(t1.interval()) # Get the loss and 'learnt' labels # - Sum the losses across replication & batch size nll_loss_anch = training_anchors["loss"] arg_max_anch = training_anchors[argmax] batch_losses.append(nll_loss_anch) batch_accs.append(100 * np.mean(arg_max_anch == labels)) total_samples += (opts.batches_per_step * opts.batch_size) if not total_steps % opts.steps_per_log or total_steps == 0: training_duration = time.time() - training_start_point print_format = ("step: {step:6d}, epoch: {epoch:6.2f}, " "lr: {lr:6.2g}, loss: {loss:6.3f}, " "accuracy: {train_acc:6.3f}%, " "img/sec: {img_per_sec:6.2f} " "step_time: {duration:6.2f} sec " "ipu_execution_time: {run_duration:6.2f}") stats = { 'step': total_steps, 'epoch': epoch, 'lr': current_lr, 'loss': np.mean(batch_losses), 'train_acc': np.mean(batch_accs), 'img_per_sec': total_samples / training_duration, 'duration': training_duration, 'run_duration': np.mean(batch_run_duration), } print(print_format.format(**stats)) # Reset the metrics batch_accs.clear() batch_losses.clear() batch_run_duration.clear() total_samples = 0 # Reset the training start point training_start_point = time.time() # Evaluation if not opts.no_validation: # The name of the onnx file we will created with current state # of the training and use to validate with the validation session. onnx_file_name = "ckpt.onnx" training_session.modelToHost(onnx_file_name) # Copy weights and optimization parameters onto the device validation_session.resetHostWeights(onnx_file_name) validation_session.weightsFromHost() validation_start_point = time.time() for validation_data in validation_dataset: validation_images = validation_data[0] validation_labels = validation_data[1] validation_stepio = popart.PyStepIO( { image: validation_images, label: validation_labels }, validation_anchors) validation_session.run(validation_stepio) # Get the loss and 'predicted' labels validation_nll_loss_anch = validation_anchors["loss"] validation_arg_max_anch = validation_anchors[argmax] validation_losses.append(validation_nll_loss_anch) validation_accs.append( 100 * np.mean(validation_arg_max_anch == validation_labels)) print("Validation accuracy epoch {:6.2f}, img/sec:{:6.2f} " "accuracy: {:6.3f}% loss: {:6.3f}".format( epoch, (len(validation_dataset) * opts.batch_size * opts.batches_per_step / (time.time() - validation_start_point)), np.mean(validation_accs), np.mean(validation_losses))) training_session.resetHostWeights(onnx_file_name) # Write the training weights to the device training_session.weightsFromHost() training_session.optimizerFromHost() # Save the popart training report if opts.gc_profile_log_dir is not None: from gcprofile import save_popart_report save_popart_report(training_session)
def get_model_anchors(doSharding, doPipelining, batchesPerStep, doTraining, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False): np.random.seed(seed=1) builder = popart.Builder() batchSize = 2 shape_d0 = [batchSize, 2, 4, 4] shape_l0 = [batchSize] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0)) data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0) l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0)) s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [batchSize, 32]) out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm") nll = builder.aiGraphcore.nllloss([out, l0]) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining if doSharding is False: numIPUs = 1 else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIPUs = 3 builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 1) builder.virtualGraph(c0, 1) builder.virtualGraph(r0, 2) builder.virtualGraph(out, 2) builder.virtualGraph(nll, 2) if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIpu=20)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIpu=20)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() if batchesPerStep > 1: shape_d0.insert(0, batchesPerStep) shape_l0.insert(0, batchesPerStep) data = np.random.uniform(low=-10.0, high=10.0, size=shape_d0).astype(np.float32) classes = np.prod(shape_d0) / (batchSize * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
def run_py(proto: onnx.ModelProto, data: Mapping[str, np.ndarray], outputs: Optional[Union[str, Iterable[str]]], loss: Optional[Union[popart.Loss, Iterable[popart.Loss]]] = None, optimizer: Optional[popart.Optimizer] = None, return_stats: bool = False, log_dir: Optional[str] = None, ipus: Optional[int] = None, batches_per_step: int = 1, user_options: Optional[Mapping[str, Any]] = None): outputs = make_tuple(outputs) if loss is not None: loss = make_tuple(loss) # Setting up the Session data_flow = popart.DataFlow( batches_per_step, {output: popart.AnchorReturnType("ALL") for output in outputs}) if user_options is None: user_options = {} options = popart.SessionOptions() options.enableGroupedMatmuls = False options.enableStochasticRounding = False options.constantWeights = True options.outlineThreshold = 10.0 options.reportOptions = {"showVarStorage": "true"} if ipus is not None and ipus > 1: options.virtualGraphMode = popart.VirtualGraphMode.Manual else: ipus = 1 if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true" } for key, value in user_options.items(): setattr(options, key, value) if ipus is not None: options.enableVirtualGraphs = False else: ipus = 1 if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true" } request_ipus = pow(2, math.ceil(math.log2(ipus))) device = popart.DeviceManager().acquireAvailableDevice(request_ipus) if device is None: raise Exception("Failed to acquire IPU.") print("Compiling graph") if optimizer is not None: session = popart.TrainingSession(fnModel=proto, deviceInfo=device, dataFeed=data_flow, userOptions=options, losses=loss, optimizer=optimizer) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFeed=data_flow, userOptions=options) # Compile the Poplar Graph. If it fails, return the memory stats try: session.prepareDevice() except popart.session.PrepareDeviceException as e: if return_stats: if log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) reports = gcprofile.save_popart_report(session, log_dir=log_dir, exception=e) graph_report = json.loads(reports["graph"]) else: graph_report = json.loads(e.getGraphReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) raise e else: raise e print("Compilation complete") session.weightsFromHost() if optimizer is not None: session.optimizerFromHost() session.setRandomSeed(1984) anchors = session.initAnchorArrays() # Add a batches_per_step dimension if needed if batches_per_step > 1: data = { k: np.repeat(v[np.newaxis], batches_per_step, 0) for k, v in data.items() } stepio = popart.PyStepIO(data, anchors) session.run(stepio) with tempfile.TemporaryDirectory() as tmp: file_path = os.path.join(tmp, "model.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) # Release device device.detach() if return_stats: if log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) reports = gcprofile.save_popart_report(session, log_dir=log_dir) graph_report = json.loads(reports["graph"]) exec_report = json.loads(reports["execution"]) else: graph_report = json.loads(session.getGraphReport()) exec_report = json.loads(session.getExecutionReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) cycles = exec_report["simulation"]["cycles"] return (anchors[output] for output in outputs ), post_proto, total_memory, max_tile_memory, cycles return (anchors[output] for output in outputs), post_proto
def get_model_anchors_model2(doSharding, doPipelining, batchesPerStep, doTraining, doGradAccl=False, gradAcclFactor=1, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False, labelArray=None): np.random.seed(1234) builder = popart.Builder() micro_batch_size = batch_size // gradAcclFactor shape_d0 = [micro_batch_size, 2, 4, 4] shape_l0 = [batch_size] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0), "inp") data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0, "weights") s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [micro_batch_size, 32]) out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm") label_shape = [micro_batch_size] l0 = builder.addInputTensor(popart.TensorInfo("INT32", label_shape), "label") nll = builder.aiGraphcore.nllloss([out, l0]) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art, s0: art, c0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art if doGradAccl is True: anchor_map[popart.reservedAcclToUpdatePrefix() + popart.reservedGradientPrefix() + w0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradAccl opts.accumulationFactor = gradAcclFactor if doSharding is False: numIPUs = 1 else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIPUs = 3 builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 1) builder.virtualGraph(c0, 1) builder.virtualGraph(r0, 2) builder.virtualGraph(out, 2) builder.virtualGraph(nll, 2) if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() classes = np.prod(shape_d0) / (micro_batch_size * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) outer_dim = 1 if batchesPerStep > 1: # Add an outer dimension of batchesPerStep. We repeat the labels # as we want consistency if we have different shape inputs between examples. outer_dim *= batchesPerStep label = np.repeat(label[np.newaxis], batchesPerStep, 0) if gradAcclFactor > 1: # Divide up the batches per step batches into gradAcclFactor * batchesPerStep # samples. outer_dim *= gradAcclFactor label = label.reshape([gradAcclFactor * batchesPerStep, -1]) if outer_dim > 1: # Add the gradAcclFactor * batchesPerStep dimension into the input. shape_d0.insert(0, outer_dim) data = np.ones(shape=shape_d0).astype(np.float32) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() for i in range(6): session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
def get_model_anchors_model1(doSharding, doPipelining, batchesPerStep, doTraining, doGradAccl=False, gradAcclFactor=1, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, labelArray=None): micro_batch_size = batch_size // gradAcclFactor builder = popart.Builder() input_shape = [micro_batch_size, hidden_size] input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape)) x = input_ with builder.virtualGraph(0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_0_{i}") x = builder.aiOnnx.matmul([x, w]) with builder.virtualGraph(1 if doSharding else 0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_1_{i}") x = builder.aiOnnx.matmul([x, w]) with builder.virtualGraph(2 if doSharding else 0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_2_{i}") if i == 1: w0 = w x = builder.aiOnnx.matmul([x, w]) label = builder.addInputTensor("INT32", [micro_batch_size]) x = builder.aiGraphcore.nllloss([x, label]) output = x builder.addOutputTensor(output) art = popart.AnchorReturnType("All") anchor_map = {x: art, w0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + x] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + x] = art anchor_map[popart.reservedRestoredPrefix() + w0] = art if doGradAccl is True: anchor_map[popart.reservedAcclToUpdatePrefix() + popart.reservedGradientPrefix() + w0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradAccl opts.accumulationFactor = gradAcclFactor opts.virtualGraphMode = popart.VirtualGraphMode.Manual if doSharding is False: numIPUs = 1 else: numIPUs = 3 if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=output, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() outer_dim = 1 if batchesPerStep > 1: # Add an outer dimension of batchesPerStep. We repeat the labels # as we want consistency if we have different shape inputs between examples. outer_dim *= batchesPerStep labelArray = np.repeat(labelArray[np.newaxis], batchesPerStep, 0) if gradAcclFactor > 1: # Divide up the batches per step batches into gradAcclFactor * batchesPerStep # samples. outer_dim *= gradAcclFactor labelArray = labelArray.reshape([gradAcclFactor * batchesPerStep, -1]) if outer_dim > 1: # Add the gradAcclFactor * batchesPerStep dimension into the input. input_shape = [outer_dim] + input_shape stepio = popart.PyStepIO( { input_: np.ones(input_shape, np.float32), label: labelArray.astype(np.int32) }, anchors) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) return anchors
def main(argv): FLAGS = flags.FLAGS FLAGS.samples_per_device = int(FLAGS.batch_size / FLAGS.replication_factor) proto, data, outputs, output_id = graph_builder() print(f"Model: {FLAGS.model_name}") if not FLAGS.synthetic: print(f"Data_dir: {FLAGS.data_dir}") else: print(f"Using synthetic data") print(f"Data_sub_dir for this process: {FLAGS.data_sub_dir}") print(f"num_workers: {FLAGS.num_workers}") print(f"batches per step: {FLAGS.batches_per_step}") dataFlow = popart.DataFlow(FLAGS.batches_per_step, outputs) # Create a session to compile and execute the graph options = popart.SessionOptions() if FLAGS.synthetic: options.ignoreData = True options.engineOptions = { "debug.instrument": "true" if FLAGS.profile else "false", "target.syncMethod": "polling" } # Select a device deviceManager = popart.DeviceManager() device = deviceManager.acquireAvailableDevice(1) print(f"{device}\n") if device is None: raise Exception("Not enough IPUs available.") session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFeed=dataFlow, userOptions=options) print("Compiling...") start = time.time() try: session.prepareDevice() except popart.PrepareDeviceException as e: import gcprofile gcprofile.save_popart_report(session, exception=e) sys.exit(1) compilation_duration = time.time() - start print("Time to compile: {:.3f} seconds\n".format(compilation_duration)) # Create buffers to receive results from the execution anchors = session.initAnchorArrays() # Copy weights and optimisation parameters onto the device session.weightsFromHost() def report_time(duration, data_duration=None, compute_duration=None): report_string = "Total {:<8.3} sec.".format(duration) if data_duration: report_string += " Preprocessing {:<8.3} sec ({:4.3}%).".format( data_duration, 100 * (data_duration / duration)) if compute_duration: report_string += " Compute {:<8.3} sec ({:4.3}%).".format( compute_duration, 100 * (compute_duration / duration)) report_string += " {:5f} images/sec.".format( int(FLAGS.batch_size * FLAGS.batches_per_step / duration)) print(report_string) print("Executing...") average_batches_per_sec = 0 # Run start = time.time() durations = [] if FLAGS.synthetic: for i in range(FLAGS.iterations): stepio = popart.PyStepIO(data, anchors) data_time = time.time() data_d = data_time - start # Run compute session.run(stepio) # Calc compute duration results = anchors[output_id] comp_d = time.time() - data_time # Calc total duration t = time.time() - start report_time(t, data_d, comp_d) durations.append(t) start = time.time() duration = np.mean(durations) else: for d in data: stepio = popart.PyStepIO(d, anchors) # Calc data duration data_time = time.time() data_d = data_time - start # Run compute session.run(stepio) # Calc compute duration results = anchors[output_id] comp_d = time.time() - data_time # Calc total duration t = time.time() - start report_time(t, data_d, comp_d) durations.append(t) start = time.time() duration = np.mean(durations) if FLAGS.profile: import gcprofile return gcprofile.save_popart_report(session)
def get_model_anchors(doSharding, doPipelining, batchesPerStep, doTraining, replicated_graph_count=1, doProfiling=False, doDropout=False, doGradientAccl=False, acclSteps=1, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False): np.random.seed(seed=1) builder = popart.Builder() batchSize = 16 microBatchSize = batchSize // acclSteps shape_d0 = [microBatchSize, 2, 4, 4] shape_l0 = [microBatchSize] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0)) data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0) l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0)) s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugContext="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [microBatchSize, 32]) if doDropout: do0 = builder.aiOnnx.dropout([r0], num_outputs=1, ratio=0.2)[0] out = builder.aiOnnx.softmax([do0], axis=1, debugContext="sfm") else: out = builder.aiOnnx.softmax([r0], axis=1, debugContext="sfm") nll = builder.aiGraphcore.nllloss([out, l0], reduction=popart.ReductionType.Sum) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradientAccl opts.accumulationFactor = acclSteps opts.enableStochasticRounding = False if doSharding is False: numIpus = 1 * replicated_graph_count else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIpus = 2 * replicated_graph_count builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 0) builder.virtualGraph(c0, 0) builder.virtualGraph(r0, 1) if doDropout: builder.virtualGraph(do0, 1) builder.virtualGraph(out, 1) builder.virtualGraph(nll, 1) if replicated_graph_count > 1: opts.replicatedGraphCount = replicated_graph_count opts.enableReplicatedGraphs = True device = tu.create_test_device(numIpus=numIpus) if doTraining is True: session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow( batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=device) else: session = popart.InferenceSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow( batchesPerStep, anchor_map), userOptions=opts, deviceInfo=device) if doDevicex is False: return None session.prepareDevice() anchors = session.initAnchorArrays() session.setRandomSeed(0) classes = np.prod(shape_d0) // (batchSize * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) # With all options enabled return anchors are of the shape: # [batches_per_step, accl_factor, repl_factor, micro_batch, *data_shape] if acclSteps > 1: shape_d0.insert(0, acclSteps) label = label.reshape([acclSteps, -1]) if batchesPerStep > 1: shape_d0.insert(0, batchesPerStep) label = np.repeat(label[np.newaxis], batchesPerStep, 0) data = np.random.random_sample(shape_d0).astype(np.float32) # This is a slightly odd case - we want the same data to be input for both # replicated graphs, but the dimension we need to repeat on is either the # first or second (the replication dimension) depending on whether we # have gradient accumulation enabled. # If we are not testing, this is a lot simpler as we can split samples however # we want. if replicated_graph_count > 1: if acclSteps > 1: data = np.repeat(data[np.newaxis], replicated_graph_count, 2) label = label.reshape([replicated_graph_count, -1]) else: data = np.repeat(data[np.newaxis], replicated_graph_count, 1) label = label.reshape([replicated_graph_count, -1]) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) stepio.enableRuntimeAsserts(False) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
def main(argv): FLAGS = flags.FLAGS print(f"micro batch size is {FLAGS.micro_batch_size}") print(f"batch size is {FLAGS.batch_size}") print(f"batches_per_step is {FLAGS.batches_per_step}") proto, data, outputs, output_id = graph_builder() print(f"Model: {FLAGS.model_name}") if not FLAGS.synthetic: print(f"Data_dir: {FLAGS.data_dir}") else: print(f"Using synthetic data") print(f"Data_sub_dir for this process: {FLAGS.data_sub_dir}") print(f"num_workers: {FLAGS.num_workers}") print(f"batches per step: {FLAGS.batches_per_step}") dataFlow = popart.DataFlow(FLAGS.batches_per_step, outputs) # Create a session to compile and execute the graph options = popart.SessionOptions() if FLAGS.synthetic: options.syntheticDataMode = popart.SyntheticDataMode.Zeros options.instrumentWithHardwareCycleCounter = FLAGS.report_hw_cycle_count # Configure precision of convolutions and MatMuls if FLAGS.half_partials: options.convolutionOptions = {'partialsType': 'half'} options.partialsTypeMatMuls = "half" # Select a device deviceManager = popart.DeviceManager() device = deviceManager.acquireAvailableDevice(1) print(f"{device}\n") if device is None: raise Exception("Not enough IPUs available.") session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=dataFlow, userOptions=options) print("Compiling...") start = time.time() try: session.prepareDevice() except popart.PrepareDeviceException as e: import gcprofile gcprofile.save_popart_report(session, exception=e) sys.exit(1) compilation_duration = time.time() - start print("Time to compile: {:.3f} seconds\n".format(compilation_duration)) # Create buffers to receive results from the execution anchors = session.initAnchorArrays() # Copy weights and optimisation parameters onto the device session.weightsFromHost() def report_time(duration, data_duration=None, compute_duration=None): report_string = "Total {:<8.3} sec.".format(duration) if data_duration: report_string += " Preprocessing {:<8.3} sec ({:4.3}%).".format( data_duration, 100 * (data_duration / duration)) if compute_duration: report_string += " Compute {:<8.3} sec ({:4.3}%).".format( compute_duration, 100 * (compute_duration / duration)) report_string += " {:5f} images/sec.".format( int(FLAGS.micro_batch_size * FLAGS.batches_per_step / duration)) print(report_string) if FLAGS.report_hw_cycle_count: print("Hardware cycle count per 'run':", session.getCycleCount()) print("Executing...") average_batches_per_sec = 0 # Run start = time.time() durations = [] if FLAGS.synthetic: for i in range(FLAGS.iterations): stepio = popart.PyStepIO(data, anchors) data_time = time.time() data_d = data_time - start # Run compute session.run(stepio) # Calc compute duration results = anchors[output_id] comp_d = time.time() - data_time # Calc total duration t = time.time() - start report_time(t, data_d, comp_d) durations.append(t) start = time.time() duration = np.mean(durations) else: for d in data: stepio = popart.PyStepIO(d, anchors) # Calc data duration data_time = time.time() data_d = data_time - start # Run compute session.run(stepio) # Calc compute duration results = anchors[output_id] comp_d = time.time() - data_time # Calc total duration t = time.time() - start report_time(t, data_d, comp_d) durations.append(t) start = time.time() duration = np.mean(durations)
def train_process(opts): net = getattr(model, opts.model_name)( pretrained=False, progress=True, num_classes=10 if opts.dataset == "CIFAR-10" else 1000) # Models are missing a softmax layer to work with our NllLoss, # so we just add one on. net = nn.Sequential(net, nn.Softmax(dim=1)) criterion = nn.NLLLoss() optimizer = optim.SGD(net.parameters(), lr=opts.learning_rate, momentum=opts.momentum, weight_decay=opts.weight_decay) trainset, testset, trainloader, testloader = get_dataset(opts) inputs, labels = iter(trainloader).next() sessionOpts = get_options(opts) patterns = popart.Patterns() patterns.InPlace = opts.no_inplacing start = time.time() # Pass all the pytorch stuff to the session torchSession = popart.torch.TrainingSession( torchModel=net, inputs=inputs, targets=labels, optimizer=optimizer, losses=criterion, batch_size=opts.batch_size, batches_per_step=opts.batches_per_step, deviceInfo=get_device(opts.num_ipus, opts.simulation), userOptions=sessionOpts, passes=patterns) print("Converting pytorch model took {:.2f}s".format(time.time() - start)) # Prepare for training. start = time.time() print("Compiling model...") anchors = torchSession.initAnchorArrays() torchSession.prepareDevice() torchSession.optimizerFromHost() torchSession.weightsFromHost() torchSession.setRandomSeed(0) print("Compiling popart model took {:.2f}s".format(time.time() - start)) for epoch in range(opts.epochs): # loop over the dataset multiple times run_training(opts, epoch, torchSession, trainloader, trainset, anchors) if (not opts.no_validation) and ((epoch + 1) % opts.valid_per_epoch == 0): run_validation(opts, epoch, torchSession, testloader, testset) print('Finished Training') # Save the popart training report if opts.gc_profile_log_dir is not None: from gcprofile import save_popart_report save_popart_report(torchSession)
def main(args): # Model parameters np.random.seed(1971) input_rows = 28 input_columns = 28 num_classes = 10 batch_size = 8 input_shape = [batch_size, input_rows * input_columns] labels_shape = [batch_size] # Create model x0, labels, model_proto, anchor_map, loss = create_pipelined_model( num_features=input_columns * input_rows, num_classes=num_classes, batch_size=batch_size) # Save model (optional) if args.export: with open(args.export, 'wb') as model_path: model_path.write(model_proto) # Session options opts = popart.SessionOptions() opts.enablePipelining = False if args.no_pipelining else True opts.virtualGraphMode = popart.VirtualGraphMode.Manual opts.reportOptions = {"showExecutionSteps": "true"} opts.engineOptions = {"debug.instrument": "true"} pipeline_depth = 64 num_ipus = 2 # Create session session = popart.TrainingSession( fnModel=model_proto, dataFlow=popart.DataFlow(pipeline_depth, anchor_map), loss=loss, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=popart.DeviceManager().acquireAvailableDevice(num_ipus)) anchors = session.initAnchorArrays() session.prepareDevice() # Extra data feed for pipeline if pipeline_depth > 1: labels_shape.insert(0, pipeline_depth) input_shape.insert(0, pipeline_depth) # Synthetic data input data_in = np.random.uniform( low=0.0, high=1.0, size=input_shape).astype(np.float32) labels_in = np.random.randint( low=0, high=num_classes, size=labels_shape).astype(np.int32) # Run session inputs = {x0: data_in, labels: labels_in} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() session.run(stepio) # Save report and return session object (optional) if args.report: from gcprofile import save_popart_report save_popart_report(session) if args.test: return session
def main(args): # Model parameters np.random.seed(1971) input_rows = 28 input_columns = 28 num_classes = 10 batch_size = 2048 input_shape = [batch_size, input_rows * input_columns] labels_shape = [batch_size] # Create model x0, labels, model_proto, anchor_map, loss = create_model( num_features=input_columns * input_rows, num_classes=num_classes, batch_size=batch_size, force_recompute=True if args.recomputing == 'ON' else False) # Save model (optional) if args.export: with open(args.export, 'wb') as model_path: model_path.write(model_proto) # Session options num_ipus = 1 opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.engineOptions = {"debug.instrument": "true"} if args.recomputing == 'AUTO': opts.autoRecomputation = popart.RecomputationType.Standard # Create session session = popart.TrainingSession( fnModel=model_proto, dataFeed=popart.DataFlow(1, anchor_map), losses=[loss], optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=popart.DeviceManager().acquireAvailableDevice(num_ipus)) anchors = session.initAnchorArrays() session.prepareDevice() # Synthetic data input data_in = np.random.uniform(low=0.0, high=1.0, size=input_shape).astype(np.float32) labels_in = np.random.randint(low=0, high=num_classes, size=labels_shape).astype(np.int32) # Run session inputs = {x0: data_in, labels: labels_in} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() session.optimizerFromHost() session.run(stepio) # Save report and return session object (optional) if args.report: from gcprofile import save_popart_report save_popart_report(session) if args.test: return session