def get_report(loop_op: tf.Operation, infeed_queue_initializer: tf.Operation, outfeed_op: tf.Operation, report_dest: str, available_memory_proportion: Optional[float] = 0.6) -> None: """Generate report from running model on IPU and save to disk. Args: loop_op: Inference op to generate report on. infeed_queue_initializer: Initializer for the infeed queue outfeed_op: Outfeed operator. report_dest: Location to store report. available_memory_proportion: Proportion of tile memory available as temporary memory for matmul and convolution execution """ # Set compile and device options os.environ["TF_POPLAR_FLAGS"] += " --use_ipu_model" use_poplar_text_report = report_mode == 'text' opts = ipu_utils.create_ipu_config( profiling=True, use_poplar_text_report=use_poplar_text_report, profile_execution=True) opts = ipu_utils.set_matmul_options(opts, matmul_options={ "availableMemoryProportion": str(available_memory_proportion) }) opts = ipu_utils.set_convolution_options( opts, convolution_options={ "availableMemoryProportion": str(available_memory_proportion) }) ipu_utils.auto_select_ipus(opts, [1]) ipu_utils.configure_ipu_system(opts) with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) session = tf.Session() session.run(infeed_queue_initializer) session.run(loop_op, options=run_options) session.run(outfeed_op, options=run_options) out = session.run(report) if report_mode == 'text': # extract the report rep = ipu_utils.extract_all_strings_from_event_trace(out) logging.info("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: save_tf_report(out)
def generate_report(graph): print(f'Generating training report... {graph.report}') report = graph.session.run(graph.report) compilation_report = ipu_utils.extract_compile_reports(report) execution_report = ipu_utils.extract_execute_reports(report) with open("report.txt", "w") as f: f.write(ipu_utils.extract_all_strings_from_event_trace(report)) with open("compilation_report.json", "w") as f: json.dump(compilation_report, f) with open("execution_report.json", "w") as f: json.dump(execution_report, f) print('Reports saved to .')
def generate_report(batch_size: int, report_dest: str = "./densenet_report.txt") -> None: """Generate report from running model on IPU Args: batch_size: Batch size for inference report_dest: Location to save generated text report """ # Set compile and device options os.environ['TF_POPLAR_FORCE_IPU_MODEL'] = "1" opts = utils.create_ipu_config(profiling=True, use_poplar_text_report=True) utils.auto_select_ipus(opts, [1]) utils.configure_ipu_system(opts) output_probs = construct_graph(batch_size) with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) with tf.Session() as session: session.run(output_probs, feed_dict={ "optimized/image_input:0": np.zeros( (batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.float16) }, options=run_options) out = session.run(report) # extract the report rep = utils.extract_all_strings_from_event_trace(out) logging.info("Writing densenet profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep)
# Run the main graph session.run(logits, feed_dict={x: training_data}) # Execute the event trace op: the result is a list of trace event serialized protobufs. raw_report = session.run(trace) # These objects can be converted to strings with utility functions, as shown below. ext = ".json" if args.json_report else ".txt" if args.split_reports: compile_reports = utils.extract_compile_reports(raw_report) execution_reports = utils.extract_execute_reports(raw_report) # These are lists, as long as the number of graphs profiled, except that the # execution_reports list will be empty if execution profiling is not enabled. # You could save only the last (i.e. relative to the main graph); in this case we save everything. with open("compile" + ext, "w", encoding="utf-8") as f: for report in compile_reports: # Each element of the list is a tuple of 2 elements: # the first is a string representing an auto-generated name of the xla graph # the second is a string containing the actual report relative to the graph xla_name, report_string = report f.write(xla_name + "\n") f.write(report_string + "\n") if len(execution_reports) > 0: with open("execution" + ext, "w", encoding="utf-8") as f: for report in execution_reports: xla_name, report_string = report f.write(xla_name + "\n") f.write(report_string + "\n") else: report = utils.extract_all_strings_from_event_trace(raw_report) with open("report" + ext, "w", encoding="utf-8") as f: f.write(report)
start = time.time() sess.run(inference_output) convolution_predictions = sess.run(outfeed) # convolution_predictions = sess.run(inference_output, feed_dict={input_image: np_image}) raw_output = sess.run( decoder, feed_dict={input_detection: convolution_predictions[0]}) filtered_output = process_detections(raw_output) draw_detections(original_image, original_image_dims[0], original_image_dims[1], filtered_output) print("Done running inference.") duration = time.time() - start print("Duration: {:.3f} seconds\n".format(duration)) if REPORT: rep_out = sess.run(report) save_tf_report(rep_out) rep = utils.extract_all_strings_from_event_trace(rep_out) with open( str(WIDTH) + "x" + str(HEIGHT) + "_ipus" + str(NUM_IPUS) + "_ssd_report.txt", "w") as f: f.write(rep) # Performance runs print("Executing...") for iter_count in range(N_ITERATIONS): print("Running iteration: ", iter_count) # Run start = time.time() sess.run(inference_output) convolution_predictions = sess.run(outfeed) raw_output = sess.run( decoder, feed_dict={input_detection: convolution_predictions[0]}) filtered_output = process_detections(raw_output)
def train(replication_factor, batch_size, batch_per_step, profile, num_iter, time_steps): """Launch training.""" # Set up in-feeds for the data with tf.device('cpu'): data_generator = EnvGenerator(batch_size, time_steps) items = next(data_generator) output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items)) output_shapes = tuple((tf.TensorShape(i.shape) for i in items)) total_bytes = 0 for i in items: total_bytes += i.nbytes print(f'Input data size = {total_bytes/1000000} MB/batch') dataset = tf.data.Dataset.from_generator(data_generator, output_types=output_types, output_shapes=output_shapes) infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, "InfeedQueue", replication_factor=replication_factor) data_init = infeed_queue.initializer # Compile loss op with ipu_scope("/device:IPU:0"): total_loss = ipu_compiler.compile( lambda: loops.repeat(batch_per_step, build_train_op, infeed_queue=infeed_queue, inputs=[tf.constant(0.0, dtype=DTYPE)])) # Set up report op optionally. if profile: with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() # Set up session on IPU opts = utils.create_ipu_config( profiling=profile, use_poplar_text_report=use_poplar_text_report, profile_execution=profile, merge_infeed_io_copies=True) opts = utils.set_optimization_options( opts, max_cross_replica_sum_buffer_size=10000000) opts = utils.auto_select_ipus(opts, [replication_factor]) utils.configure_ipu_system(opts) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) # Initialize variables utils.move_variable_initialization_to_cpu() sess.run([tf.global_variables_initializer(), data_init]) # Run training and time total_time = 0.0 total_samples = 0 skip_iterations = 5 # Initially the infeed may buffer extra input data and # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec. for iters in range(num_iter): data_generator.reset_counter() t0 = time.perf_counter() sess.run(total_loss) t1 = time.perf_counter() if profile: raw_reports = sess.run(report) if use_poplar_text_report: # extract the report rep = utils.extract_all_strings_from_event_trace(raw_reports) print("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: os.makedirs('profile_rl', exist_ok=True) save_tf_report(raw_reports, log_dir='profile_rl') print("Writing profiling report to profile_rl") break if iters > skip_iterations: total_time += (t1 - t0) total_samples += (batch_size * batch_per_step * replication_factor) print("Average %.1f items/sec" % (total_samples / total_time))
def extract_runtimes_from_report(report, display=True): """Returns timing information from IpuTraceEvent report -- Array of text encoded IpuTraceEvent """ if len(report) is 0: return # Timings from tf xla event timestamps from tensorflow.compiler.plugin.poplar.driver.trace_pb2 import IpuTraceEvent # Retrieve IpuEvents, poplar report and cycles events = list(map(IpuTraceEvent.FromString, report)) report = utils.extract_all_strings_from_event_trace(report) m = list(map(int, re.findall("Program cycles\s*:\s*([\d\.]+)", report))) global start_time first = start_time == 0 if first: start_time = events[0].timestamp events = events[1:] evt_str = "\nIPU Timings\n" exec_num = 0 for evt in events: extra_str = "" if evt.type == IpuTraceEvent.COMPILE_BEGIN: continue elif evt.type == IpuTraceEvent.COMPILE_END: evt_name = "Compile" elif evt.type == IpuTraceEvent.HOST_TO_DEVICE_TRANSFER: evt_name = "Host->Device" extra_str = "\n Tensors:" transfered_tensors = json.loads( evt.data_transfer.data_transfer.decode("utf-8")) for t in transfered_tensors["tensors"]: extra_str += "\n handle: {:>6}, size: {}".format( t["name"], t["size"]) extra_str += "\n Total_size: {}".format( transfered_tensors["total_size"]) elif evt.type == IpuTraceEvent.DEVICE_TO_HOST_TRANSFER: evt_name = "Device->Host" extra_str = "\n Tensors:" transfered_tensors = json.loads( evt.data_transfer.data_transfer.decode("utf-8")) for t in transfered_tensors["tensors"]: extra_str += "\n handle: {:>6}, size: {}".format( t["name"], t["size"]) extra_str += "\n Total_size: {}".format( transfered_tensors["total_size"]) elif evt.type == IpuTraceEvent.LOAD_ENGINE: evt_name = "Load engine" elif evt.type == IpuTraceEvent.EXECUTE: evt_name = "Execute" if m and m[exec_num]: execution_time = float(m[exec_num]) / ( 1 * 1000 * 1000 * 1000) # Implied 1GHz clock speed extra_str = "\n Execution Time: {:.3g}s".format( execution_time) extra_str += "\n Streaming Time: {:.3g}s".format( (evt.timestamp - start_time) - execution_time) exec_num += 1 else: evt_name = "Unknown event" evt_str += "{:<15s}: {:<8.3g} s {}\n".format( evt_name, (evt.timestamp - start_time), extra_str) start_time = evt.timestamp # Print Cycle count from poplar report evt_str += "\nCycle counts on IPU\n" for execution_num, execution_cycles in enumerate(m): evt_str += "Execution {} cycles : {}\n".format(execution_num, execution_cycles) if display: print(evt_str) # Write Report to file if first: with open("report.txt", "w") as f: f.write(report) print("\nWritten to file: report.txt")
def basic_graph(pa, pb, pc): # Do basic addition with tensors o1 = pa + pb o2 = pa + pc simple_graph_output = o1 + o2 return simple_graph_output with ipu_scope("/device:IPU:0"): result = basic_graph(pa, pb, pc) with tf.Session() as sess: # Run the graph through the session feeding it an arbitrary dictionary result = sess.run(result, feed_dict={ pa: [1., 1.], pb: [0., 1.], pc: [1., 5.] }) # Generate report based on the event run in session trace_out = sess.run(report) trace_report = utils.extract_all_strings_from_event_trace(trace_out) # Write trace report to file with open('Trace_Event_Report.rep', "w") as f: f.write(trace_report) # Print the result print(result)