def _format_measure_tuple(measure): format_memory = getattr(torch_profiler, "format_memory", lambda _: "N/A") self_cpu_total = ( torch_profiler.format_time(measure.self_cpu_total) if measure else "" ) cpu_total = torch_profiler.format_time(measure.cpu_total) if measure else "" self_cuda_total = ( torch_profiler.format_time(measure.self_cuda_total) if measure and measure.self_cuda_total is not None else "" ) cuda_total = torch_profiler.format_time(measure.cuda_total) if measure else "" self_cpu_memory = ( format_memory(measure.self_cpu_memory) if measure and measure.self_cpu_memory is not None else "" ) cpu_memory = ( format_memory(measure.cpu_memory) if measure and measure.cpu_memory is not None else "" ) self_cuda_memory = ( format_memory(measure.self_cuda_memory) if measure and measure.self_cuda_memory is not None else "" ) cuda_memory = ( format_memory(measure.cuda_memory) if measure and measure.cuda_memory is not None else "" ) occurrences = str(measure.occurrences) if measure else "" return Measure( self_cpu_total=self_cpu_total, cpu_total=cpu_total, self_cuda_total=self_cuda_total, cuda_total=cuda_total, self_cpu_memory=self_cpu_memory, cpu_memory=cpu_memory, self_cuda_memory=self_cuda_memory, cuda_memory=cuda_memory, occurrences=occurrences, )
def traces_to_display(traces, trace_events, want_op_file, show_events=False, paths=None): """Construct human readable output of the profiler traces and events. """ tree = OrderedDict() for trace in traces: [path, leaf, module] = trace current_tree = tree # unwrap all of the events, in case model is called multiple times events = [te for tevents in trace_events[path] for te in tevents] for depth, name in enumerate(path, 1): if name not in current_tree: current_tree[name] = OrderedDict() if depth == len(path) and ((paths is None and leaf) or (paths is not None and path in paths)): # tree measurements have key None, avoiding name conflict if show_events: for event in events: current_tree[name][event.name] = { None: Measure( event.self_cpu_time_total, event.cpu_time_total, event.cuda_time_total, ) } else: current_tree[name][None] = Measure( sum([e.self_cpu_time_total for e in events]), sum([e.cpu_time_total for e in events]), sum([e.cuda_time_total for e in events]), ) current_tree = current_tree[name] tree_lines = flatten_tree(tree) # dt = ('|', '|-- ', '+-- ', ' ') # ascii dt = ("\u2502", "\u251c\u2500\u2500 ", "\u2514\u2500\u2500 ", " " ) # ascii-ex format_lines = [] for idx, tree_line in enumerate(tree_lines): depth, name, measures = tree_line self_cpu_time = "" cpu_time = "" cuda_time = "" if measures: self_cpu_time = tprofiler.format_time(measures.self_cpu_total) cpu_time = tprofiler.format_time(measures.cpu_total) cuda_time = tprofiler.format_time(measures.cuda_total) pre = "" next_depths = [pl[0] for pl in tree_lines[idx + 1:]] current = True while depth: if current: if depth in next_depths and next_depths[0] >= depth: pre = dt[1] else: pre = dt[2] else: if depth in next_depths: pre = dt[0] + pre else: pre = dt[3] + pre depth -= 1 current = False if self_cpu_time == "": continue format_lines.append([name, self_cpu_time, cpu_time, cuda_time]) # construct the table mynn = { "Layer Name": [], "Self CPU total": [], "CPU total": [], "GPU total": [] } #heading = ("Module", "Self CPU totacl", "CPU total", "CUDA total") #max_lens = [max(map(len, col)) for col in zip(*([heading] + format_lines))] # create the heading # disp = "{:<{}s}".format(heading[0], max_lens[0]) + " | " # disp += "{:>{}s}".format(heading[1], max_lens[1]) + " | " # disp += "{:>{}s}".format(heading[2], max_lens[2]) + " | " # disp += "{:>{}s}".format(heading[3], max_lens[3]) + "\n" # disp += "-|-".join(["-" * mlen for mlen in max_lens]) + "\n" for line in format_lines: label, self_cpu_time, cpu_time, cuda_time = line mynn["Layer Name"].append(str(label)) mynn["Self CPU total"].append(str(self_cpu_time)) mynn["CPU total"].append(str(cpu_time)) mynn["GPU total"].append(str(cuda_time)) df = DataFrame( mynn, columns=["Layer Name", "Self CPU total", "CPU total", "GPU total"]) if want_op_file == True: export_csv = df.to_csv(r'output_file.csv', index=None, header=True) else: print(df)
def save_events_table( events, path, times_path=None, sort_by=None, header=None, row_limit=100, max_src_column_width=75, with_flops=True, profile_memory=False, top_level_events_only=False, only_save_root_call=False): """Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg).""" if len(events) == 0: return "" has_cuda_time = any([event.self_cuda_time_total > 0 for event in events]) has_cuda_mem = any([event.self_cuda_memory_usage > 0 for event in events]) has_input_shapes = any( [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events]) if sort_by is not None: events = EventList(sorted( events, key=lambda evt: getattr(evt, sort_by), reverse=True ), use_cuda=has_cuda_time, profile_memory=profile_memory, with_flops=with_flops) stacks = [] for evt in events: if evt.stack is not None and len(evt.stack) > 0: stacks.append(evt.stack) has_stack = len(stacks) > 0 print(f"Has stack: {has_stack}") #time columns in microseconds, memory columns in bytes headers = [ 'Name', 'Self CPU %', 'Self CPU', 'CPU total %', 'CPU total', 'CPU time avg', ] if has_cuda_time: headers.extend([ 'Self CUDA', 'Self CUDA %', 'CUDA total', 'CUDA time avg', ]) if profile_memory: headers.extend([ 'CPU Mem', 'Self CPU Mem', ]) if has_cuda_mem: headers.extend([ 'CUDA Mem', 'Self CUDA Mem', ]) headers.append( '# of Calls' ) # Only append Node ID if any event has a valid (>= 0) Node ID append_node_id = any([evt.node_id != -1 for evt in events]) if append_node_id: headers.append('Node ID') # Have to use a list because nonlocal is Py3 only... MAX_STACK_ENTRY = 5 def auto_scale_flops(flops): flop_headers = [ 'FLOPS', 'KFLOPS', 'MFLOPS', 'GFLOPS', 'TFLOPS', 'PFLOPS', ] assert flops > 0 log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1))) assert log_flops >= 0 and log_flops < len(flop_headers) return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)]) with open(path, 'w') as profiler_csv: writer = csv.writer(profiler_csv) ''' if with_flops: # Auto-scaling of flops header US_IN_SECOND = 1000.0 * 1000.0 # cpu_time_total is in us raw_flops = [] for evt in events: if evt.flops > 0: if evt.cuda_time_total != 0: evt.flops = float(evt.flops) / evt.cuda_time_total * US_IN_SECOND else: evt.flops = float(evt.flops) / evt.cpu_time_total * US_IN_SECOND raw_flops.append(evt.flops) if len(raw_flops) != 0: (flops_scale, flops_header) = auto_scale_flops(min(raw_flops)) header = headers + flops_header else: with_flops = False # can't find any valid flops''' if with_flops: flop_header = ["FLOP"] headers = headers + flop_header if has_stack: headers = headers + ['Source Location'] headers = headers + ['DeviceType'] writer.writerow(headers) # Have to use a list because nonlocal is Py3 only... result = [] sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events]) sum_self_cuda_time_total = 0 for evt in events: if evt.device_type == DeviceType.CPU: # in legacy profiler, kernel info is stored in cpu events if evt.is_legacy: sum_self_cuda_time_total += evt.self_cuda_time_total elif evt.device_type == DeviceType.CUDA: # in kineto profiler, there're events with the correct device type (e.g. CUDA) sum_self_cuda_time_total += evt.self_cuda_time_total # Actual printing event_limit = 0 for evt in events: if event_limit == row_limit: break if top_level_events_only and evt.cpu_parent is not None: continue else: event_limit += 1 name = evt.key ''' row_values = [ name, # Self CPU total %, 0 for async events. format_time_share(evt.self_cpu_time_total, sum_self_cpu_time_total), evt.self_cpu_time_total, # Self CPU total # CPU total %, 0 for async events. format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) if not evt.is_async else 0, evt.cpu_time_total, # CPU total evt.cpu_time, # CPU time avg ]''' row_values = [ name, # Self CPU total %, 0 for async events. (evt.self_cpu_time_total/sum_self_cpu_time_total), evt.self_cpu_time_total, # Self CPU total # CPU total %, 0 for async events. (evt.cpu_time_total / sum_self_cpu_time_total) if not evt.is_async else 0, evt.cpu_time_total, # CPU total evt.cpu_time, # CPU time avg ] if has_cuda_time: row_values.extend([ evt.self_cuda_time_total, # CUDA time total % (evt.self_cuda_time_total/sum_self_cuda_time_total), evt.cuda_time_total, evt.cuda_time, # Cuda time avg ]) if profile_memory: row_values.extend([ # CPU Mem Total evt.cpu_memory_usage, # Self CPU Mem Total evt.self_cpu_memory_usage, ]) if has_cuda_mem: row_values.extend([ # CUDA Mem Total evt.cuda_memory_usage, # Self CUDA Mem Total evt.self_cuda_memory_usage, ]) row_values.append( evt.count, # Number of calls ) if append_node_id: row_values.append(evt.node_id) if has_input_shapes: row_values.append(str(evt.input_shapes)) if with_flops: row_values.append(evt.flops) '''if with_flops: if evt.flops <= 0.0: row_values.append("--") else: row_values.append('{0:8.3f}'.format(evt.flops * flops_scale))''' if has_stack: if (len(evt.stack) > 0) and only_save_root_call: src_field = evt.stack[0] elif not only_save_root_call: src_field = ",".join(evt.stack) row_values.append(src_field) row_values.append(evt.device_type) writer.writerow(row_values) if times_path is not None: with open(times_path, 'w') as profiler_log: profiler_log.write("Self CPU time total: {}".format(format_time(sum_self_cpu_time_total))) profiler_log.write("Self CUDA time total: {}".format(format_time(sum_self_cuda_time_total)))
def traces_to_display(traces, trace_events, show_events=False, paths=None): """Construct human readable output of the profiler traces and events. """ tree = OrderedDict() for trace in traces: [path, leaf, module] = trace current_tree = tree # unwrap all of the events, in case model is called multiple times events = [te for tevents in trace_events[path] for te in tevents] for depth, name in enumerate(path, 1): if name not in current_tree: current_tree[name] = OrderedDict() if depth == len(path) and ((paths is None and leaf) or (paths is not None and path in paths)): # tree measurements have key None, avoiding name conflict if show_events: for event_name, event_group in groupby( events, lambda e: e.name): event_group = list(event_group) current_tree[name][event_name] = { None: Measure( sum([ e.self_cpu_time_total for e in event_group ]), sum([e.cpu_time_total for e in event_group]), sum([e.cuda_time_total for e in event_group]), len(event_group)) } else: current_tree[name][None] = Measure( sum([e.self_cpu_time_total for e in events]), sum([e.cpu_time_total for e in events]), sum([e.cuda_time_total for e in events]), len(trace_events[path])) current_tree = current_tree[name] tree_lines = flatten_tree(tree) # dt = ('|', '|-- ', '+-- ', ' ') # ascii dt = ("\u2502", "\u251c\u2500\u2500 ", "\u2514\u2500\u2500 ", " " ) # ascii-ex format_lines = [] for idx, tree_line in enumerate(tree_lines): depth, name, measures = tree_line self_cpu_time = "" cpu_time = "" cuda_time = "" occurrences = "" if measures: self_cpu_time = tprofiler.format_time(measures.self_cpu_total) cpu_time = tprofiler.format_time(measures.cpu_total) cuda_time = tprofiler.format_time(measures.cuda_total) occurrences = str(measures.occurrences) pre = "" next_depths = [pl[0] for pl in tree_lines[idx + 1:]] current = True while depth: if current: if depth in next_depths and next_depths[0] >= depth: pre = dt[1] else: pre = dt[2] else: if depth in next_depths: pre = dt[0] + pre else: pre = dt[3] + pre depth -= 1 current = False format_lines.append( [pre + name, self_cpu_time, cpu_time, cuda_time, occurrences]) # construct the table heading = ("Module", "Self CPU total", "CPU total", "CUDA total", "Occurrences") max_lens = [max(map(len, col)) for col in zip(*([heading] + format_lines))] # create the heading disp = "{:<{}s}".format(heading[0], max_lens[0]) + " | " disp += "{:>{}s}".format(heading[1], max_lens[1]) + " | " disp += "{:>{}s}".format(heading[2], max_lens[2]) + " | " disp += "{:>{}s}".format(heading[3], max_lens[3]) + " | " disp += "{:>{}s}".format(heading[4], max_lens[4]) + "\n" disp += "-|-".join(["-" * mlen for mlen in max_lens]) + "\n" for line in format_lines: label, self_cpu_time, cpu_time, cuda_time, occurrences = line disp += "{:<{}s}".format(label, max_lens[0]) + " | " disp += "{:>{}s}".format(self_cpu_time, max_lens[1]) + " | " disp += "{:>{}s}".format(cpu_time, max_lens[2]) + " | " disp += "{:>{}s}".format(cuda_time, max_lens[3]) + " | " disp += "{:>{}s}".format(occurrences, max_lens[4]) + "\n" return disp
def display(self): if not self.exited: return "<unfinished torchprof.profile>" tree = OrderedDict() for trace in self.traces: [path, leaf, module] = trace current_tree = tree for depth, name in enumerate(path, 1): if name not in current_tree: current_tree[name] = OrderedDict() if depth == len(path) and leaf: # tree measurements have key None, avoiding name conflict if hasattr(module, 'tic'): current_tree[name][None] = module.tic current_tree = current_tree[name] tree_lines = flatten_tree(tree) tic = None for idx in reversed(range(len(tree_lines))): depth, name, measures = tree_lines[idx] if isinstance(measures, float): if tic == None: tic = measures tree_lines[idx][2] = "" else: tree_lines[idx][2] = tprofiler.format_time( ((tree_lines[idx][2] - tic) * 1e6)) tic = measures else: tree_lines[idx][2] = "" # dt = ('|', '|-- ', '+-- ', ' ') # ascii dt = ("\u2502", "\u251c\u2500\u2500 ", "\u2514\u2500\u2500 ", " " ) # ascii-ex format_lines = [] for idx, tree_line in enumerate(tree_lines): depth, name, measures = tree_line pre = "" next_depths = [pl[0] for pl in tree_lines[idx + 1:]] current = True while depth: if current: if depth in next_depths and next_depths[0] >= depth: pre = dt[1] else: pre = dt[2] else: if depth in next_depths: pre = dt[0] + pre else: pre = dt[3] + pre depth -= 1 current = False format_lines.append([pre + name, measures]) # construct the table heading = ("Module", "Time Total") max_lens = [ max(map(len, col)) for col in zip(*([heading] + format_lines)) ] # create the heading disp = "{:<{}s}".format(heading[0], max_lens[0]) + " | " disp += "{:>{}s}".format(heading[1], max_lens[1]) + "\n" disp += "-|-".join(["-" * mlen for mlen in max_lens]) + "\n" for line in format_lines: label, time_total = line disp += "{:<{}s}".format(label, max_lens[0]) + " | " disp += "{:>{}s}".format(time_total, max_lens[1]) + "\n" return disp