Exemple #1
0
def _format_measure_tuple(measure):
    format_memory = getattr(torch_profiler, "format_memory", lambda _: "N/A")

    self_cpu_total = (
        torch_profiler.format_time(measure.self_cpu_total) if measure else ""
    )
    cpu_total = torch_profiler.format_time(measure.cpu_total) if measure else ""
    self_cuda_total = (
        torch_profiler.format_time(measure.self_cuda_total)
        if measure and measure.self_cuda_total is not None
        else ""
    )
    cuda_total = torch_profiler.format_time(measure.cuda_total) if measure else ""
    self_cpu_memory = (
        format_memory(measure.self_cpu_memory)
        if measure and measure.self_cpu_memory is not None
        else ""
    )
    cpu_memory = (
        format_memory(measure.cpu_memory)
        if measure and measure.cpu_memory is not None
        else ""
    )
    self_cuda_memory = (
        format_memory(measure.self_cuda_memory)
        if measure and measure.self_cuda_memory is not None
        else ""
    )
    cuda_memory = (
        format_memory(measure.cuda_memory)
        if measure and measure.cuda_memory is not None
        else ""
    )
    occurrences = str(measure.occurrences) if measure else ""

    return Measure(
        self_cpu_total=self_cpu_total,
        cpu_total=cpu_total,
        self_cuda_total=self_cuda_total,
        cuda_total=cuda_total,
        self_cpu_memory=self_cpu_memory,
        cpu_memory=cpu_memory,
        self_cuda_memory=self_cuda_memory,
        cuda_memory=cuda_memory,
        occurrences=occurrences,
    )
Exemple #2
0
def traces_to_display(traces,
                      trace_events,
                      want_op_file,
                      show_events=False,
                      paths=None):
    """Construct human readable output of the profiler traces and events.
    """
    tree = OrderedDict()

    for trace in traces:
        [path, leaf, module] = trace
        current_tree = tree
        # unwrap all of the events, in case model is called multiple times
        events = [te for tevents in trace_events[path] for te in tevents]
        for depth, name in enumerate(path, 1):
            if name not in current_tree:
                current_tree[name] = OrderedDict()
            if depth == len(path) and ((paths is None and leaf) or
                                       (paths is not None and path in paths)):
                # tree measurements have key None, avoiding name conflict
                if show_events:
                    for event in events:
                        current_tree[name][event.name] = {
                            None:
                            Measure(
                                event.self_cpu_time_total,
                                event.cpu_time_total,
                                event.cuda_time_total,
                            )
                        }
                else:
                    current_tree[name][None] = Measure(
                        sum([e.self_cpu_time_total for e in events]),
                        sum([e.cpu_time_total for e in events]),
                        sum([e.cuda_time_total for e in events]),
                    )
            current_tree = current_tree[name]
    tree_lines = flatten_tree(tree)

    # dt = ('|', '|-- ', '+-- ', ' ') # ascii
    dt = ("\u2502", "\u251c\u2500\u2500 ", "\u2514\u2500\u2500 ", " "
          )  # ascii-ex
    format_lines = []
    for idx, tree_line in enumerate(tree_lines):
        depth, name, measures = tree_line
        self_cpu_time = ""
        cpu_time = ""
        cuda_time = ""
        if measures:
            self_cpu_time = tprofiler.format_time(measures.self_cpu_total)
            cpu_time = tprofiler.format_time(measures.cpu_total)
            cuda_time = tprofiler.format_time(measures.cuda_total)
        pre = ""
        next_depths = [pl[0] for pl in tree_lines[idx + 1:]]
        current = True
        while depth:
            if current:
                if depth in next_depths and next_depths[0] >= depth:
                    pre = dt[1]
                else:
                    pre = dt[2]
            else:
                if depth in next_depths:
                    pre = dt[0] + pre
                else:
                    pre = dt[3] + pre
            depth -= 1
            current = False
        if self_cpu_time == "":
            continue
        format_lines.append([name, self_cpu_time, cpu_time, cuda_time])

    # construct the table
    mynn = {
        "Layer Name": [],
        "Self CPU total": [],
        "CPU total": [],
        "GPU total": []
    }

    #heading = ("Module", "Self CPU totacl", "CPU total", "CUDA total")
    #max_lens = [max(map(len, col)) for col in zip(*([heading] + format_lines))]
    # create the heading
    #     disp = "{:<{}s}".format(heading[0], max_lens[0]) + " | "
    #     disp += "{:>{}s}".format(heading[1], max_lens[1]) + " | "
    #     disp += "{:>{}s}".format(heading[2], max_lens[2]) + " | "
    #     disp += "{:>{}s}".format(heading[3], max_lens[3]) + "\n"
    #     disp += "-|-".join(["-" * mlen for mlen in max_lens]) + "\n"
    for line in format_lines:
        label, self_cpu_time, cpu_time, cuda_time = line
        mynn["Layer Name"].append(str(label))
        mynn["Self CPU total"].append(str(self_cpu_time))
        mynn["CPU total"].append(str(cpu_time))
        mynn["GPU total"].append(str(cuda_time))

    df = DataFrame(
        mynn,
        columns=["Layer Name", "Self CPU total", "CPU total", "GPU total"])

    if want_op_file == True:
        export_csv = df.to_csv(r'output_file.csv', index=None, header=True)
    else:
        print(df)
Exemple #3
0
def save_events_table(
        events,
        path,
        times_path=None,
        sort_by=None,
        header=None,
        row_limit=100,
        max_src_column_width=75,
        with_flops=True,
        profile_memory=False,
        top_level_events_only=False,
        only_save_root_call=False):
    """Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
    if len(events) == 0:
        return ""

    has_cuda_time = any([event.self_cuda_time_total > 0 for event in events])
    has_cuda_mem = any([event.self_cuda_memory_usage > 0 for event in events])
    has_input_shapes = any(
        [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events])

    if sort_by is not None:
        events = EventList(sorted(
            events, key=lambda evt: getattr(evt, sort_by), reverse=True
        ), use_cuda=has_cuda_time, profile_memory=profile_memory, with_flops=with_flops)

    stacks = []
    for evt in events:
        if evt.stack is not None and len(evt.stack) > 0:
            stacks.append(evt.stack)
    has_stack = len(stacks) > 0
    print(f"Has stack: {has_stack}")

    #time columns in microseconds, memory columns in bytes

    headers = [
        'Name',
        'Self CPU %',
        'Self CPU',
        'CPU total %',
        'CPU total',
        'CPU time avg',
    ]
    if has_cuda_time:
        headers.extend([
            'Self CUDA',
            'Self CUDA %',
            'CUDA total',
            'CUDA time avg',
        ])
    if profile_memory:
        headers.extend([
            'CPU Mem',
            'Self CPU Mem',
        ])
        if has_cuda_mem:
            headers.extend([
                'CUDA Mem',
                'Self CUDA Mem',
            ])
    headers.append(
        '# of Calls'
    )
    # Only append Node ID if any event has a valid (>= 0) Node ID
    append_node_id = any([evt.node_id != -1 for evt in events])
    if append_node_id:
        headers.append('Node ID')

    # Have to use a list because nonlocal is Py3 only...
    MAX_STACK_ENTRY = 5

    def auto_scale_flops(flops):
        flop_headers = [
            'FLOPS',
            'KFLOPS',
            'MFLOPS',
            'GFLOPS',
            'TFLOPS',
            'PFLOPS',
        ]
        assert flops > 0
        log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
        assert log_flops >= 0 and log_flops < len(flop_headers)
        return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])


    with open(path, 'w') as profiler_csv:
        writer = csv.writer(profiler_csv)

        '''
        if with_flops:
            # Auto-scaling of flops header
            US_IN_SECOND = 1000.0 * 1000.0  # cpu_time_total is in us
            raw_flops = []
            for evt in events:
                if evt.flops > 0:
                    if evt.cuda_time_total != 0:
                        evt.flops = float(evt.flops) / evt.cuda_time_total * US_IN_SECOND
                    else:
                        evt.flops = float(evt.flops) / evt.cpu_time_total * US_IN_SECOND
                    raw_flops.append(evt.flops)
            if len(raw_flops) != 0:
                (flops_scale, flops_header) = auto_scale_flops(min(raw_flops))
                header = headers + flops_header
            else:
                with_flops = False  # can't find any valid flops'''
        if with_flops:
            flop_header = ["FLOP"]
            headers = headers + flop_header
        
        if has_stack:
            headers = headers + ['Source Location']
        headers = headers + ['DeviceType']
        writer.writerow(headers)


        # Have to use a list because nonlocal is Py3 only...
        result = []

        sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
        sum_self_cuda_time_total = 0
        for evt in events:
            if evt.device_type == DeviceType.CPU:
                # in legacy profiler, kernel info is stored in cpu events
                if evt.is_legacy:
                    sum_self_cuda_time_total += evt.self_cuda_time_total
            elif evt.device_type == DeviceType.CUDA:
                # in kineto profiler, there're events with the correct device type (e.g. CUDA)
                sum_self_cuda_time_total += evt.self_cuda_time_total

        # Actual printing
        event_limit = 0
        for evt in events:
            if event_limit == row_limit:
                break
            if top_level_events_only and evt.cpu_parent is not None:
                continue
            else:
                event_limit += 1
            name = evt.key

            '''
            row_values = [
                name,
                # Self CPU total %, 0 for async events.
                format_time_share(evt.self_cpu_time_total,
                                sum_self_cpu_time_total),
                evt.self_cpu_time_total,  # Self CPU total
                # CPU total %, 0 for async events.
                format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) if not evt.is_async else 0,
                evt.cpu_time_total,  # CPU total
                evt.cpu_time,  # CPU time avg
            ]'''
            row_values = [
                name,
                # Self CPU total %, 0 for async events.
                (evt.self_cpu_time_total/sum_self_cpu_time_total),
                evt.self_cpu_time_total,  # Self CPU total
                # CPU total %, 0 for async events.
                (evt.cpu_time_total / sum_self_cpu_time_total) if not evt.is_async else 0,
                evt.cpu_time_total,  # CPU total
                evt.cpu_time,  # CPU time avg
            ]
            if has_cuda_time:
                row_values.extend([
                    evt.self_cuda_time_total,
                    # CUDA time total %
                    (evt.self_cuda_time_total/sum_self_cuda_time_total),
                    evt.cuda_time_total,
                    evt.cuda_time,  # Cuda time avg
                ])
            if profile_memory:
                row_values.extend([
                    # CPU Mem Total
                    evt.cpu_memory_usage,
                    # Self CPU Mem Total
                    evt.self_cpu_memory_usage,
                ])
                if has_cuda_mem:
                    row_values.extend([
                        # CUDA Mem Total
                        evt.cuda_memory_usage,
                        # Self CUDA Mem Total
                        evt.self_cuda_memory_usage,
                    ])
            row_values.append(
                evt.count,  # Number of calls
            )

            if append_node_id:
                row_values.append(evt.node_id)
            if has_input_shapes:
                row_values.append(str(evt.input_shapes))
            
            if with_flops:
                row_values.append(evt.flops)
            '''if with_flops:
                if evt.flops <= 0.0:
                    row_values.append("--")
                else:
                    row_values.append('{0:8.3f}'.format(evt.flops * flops_scale))'''
            if has_stack:
                if (len(evt.stack) > 0) and only_save_root_call:
                    src_field = evt.stack[0]
                elif not only_save_root_call:
                    src_field = ",".join(evt.stack)
                row_values.append(src_field)

            row_values.append(evt.device_type)

            writer.writerow(row_values)

    if times_path is not None:
        with open(times_path, 'w') as profiler_log:
            profiler_log.write("Self CPU time total: {}".format(format_time(sum_self_cpu_time_total)))
            profiler_log.write("Self CUDA time total: {}".format(format_time(sum_self_cuda_time_total)))
Exemple #4
0
def traces_to_display(traces, trace_events, show_events=False, paths=None):
    """Construct human readable output of the profiler traces and events.
    """
    tree = OrderedDict()

    for trace in traces:
        [path, leaf, module] = trace
        current_tree = tree
        # unwrap all of the events, in case model is called multiple times
        events = [te for tevents in trace_events[path] for te in tevents]
        for depth, name in enumerate(path, 1):
            if name not in current_tree:
                current_tree[name] = OrderedDict()
            if depth == len(path) and ((paths is None and leaf) or
                                       (paths is not None and path in paths)):
                # tree measurements have key None, avoiding name conflict
                if show_events:
                    for event_name, event_group in groupby(
                            events, lambda e: e.name):
                        event_group = list(event_group)
                        current_tree[name][event_name] = {
                            None:
                            Measure(
                                sum([
                                    e.self_cpu_time_total for e in event_group
                                ]),
                                sum([e.cpu_time_total for e in event_group]),
                                sum([e.cuda_time_total for e in event_group]),
                                len(event_group))
                        }
                else:
                    current_tree[name][None] = Measure(
                        sum([e.self_cpu_time_total for e in events]),
                        sum([e.cpu_time_total for e in events]),
                        sum([e.cuda_time_total for e in events]),
                        len(trace_events[path]))
            current_tree = current_tree[name]
    tree_lines = flatten_tree(tree)

    # dt = ('|', '|-- ', '+-- ', ' ') # ascii
    dt = ("\u2502", "\u251c\u2500\u2500 ", "\u2514\u2500\u2500 ", " "
          )  # ascii-ex
    format_lines = []
    for idx, tree_line in enumerate(tree_lines):
        depth, name, measures = tree_line
        self_cpu_time = ""
        cpu_time = ""
        cuda_time = ""
        occurrences = ""
        if measures:
            self_cpu_time = tprofiler.format_time(measures.self_cpu_total)
            cpu_time = tprofiler.format_time(measures.cpu_total)
            cuda_time = tprofiler.format_time(measures.cuda_total)
            occurrences = str(measures.occurrences)
        pre = ""
        next_depths = [pl[0] for pl in tree_lines[idx + 1:]]
        current = True
        while depth:
            if current:
                if depth in next_depths and next_depths[0] >= depth:
                    pre = dt[1]
                else:
                    pre = dt[2]
            else:
                if depth in next_depths:
                    pre = dt[0] + pre
                else:
                    pre = dt[3] + pre
            depth -= 1
            current = False
        format_lines.append(
            [pre + name, self_cpu_time, cpu_time, cuda_time, occurrences])

    # construct the table
    heading = ("Module", "Self CPU total", "CPU total", "CUDA total",
               "Occurrences")
    max_lens = [max(map(len, col)) for col in zip(*([heading] + format_lines))]
    # create the heading
    disp = "{:<{}s}".format(heading[0], max_lens[0]) + " | "
    disp += "{:>{}s}".format(heading[1], max_lens[1]) + " | "
    disp += "{:>{}s}".format(heading[2], max_lens[2]) + " | "
    disp += "{:>{}s}".format(heading[3], max_lens[3]) + " | "
    disp += "{:>{}s}".format(heading[4], max_lens[4]) + "\n"
    disp += "-|-".join(["-" * mlen for mlen in max_lens]) + "\n"
    for line in format_lines:
        label, self_cpu_time, cpu_time, cuda_time, occurrences = line
        disp += "{:<{}s}".format(label, max_lens[0]) + " | "
        disp += "{:>{}s}".format(self_cpu_time, max_lens[1]) + " | "
        disp += "{:>{}s}".format(cpu_time, max_lens[2]) + " | "
        disp += "{:>{}s}".format(cuda_time, max_lens[3]) + " | "
        disp += "{:>{}s}".format(occurrences, max_lens[4]) + "\n"

    return disp
Exemple #5
0
    def display(self):
        if not self.exited:
            return "<unfinished torchprof.profile>"

        tree = OrderedDict()

        for trace in self.traces:
            [path, leaf, module] = trace
            current_tree = tree
            for depth, name in enumerate(path, 1):
                if name not in current_tree:
                    current_tree[name] = OrderedDict()
                if depth == len(path) and leaf:
                    # tree measurements have key None, avoiding name conflict
                    if hasattr(module, 'tic'):
                        current_tree[name][None] = module.tic
                current_tree = current_tree[name]
        tree_lines = flatten_tree(tree)

        tic = None
        for idx in reversed(range(len(tree_lines))):
            depth, name, measures = tree_lines[idx]
            if isinstance(measures, float):
                if tic == None:
                    tic = measures
                    tree_lines[idx][2] = ""
                else:
                    tree_lines[idx][2] = tprofiler.format_time(
                        ((tree_lines[idx][2] - tic) * 1e6))
                    tic = measures
            else:
                tree_lines[idx][2] = ""

        # dt = ('|', '|-- ', '+-- ', ' ') # ascii
        dt = ("\u2502", "\u251c\u2500\u2500 ", "\u2514\u2500\u2500 ", " "
              )  # ascii-ex
        format_lines = []
        for idx, tree_line in enumerate(tree_lines):
            depth, name, measures = tree_line
            pre = ""
            next_depths = [pl[0] for pl in tree_lines[idx + 1:]]
            current = True
            while depth:
                if current:
                    if depth in next_depths and next_depths[0] >= depth:
                        pre = dt[1]
                    else:
                        pre = dt[2]
                else:
                    if depth in next_depths:
                        pre = dt[0] + pre
                    else:
                        pre = dt[3] + pre
                depth -= 1
                current = False
            format_lines.append([pre + name, measures])

        # construct the table
        heading = ("Module", "Time Total")
        max_lens = [
            max(map(len, col)) for col in zip(*([heading] + format_lines))
        ]
        # create the heading
        disp = "{:<{}s}".format(heading[0], max_lens[0]) + " | "
        disp += "{:>{}s}".format(heading[1], max_lens[1]) + "\n"
        disp += "-|-".join(["-" * mlen for mlen in max_lens]) + "\n"
        for line in format_lines:
            label, time_total = line
            disp += "{:<{}s}".format(label, max_lens[0]) + " | "
            disp += "{:>{}s}".format(time_total, max_lens[1]) + "\n"

        return disp