Ejemplo n.º 1
0
    def _parse_kineto_results(self, result):
        # result.events() has most of the events - PyTorch op-level and device-level events

        trace_start_us = result.trace_start_us()
        mem_records = [[evt, False] for evt in result.events()
                       if evt.name() == MEMORY_EVENT_NAME]
        mem_records_acc = MemRecordsAcc(mem_records)

        def _cpu_memory_usage(mem_record):
            return mem_record.nbytes() if \
                mem_record.device_type() in [DeviceType.CPU, DeviceType.MKLDNN, DeviceType.IDEEP] \
                else 0

        def _cuda_memory_usage(mem_record):
            return mem_record.nbytes() if \
                mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP] \
                else 0

        # Create and return FunctionEvent list
        function_events = []
        cuda_corr_map: Dict[int, List[FunctionEvent]] = {}
        max_evt_id = 0
        for kineto_event in result.events():
            if _filter_name(kineto_event.name()):
                continue
            rel_start_us = kineto_event.start_us() - trace_start_us
            rel_end_us = rel_start_us + kineto_event.duration_us()
            abs_end_us = kineto_event.start_us() + kineto_event.duration_us()

            cpu_memory_usage = 0
            cuda_memory_usage = 0
            if kineto_event.device_type() == DeviceType.CPU:
                # find the corresponding memory allocation events
                for mem_record in mem_records_acc.in_interval(
                        kineto_event.start_us(), abs_end_us):
                    cpu_memory_usage += _cpu_memory_usage(mem_record[0])
                    cuda_memory_usage += _cuda_memory_usage(mem_record[0])
                    mem_record[1] = True

            is_async = kineto_event.is_async() or (
                kineto_event.start_thread_id() != kineto_event.end_thread_id())

            fe = FunctionEvent(
                id=kineto_event.correlation_id(),
                name=_rewrite_name(name=kineto_event.name(),
                                   with_wildcard=True),
                trace_name=_rewrite_name(name=kineto_event.name(),
                                         with_wildcard=False),
                thread=kineto_event.start_thread_id(),
                start_us=rel_start_us,
                end_us=rel_end_us,
                fwd_thread=kineto_event.fwd_thread_id(),
                input_shapes=kineto_event.shapes(),
                stack=[
                    entry for entry in kineto_event.stack()
                    if _filter_stack_entry(entry)
                ],
                scope=kineto_event.scope(),
                cpu_memory_usage=cpu_memory_usage,
                cuda_memory_usage=cuda_memory_usage,
                is_async=is_async,
                sequence_nr=kineto_event.sequence_nr(),
                device_type=kineto_event.device_type(),
                device_index=kineto_event.device_index(),
                flops=kineto_event.flops(),
            )
            max_evt_id = fe.id if fe.id > max_evt_id else max_evt_id
            if fe.device_type == DeviceType.CPU and not fe.is_async:
                # Check if we have CUDA time as a fallback
                cuda_time = kineto_event.cuda_elapsed_us()
                if cuda_time > 0:
                    fe.append_kernel(fe.name, fe.device_index, cuda_time)
                    fe.is_legacy = True
            function_events.append(fe)
            corr_id = kineto_event.linked_correlation_id()
            if corr_id > 0:
                if corr_id not in cuda_corr_map:
                    cuda_corr_map[corr_id] = []
                cuda_corr_map[corr_id].append(fe)

        # associate CUDA kernels and CUDA runtime (CPU) with CPU events
        for fe in function_events:
            if (fe.device_type == DeviceType.CPU and not fe.is_async
                    and fe.id in cuda_corr_map):
                for f_evt in cuda_corr_map[fe.id]:
                    if f_evt.device_type == DeviceType.CUDA:
                        fe.append_kernel(
                            f_evt.name, f_evt.device_index,
                            f_evt.time_range.end - f_evt.time_range.start)
                    elif f_evt.device_type == DeviceType.CPU:
                        # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated
                        # with the 'thread' of the corresponding linked PyTorch event to properly track
                        # parents and children
                        f_evt.thread = fe.thread

        # output top-level memory events
        for mem_record in mem_records:
            if not mem_record[1]:
                rel_start_us = mem_record[0].start_us() - trace_start_us
                max_evt_id += 1
                fe = FunctionEvent(
                    id=max_evt_id,
                    name=MEMORY_EVENT_NAME,
                    trace_name=None,  # not outputting in the trace
                    thread=mem_record[0].start_thread_id(),
                    start_us=rel_start_us,
                    end_us=rel_start_us,  # no duration
                    fwd_thread=mem_record[0].start_thread_id(),
                    input_shapes=[],
                    stack=[],
                    scope=0,  # RecordScope::FUNCTION
                    cpu_memory_usage=_cpu_memory_usage(mem_record[0]),
                    cuda_memory_usage=_cuda_memory_usage(mem_record[0]),
                    is_async=False,
                    sequence_nr=-1,
                    device_type=DeviceType.CPU,
                    device_index=0,
                )
                function_events.append(fe)

        function_events.sort(
            key=lambda evt: [evt.time_range.start, -evt.time_range.end])
        return function_events
Ejemplo n.º 2
0
def _parse_legacy_records(thread_records):
    def _get_record_key(record):
        """
        Returns a tuple to be used by _parse_legacy_records for correlating start and
        end records.
        """
        return (record.handle(), record.node_id())

    next_id = 0
    start_record = None
    functions = []
    record_stack = []

    # '__start_profile' is not guaranteed to be first, so we must find it here
    for record in itertools.chain(*thread_records):
        name = record.name()
        if start_record is None and name == '__start_profile':
            start_record = record

    assert start_record is not None and not start_record.is_remote()

    for thread_record_list in thread_records:
        # accumulated memory allocations per handle
        cpu_memory_allocs = {}
        cuda_memory_allocs = {}
        # ranges per handle
        range_starts = {}

        filtered_handles = set()
        prev_record = None
        for record in thread_record_list:
            record_key = _get_record_key(record)
            if (_filter_name(record.name()) or record_key in filtered_handles):
                filtered_handles.add(record_key)
                continue

            if record.kind() == 'push':
                # workaround to reduce double logging from operator
                # wrappers and redispatch
                if prev_record is not None:
                    duplicate = (prev_record.name() == record.name()
                                 and prev_record.kind() == record.kind()
                                 and prev_record.node_id() == record.node_id())
                    if duplicate:
                        filtered_handles.add(record_key)
                        continue

                range_starts[record_key] = record
                cpu_memory_allocs[record_key] = 0
                cuda_memory_allocs[record_key] = 0
            elif record.kind() == 'pop':
                assert (
                    record_key in range_starts
                ), """Expected record with key {} to exist in range_starts.
                    This means that the pop event did not have a corresponding push.""".format(
                    record_key)

                start = range_starts[record_key]

                cpu_memory_usage = cpu_memory_allocs[record_key]
                cuda_memory_usage = cuda_memory_allocs[record_key]
                is_async = start.is_async() or (start.thread_id() !=
                                                record.thread_id())
                is_remote_event = record.is_remote()
                start_flops = start.flops()

                fe = FunctionEvent(
                    id=record.handle(),
                    node_id=record.node_id(),
                    name=_rewrite_name(name=start.name(), with_wildcard=True),
                    trace_name=_rewrite_name(name=start.name(),
                                             with_wildcard=False),
                    thread=start.thread_id(),
                    start_us=start_record.cpu_elapsed_us(start),
                    end_us=start_record.cpu_elapsed_us(record),
                    fwd_thread=start.fwd_thread_id(),
                    input_shapes=start.shapes(),
                    stack=[
                        entry for entry in start.stack()
                        if _filter_stack_entry(entry)
                    ],
                    scope=start.scope(),
                    cpu_memory_usage=cpu_memory_usage,
                    cuda_memory_usage=cuda_memory_usage,
                    is_async=is_async,
                    is_remote=is_remote_event,
                    sequence_nr=start.sequence_nr(),
                    device_type=DeviceType.CPU,
                    is_legacy=True,
                    flops=start_flops,
                )
                # note: async events have only cpu total time
                if not is_async and start.has_cuda():
                    duration = start.cuda_elapsed_us(record)
                    if duration > 0:
                        fe.append_kernel(start.name(), start.device(),
                                         duration)
                functions.append(fe)
                del range_starts[record_key]
                del cpu_memory_allocs[record_key]
                del cuda_memory_allocs[record_key]
            elif record.kind() == 'memory_alloc':
                num_open_handles_cpu = len(cpu_memory_allocs)
                num_open_handles_cuda = len(cuda_memory_allocs)
                assert num_open_handles_cpu == num_open_handles_cuda
                for handle in cpu_memory_allocs.keys():
                    cpu_memory_allocs[handle] += record.cpu_memory_usage()
                for handle in cuda_memory_allocs.keys():
                    cuda_memory_allocs[handle] += record.cuda_memory_usage()
                if num_open_handles_cpu == 0:
                    # output event as a top-level memory event
                    fe = FunctionEvent(
                        id=0,
                        name=MEMORY_EVENT_NAME,
                        trace_name=None,
                        thread=0,
                        start_us=0,
                        end_us=0,
                        stack=[],
                        cpu_memory_usage=record.cpu_memory_usage(),
                        cuda_memory_usage=record.cuda_memory_usage(),
                        is_legacy=True,
                    )
                    functions.append(fe)
            prev_record = record

    # Sort functions by start time then by end time ascending.
    # This ensures that--in the case of nested events which
    # have the same start time (which may happen due to the
    # granularity of the given clock tick)--we always show
    # the outermost nested call first. This adds stability
    # in how FunctionEvents appear
    functions.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
    return functions