コード例 #1
0
 def get_first_timestamp(self):
     while self.first_timestamp == 0:
         if self.path.startswith("s3"):
             self.system_metrics_reader = S3SystemMetricsReader(self.path)
             self.framework_metrics_reader = S3AlgorithmMetricsReader(
                 self.path)
         else:
             self.system_metrics_reader = LocalSystemMetricsReader(
                 self.path)
             self.framework_metrics_reader = LocalAlgorithmMetricsReader(
                 self.path)
         if self.system_metrics_reader.get_timestamp_of_first_available_file(
         ) != 0:
             self.first_timestamp = (
                 self.system_metrics_reader.
                 get_timestamp_of_first_available_file())
         if self.framework_metrics_reader.get_timestamp_of_first_available_file(
         ) != 0:
             if (self.framework_metrics_reader.
                     get_timestamp_of_first_available_file() <
                     self.first_timestamp):
                 self.first_timestamp = (
                     self.framework_metrics_reader.
                     get_timestamp_of_first_available_file())
         self.logger.info("Waiting for profiler data.")
         time.sleep(10)
コード例 #2
0
    def __init__(self,
                 path,
                 use_in_memory_cache=False,
                 scan_interval=5000000000):

        self.path = path
        self.step_time_mapping = dict()

        # Reader for system and framework metrics
        if path.startswith("s3"):
            self.system_metrics_reader = S3SystemMetricsReader(self.path)
            self.framework_metrics_reader = S3AlgorithmMetricsReader(
                self.path, use_in_memory_cache=use_in_memory_cache)
        else:
            self.system_metrics_reader = LocalSystemMetricsReader(self.path)
            self.framework_metrics_reader = LocalAlgorithmMetricsReader(
                self.path, use_in_memory_cache=use_in_memory_cache)

        # list to store metrics
        self.system_metrics = []
        self.framework_metrics = []

        # we read data in chunks
        self.interval = scan_interval
        self.start_time = self.system_metrics_reader.get_timestamp_of_first_available_file(
        )
コード例 #3
0
def test_LocalMetricsReader(
        use_in_memory_cache,
        tracefolder="./tests/profiler/resources/test_traces"):
    lt = LocalAlgorithmMetricsReader(tracefolder,
                                     use_in_memory_cache=use_in_memory_cache)
    events = lt.get_events(1589930980, 1589930995, unit=TimeUnits.SECONDS)
    print(f"Number of events {len(events)}")
    assert len(events) == 4
コード例 #4
0
def test_LocalMetricsReader_Horovod_timeline(
        use_in_memory_cache,
        tracefolder="./tests/profiler/resources/horovod_timeline_traces"):
    lt = LocalAlgorithmMetricsReader(tracefolder,
                                     use_in_memory_cache=use_in_memory_cache)
    events = lt.get_events(1593673051472800, 1593673051473100)

    print(f"Number of events {len(events)}")
    assert len(events) == 15
コード例 #5
0
def test_LocalMetricsReader_Model_timeline(
        use_in_memory_cache,
        tracefolder="./tests/profiler/resources/model_timeline_traces"):
    lt = LocalAlgorithmMetricsReader(tracefolder,
                                     use_in_memory_cache=use_in_memory_cache)
    events = lt.get_events(1590461127873222, 1590461139949971)

    print(f"Number of events {len(events)}")
    assert len(events) == 54
    assert events[0].node_id == "0001"
コード例 #6
0
def test_timeline_merge(out_dir, trace_location):
    if trace_location == "local":
        tracefolder = "./tests/profiler/resources/merge_traces"
        combined_timeline = MergedTimeline(tracefolder,
                                           output_directory=out_dir)
        start_time_us = 1594662618874598
        end_time_us = 1594662623418701
        combined_timeline.merge_timeline(start_time_us, end_time_us)
        algo_reader = LocalAlgorithmMetricsReader(tracefolder)
    elif trace_location == "s3":
        tracefolder = "s3://smdebug-testing/resources/tf2_detailed_profile/profiler-output"
        combined_timeline = MergedTimeline(tracefolder,
                                           output_directory=out_dir)
        start_time_us = 1596668400000000
        end_time_us = 1596668441010095
        combined_timeline.merge_timeline(start_time_us, end_time_us)
        algo_reader = S3AlgorithmMetricsReader(tracefolder)
    else:
        return

    files = []
    for path in Path(out_dir).rglob(f"*{MERGEDTIMELINE_SUFFIX}"):
        files.append(path)

    assert len(files) == 1

    with open(files[0], "r+") as merged_file:
        event_list = json.load(merged_file)

    # check if the events are sorted by start time
    for i in range(len(event_list) - 1):
        if "ts" in event_list[i] and "ts" in event_list[i + 1]:
            assert event_list[i]["ts"] <= event_list[i + 1]["ts"]

    total_events = 0
    # check if the number of events match individual files
    for suffix in [
            PYTHONTIMELINE_SUFFIX,
            MODELTIMELINE_SUFFIX,
            TENSORBOARDTIMELINE_SUFFIX,
            HOROVODTIMELINE_SUFFIX,
    ]:
        events = algo_reader.get_events(start_time_us,
                                        end_time_us,
                                        file_suffix_filter=[suffix])
        total_events += len(events)

    t_events = SMProfilerEvents()
    t_events.read_events_from_file(str(files[0]))
    all_trace_events = t_events.get_all_events()
    num_trace_events = len(all_trace_events)

    assert total_events == num_trace_events
コード例 #7
0
def test_timeline_merge_by_step(out_dir, trace_location):
    start_step, end_step = 2, 4
    if trace_location == "local":
        tracefolder = "./tests/profiler/resources/merge_traces"
        combined_timeline = MergedTimeline(tracefolder,
                                           output_directory=out_dir)
        combined_timeline.merge_timeline(start_step,
                                         end_step,
                                         unit=MergeUnit.STEP)
        algo_reader = LocalAlgorithmMetricsReader(tracefolder)
    elif trace_location == "s3":
        tracefolder = "s3://smdebug-testing/resources/tf2_detailed_profile/profiler-output"
        combined_timeline = MergedTimeline(tracefolder,
                                           output_directory=out_dir)
        combined_timeline.merge_timeline(start_step,
                                         end_step,
                                         unit=MergeUnit.STEP)
        algo_reader = S3AlgorithmMetricsReader(tracefolder)
    else:
        return

    files = []
    for path in Path(out_dir).rglob(f"*{MERGEDTIMELINE_SUFFIX}"):
        files.append(path)

    assert len(files) == 1

    total_events = 0
    # check if the number of events match individual files
    for suffix in [
            PYTHONTIMELINE_SUFFIX,
            MODELTIMELINE_SUFFIX,
            TENSORBOARDTIMELINE_SUFFIX,
            HOROVODTIMELINE_SUFFIX,
    ]:
        events = algo_reader.get_events_by_step(start_step,
                                                end_step,
                                                file_suffix_filter=[suffix])
        total_events += len(events)

    t_events = SMProfilerEvents()
    t_events.read_events_from_file(str(files[0]))
    all_trace_events = t_events.get_all_events()
    num_trace_events = len(all_trace_events)

    assert total_events == num_trace_events
コード例 #8
0
    def __init__(self, path, file_suffix_filter=None, output_directory=None):
        """
        :param path: trace root folder that contains framework and system folders
        :param file_suffix_filter: list of file suffix
            PYTHONTIMELINE_SUFFIX = "pythontimeline.json"
            MODELTIMELINE_SUFFIX = "model_timeline.json"
            TENSORBOARDTIMELINE_SUFFIX = "trace.json.gz"
            HOROVODTIMELINE_SUFFIX = "horovod_timeline.json"
            SMDATAPARALLELTIMELINE_SUFFIX = "smdataparallel_timeline.json".
            Default: None (all files will be merged)
        :param output_directory: Path where merged file should be saved
            Default: None (writes to the same location as the 'path' argument.
        """
        self.path = path
        self.out_dir = output_directory if output_directory is not None else self.path
        self.file_suffix_filter = file_suffix_filter

        self.bytes_written = 0

        # Reader for system and framework metrics
        if path.startswith("s3"):
            self.framework_metrics_reader = S3AlgorithmMetricsReader(self.path)
        else:
            self.framework_metrics_reader = LocalAlgorithmMetricsReader(
                self.path)

        self._writer = None
        self.tensor_table = collections.defaultdict(int)
        # eventphase_starting_ids contains indexes for each of these phases,
        # The value of these is used to put pid for timeline. This makes sure that we will have timeline produced
        # which will be in order Step, Forward, Backward, DataIterator, Dataset, /device, /host, horovod, others..
        # Note that we can have 10000 event phase of each type
        self.eventphase_starting_ids = {
            "Step:": 1,
            "Forward": 10000,
            "Backward": 20000,
            "DataIterator": 30000,
            "Dataset": 40000,
            "/device": 50000,
            "/host": 60000,
            "gpu_functions": 70000,
            "cpu_functions": 80000,
            "Horovod": 90000,
            "other": 1000000,
        }
コード例 #9
0
def test_pytorch_profiler(pytorch_profiler_config_parser, out_dir):
    device = torch.device("cpu")
    model = Net().to(device)
    hook = Hook(out_dir=out_dir)
    hook.register_hook(model)
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    train(model, device, optimizer, hook)
    hook.close()
    lt = LocalAlgorithmMetricsReader(out_dir)
    lt.refresh_event_file_list()
    events = lt.get_events(0, time.time() * 1000000)
    print(f"Number of events {len(events)}")
    if is_pt_1_5():
        assert len(events) == 386
    elif is_pt_1_6():
        assert len(events) == 672
    elif is_pt_1_7():
        assert 220 <= len(events)
コード例 #10
0
def test_MetricsReader_TFProfiler_timeline(use_in_memory_cache,
                                           trace_location):
    if trace_location == "local":
        tracefolder = "./tests/profiler/resources/tfprofiler_timeline_traces"
        lt = LocalAlgorithmMetricsReader(
            tracefolder, use_in_memory_cache=use_in_memory_cache)
    elif trace_location == "s3":
        bucket_name = "s3://smdebug-testing/resources/tf2_detailed_profile/profiler-output"
        lt = S3AlgorithmMetricsReader(bucket_name,
                                      use_in_memory_cache=use_in_memory_cache)
    else:
        return
    events = lt.get_events(0, time.time() * CONVERT_TO_MICROSECS)

    print(f"Number of events {len(events)}")
    if trace_location == "local":
        assert len(events) == 798
    elif trace_location == "s3":
        assert len(events) >= 73000
コード例 #11
0
class PandasFrame:
    def __init__(self,
                 path,
                 use_in_memory_cache=False,
                 scan_interval=5000000000):

        self.path = path
        self.step_time_mapping = dict()

        # Reader for system and framework metrics
        if path.startswith("s3"):
            self.system_metrics_reader = S3SystemMetricsReader(self.path)
            self.framework_metrics_reader = S3AlgorithmMetricsReader(
                self.path, use_in_memory_cache=use_in_memory_cache)
        else:
            self.system_metrics_reader = LocalSystemMetricsReader(self.path)
            self.framework_metrics_reader = LocalAlgorithmMetricsReader(
                self.path, use_in_memory_cache=use_in_memory_cache)

        # list to store metrics
        self.system_metrics = []
        self.framework_metrics = []

        # we read data in chunks
        self.interval = scan_interval
        self.start_time = self.system_metrics_reader.get_timestamp_of_first_available_file(
        )

    def get_all_system_metrics(self, selected_system_metrics=[]):
        """
        Get system metrics
        :param systemk_metrics_list: list of system metrics.If not empty, function will only return framework events that are part of this list.
        :return: System metrics DataFrame
        """
        # get all system metrics from last to current timestamp

        start_timestamp = self.system_metrics_reader.get_timestamp_of_first_available_file(
        )
        end_timestamp = (self.system_metrics_reader.
                         get_timestamp_of_latest_available_file() +
                         self.interval)
        sys_events_df, _ = self.get_profiler_data_by_time(
            start_timestamp,
            end_timestamp,
            cache_metrics=False,
            selected_system_metrics=selected_system_metrics,
            get_framework_metrics=False,
        )

        return sys_events_df

    def get_all_framework_metrics(self, selected_framework_metrics=[]):
        """
        Get framework metrics
        :param selected_framework_metrics: list of framework metrics.If not empty, function will only return framework events that are part of this list.
        :return: Framework metrics DataFrame
        """
        # get all framework metrics from last to current timestamp
        self.framework_metrics_reader.refresh_event_file_list()

        start_timestamp = (
            self.system_metrics_reader.get_timestamp_of_first_available_file()
        )  # bug: get_events does not return the very first event
        end_timestamp = self.framework_metrics_reader.get_timestamp_of_latest_available_file(
        )

        _, fw_events_df = self.get_profiler_data_by_time(
            start_timestamp,
            end_timestamp,
            cache_metrics=False,
            selected_framework_metrics=selected_framework_metrics,
            get_system_metrics=False,
        )

        return fw_events_df

    def convert_datetime_to_timestamp(self, timestamp):
        """
        A helper function to convert datetime into timestamp
        :param timestep: timestamp in datetime
        :return: timestamp in microseconds
        """
        timestamp = pd.to_datetime(timestamp,
                                   format="%Y-%m-%dT%H:%M:%S:%f",
                                   utc=True)
        return convert_utc_datetime_to_microseconds(timestamp)

    def get_framework_metrics_by_timesteps(self,
                                           timestep_list=[],
                                           selected_framework_metrics=[]):
        """
        Get framework metrics for a list of timeranges. This function is useful when we want to correlate framework metrics with system metrics. Framework metrics have a begin and end timestamp. System metrics have only a single timestamp.
        :param timestep_list: list of timestamps
        :param selected_framework_metrics: list of framework metrics which will be stored in the dataframe
        :return: Framework metrics DataFrame
        """
        # get min and max search range
        timestep_list = sorted(timestep_list)
        start_time_us = self.convert_datetime_to_timestamp(timestep_list[0])
        end_time_us = self.convert_datetime_to_timestamp(timestep_list[-1])

        # to avoid out of memory issues, we read data in chunks
        current_time_us = start_time_us
        if end_time_us - start_time_us > self.interval:
            current_time_us = start_time_us + self.interval
        else:
            current_time_us = end_time_us
        results = {}
        results_detailed = {}
        counter = 0

        while start_time_us < end_time_us:
            # get all framework metrics from last to current timestamp
            self.framework_metrics_reader.refresh_event_file_list()
            events = self.framework_metrics_reader.get_events(
                start_time_us, current_time_us)

            # iterate over system metrics timestamps and find overlap
            for index, timestamp in enumerate(timestep_list[counter:]):
                timestamp = self.convert_datetime_to_timestamp(timestamp)
                if timestamp >= current_time_us:
                    counter = index
                    break
                for event in events:
                    if len(selected_framework_metrics) > 0 and (
                            event.event_name not in selected_framework_metrics
                            and event.event_phase
                            not in selected_framework_metrics):
                        continue
                    if event.start_time < timestamp and event.end_time > timestamp:
                        if event.event_phase not in results:
                            results[event.event_phase] = 0
                        results[
                            event.
                            event_phase] += event.end_time - event.start_time
                        if "Step" not in event.event_name:
                            if event.event_name not in results_detailed:
                                results_detailed[event.event_name] = 0
                            results_detailed[
                                event.
                                event_name] += event.end_time - event.start_time
            # read the next chunk of framework metrics
            start_time_us = current_time_us
            if current_time_us + self.interval < end_time_us:
                current_time_us = current_time_us + self.interval
            else:
                current_time_us = end_time_us

        framework_metrics = {}
        training_phase = {}

        for key in results:
            if "Step" in key:
                training_phase[key] = results[key]
            else:
                framework_metrics[key] = results[key]

        if len(framework_metrics.values()) > 0:
            max_value = float(max(list(framework_metrics.values())))
            for key in framework_metrics:
                framework_metrics[key] = framework_metrics[key] / max_value

        return framework_metrics, results_detailed, training_phase

    def get_framework_metrics_by_begin_and_end_timesteps(
            self,
            begin_timestep_list,
            end_timestep_list,
            selected_framework_metrics=[]):
        """
        Get framework metrics for a set of given timeranges. This function is useful when we want to correlate framework metrics such as steps with other framework metrics such as dataloading, preprocessing etc.
        :param begin_timestep_list: list of start of intervals in datetime
        :param end_timestep_list: list of end intervals in datetime
        :param selected_framework_metrics: list of framework metrics which will be stored in the dataframe
        :return: Framework metrics DataFrame
        """
        # Get min and max timestamps from the list of timeranges
        start_time_us = self.convert_datetime_to_timestamp(
            min(begin_timestep_list))
        end_time_us = self.convert_datetime_to_timestamp(
            max(end_timestep_list))

        # in order to avoid out of memory issues we will read only chunks of data
        current_time_us = start_time_us
        if end_time_us - start_time_us > self.interval:
            current_time_us = start_time_us + self.interval
        else:
            current_time_us = end_time_us
        framework_metrics = {}
        framework_metrics_detailed = {}
        counter = 0
        while start_time_us < end_time_us:
            # get all framework metrics from last to current timestamp
            self.framework_metrics_reader.refresh_event_file_list()
            events = self.framework_metrics_reader.get_events(
                start_time_us, current_time_us)

            # iterate over start and end time intervals and find overlaps in the current timerange
            for index, (begin_timestamp, end_timestamp) in enumerate(
                    zip(begin_timestep_list[counter:],
                        end_timestep_list[counter:])):
                begin_timestamp = self.convert_datetime_to_timestamp(
                    begin_timestamp)
                end_timestamp = self.convert_datetime_to_timestamp(
                    end_timestamp)

                # if we are out of range, stop searching for overlaps
                if begin_timestamp >= current_time_us:
                    counter = index
                    break
                # iterate over events in the current timerange
                for event in events:
                    if len(selected_framework_metrics) > 0 and (
                            event.event_name not in selected_framework_metrics
                            and event.event_phase
                            not in selected_framework_metrics):
                        continue
                    if event.end_time >= begin_timestamp and event.start_time <= end_timestamp:
                        if "Step" not in event.event_name:
                            if event.event_phase not in framework_metrics:
                                framework_metrics[event.event_phase] = 0
                            framework_metrics[event.event_phase] += (
                                event.end_time - event.start_time)
                            if event.event_name not in framework_metrics_detailed:
                                framework_metrics_detailed[
                                    event.event_name] = 0
                            framework_metrics_detailed[event.event_name] += (
                                event.end_time - event.start_time)
            # read the next chunk of data
            start_time_us = current_time_us
            if current_time_us + self.interval < end_time_us:
                current_time_us = current_time_us + self.interval
            else:
                current_time_us = end_time_us

        # normalize cumulative time to 0-1
        if len(list(framework_metrics.values())) > 0:
            max_value = float(max(list(framework_metrics.values())))
            for key in framework_metrics:
                framework_metrics[key] = framework_metrics[key] / max_value
            max_value = float(max(list(framework_metrics_detailed.values())))
            for key in framework_metrics_detailed:
                framework_metrics_detailed[
                    key] = framework_metrics_detailed[key] / max_value

        return framework_metrics, framework_metrics_detailed

    def get_profiler_data_by_time(
        self,
        start_time_us,
        end_time_us,
        cache_metrics=False,
        selected_framework_metrics=[],
        selected_system_metrics=[],
        get_framework_metrics=True,
        get_system_metrics=True,
    ):
        """
        Get metrics data within a time interval.
        :param start_time_us: Start of the interval in microseconds
        :param end_time_us: End of the interval in microseconds
        :param cache_metrics: If True, collect and return all metrics requested so far, else,
        :param framework_metrics_list: list of framework metrics. If not empty, function will only return framework events that are part of this list.
        :param selected_system_metrics: list of system metrics. If not empty, function will only return system events that are part of this list.
        :param selected_framework_metrics: if True, get framework metrics
        :param get_system_metrics: if True: get system metrics
       return current request
        :return: System metrics DataFrame, Framework metrics DataFrame
        """
        # read system metrics
        system_metrics = []
        if get_system_metrics:
            events = self.system_metrics_reader.get_events(
                start_time_us, end_time_us)

            # append new events to existing list
            for event in events:
                if len(selected_system_metrics
                       ) > 0 and event.name not in selected_system_metrics:
                    continue
                system_metrics.append([
                    # GPU and CPU metrics are recorded at slightly different timesteps, so we round the numbers
                    us_since_epoch_to_human_readable_time(
                        int(event.timestamp * CONVERT_TO_MICROSECS)),
                    int(event.timestamp * CONVERT_TO_MICROSECS),
                    event.value,
                    event.name,
                    event.dimension,
                    event.node_id,
                    event.type,
                ])

            if cache_metrics is True:
                self.system_metrics.extend(system_metrics)
                system_metrics = self.system_metrics

        # create data frame for system metrics
        system_metrics_df = pd.DataFrame(
            system_metrics,
            columns=[
                "timestamp",
                "timestamp_us",
                "value",
                "system_metric",
                "dimension",
                "nodeID",
                "type",
            ],
        )

        system_metrics_df["timestamp_us"] = system_metrics_df[
            "timestamp_us"] - self.start_time

        # get framework metrics
        framework_metrics = []
        if get_framework_metrics:
            # only fetch a subset of data to avoid out of memory issues
            if end_time_us - start_time_us > self.interval:
                current_time_us = start_time_us + self.interval
            else:
                current_time_us = end_time_us

            while start_time_us < end_time_us:
                # get all framework metrics from last to current timestamp
                self.framework_metrics_reader.refresh_event_file_list()
                events = self.framework_metrics_reader.get_events(
                    start_time_us, current_time_us)

                # append new events to existing list
                for event in events:
                    if len(selected_framework_metrics) > 0 and (
                            event.event_name not in selected_framework_metrics
                            and event.event_phase
                            not in selected_framework_metrics):
                        continue
                    if event.event_args is not None and "step_num" in event.event_args:
                        step = int(event.event_args["step_num"])
                    else:
                        step = -1
                    if event.event_args is not None and "layer_name" in event.event_args:
                        name = event.event_args["layer_name"]
                    elif event.event_args is not None and "name" in event.event_args:
                        name = event.event_args["name"]
                    else:
                        name = event.event_name
                    if event.event_args is not None and "bytes_fetched" in event.event_args:
                        bytes_fetched = event.event_args["bytes_fetched"]
                    else:
                        bytes_fetched = -1

                    framework_metrics.append([
                        us_since_epoch_to_human_readable_time(
                            event.start_time),
                        us_since_epoch_to_human_readable_time(event.end_time),
                        event.start_time,
                        event.end_time,
                        event.tid,
                        event.pid,
                        name,
                        step,
                        bytes_fetched,
                        event.event_phase,
                        event.node_id,
                    ])
                # read the next chunk of data
                start_time_us = current_time_us
                if current_time_us + self.interval < end_time_us:
                    current_time_us = current_time_us + self.interval
                else:
                    current_time_us = end_time_us

                if cache_metrics is True:
                    self.framework_metrics.extend(framework_metrics)
                    framework_metrics = self.framework_metrics

        # create data frame for framework metrics
        framework_metrics_df = pd.DataFrame(
            framework_metrics,
            columns=[
                "start_time",
                "end_time",
                "start_time_us",
                "end_time_us",
                "tid",
                "pid",
                "framework_metric",
                "step",
                "bytes",
                "process",
                "nodeID",
            ],
        )
        framework_metrics_df["start_time_us"] = (
            framework_metrics_df["start_time_us"] - self.start_time)
        framework_metrics_df["end_time_us"] = framework_metrics_df[
            "end_time_us"] - self.start_time

        return (
            system_metrics_df[system_metrics_df.duplicated() == False],
            framework_metrics_df[framework_metrics_df.duplicated() == False],
        )

    def get_profiler_data_by_step(self,
                                  start_step,
                                  end_step,
                                  cache_metrics=False):
        """
        Get metrics data within a step interval. We find the mapping between step number and time interval for
        the step as some events may not be associated with a step number yet.
        :param start_step: Start of step interval
        :param end_step: End of step interval
        :param cache_metrics: If True, collect and return all metrics requested so far, else,
        return current request
        :return: System metrics DataFrame, Framework metrics DataFrame
        """
        sys_metrics_df, fw_metrics_df = (
            self.get_all_system_metrics(),
            self.get_all_framework_metrics(),
        )

        fw_metrics_df = fw_metrics_df[(fw_metrics_df["step"].between(
            start_step, end_step, inclusive=True))]
        start_time, end_time = (
            fw_metrics_df["start_time_us"].min(),
            fw_metrics_df["end_time_us"].max(),
        )

        sys_metrics_df = sys_metrics_df[(
            sys_metrics_df["timestamp_us"].between(start_time,
                                                   end_time,
                                                   inclusive=True))]

        return sys_metrics_df, fw_metrics_df

    def get_all_dataloader_metrics(self, selected_framework_metrics=[]):
        """
        Get framework metrics
        :param selected_framework_metrics: list of framework metrics.If not empty, function will only return framework events that are part of this list.
        :return: Framework metrics DataFrame
        """
        # get all framework metrics from last to current timestamp
        self.framework_metrics_reader.refresh_event_file_list()

        start_timestamp = (
            self.system_metrics_reader.get_timestamp_of_first_available_file()
        )  # bug: get_events does not return the very first event
        end_timestamp = self.framework_metrics_reader.get_timestamp_of_latest_available_file(
        )

        fw_events_df = self._get_dataloader_profiler_data_by_time(
            start_timestamp,
            end_timestamp,
            cache_metrics=False,
            selected_framework_metrics=selected_framework_metrics,
        )

        return fw_events_df

    def _get_dataloader_profiler_data_by_time(self,
                                              start_time_us,
                                              end_time_us,
                                              cache_metrics=False,
                                              selected_framework_metrics=[]):
        """
        Get metrics data within a time interval.
        :param start_time_us: Start of the interval in microseconds
        :param end_time_us: End of the interval in microseconds
        :param cache_metrics: If True, collect and return all metrics requested so far, else,
        :param framework_metrics_list: list of framework metrics. If not empty, function will only return framework events that are part of this list.
        :return: Framework metrics DataFrame
        """
        # get framework metrics
        framework_metrics = []
        # only fetch a subset of data to avoid out of memory issues
        if end_time_us - start_time_us > self.interval:
            current_time_us = start_time_us + self.interval
        else:
            current_time_us = end_time_us

        while start_time_us < end_time_us:
            # get all framework metrics from last to current timestamp
            self.framework_metrics_reader.refresh_event_file_list()
            events = self.framework_metrics_reader.get_events(
                start_time_us, current_time_us)

            # append new events to existing list
            for event in events:
                if len(selected_framework_metrics) > 0 and not (
                        match_inc(event.event_name, selected_framework_metrics)
                        or match_inc(event.event_phase,
                                     selected_framework_metrics)):
                    continue
                if event.event_args is not None and "step_num" in event.event_args:
                    step = int(event.event_args["step_num"])
                else:
                    step = -1
                if event.event_args is not None and "layer_name" in event.event_args:
                    name = event.event_args["layer_name"]
                elif event.event_args is not None and "name" in event.event_args:
                    name = event.event_args["name"]
                else:
                    name = event.event_name
                if event.event_args is not None and "worker_id" in event.event_args:
                    worker_id = event.event_args["worker_id"]
                else:
                    worker_id = -1

                if event.event_args is not None and "num_workers" in event.event_args:
                    num_workers = event.event_args["num_workers"]
                else:
                    num_workers = -1

                if event.event_args is not None and "pin_memory" in event.event_args:
                    pin_memory = "True" if event.event_args[
                        "pin_memory"] is True else "False"
                else:
                    pin_memory = "NA"

                framework_metrics.append([
                    us_since_epoch_to_human_readable_time(event.start_time),
                    us_since_epoch_to_human_readable_time(event.end_time),
                    event.start_time,
                    event.end_time,
                    event.duration,
                    event.pid,
                    name,
                    step,
                    worker_id,
                    num_workers,
                    pin_memory,
                    event.event_phase,
                    event.node_id,
                ])
            # read the next chunk of data
            start_time_us = current_time_us
            if current_time_us + self.interval < end_time_us:
                current_time_us = current_time_us + self.interval
            else:
                current_time_us = end_time_us

            if cache_metrics is True:
                self.framework_metrics.extend(framework_metrics)
                framework_metrics = self.framework_metrics

        # create data frame for framework metrics
        framework_metrics_df = pd.DataFrame(
            framework_metrics,
            columns=[
                "start_time",
                "end_time",
                "start_time_us",
                "end_time_us",
                "duration_us",
                "pid",
                "framework_metric",
                "step",
                "worker_id",
                "num_workers",
                "pin_memory",
                "process",
                "node_id",
            ],
        )
        framework_metrics_df["start_time_us"] = (
            framework_metrics_df["start_time_us"] - self.start_time)
        framework_metrics_df["end_time_us"] = framework_metrics_df[
            "end_time_us"] - self.start_time
        return framework_metrics_df[framework_metrics_df.duplicated() == False]
コード例 #12
0
class ProfilerTrial:
    def __init__(self, name, trial_dir, output_dir):
        self.name = name
        # Trial dir is the s3/local directory contains profiling data captured during runtime.
        self.path = trial_dir

        self.logger = get_logger()
        self.first_timestamp = 0
        self.get_first_timestamp()

        # Output directory will contains data emitted by rules further published to S3.
        self.output_dir = output_dir
        if output_dir and not os.path.exists(output_dir):
            pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

        # .sagemaker-ignore will not be picked by service code for uploading. It will be used to save temp files.
        self.temp_dir = os.path.join(output_dir, ".sagemaker-ignore")
        if not os.path.exists(self.temp_dir):
            pathlib.Path(self.temp_dir).mkdir(parents=True, exist_ok=True)
        self.logger.info(
            "Output files of ProfilerTrial will be saved to {}".format(
                self.output_dir))

    def job_finished(self):
        training_ended = has_training_ended(
            self.path + "/system") or has_training_ended(self.path +
                                                         "/framework")
        # rule job should finish if training job has ended or rule job has been signalled
        return training_ended or is_rule_signalled_gracetime_passed(self.path)

    def get_first_timestamp(self):
        while self.first_timestamp == 0:
            if self.path.startswith("s3"):
                self.system_metrics_reader = S3SystemMetricsReader(self.path)
                self.framework_metrics_reader = S3AlgorithmMetricsReader(
                    self.path)
            else:
                self.system_metrics_reader = LocalSystemMetricsReader(
                    self.path)
                self.framework_metrics_reader = LocalAlgorithmMetricsReader(
                    self.path)
            if self.system_metrics_reader.get_timestamp_of_first_available_file(
            ) != 0:
                self.first_timestamp = (
                    self.system_metrics_reader.
                    get_timestamp_of_first_available_file())
            if self.framework_metrics_reader.get_timestamp_of_first_available_file(
            ) != 0:
                if (self.framework_metrics_reader.
                        get_timestamp_of_first_available_file() <
                        self.first_timestamp):
                    self.first_timestamp = (
                        self.framework_metrics_reader.
                        get_timestamp_of_first_available_file())
            self.logger.info("Waiting for profiler data.")
            time.sleep(10)

    def get_latest_timestamp(self):
        latest_timestamp = 0
        self.system_metrics_reader.refresh_event_file_list()
        self.framework_metrics_reader.refresh_event_file_list()
        if self.system_metrics_reader.get_timestamp_of_latest_available_file(
        ) != 0:
            latest_timestamp = self.system_metrics_reader.get_timestamp_of_latest_available_file(
            )
        if self.framework_metrics_reader.get_timestamp_of_latest_available_file(
        ) != 0:
            if (self.framework_metrics_reader.
                    get_timestamp_of_latest_available_file() <
                    latest_timestamp):
                latest_timestamp = (self.framework_metrics_reader.
                                    get_timestamp_of_latest_available_file())
        return latest_timestamp

    def wait_for_data(self, end_time, start_time):

        if end_time < self.get_latest_timestamp():
            return

        while end_time > self.get_latest_timestamp() and self.job_finished(
        ) is not True:
            self.logger.info(
                f"Current timestamp {end_time} latest timestamp {self.get_latest_timestamp()}: waiting for new profiler data."
            )

            time.sleep(10)

        if self.job_finished():
            if start_time >= self.get_latest_timestamp():
                raise NoMoreProfilerData(end_time)
        return

    def get_system_metrics(self, start_time, end_time):
        events = self.system_metrics_reader.get_events(start_time, end_time)
        return events

    def get_framework_metrics(self, start_time, end_time):
        events = self.framework_metrics_reader.get_events(start_time, end_time)
        return events
コード例 #13
0
class MergedTimeline:
    def __init__(self, path, file_suffix_filter=None, output_directory=None):
        """
        :param path: trace root folder that contains framework and system folders
        :param file_suffix_filter: list of file suffix
            PYTHONTIMELINE_SUFFIX = "pythontimeline.json"
            MODELTIMELINE_SUFFIX = "model_timeline.json"
            TENSORBOARDTIMELINE_SUFFIX = "trace.json.gz"
            HOROVODTIMELINE_SUFFIX = "horovod_timeline.json"
            SMDATAPARALLELTIMELINE_SUFFIX = "smdataparallel_timeline.json".
            Default: None (all files will be merged)
        :param output_directory: Path where merged file should be saved
            Default: None (writes to the same location as the 'path' argument.
        """
        self.path = path
        self.out_dir = output_directory if output_directory is not None else self.path
        self.file_suffix_filter = file_suffix_filter

        self.bytes_written = 0

        # Reader for system and framework metrics
        if path.startswith("s3"):
            self.framework_metrics_reader = S3AlgorithmMetricsReader(self.path)
        else:
            self.framework_metrics_reader = LocalAlgorithmMetricsReader(
                self.path)

        self._writer = None
        self.tensor_table = collections.defaultdict(int)
        # eventphase_starting_ids contains indexes for each of these phases,
        # The value of these is used to put pid for timeline. This makes sure that we will have timeline produced
        # which will be in order Step, Forward, Backward, DataIterator, Dataset, /device, /host, horovod, others..
        # Note that we can have 10000 event phase of each type
        self.eventphase_starting_ids = {
            "Step:": 1,
            "Forward": 10000,
            "Backward": 20000,
            "DataIterator": 30000,
            "Dataset": 40000,
            "/device": 50000,
            "/host": 60000,
            "gpu_functions": 70000,
            "cpu_functions": 80000,
            "Horovod": 90000,
            "other": 1000000,
        }

    def open(self, file_path):
        """
        Open the trace event file
        """
        s3, bucket_name, key_name = is_s3(file_path)
        try:
            if s3:
                self._writer = TSAccessS3(bucket_name, key_name, binary=False)
            else:
                self._writer = TSAccessFile(file_path, "w")
        except (OSError, IOError) as err:
            logger.debug(
                f"Sagemaker-Debugger: failed to open {file_path}: {str(err)}")
        start, length = self._writer.write("[\n")
        self.bytes_written += length

    def file_name(self, end_timestamp_in_us):
        """
        Since this util will be used from a notebook or local directory, we directly write
        to the merged file
        """
        return TraceFileLocation().get_merged_trace_file_location(
            base_dir=self.out_dir, timestamp_in_us=end_timestamp_in_us)

    def merge_timeline(self,
                       start,
                       end,
                       unit=MergeUnit.TIME,
                       sys_metrics_filter={"lowgpu": ()}):
        """
        Get all trace files captured and merge them for viewing in the browser
        """
        if unit == MergeUnit.STEP or unit == "step":
            start_timestamp_in_us, end_timestamp_in_us = self.framework_metrics_reader._get_time_interval_for_step(
                start, end)
        else:
            start_timestamp_in_us, end_timestamp_in_us = start, end

        filename = self.file_name(end_timestamp_in_us)
        if self._writer is None:
            self.open(filename)

        # get framework metrics
        self.framework_metrics_reader.refresh_event_file_list()
        framework_events = self.framework_metrics_reader.get_events(
            start_timestamp_in_us,
            end_timestamp_in_us,
            file_suffix_filter=self.file_suffix_filter)
        print("Got framework events")
        if sys_metrics_filter is not None and len(sys_metrics_filter) > 0:
            print(f"Appending system metrics with filter:{sys_metrics_filter}")
            system_metrics_df = PandasFrame(self.path)
            filtered_sys_metrics = system_metrics_df.get_profiler_data_by_time(
                start_timestamp_in_us,
                end_timestamp_in_us,
                get_framework_metrics=False)
            # 1st element of tuple is sys metrics df
            filtered_sys_metrics = filtered_sys_metrics[0]
            filtered_sys_metrics["timestamp_us"] = (
                filtered_sys_metrics["timestamp_us"] +
                system_metrics_df.start_time)
            filtered_sys_metrics = filtered_sys_metrics.sort_values(
                by="timestamp_us")

            gpu_rows = filtered_sys_metrics[filtered_sys_metrics["dimension"]
                                            == "GPUUtilization"]
            selected_sys = gpu_rows[gpu_rows["value"] < 85]

            # convert timestamp_us to actual epoch time
            #'timestamp', 'timestamp_us', 'value', 'system_metric', 'dimension',
            #'nodeID', 'type'
            selected_sys = selected_sys.sort_values(by="timestamp_us")
            print(len(selected_sys))
            prev_timestamp = None
            prev_args = {}
            added_sys_bottleneck_events_count = 0
            added_counter_events_count = 0
            current_iloc_selected_sys = 0
            # iterate over each row, if prev_timestamp is Not None and this timestamp != prev_timestamp: create trace event with args)
            for index, row in filtered_sys_metrics.iterrows():
                if current_iloc_selected_sys >= len(selected_sys):
                    break
                timestamp = selected_sys.iloc[current_iloc_selected_sys][
                    "timestamp_us"]
                print(
                    f"selected sys timestamp:{timestamp} row timestamp:{row['timestamp_us']}"
                )

                if (timestamp - 1000 * 20) <= (row["timestamp_us"]) <= (
                        1000 * 20 + timestamp):
                    prev_timestamp = timestamp
                    prev_args[row["system_metric"] + "_" + row["dimension"] +
                              "_" + row["nodeID"]] = row["value"]
                else:
                    if prev_timestamp is not None:
                        # Make this instant marker global
                        # prev_args["s"] = "g"
                        t_event = TraceEvent(
                            ts=int(prev_timestamp),
                            name="sys_metrics_bottlenecks",
                            dur=0,
                            phase_pid=0,
                            phase_tid=0,
                            event_args=prev_args,
                            node_id=0,
                            phase="i",
                            file_type="system_metrics",
                            event_phase="sys_metrics_bottlenecks",
                            pid=0,
                            tid=0,
                            process_info=None,
                        )
                        framework_events.append(t_event)
                        added_sys_bottleneck_events_count += 1
                    prev_timestamp = None
                    prev_args = {}
                    if row["timestamp_us"] > timestamp:
                        current_iloc_selected_sys += 1
                        if current_iloc_selected_sys >= len(selected_sys):
                            break
                        timestamp = selected_sys.iloc[
                            current_iloc_selected_sys]["timestamp_us"]
                    if (timestamp - 1000 * 20) <= row["timestamp_us"] <= (
                            1000 * 20 + timestamp):
                        prev_timestamp = timestamp
                        prev_args[row["system_metric"] + "_" +
                                  row["dimension"] +
                                  row["nodeID"]] = row["value"]
                # {"name": "process_name", "ph": "M", "pid": 1000001, "args": {"name": "counter"}},
                # {"name": "process_sort_index", "ph": "M", "pid": 1000001, "args": {"sort_index": 1000001}},

                # {"name": "ctr", "pid": 1000001, "ph": "C", "ts": 1602564436444665, "args": {"cats": 0}},
                # create a counter event
                t_event = TraceEvent(
                    ts=int(row["timestamp_us"]),
                    name=row["system_metric"] + "_" + row["dimension"] + "_" +
                    row["nodeID"],
                    dur=0,
                    phase_pid=0,
                    phase_tid=0,
                    event_args={"value": row["value"]},
                    node_id=0,
                    phase="C",
                    file_type="system_metrics",
                    event_phase="sys_metrics",
                    pid=0,
                    tid=0,
                    process_info=None,
                )
                added_counter_events_count += 1
                framework_events.append(t_event)
                # prev_timestamp = row['timestamp_us']
                # prev_args[row['system_metric'] + "_" + row['dimension'] + row['nodeID']] = row['value']

        if len(prev_args) > 0 and prev_timestamp is not None:
            t_event = TraceEvent(
                ts=int(prev_timestamp),
                name="sys_metrics_bottlenecks",
                dur=0,
                phase_pid=0,
                phase_tid=0,
                event_args=prev_args,
                node_id=0,
                phase="i",
                file_type="system_metrics",
                event_phase="sys_metrics_bottlenecks",
                pid=0,
                tid=0,
                process_info=None,
            )
            framework_events.append(t_event)
            added_sys_bottleneck_events_count += 1
        print(
            f"Added {added_sys_bottleneck_events_count} sys events and count_counter_events:{added_counter_events_count}"
        )

        framework_events.sort(key=lambda x: x.start_time)

        seen_phasepid_tids = {}
        print("Rewriting events")
        for event in framework_events:
            # print(str(event.tid) + "\n")
            if self.tensor_table[event.event_phase] == 0:
                # We will create tensor_idx based on what event_phase is there
                # tensor idx would be generated to show timeline in order of Step(0), Forward(1)/Backward(2), DataIterator(3), Dataset(4),
                # TF(/device, /host), PT detailed(cpu_functions/gpu_functions), horovod/SMDataParallel
                # tensor_idx = len(self.tensor_table)

                found = False
                for key in self.eventphase_starting_ids.keys():
                    if key in event.event_phase:
                        tensor_idx = self.eventphase_starting_ids[key]
                        self.eventphase_starting_ids[key] += 1
                        found = True
                        break
                if not found:
                    tensor_idx = self.eventphase_starting_ids["other"]
                    self.eventphase_starting_ids["other"] += 1

                self.tensor_table[event.event_phase] = tensor_idx

                # Instant events don't have a training phase
                # TODO check with cycle marker
                if event.phase != "i":
                    args = {"name": event.event_phase}
                    json_dict = {
                        "name": "process_name",
                        "ph": "M",
                        "pid": tensor_idx,
                        "args": args
                    }
                    _, length = self._writer.write(
                        json.dumps(json_dict) + ",\n")
                    self.bytes_written += length

                    args = {"sort_index": tensor_idx}
                    json_dict = {
                        "name": "process_sort_index",
                        "ph": "M",
                        "pid": tensor_idx,
                        "args": args,
                    }
                    _, length = self._writer.write(
                        json.dumps(json_dict) + ",\n")
                    self.bytes_written += length
                # below we are modeling system metrics as instant events
                elif event.file_type == "system_metrics":
                    # {"name": “LowGpu”, "ph": "i", "ts": 1234523.3, "pid": 2343, "tid": 2347, "s": "g"}
                    args = {"name": event.event_name}
                    json_dict = {
                        "name": "process_name",
                        "ph": "M",
                        "pid": tensor_idx,
                        "args": args
                    }
                    event.phase_pid = event.phase_tid = tensor_idx
                    _, length = self._writer.write(
                        json.dumps(json_dict) + ",\n")
                    self.bytes_written += length

                    args = {"sort_index": tensor_idx}
                    json_dict = {
                        "name": "process_sort_index",
                        "ph": "M",
                        "pid": tensor_idx,
                        "args": args,
                    }
                    _, length = self._writer.write(
                        json.dumps(json_dict) + ",\n")
                    self.bytes_written += length

            tensor_idx = self.tensor_table[event.event_phase]
            # event.tid is not written. write it
            is_event_seen = False
            if (event.phase_pid in seen_phasepid_tids
                    and event.tid in seen_phasepid_tids[event.phase_pid]):
                is_event_seen = True
            # if thread id for this event pid is not yet seen, write the metadata for this thread
            if event.process_info is not None and not is_event_seen:
                phase_tid = event.phase_tid
                thread_info = event.process_info._threads[phase_tid]
                args = {"name": thread_info.thread_name}
                json_dict = {
                    "name": "thread_name",
                    "ph": "M",
                    "pid": tensor_idx,
                    #
                    "tid": thread_info.tid,
                    "args": args,
                }
                _, length = self._writer.write(json.dumps(json_dict) + ",\n")
                self.bytes_written += length
                if event.phase_pid not in seen_phasepid_tids:
                    seen_phasepid_tids[event.phase_pid] = {}
                seen_phasepid_tids[event.phase_pid][event.tid] = 1
            # change event pid back tensor idx before writing it.
            event.pid = tensor_idx
            # {"name": “LowGpu”, "ph": "i", "ts": 1234523.3, "pid": 2343, "tid": 2347, "s": "g"}

            _, length = self._writer.write(event.to_json() + ",\n")
            self.bytes_written += length

        self.close()

        get_logger().info(f"Merged timeline saved at: {filename}")
        return filename

    def close(self):
        file_seek_pos = self.bytes_written - 2
        if isinstance(self._writer, TSAccessFile):
            self._writer._accessor.seek(file_seek_pos)
            self._writer._accessor.truncate()
        else:
            self._writer.data = self._writer.data[:file_seek_pos]

        if file_seek_pos > 2:
            self._writer.write("\n]")

        self._writer.flush()
        self._writer.close()
        self._writer = None