Exemple #1
0
    def parse_event_files(self, event_files):
        file_read_requests = []
        event_files_to_read = []

        for event_file in event_files:
            if event_file not in self._parsed_files:
                self.logger.debug(f"Will request s3 object {event_file}")
                event_files_to_read.append(event_file)
                file_read_requests.append(ReadObjectRequest(path=event_file))

        event_data_list = S3Handler.get_objects(file_read_requests)
        self.logger.debug(f"Got results back from s3 for {event_files}")
        for event_data, event_file in zip(event_data_list,
                                          event_files_to_read):
            self.logger.debug(f"Will parse events in event file:{event_file}")
            if event_file.endswith(
                    "json.gz") and is_valid_tfprof_tracefilename(event_file):
                self._get_event_parser(event_file).read_events_from_file(
                    event_file)
                self._parsed_files.add(event_file)
            else:
                if is_valid_tracefilename(event_file):
                    event_string = event_data.decode("utf-8")
                    json_data = json.loads(event_string)
                    node_id = get_node_id_from_tracefilename(event_file)
                    self._get_event_parser(
                        event_file).read_events_from_json_data(
                            json_data, node_id)
                    self._parsed_files.add(event_file)
                else:
                    self.logger.info(
                        f"Invalid tracefilename:{event_file} . Skipping.")
    def read_index_files(
        self, start_after_key: str, range_steps=None
    ) -> Tuple[List[bytes], list, str, List[str]]:
        """
            Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
        :param start_after_key: str
        :param range_steps:
        :return: Tuple( responses, steps, start_after_key, workers)
        """
        object_requests = []
        steps = []
        workers = []
        index_files, start_after_key = self.list_index_files(start_after_key)
        self.logger.debug(f'Loaded Index Files: {",".join(index_files)}')
        for index_file in index_files:
            if self.index_file_cache.has_not_read(index_file):
                step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file)
                if (
                    range_steps is not None and step_in_range(range_steps, step)
                ) or range_steps is None:
                    steps.append(step)
                    workers.append(parse_worker_name_from_file(index_file))
                    object_requests.append(
                        ReadObjectRequest(format(f"s3://{self.bucket_name}/") + index_file)
                    )
                self.index_file_cache.add(index_file, start_after_key)

        responses = self.s3_handler.get_objects(object_requests)
        return responses, steps, start_after_key, workers
Exemple #3
0
    def _get_trace_events_json(self, tracefile):
        try:
            s3, bucket_name, key_name = is_s3(tracefile)
            if s3:
                object_requests = ReadObjectRequest(os.path.join("s3://", bucket_name, key_name))
                objects = S3Handler.get_objects([object_requests])
                unzipped = zlib.decompress(objects[0], zlib.MAX_WBITS | 16)
                trace_json_data = json.loads(unzipped.decode("utf-8"))
            else:
                with gzip.GzipFile(tracefile, "r") as fin:
                    trace_json_data = json.loads(fin.read().decode("utf-8"))
        except Exception as e:
            self.logger.error(f"Can't open TF trace file {tracefile}: Exception {str(e)} ")
            return None
        if "traceEvents" not in trace_json_data:
            self.logger.error(f"The TF trace file {tracefile} does not contain traceEvents")
            return None
        trace_events_json = trace_json_data["traceEvents"]
        _, start_time_in_micros, _ = read_tf_profiler_metadata_file(tracefile)
        # the first time profiler.start() is called is considered the start time
        # for TF profiler
        metadata = []
        args = {"start_time_since_epoch_in_micros": int(start_time_in_micros)}
        json_dict = {"name": "process_name", "ph": "M", "pid": 0, "args": args}
        metadata.append(json_dict)
        args = {"sort_index": 0}
        json_dict = {"name": "process_sort_index", "ph": "M", "pid": 0, "args": args}
        metadata.append(json_dict)

        # insert metadata at the beginning of trace events json
        trace_events_json = metadata + trace_events_json
        return trace_events_json
 def _read_collections(self, collection_files):
     first_collection_file = collection_files[0]  # First Collection File
     key = os.path.join(first_collection_file)
     collections_req = ReadObjectRequest(self._get_s3_location(key))
     obj_data = self.s3_handler.get_objects([collections_req])[0]
     obj_data = obj_data.decode("utf-8")
     self.collection_manager = CollectionManager.load_from_string(obj_data)
     self.num_workers = self.collection_manager.get_num_workers()
Exemple #5
0
def test_download_objects():
    s = uuid.uuid4()
    prefix = "test_get_objects/" + str(s)
    f = TSAccessS3("smdebugcodebuildtest", prefix, binary=False)
    f.write("a" * 100)
    f.write("b" * 200)
    f.write("c" * 300)
    f.close()
    r1 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix)
    r2 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix, start=100)
    r3 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix,
                           start=100,
                           length=200)
    objects = S3Handler.get_objects([r1, r2, r3])
    assert objects[0].decode("ascii") == "a" * 100 + "b" * 200 + "c" * 300
    assert objects[1].decode("ascii") == "b" * 200 + "c" * 300, len(
        objects[1].decode("ascii"))
    assert objects[2].decode("ascii") == "b" * 200

    S3Handler.delete_prefix(path="s3://smdebugcodebuildtest/" + prefix)
    def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray:
        event_file_name = tensor_location.event_file_name

        if not self._is_event_file_present(event_file_name):
            self.event_file_present_loop(tensor_location)

        start = tensor_location.start_idx
        length = tensor_location.length
        request = [ReadObjectRequest(event_file_name, int(start), int(length))]
        res = S3Handler.get_objects(request)
        tr = TensorReader(res[0])  # Access the only element in res
        tensor_tuple = list(tr.read_tensors())[0]  # Access the only element in the list
        tensor_name, step, tensor_data, mode, mode_step = tensor_tuple
        return tensor_data
    def load_python_profile_stats(self):
        """Load the stats in by creating the profile directory, downloading each stats directory from s3 to the
        profile directory, parsing the metadata from each stats directory name and creating a StepPythonProfileStats
        entry corresponding to the stats file in the stats directory.

        For cProfile, the stats file name is `python_stats`.
        For pyinstrument, the stats file name `python_stats.json`.
        """
        python_profile_stats = []

        self._set_up_profile_dir()

        list_request = ListRequest(Bucket=self.bucket_name, Prefix=self.prefix)
        s3_filepaths = S3Handler.list_prefix(list_request)
        object_requests = [
            ReadObjectRequest(
                os.path.join("s3://", self.bucket_name, s3_filepath))
            for s3_filepath in s3_filepaths
        ]
        objects = S3Handler.get_objects(object_requests)

        for full_s3_filepath, object_data in zip(s3_filepaths, objects):
            if os.path.basename(full_s3_filepath) not in (
                    CPROFILE_STATS_FILENAME,
                    PYINSTRUMENT_JSON_FILENAME,
                    PYINSTRUMENT_HTML_FILENAME,
            ):
                get_logger().info(
                    f"Unknown file {full_s3_filepath} found, skipping...")
                continue

            path_components = full_s3_filepath.split("/")
            framework, profiler_name, node_id, stats_dir, stats_file = path_components[
                -5:]

            stats_dir_path = os.path.join(self.profile_dir, node_id, stats_dir)
            os.makedirs(stats_dir_path, exist_ok=True)
            stats_file_path = os.path.join(stats_dir_path, stats_file)

            with open(stats_file_path, "wb") as f:
                f.write(object_data)

            python_profile_stats.append(
                StepPythonProfileStats(framework, profiler_name, node_id,
                                       stats_dir, stats_file_path))
        python_profile_stats.sort(
            key=lambda x: (x.start_time_since_epoch_in_micros, x.node_id)
        )  # sort each step's stats by the step number, then node ID.
        return python_profile_stats
    def parse_event_files(self, event_files):
        file_read_requests = []
        event_files_to_read = []

        for event_file in event_files:
            if event_file not in self._parsed_files:
                event_files_to_read.append(event_file)
                file_read_requests.append(ReadObjectRequest(path=event_file))

        event_data_list = S3Handler.get_objects(file_read_requests)
        for event_data, event_file in zip(event_data_list, event_files_to_read):
            event_string = event_data.decode("utf-8")
            event_items = event_string.split("\n")
            event_items.remove("")
            for item in event_items:
                event = json.loads(item)
                self._SystemProfilerEventParser.read_event_from_dict(event)
            self._parsed_files.add(event_file)
Exemple #9
0
def check_performance():
    import time
    import multiprocessing

    kb = 1024
    mb = 1024 * 1024
    sizes = [10 * kb, 100 * kb, 500 * kb]  # , mb, 5 * mb, 10 * mb]
    num_files = [100, 1000, 10000]  # , 10000]  # , 100000]  # , 1000000]
    files_path = "smdebug-testing/resources/test_performance"
    times = []
    print("Size\tNumFiles\tPool size\tSync with multiprocessing")
    pool_sizes = [
        2 * multiprocessing.cpu_count(),
        4 * multiprocessing.cpu_count(),
        8 * multiprocessing.cpu_count(),
    ]
    for size in sizes:
        timesrow = []
        for nf in num_files:
            timesrow_for_pools = []
            for pool_size in pool_sizes:
                j = 0
                S3Handler.MULTIPROCESSING_POOL_SIZE = pool_size
                times_to_be_averaged = []
                reqs = [
                    ReadObjectRequest(f"s3://{files_path}/{size}/{i}.dummy")
                    for i in range(nf)
                ]
                while j < 10:
                    sync_start = time.time()
                    S3Handler.get_objects(reqs, use_multiprocessing=True)
                    sync_end = time.time()
                    times_to_be_averaged.append(sync_end - sync_start)
                    j += 1
                timesrow_for_pools.append(
                    round(
                        sum(times_to_be_averaged) / len(times_to_be_averaged),
                        2))
            timesrow.append(timesrow_for_pools)
            print(f"{size} {nf} {pool_sizes} {timesrow_for_pools}")
        times.append(timesrow)
        print(f"Finished testing for {size}", times[-1])