Esempio n. 1
0
def test_list_objects():
    s = uuid.uuid4()
    prefix = "test_list_objects/" + str(s)
    for i in [0, 3, 7, 11]:
        f = TSAccessS3("smdebugcodebuildtest", prefix + "/" + format(i, "02"))
        f.write(b"a")
        f.close()
    req1 = ListRequest(Bucket="smdebugcodebuildtest", Prefix=prefix)
    req2 = ListRequest(Bucket="smdebugcodebuildtest",
                       Prefix="test_list_objects/",
                       Delimiter="/")
    req3 = ListRequest(Bucket="smdebugcodebuildtest",
                       Prefix=prefix,
                       StartAfter=prefix + "/0")
    req4 = ListRequest(Bucket="smdebugcodebuildtest",
                       Prefix=prefix,
                       StartAfter=prefix + "/03")
    req5 = ListRequest(Bucket="smdebugcodebuildtest", Prefix=prefix + "/0")
    files = S3Handler.list_prefixes([req1, req2, req3, req4, req5])
    # test StartAfter and delimiters
    assert len(files[0]) == 4
    assert prefix + "/" in files[1]
    assert len(files[2]) == 4
    assert len(files[3]) == 2
    assert len(files[4]) == 3
    S3Handler.delete_prefix(path="s3://smdebugcodebuildtest/" + prefix)
Esempio n. 2
0
def delete_s3_prefixes(bucket, keys):
    if not isinstance(keys, list):
        keys = [keys]
    delreqs = []
    for key in keys:
        delreqs.append(DeleteRequest(bucket, key))
    S3Handler.delete_prefixes(delreqs)
Esempio n. 3
0
def test_delete_prefix():
    s = uuid.uuid4()
    prefix = "test_delete_prefix/" + str(s)
    for i in range(3):
        f = TSAccessS3("smdebugcodebuildtest", prefix + "/" + str(i))
        f.write(b"a")
        f.close()
    S3Handler.delete_prefix(path="s3://smdebugcodebuildtest/" + prefix)
    entries = S3Handler.list_prefix(ListRequest("smdebugcodebuildtest",
                                                prefix))
    assert len(entries) == 0
    def load_python_profile_stats(self):
        """Load the stats in by creating the profile directory, downloading each stats directory from s3 to the
        profile directory, parsing the metadata from each stats directory name and creating a StepPythonProfileStats
        entry corresponding to the stats file in the stats directory.

        For cProfile, the stats file name is `python_stats`.
        For pyinstrument, the stats file name `python_stats.json`.
        """
        python_profile_stats = []

        self._set_up_profile_dir()

        list_request = ListRequest(Bucket=self.bucket_name, Prefix=self.prefix)
        s3_filepaths = S3Handler.list_prefix(list_request)
        object_requests = [
            ReadObjectRequest(
                os.path.join("s3://", self.bucket_name, s3_filepath))
            for s3_filepath in s3_filepaths
        ]
        objects = S3Handler.get_objects(object_requests)

        for full_s3_filepath, object_data in zip(s3_filepaths, objects):
            if os.path.basename(full_s3_filepath) not in (
                    CPROFILE_STATS_FILENAME,
                    PYINSTRUMENT_JSON_FILENAME,
                    PYINSTRUMENT_HTML_FILENAME,
            ):
                get_logger().info(
                    f"Unknown file {full_s3_filepath} found, skipping...")
                continue

            path_components = full_s3_filepath.split("/")
            framework, profiler_name, node_id, stats_dir, stats_file = path_components[
                -5:]

            stats_dir_path = os.path.join(self.profile_dir, node_id, stats_dir)
            os.makedirs(stats_dir_path, exist_ok=True)
            stats_file_path = os.path.join(stats_dir_path, stats_file)

            with open(stats_file_path, "wb") as f:
                f.write(object_data)

            python_profile_stats.append(
                StepPythonProfileStats(framework, profiler_name, node_id,
                                       stats_dir, stats_file_path))
        python_profile_stats.sort(
            key=lambda x: (x.start_time_since_epoch_in_micros, x.node_id)
        )  # sort each step's stats by the step number, then node ID.
        return python_profile_stats
Esempio n. 5
0
    def parse_event_files(self, event_files):
        file_read_requests = []
        event_files_to_read = []

        for event_file in event_files:
            if event_file not in self._parsed_files:
                self.logger.debug(f"Will request s3 object {event_file}")
                event_files_to_read.append(event_file)
                file_read_requests.append(ReadObjectRequest(path=event_file))

        event_data_list = S3Handler.get_objects(file_read_requests)
        self.logger.debug(f"Got results back from s3 for {event_files}")
        for event_data, event_file in zip(event_data_list,
                                          event_files_to_read):
            self.logger.debug(f"Will parse events in event file:{event_file}")
            if event_file.endswith(
                    "json.gz") and is_valid_tfprof_tracefilename(event_file):
                self._get_event_parser(event_file).read_events_from_file(
                    event_file)
                self._parsed_files.add(event_file)
            else:
                if is_valid_tracefilename(event_file):
                    event_string = event_data.decode("utf-8")
                    json_data = json.loads(event_string)
                    node_id = get_node_id_from_tracefilename(event_file)
                    self._get_event_parser(
                        event_file).read_events_from_json_data(
                            json_data, node_id)
                    self._parsed_files.add(event_file)
                else:
                    self.logger.info(
                        f"Invalid tracefilename:{event_file} . Skipping.")
Esempio n. 6
0
    def _get_trace_events_json(self, tracefile):
        try:
            s3, bucket_name, key_name = is_s3(tracefile)
            if s3:
                object_requests = ReadObjectRequest(os.path.join("s3://", bucket_name, key_name))
                objects = S3Handler.get_objects([object_requests])
                unzipped = zlib.decompress(objects[0], zlib.MAX_WBITS | 16)
                trace_json_data = json.loads(unzipped.decode("utf-8"))
            else:
                with gzip.GzipFile(tracefile, "r") as fin:
                    trace_json_data = json.loads(fin.read().decode("utf-8"))
        except Exception as e:
            self.logger.error(f"Can't open TF trace file {tracefile}: Exception {str(e)} ")
            return None
        if "traceEvents" not in trace_json_data:
            self.logger.error(f"The TF trace file {tracefile} does not contain traceEvents")
            return None
        trace_events_json = trace_json_data["traceEvents"]
        _, start_time_in_micros, _ = read_tf_profiler_metadata_file(tracefile)
        # the first time profiler.start() is called is considered the start time
        # for TF profiler
        metadata = []
        args = {"start_time_since_epoch_in_micros": int(start_time_in_micros)}
        json_dict = {"name": "process_name", "ph": "M", "pid": 0, "args": args}
        metadata.append(json_dict)
        args = {"sort_index": 0}
        json_dict = {"name": "process_sort_index", "ph": "M", "pid": 0, "args": args}
        metadata.append(json_dict)

        # insert metadata at the beginning of trace events json
        trace_events_json = metadata + trace_events_json
        return trace_events_json
    def read_index_files(
            self,
            start_after_key: str,
            range_steps=None) -> Tuple[List[bytes], list, str, List[str]]:
        """
            Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
        :param start_after_key: str
        :param range_steps:
        :return: Tuple( responses, steps, start_after_key, workers)
        """
        object_requests = []
        steps = []
        workers = []
        index_files, start_after_key = self.list_index_files(start_after_key)
        self.logger.debug(f'Loaded Index Files: {",".join(index_files)}')
        for index_file in index_files:
            if self.index_file_cache.has_not_read(index_file):
                step = IndexFileLocationUtils.parse_step_from_index_file_name(
                    index_file)
                if (range_steps is not None and step_in_range(
                        range_steps, step)) or range_steps is None:
                    steps.append(step)
                    workers.append(parse_worker_name_from_file(index_file))
                    object_requests.append(
                        ReadObjectRequest(
                            format(f"s3://{self.bucket_name}/") + index_file))
                self.index_file_cache.add(index_file, start_after_key)

        responses = S3Handler.get_objects(object_requests)
        return responses, steps, start_after_key, workers
 def __init__(
     self,
     name,
     bucket_name,
     prefix_name,
     range_steps=None,
     check=False,
     index_mode=True,
     cache=False,
 ):
     """
     :param name: for sagemaker job, this should be sagemaker training job name
     :param bucket_name: name of bucket where data is saved
     :param prefix_name: name of prefix such that s3://bucket/prefix is where data is saved
     :param range_steps: range_steps is a tuple representing (start_step, end_step).
                         Only the data from steps in between this range will be loaded
     :param check: whether to check checksum of data saved
     """
     super().__init__(
         name,
         range_steps=range_steps,
         parallel=False,
         check=check,
         index_mode=index_mode,
         cache=cache,
     )
     self.logger.info(
         f"Loading trial {name} at path s3://{bucket_name}/{prefix_name}")
     self.bucket_name = bucket_name
     self.prefix_name = os.path.join(prefix_name, "")
     self.path = "s3://" + os.path.join(self.bucket_name, self.prefix_name)
     self.index_reader = S3IndexReader(self.path)
     self.s3_handler = S3Handler()
     self._load_collections()
     self._load_tensors()
    def __init__(self, path):
        super().__init__(path)
        self.path = path
        _, self.bucket_name, self.prefix_name = is_s3(path)
        self.s3_handler = S3Handler()

        self.index_file_cache = ReadIndexFilesCache()
Esempio n. 10
0
 def _read_collections(self, collection_files):
     first_collection_file = collection_files[0]  # First Collection File
     key = os.path.join(first_collection_file)
     collections_req = ReadObjectRequest(self._get_s3_location(key))
     obj_data = S3Handler.get_objects([collections_req])[0]
     obj_data = obj_data.decode("utf-8")
     self.collection_manager = CollectionManager.load_from_string(obj_data)
     self.num_workers = self.collection_manager.get_num_workers()
def delete_s3_prefix(bucket, prefix):
    s3_handler = S3Handler()
    list_req = [ListRequest(Bucket=bucket, Prefix=prefix)]
    keys = s3_handler.list_prefixes(list_req)[0]

    loop = asyncio.get_event_loop()
    task = loop.create_task(del_prefix_helper(bucket, keys))
    loop.run_until_complete(task)
Esempio n. 12
0
def check_performance():
    import time
    import multiprocessing

    kb = 1024
    mb = 1024 * 1024
    sizes = [10 * kb, 100 * kb, 500 * kb]  # , mb, 5 * mb, 10 * mb]
    num_files = [100, 1000, 10000]  # , 10000]  # , 100000]  # , 1000000]
    files_path = "smdebug-testing/resources/test_performance"
    times = []
    print("Size\tNumFiles\tPool size\tSync with multiprocessing")
    pool_sizes = [
        2 * multiprocessing.cpu_count(),
        4 * multiprocessing.cpu_count(),
        8 * multiprocessing.cpu_count(),
    ]
    for size in sizes:
        timesrow = []
        for nf in num_files:
            timesrow_for_pools = []
            for pool_size in pool_sizes:
                j = 0
                S3Handler.MULTIPROCESSING_POOL_SIZE = pool_size
                times_to_be_averaged = []
                reqs = [
                    ReadObjectRequest(f"s3://{files_path}/{size}/{i}.dummy")
                    for i in range(nf)
                ]
                while j < 10:
                    sync_start = time.time()
                    S3Handler.get_objects(reqs, use_multiprocessing=True)
                    sync_end = time.time()
                    times_to_be_averaged.append(sync_end - sync_start)
                    j += 1
                timesrow_for_pools.append(
                    round(
                        sum(times_to_be_averaged) / len(times_to_be_averaged),
                        2))
            timesrow.append(timesrow_for_pools)
            print(f"{size} {nf} {pool_sizes} {timesrow_for_pools}")
        times.append(timesrow)
        print(f"Finished testing for {size}", times[-1])
Esempio n. 13
0
def test_download_objects():
    s = uuid.uuid4()
    prefix = "test_get_objects/" + str(s)
    f = TSAccessS3("smdebugcodebuildtest", prefix, binary=False)
    f.write("a" * 100)
    f.write("b" * 200)
    f.write("c" * 300)
    f.close()
    r1 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix)
    r2 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix, start=100)
    r3 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix,
                           start=100,
                           length=200)
    objects = S3Handler.get_objects([r1, r2, r3])
    assert objects[0].decode("ascii") == "a" * 100 + "b" * 200 + "c" * 300
    assert objects[1].decode("ascii") == "b" * 200 + "c" * 300, len(
        objects[1].decode("ascii"))
    assert objects[2].decode("ascii") == "b" * 200

    S3Handler.delete_prefix(path="s3://smdebugcodebuildtest/" + prefix)
Esempio n. 14
0
    def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray:
        event_file_name = tensor_location.event_file_name

        if not self._is_event_file_present(event_file_name):
            self.event_file_present_loop(tensor_location)

        start = tensor_location.start_idx
        length = tensor_location.length
        request = [ReadObjectRequest(event_file_name, int(start), int(length))]
        res = S3Handler.get_objects(request)
        tr = TensorReader(res[0])  # Access the only element in res
        tensor_tuple = list(tr.read_tensors())[0]  # Access the only element in the list
        tensor_name, step, tensor_data, mode, mode_step = tensor_tuple
        return tensor_data
def delete_s3_prefixes(bucket, keys):
    s3_handler = S3Handler()
    if not isinstance(keys, list):
        keys = [keys]
    list_prefixes = s3_handler.list_prefixes(
        [ListRequest(Bucket=bucket, Prefix=key) for key in keys])
    prefixes = [item for sublist in list_prefixes for item in sublist]
    loop = asyncio.get_event_loop()

    async def del_folder(bucket, keys):
        loop = asyncio.get_event_loop()
        client = aioboto3.client("s3", loop=loop, region_name=get_region())
        await asyncio.gather(
            *[client.delete_object(Bucket=bucket, Key=key) for key in keys])
        await client.close()

    task = loop.create_task(del_folder(bucket, prefixes))
    loop.run_until_complete(task)
    def parse_event_files(self, event_files):
        file_read_requests = []
        event_files_to_read = []

        for event_file in event_files:
            if event_file not in self._parsed_files:
                event_files_to_read.append(event_file)
                file_read_requests.append(ReadObjectRequest(path=event_file))

        event_data_list = S3Handler.get_objects(file_read_requests)
        for event_data, event_file in zip(event_data_list, event_files_to_read):
            event_string = event_data.decode("utf-8")
            event_items = event_string.split("\n")
            event_items.remove("")
            for item in event_items:
                event = json.loads(item)
                self._SystemProfilerEventParser.read_event_from_dict(event)
            self._parsed_files.add(event_file)
Esempio n. 17
0
def file_exists(file_path):
    s3, bucket_name, key_name = is_s3(file_path)
    if s3:
        try:
            request = ListRequest(bucket_name, key_name)
            file_available = S3Handler.list_prefixes([request])[0]
            if len(file_available) > 0:
                return True
            else:
                return False
        except ClientError as ex:
            status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"]
            logger.info(f"Client error occurred : {ex}")
            if status_code.startswith("4"):
                raise ex
            else:
                return False
    else:
        return os.path.exists(file_path)
Esempio n. 18
0
def test_merge_timeline_s3_write():
    bucket_name = "smdebug-testing"
    key_name = f"outputs/smprofiler-timeline-merge-test-{uuid.uuid4()}"
    location = "s3://{}/{}".format(bucket_name, key_name)

    tracefolder = "./tests/profiler/resources/merge_traces"
    combined_timeline = MergedTimeline(tracefolder, output_directory=location)
    combined_timeline.merge_timeline(0,
                                     time.time() * CONVERT_TO_MICROSECS,
                                     unit=MergeUnit.TIME)

    start_step, end_step = 2, 4
    tracefolder = "s3://smdebug-testing/resources/tf2_detailed_profile/profiler-output"
    combined_timeline = MergedTimeline(tracefolder, output_directory=location)
    combined_timeline.merge_timeline(start_step, end_step, unit=MergeUnit.STEP)

    request = ListRequest(bucket_name, key_name)
    files = S3Handler.list_prefixes([request])
    assert len(files) == 1
    assert len(files[0]) == 2
def has_training_ended(trial_prefix):
    file_path = os.path.join(trial_prefix, END_OF_JOB_FILENAME)
    s3, bucket_name, key_name = is_s3(file_path)
    if s3:
        try:
            s3_handler = S3Handler()
            request = ListRequest(bucket_name, key_name)
            file_available = s3_handler.list_prefixes([request])[0]
            if len(file_available) > 0:
                return True
            else:
                return False
        except ClientError as ex:
            status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"]
            logger.info(f"Client error occurred : {ex}")
            if status_code.startswith("4"):
                raise ex
            else:
                return False
    else:
        return os.path.exists(file_path)
 def _refresh_event_file_list_s3_mode(self, list_dir):
     event_files = [
         x for x in S3Handler.list_prefix(list_dir)
         if re.search(self._get_event_file_regex(), x)
     ]
     for event_file in event_files:
         event_file_full_path = f"s3://{list_dir.bucket}/{event_file}"
         timestamp = self._get_timestamp_from_filename(event_file_full_path)
         if timestamp is None:
             self.logger.debug(
                 f"Unable to find timestamp from event file name {event_file}."
             )
             continue
         if timestamp in self._timestamp_to_filename:
             if event_file_full_path not in self._timestamp_to_filename[
                     timestamp]:
                 self._timestamp_to_filename[timestamp].append(
                     event_file_full_path)
         else:
             self._timestamp_to_filename[timestamp] = [event_file_full_path]
     for timestamp in self._timestamp_to_filename:
         self._timestamp_to_filename[timestamp].sort()
     self._update_start_after_prefix()
Esempio n. 21
0
def check_dir_exists(path):
    from smdebug.core.access_layer.s3handler import S3Handler, ListRequest

    s3, bucket_name, key_name = is_s3(path)
    if s3:
        try:
            request = ListRequest(bucket_name, key_name)
            folder = S3Handler.list_prefixes([request])[0]
            if len(folder) > 0 and has_training_ended(folder[-1]):
                raise RuntimeError("The path:{} already exists on s3. "
                                   "Please provide a directory path that does "
                                   "not already exist.".format(path))
        except ClientError as ex:
            if ex.response["Error"]["Code"] == "NoSuchBucket":
                # then we do not need to raise any error
                pass
            else:
                # do not know the error
                raise ex
    elif os.path.exists(path) and has_training_ended(path):
        raise RuntimeError("The path:{} already exists on local disk. "
                           "Please provide a directory path that does "
                           "not already exist".format(path))
Esempio n. 22
0
def read_tf_profiler_metadata_file(file_path):
    if not is_valid_tfprof_tracefilename(file_path):
        return "", "0", "0"
    s3, bucket_name, key_name = is_s3(file_path)
    if s3:
        try:
            folder_name = "/".join(key_name.split("/")[:-4])
            request = ListRequest(bucket_name, folder_name)
            file_available = S3Handler.list_prefixes([request])
            if len(file_available) > 0:
                metadata_filename = list(
                    filter(lambda x: ".metadata" in x, file_available[0]))
                if len(metadata_filename) > 0:
                    metadata_filename = metadata_filename[0]
                    metadata_filename = metadata_filename.split("/")[-1]
                    node_id, start, end = str(metadata_filename).split("_")
                    return node_id, start, end.split(".")[0]
                else:
                    return "", "0", "0"
            else:
                return "", "0", "0"
        except ClientError as ex:
            status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"]
            logger.info(f"Client error occurred : {ex}")
            if status_code.startswith("4"):
                raise ex
            else:
                return "", "0", "0"
    else:
        folder_name = "/".join(file_path.split("/")[:-4])
        metadata_filename = list(Path(folder_name).rglob("*.metadata"))
        if len(metadata_filename) > 0:
            metadata_filename = metadata_filename[0].name
            node_id, start, end = str(metadata_filename).split("_")
            return node_id, start, end.split(".")[0]
        else:
            return "", "0", "0"
Esempio n. 23
0
def _list_s3_prefixes(list_info):
    files = S3Handler.list_prefixes(list_info)
    if len(files) == 1:
        files = files[0]
    return files
Esempio n. 24
0
def delete_s3_prefix(bucket, prefix):
    S3Handler.delete_prefix(
        delete_request=DeleteRequest(Bucket=bucket, Prefix=prefix))