def test_list_objects(): s = uuid.uuid4() prefix = "test_list_objects/" + str(s) for i in [0, 3, 7, 11]: f = TSAccessS3("smdebugcodebuildtest", prefix + "/" + format(i, "02")) f.write(b"a") f.close() req1 = ListRequest(Bucket="smdebugcodebuildtest", Prefix=prefix) req2 = ListRequest(Bucket="smdebugcodebuildtest", Prefix="test_list_objects/", Delimiter="/") req3 = ListRequest(Bucket="smdebugcodebuildtest", Prefix=prefix, StartAfter=prefix + "/0") req4 = ListRequest(Bucket="smdebugcodebuildtest", Prefix=prefix, StartAfter=prefix + "/03") req5 = ListRequest(Bucket="smdebugcodebuildtest", Prefix=prefix + "/0") files = S3Handler.list_prefixes([req1, req2, req3, req4, req5]) # test StartAfter and delimiters assert len(files[0]) == 4 assert prefix + "/" in files[1] assert len(files[2]) == 4 assert len(files[3]) == 2 assert len(files[4]) == 3 S3Handler.delete_prefix(path="s3://smdebugcodebuildtest/" + prefix)
def delete_s3_prefixes(bucket, keys): if not isinstance(keys, list): keys = [keys] delreqs = [] for key in keys: delreqs.append(DeleteRequest(bucket, key)) S3Handler.delete_prefixes(delreqs)
def test_delete_prefix(): s = uuid.uuid4() prefix = "test_delete_prefix/" + str(s) for i in range(3): f = TSAccessS3("smdebugcodebuildtest", prefix + "/" + str(i)) f.write(b"a") f.close() S3Handler.delete_prefix(path="s3://smdebugcodebuildtest/" + prefix) entries = S3Handler.list_prefix(ListRequest("smdebugcodebuildtest", prefix)) assert len(entries) == 0
def load_python_profile_stats(self): """Load the stats in by creating the profile directory, downloading each stats directory from s3 to the profile directory, parsing the metadata from each stats directory name and creating a StepPythonProfileStats entry corresponding to the stats file in the stats directory. For cProfile, the stats file name is `python_stats`. For pyinstrument, the stats file name `python_stats.json`. """ python_profile_stats = [] self._set_up_profile_dir() list_request = ListRequest(Bucket=self.bucket_name, Prefix=self.prefix) s3_filepaths = S3Handler.list_prefix(list_request) object_requests = [ ReadObjectRequest( os.path.join("s3://", self.bucket_name, s3_filepath)) for s3_filepath in s3_filepaths ] objects = S3Handler.get_objects(object_requests) for full_s3_filepath, object_data in zip(s3_filepaths, objects): if os.path.basename(full_s3_filepath) not in ( CPROFILE_STATS_FILENAME, PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME, ): get_logger().info( f"Unknown file {full_s3_filepath} found, skipping...") continue path_components = full_s3_filepath.split("/") framework, profiler_name, node_id, stats_dir, stats_file = path_components[ -5:] stats_dir_path = os.path.join(self.profile_dir, node_id, stats_dir) os.makedirs(stats_dir_path, exist_ok=True) stats_file_path = os.path.join(stats_dir_path, stats_file) with open(stats_file_path, "wb") as f: f.write(object_data) python_profile_stats.append( StepPythonProfileStats(framework, profiler_name, node_id, stats_dir, stats_file_path)) python_profile_stats.sort( key=lambda x: (x.start_time_since_epoch_in_micros, x.node_id) ) # sort each step's stats by the step number, then node ID. return python_profile_stats
def parse_event_files(self, event_files): file_read_requests = [] event_files_to_read = [] for event_file in event_files: if event_file not in self._parsed_files: self.logger.debug(f"Will request s3 object {event_file}") event_files_to_read.append(event_file) file_read_requests.append(ReadObjectRequest(path=event_file)) event_data_list = S3Handler.get_objects(file_read_requests) self.logger.debug(f"Got results back from s3 for {event_files}") for event_data, event_file in zip(event_data_list, event_files_to_read): self.logger.debug(f"Will parse events in event file:{event_file}") if event_file.endswith( "json.gz") and is_valid_tfprof_tracefilename(event_file): self._get_event_parser(event_file).read_events_from_file( event_file) self._parsed_files.add(event_file) else: if is_valid_tracefilename(event_file): event_string = event_data.decode("utf-8") json_data = json.loads(event_string) node_id = get_node_id_from_tracefilename(event_file) self._get_event_parser( event_file).read_events_from_json_data( json_data, node_id) self._parsed_files.add(event_file) else: self.logger.info( f"Invalid tracefilename:{event_file} . Skipping.")
def _get_trace_events_json(self, tracefile): try: s3, bucket_name, key_name = is_s3(tracefile) if s3: object_requests = ReadObjectRequest(os.path.join("s3://", bucket_name, key_name)) objects = S3Handler.get_objects([object_requests]) unzipped = zlib.decompress(objects[0], zlib.MAX_WBITS | 16) trace_json_data = json.loads(unzipped.decode("utf-8")) else: with gzip.GzipFile(tracefile, "r") as fin: trace_json_data = json.loads(fin.read().decode("utf-8")) except Exception as e: self.logger.error(f"Can't open TF trace file {tracefile}: Exception {str(e)} ") return None if "traceEvents" not in trace_json_data: self.logger.error(f"The TF trace file {tracefile} does not contain traceEvents") return None trace_events_json = trace_json_data["traceEvents"] _, start_time_in_micros, _ = read_tf_profiler_metadata_file(tracefile) # the first time profiler.start() is called is considered the start time # for TF profiler metadata = [] args = {"start_time_since_epoch_in_micros": int(start_time_in_micros)} json_dict = {"name": "process_name", "ph": "M", "pid": 0, "args": args} metadata.append(json_dict) args = {"sort_index": 0} json_dict = {"name": "process_sort_index", "ph": "M", "pid": 0, "args": args} metadata.append(json_dict) # insert metadata at the beginning of trace events json trace_events_json = metadata + trace_events_json return trace_events_json
def read_index_files( self, start_after_key: str, range_steps=None) -> Tuple[List[bytes], list, str, List[str]]: """ Read files like `trial_{datetime}/index/000/{step}_{worker}.json. :param start_after_key: str :param range_steps: :return: Tuple( responses, steps, start_after_key, workers) """ object_requests = [] steps = [] workers = [] index_files, start_after_key = self.list_index_files(start_after_key) self.logger.debug(f'Loaded Index Files: {",".join(index_files)}') for index_file in index_files: if self.index_file_cache.has_not_read(index_file): step = IndexFileLocationUtils.parse_step_from_index_file_name( index_file) if (range_steps is not None and step_in_range( range_steps, step)) or range_steps is None: steps.append(step) workers.append(parse_worker_name_from_file(index_file)) object_requests.append( ReadObjectRequest( format(f"s3://{self.bucket_name}/") + index_file)) self.index_file_cache.add(index_file, start_after_key) responses = S3Handler.get_objects(object_requests) return responses, steps, start_after_key, workers
def __init__( self, name, bucket_name, prefix_name, range_steps=None, check=False, index_mode=True, cache=False, ): """ :param name: for sagemaker job, this should be sagemaker training job name :param bucket_name: name of bucket where data is saved :param prefix_name: name of prefix such that s3://bucket/prefix is where data is saved :param range_steps: range_steps is a tuple representing (start_step, end_step). Only the data from steps in between this range will be loaded :param check: whether to check checksum of data saved """ super().__init__( name, range_steps=range_steps, parallel=False, check=check, index_mode=index_mode, cache=cache, ) self.logger.info( f"Loading trial {name} at path s3://{bucket_name}/{prefix_name}") self.bucket_name = bucket_name self.prefix_name = os.path.join(prefix_name, "") self.path = "s3://" + os.path.join(self.bucket_name, self.prefix_name) self.index_reader = S3IndexReader(self.path) self.s3_handler = S3Handler() self._load_collections() self._load_tensors()
def __init__(self, path): super().__init__(path) self.path = path _, self.bucket_name, self.prefix_name = is_s3(path) self.s3_handler = S3Handler() self.index_file_cache = ReadIndexFilesCache()
def _read_collections(self, collection_files): first_collection_file = collection_files[0] # First Collection File key = os.path.join(first_collection_file) collections_req = ReadObjectRequest(self._get_s3_location(key)) obj_data = S3Handler.get_objects([collections_req])[0] obj_data = obj_data.decode("utf-8") self.collection_manager = CollectionManager.load_from_string(obj_data) self.num_workers = self.collection_manager.get_num_workers()
def delete_s3_prefix(bucket, prefix): s3_handler = S3Handler() list_req = [ListRequest(Bucket=bucket, Prefix=prefix)] keys = s3_handler.list_prefixes(list_req)[0] loop = asyncio.get_event_loop() task = loop.create_task(del_prefix_helper(bucket, keys)) loop.run_until_complete(task)
def check_performance(): import time import multiprocessing kb = 1024 mb = 1024 * 1024 sizes = [10 * kb, 100 * kb, 500 * kb] # , mb, 5 * mb, 10 * mb] num_files = [100, 1000, 10000] # , 10000] # , 100000] # , 1000000] files_path = "smdebug-testing/resources/test_performance" times = [] print("Size\tNumFiles\tPool size\tSync with multiprocessing") pool_sizes = [ 2 * multiprocessing.cpu_count(), 4 * multiprocessing.cpu_count(), 8 * multiprocessing.cpu_count(), ] for size in sizes: timesrow = [] for nf in num_files: timesrow_for_pools = [] for pool_size in pool_sizes: j = 0 S3Handler.MULTIPROCESSING_POOL_SIZE = pool_size times_to_be_averaged = [] reqs = [ ReadObjectRequest(f"s3://{files_path}/{size}/{i}.dummy") for i in range(nf) ] while j < 10: sync_start = time.time() S3Handler.get_objects(reqs, use_multiprocessing=True) sync_end = time.time() times_to_be_averaged.append(sync_end - sync_start) j += 1 timesrow_for_pools.append( round( sum(times_to_be_averaged) / len(times_to_be_averaged), 2)) timesrow.append(timesrow_for_pools) print(f"{size} {nf} {pool_sizes} {timesrow_for_pools}") times.append(timesrow) print(f"Finished testing for {size}", times[-1])
def test_download_objects(): s = uuid.uuid4() prefix = "test_get_objects/" + str(s) f = TSAccessS3("smdebugcodebuildtest", prefix, binary=False) f.write("a" * 100) f.write("b" * 200) f.write("c" * 300) f.close() r1 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix) r2 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix, start=100) r3 = ReadObjectRequest("s3://smdebugcodebuildtest/" + prefix, start=100, length=200) objects = S3Handler.get_objects([r1, r2, r3]) assert objects[0].decode("ascii") == "a" * 100 + "b" * 200 + "c" * 300 assert objects[1].decode("ascii") == "b" * 200 + "c" * 300, len( objects[1].decode("ascii")) assert objects[2].decode("ascii") == "b" * 200 S3Handler.delete_prefix(path="s3://smdebugcodebuildtest/" + prefix)
def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray: event_file_name = tensor_location.event_file_name if not self._is_event_file_present(event_file_name): self.event_file_present_loop(tensor_location) start = tensor_location.start_idx length = tensor_location.length request = [ReadObjectRequest(event_file_name, int(start), int(length))] res = S3Handler.get_objects(request) tr = TensorReader(res[0]) # Access the only element in res tensor_tuple = list(tr.read_tensors())[0] # Access the only element in the list tensor_name, step, tensor_data, mode, mode_step = tensor_tuple return tensor_data
def delete_s3_prefixes(bucket, keys): s3_handler = S3Handler() if not isinstance(keys, list): keys = [keys] list_prefixes = s3_handler.list_prefixes( [ListRequest(Bucket=bucket, Prefix=key) for key in keys]) prefixes = [item for sublist in list_prefixes for item in sublist] loop = asyncio.get_event_loop() async def del_folder(bucket, keys): loop = asyncio.get_event_loop() client = aioboto3.client("s3", loop=loop, region_name=get_region()) await asyncio.gather( *[client.delete_object(Bucket=bucket, Key=key) for key in keys]) await client.close() task = loop.create_task(del_folder(bucket, prefixes)) loop.run_until_complete(task)
def parse_event_files(self, event_files): file_read_requests = [] event_files_to_read = [] for event_file in event_files: if event_file not in self._parsed_files: event_files_to_read.append(event_file) file_read_requests.append(ReadObjectRequest(path=event_file)) event_data_list = S3Handler.get_objects(file_read_requests) for event_data, event_file in zip(event_data_list, event_files_to_read): event_string = event_data.decode("utf-8") event_items = event_string.split("\n") event_items.remove("") for item in event_items: event = json.loads(item) self._SystemProfilerEventParser.read_event_from_dict(event) self._parsed_files.add(event_file)
def file_exists(file_path): s3, bucket_name, key_name = is_s3(file_path) if s3: try: request = ListRequest(bucket_name, key_name) file_available = S3Handler.list_prefixes([request])[0] if len(file_available) > 0: return True else: return False except ClientError as ex: status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"] logger.info(f"Client error occurred : {ex}") if status_code.startswith("4"): raise ex else: return False else: return os.path.exists(file_path)
def test_merge_timeline_s3_write(): bucket_name = "smdebug-testing" key_name = f"outputs/smprofiler-timeline-merge-test-{uuid.uuid4()}" location = "s3://{}/{}".format(bucket_name, key_name) tracefolder = "./tests/profiler/resources/merge_traces" combined_timeline = MergedTimeline(tracefolder, output_directory=location) combined_timeline.merge_timeline(0, time.time() * CONVERT_TO_MICROSECS, unit=MergeUnit.TIME) start_step, end_step = 2, 4 tracefolder = "s3://smdebug-testing/resources/tf2_detailed_profile/profiler-output" combined_timeline = MergedTimeline(tracefolder, output_directory=location) combined_timeline.merge_timeline(start_step, end_step, unit=MergeUnit.STEP) request = ListRequest(bucket_name, key_name) files = S3Handler.list_prefixes([request]) assert len(files) == 1 assert len(files[0]) == 2
def has_training_ended(trial_prefix): file_path = os.path.join(trial_prefix, END_OF_JOB_FILENAME) s3, bucket_name, key_name = is_s3(file_path) if s3: try: s3_handler = S3Handler() request = ListRequest(bucket_name, key_name) file_available = s3_handler.list_prefixes([request])[0] if len(file_available) > 0: return True else: return False except ClientError as ex: status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"] logger.info(f"Client error occurred : {ex}") if status_code.startswith("4"): raise ex else: return False else: return os.path.exists(file_path)
def _refresh_event_file_list_s3_mode(self, list_dir): event_files = [ x for x in S3Handler.list_prefix(list_dir) if re.search(self._get_event_file_regex(), x) ] for event_file in event_files: event_file_full_path = f"s3://{list_dir.bucket}/{event_file}" timestamp = self._get_timestamp_from_filename(event_file_full_path) if timestamp is None: self.logger.debug( f"Unable to find timestamp from event file name {event_file}." ) continue if timestamp in self._timestamp_to_filename: if event_file_full_path not in self._timestamp_to_filename[ timestamp]: self._timestamp_to_filename[timestamp].append( event_file_full_path) else: self._timestamp_to_filename[timestamp] = [event_file_full_path] for timestamp in self._timestamp_to_filename: self._timestamp_to_filename[timestamp].sort() self._update_start_after_prefix()
def check_dir_exists(path): from smdebug.core.access_layer.s3handler import S3Handler, ListRequest s3, bucket_name, key_name = is_s3(path) if s3: try: request = ListRequest(bucket_name, key_name) folder = S3Handler.list_prefixes([request])[0] if len(folder) > 0 and has_training_ended(folder[-1]): raise RuntimeError("The path:{} already exists on s3. " "Please provide a directory path that does " "not already exist.".format(path)) except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchBucket": # then we do not need to raise any error pass else: # do not know the error raise ex elif os.path.exists(path) and has_training_ended(path): raise RuntimeError("The path:{} already exists on local disk. " "Please provide a directory path that does " "not already exist".format(path))
def read_tf_profiler_metadata_file(file_path): if not is_valid_tfprof_tracefilename(file_path): return "", "0", "0" s3, bucket_name, key_name = is_s3(file_path) if s3: try: folder_name = "/".join(key_name.split("/")[:-4]) request = ListRequest(bucket_name, folder_name) file_available = S3Handler.list_prefixes([request]) if len(file_available) > 0: metadata_filename = list( filter(lambda x: ".metadata" in x, file_available[0])) if len(metadata_filename) > 0: metadata_filename = metadata_filename[0] metadata_filename = metadata_filename.split("/")[-1] node_id, start, end = str(metadata_filename).split("_") return node_id, start, end.split(".")[0] else: return "", "0", "0" else: return "", "0", "0" except ClientError as ex: status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"] logger.info(f"Client error occurred : {ex}") if status_code.startswith("4"): raise ex else: return "", "0", "0" else: folder_name = "/".join(file_path.split("/")[:-4]) metadata_filename = list(Path(folder_name).rglob("*.metadata")) if len(metadata_filename) > 0: metadata_filename = metadata_filename[0].name node_id, start, end = str(metadata_filename).split("_") return node_id, start, end.split(".")[0] else: return "", "0", "0"
def _list_s3_prefixes(list_info): files = S3Handler.list_prefixes(list_info) if len(files) == 1: files = files[0] return files
def delete_s3_prefix(bucket, prefix): S3Handler.delete_prefix( delete_request=DeleteRequest(Bucket=bucket, Prefix=prefix))