def __init__(self, path): super().__init__(path) self.path = path _, self.bucket_name, self.prefix_name = is_s3(path) self.s3_handler = S3Handler() self.index_file_cache = ReadIndexFilesCache()
def help_test_multiple_trials(num_steps=20, num_tensors=10): trial_name = str(uuid.uuid4()) bucket = "smdebug-testing" path = "s3://" + os.path.join(bucket, "outputs/") c = CollectionManager() c.add("default") c.get("default").tensor_names = [ "foo_" + str(i) for i in range(num_tensors) ] c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME) c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME) for i in range(num_steps): generate_data( path=path, trial=trial_name, num_tensors=num_tensors, step=i, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), rank=0, ) _, bucket, prefix = is_s3(os.path.join(path, trial_name)) trial_obj = S3Trial(name=prefix, bucket_name=bucket, prefix_name=prefix) return trial_obj, trial_name
def use_s3_datasets(): s3 = boto3.resource("s3") _, bucket, _ = is_s3(TEST_DATASET_S3_PATH) try: s3.meta.client.head_bucket(Bucket=bucket) return True except Exception: return False
def test_s3_training_end(): s3dir = "s3://smdebugcodebuildtest/training_end_test_dir" _, bucket, key = is_s3(s3dir) f = TSAccessS3(bucket_name=bucket, key_name=key) f.close() training_has_ended(s3dir) assert has_training_ended(s3dir) is True delete_s3_prefixes(bucket, key)
def __init__(self, path, start=0, length=None): self.is_s3, self.bucket, self.path = is_s3(path) if not self.is_s3: self.path = path self.bucket = None assert start >= 0 and (start == 0 or length is not None) self.start = start self.length = length self.download_entire_file = self.start == 0 and self.length is None
def test_s3_training_end(): s3key = str(uuid.uuid4()) s3dir = f"s3://smdebugcodebuildtest/ok_to_delete_{s3key}" _, bucket, key = is_s3(s3dir) f = TSAccessS3(bucket_name=bucket, key_name=key) f.close() training_has_ended(s3dir) assert has_training_ended(s3dir) is True delete_s3_prefixes(bucket, key)
def __init__(self, path, write_checksum): self.write_checksum = write_checksum s3, bucket_name, key_name = is_s3(path) try: if s3: self._writer = TSAccessS3(bucket_name, key_name) else: self._writer = TSAccessFile(path, "wb") except (OSError, IOError) as err: raise ValueError("failed to open {}: {}".format(path, str(err)))
def helper_test_mnist_trial(trial_dir): tr = create_trial(trial_dir) assert len(tr.steps()) == 3 assert len(tr.steps(mode=smd.modes.TRAIN)) == 2 assert len(tr.steps(mode=smd.modes.EVAL)) == 1 assert len(tr.tensor_names()) == 13 on_s3, bucket, prefix = is_s3(trial_dir) if not on_s3: shutil.rmtree(trial_dir, ignore_errors=True) else: delete_s3_prefix(bucket, prefix)
def create_trial(path, name=None, **kwargs): if name is None: name = os.path.basename(path) s3, bucket_name, prefix_name = is_s3(path) if s3: return S3Trial(name=name, bucket_name=bucket_name, prefix_name=prefix_name, **kwargs) else: return LocalTrial(name=name, dirname=path, **kwargs)
def __init__(self, path, start=None, length=None): self.is_s3, self.bucket, self.key = is_s3(path) if not self.is_s3: self.key = path self.bucket = None if start is None: self.start = 0 else: self.start = start self.length = length self.download_entire_file = start is None
def delete_prefix(path=None, delete_request=None): if path is not None and delete_request is not None: raise ValueError( "Only one of path or delete_request can be passed") elif path is not None: on_s3, bucket, prefix = is_s3(path) if on_s3 is False: raise ValueError("Given path is not an S3 location") delete_requests = [DeleteRequest(bucket, prefix)] S3Handler.delete_prefixes(delete_requests) elif delete_request is not None: S3Handler.delete_prefixes([delete_request])
def __init__(self, path): s3, bucket_name, key_name = is_s3(path) try: if s3: self._reader = TSAccessS3(bucket_name, key_name) else: self._reader = TSAccessFile(path, "rb") except (OSError, IOError) as err: raise ValueError("failed to open {}: {}".format(path, str(err))) except: raise self._reader.ingest_all()
def create_trial(path, name=None, **kwargs): path = path.strip( ) # Remove any accidental leading/trailing whitespace input by the user if name is None: name = os.path.basename(path) s3, bucket_name, prefix_name = is_s3(path) if s3: return S3Trial(name=name, bucket_name=bucket_name, prefix_name=prefix_name, **kwargs) else: return LocalTrial(name=name, dirname=path, **kwargs)
def helper_test_multi_save_configs_trial(trial_dir): tr = create_trial(trial_dir) print(tr.steps(), tr.steps(mode=smd.modes.TRAIN), tr.steps(mode=smd.modes.EVAL)) assert len(tr.steps()) == 4 assert len(tr.steps(mode=smd.modes.TRAIN)) == 3 assert len(tr.steps(mode=smd.modes.EVAL)) == 1 assert len(tr.tensor_names()) == 1 on_s3, bucket, prefix = is_s3(trial_dir) if not on_s3: shutil.rmtree(trial_dir) else: delete_s3_prefix(bucket, prefix)
def open(self, file_path): """ Open the trace event file """ s3, bucket_name, key_name = is_s3(file_path) try: if s3: self._writer = TSAccessS3(bucket_name, key_name, binary=False) else: self._writer = TSAccessFile(file_path, "w") except (OSError, IOError) as err: logger.debug( f"Sagemaker-Debugger: failed to open {file_path}: {str(err)}") start, length = self._writer.write("[\n") self.bytes_written += length
def test_parse_worker_name_from_collection_file(): path = "s3://smdebug-testing/resources/one-index-file" _, bucket_name, key_name = is_s3(path) collection_files, _ = list_s3_objects(bucket_name, get_path_to_collections(key_name)) assert len(collection_files) == 1 collection_file = collection_files[0] worker_name = get_worker_name_from_collection_file(collection_file) assert worker_name == "/job:worker/replica:0/task:1/device:GPU:0" file_name = "/tmp/collections/000000000/job-worker_1_collections.json" worker_name = get_worker_name_from_collection_file(file_name) assert worker_name == "job-worker_1"
def test_parse_worker_name_from_index_file(): filename = "/tmp/ts-logs/index/000000001/000000001230_worker_2.json" worker_name = parse_worker_name_from_file(filename) assert worker_name == "worker_2" filename = "/tmp/ts-logs/index/000000000499__job-worker_replica-0_task-1_device-GPU-6.json" worker_name = parse_worker_name_from_file(filename) assert worker_name == "/job:worker/replica:0/task:1/device:GPU:6" path = "s3://smdebug-testing/resources/one-index-file" _, bucket, prefix = is_s3(path) index_reader = S3IndexReader(path) index_files, _ = index_reader.list_index_files() filename = index_files[0] worker_name = parse_worker_name_from_file(filename) assert worker_name == "/job:worker/replica:0/task:1/device:GPU:4"
def file_exists(file_path): s3, bucket_name, key_name = is_s3(file_path) if s3: try: request = ListRequest(bucket_name, key_name) file_available = S3Handler.list_prefixes([request])[0] if len(file_available) > 0: return True else: return False except ClientError as ex: status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"] logger.info(f"Client error occurred : {ex}") if status_code.startswith("4"): raise ex else: return False else: return os.path.exists(file_path)
def has_training_ended(trial_prefix): file_path = os.path.join(trial_prefix, END_OF_JOB_FILENAME) s3, bucket_name, key_name = is_s3(file_path) if s3: try: s3_handler = S3Handler() request = ListRequest(bucket_name, key_name) file_available = s3_handler.list_prefixes([request])[0] if len(file_available) > 0: return True else: return False except ClientError as ex: status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"] logger.info(f"Client error occurred : {ex}") if status_code.startswith("4"): raise ex else: return False else: return os.path.exists(file_path)
def create_trial(path, name=None, profiler=False, output_dir="/opt/ml/processing/outputs/", **kwargs): path = path.strip( ) # Remove any accidental leading/trailing whitespace input by the user if name is None: name = os.path.basename(path) s3, bucket_name, prefix_name = is_s3(path) if profiler: return ProfilerTrial(name=name, trial_dir=path, output_dir=output_dir, **kwargs) if s3: return S3Trial(name=name, bucket_name=bucket_name, prefix_name=prefix_name, **kwargs) else: return LocalTrial(name=name, dirname=path, **kwargs)
def check_dir_exists(path): from smdebug.core.access_layer.s3handler import S3Handler, ListRequest s3, bucket_name, key_name = is_s3(path) if s3: try: request = ListRequest(bucket_name, key_name) folder = S3Handler.list_prefixes([request])[0] if len(folder) > 0 and has_training_ended(folder[-1]): raise RuntimeError("The path:{} already exists on s3. " "Please provide a directory path that does " "not already exist.".format(path)) except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchBucket": # then we do not need to raise any error pass else: # do not know the error raise ex elif os.path.exists(path) and has_training_ended(path): raise RuntimeError("The path:{} already exists on local disk. " "Please provide a directory path that does " "not already exist".format(path))
def training_has_ended(trial_prefix): # Emit the end of training file only if the job is not running under SageMaker. if is_sagemaker_job(): logger.info( f"The end of training job file will not be written for jobs running under SageMaker." ) return try: check_dir_exists(trial_prefix) # if path does not exist, then we don't need to write a file except RuntimeError: # dir exists pass file_path = os.path.join(trial_prefix, END_OF_JOB_FILENAME) s3, bucket_name, key_name = is_s3(file_path) if s3: writer = TSAccessS3(bucket_name, key_name, binary=False) else: writer = TSAccessFile(file_path, "a+") writer.flush() try: writer.close() except OSError: """
def test_s3(): rval = is_s3("s3://a/b") assert rval[0] assert rval[1] == "a" assert rval[2] == "b"
def test_normal(): rval = is_s3("a/b/c") assert not rval[0]
def _init_writer(self): s3, bucket_name, key_name = is_s3(self.file_path) if s3: self.writer = TSAccessS3(bucket_name, key_name, binary=False) else: self.writer = TSAccessFile(self.file_path, "a+")
def test_s3_noprefix2(): rval = is_s3("s3://a/") assert rval[0] assert rval[1] == "a" assert rval[2] == ""
def _is_event_file_present(self, file): event_files = self.list_event_files() _, _, prefix = is_s3(file) return prefix in set(event_files)
def check_s3_trial(path, num_steps=20, num_tensors=10): _, bucket, prefix = is_s3(path) trial_obj = S3Trial(name=prefix, bucket_name=bucket, prefix_name=prefix) check_trial(trial_obj, num_steps=num_steps, num_tensors=num_tensors)