def __init__(self, path):
        super().__init__(path)
        self.path = path
        _, self.bucket_name, self.prefix_name = is_s3(path)
        self.s3_handler = S3Handler()

        self.index_file_cache = ReadIndexFilesCache()
Example #2
0
def help_test_multiple_trials(num_steps=20, num_tensors=10):
    trial_name = str(uuid.uuid4())
    bucket = "smdebug-testing"
    path = "s3://" + os.path.join(bucket, "outputs/")

    c = CollectionManager()
    c.add("default")
    c.get("default").tensor_names = [
        "foo_" + str(i) for i in range(num_tensors)
    ]
    c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME)
    c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME)
    for i in range(num_steps):
        generate_data(
            path=path,
            trial=trial_name,
            num_tensors=num_tensors,
            step=i,
            tname_prefix="foo",
            worker="algo-1",
            shape=(3, 3, 3),
            rank=0,
        )
    _, bucket, prefix = is_s3(os.path.join(path, trial_name))
    trial_obj = S3Trial(name=prefix, bucket_name=bucket, prefix_name=prefix)
    return trial_obj, trial_name
Example #3
0
def use_s3_datasets():
    s3 = boto3.resource("s3")
    _, bucket, _ = is_s3(TEST_DATASET_S3_PATH)
    try:
        s3.meta.client.head_bucket(Bucket=bucket)
        return True
    except Exception:
        return False
def test_s3_training_end():
    s3dir = "s3://smdebugcodebuildtest/training_end_test_dir"
    _, bucket, key = is_s3(s3dir)
    f = TSAccessS3(bucket_name=bucket, key_name=key)
    f.close()
    training_has_ended(s3dir)
    assert has_training_ended(s3dir) is True
    delete_s3_prefixes(bucket, key)
 def __init__(self, path, start=0, length=None):
     self.is_s3, self.bucket, self.path = is_s3(path)
     if not self.is_s3:
         self.path = path
         self.bucket = None
     assert start >= 0 and (start == 0 or length is not None)
     self.start = start
     self.length = length
     self.download_entire_file = self.start == 0 and self.length is None
Example #6
0
def test_s3_training_end():
    s3key = str(uuid.uuid4())
    s3dir = f"s3://smdebugcodebuildtest/ok_to_delete_{s3key}"
    _, bucket, key = is_s3(s3dir)
    f = TSAccessS3(bucket_name=bucket, key_name=key)
    f.close()
    training_has_ended(s3dir)
    assert has_training_ended(s3dir) is True
    delete_s3_prefixes(bucket, key)
Example #7
0
 def __init__(self, path, write_checksum):
     self.write_checksum = write_checksum
     s3, bucket_name, key_name = is_s3(path)
     try:
         if s3:
             self._writer = TSAccessS3(bucket_name, key_name)
         else:
             self._writer = TSAccessFile(path, "wb")
     except (OSError, IOError) as err:
         raise ValueError("failed to open {}: {}".format(path, str(err)))
def helper_test_mnist_trial(trial_dir):
    tr = create_trial(trial_dir)
    assert len(tr.steps()) == 3
    assert len(tr.steps(mode=smd.modes.TRAIN)) == 2
    assert len(tr.steps(mode=smd.modes.EVAL)) == 1
    assert len(tr.tensor_names()) == 13
    on_s3, bucket, prefix = is_s3(trial_dir)
    if not on_s3:
        shutil.rmtree(trial_dir, ignore_errors=True)
    else:
        delete_s3_prefix(bucket, prefix)
Example #9
0
def create_trial(path, name=None, **kwargs):
    if name is None:
        name = os.path.basename(path)
    s3, bucket_name, prefix_name = is_s3(path)
    if s3:
        return S3Trial(name=name,
                       bucket_name=bucket_name,
                       prefix_name=prefix_name,
                       **kwargs)
    else:
        return LocalTrial(name=name, dirname=path, **kwargs)
Example #10
0
 def __init__(self, path, start=None, length=None):
     self.is_s3, self.bucket, self.key = is_s3(path)
     if not self.is_s3:
         self.key = path
         self.bucket = None
     if start is None:
         self.start = 0
     else:
         self.start = start
     self.length = length
     self.download_entire_file = start is None
Example #11
0
 def delete_prefix(path=None, delete_request=None):
     if path is not None and delete_request is not None:
         raise ValueError(
             "Only one of path or delete_request can be passed")
     elif path is not None:
         on_s3, bucket, prefix = is_s3(path)
         if on_s3 is False:
             raise ValueError("Given path is not an S3 location")
         delete_requests = [DeleteRequest(bucket, prefix)]
         S3Handler.delete_prefixes(delete_requests)
     elif delete_request is not None:
         S3Handler.delete_prefixes([delete_request])
 def __init__(self, path):
     s3, bucket_name, key_name = is_s3(path)
     try:
         if s3:
             self._reader = TSAccessS3(bucket_name, key_name)
         else:
             self._reader = TSAccessFile(path, "rb")
     except (OSError, IOError) as err:
         raise ValueError("failed to open {}: {}".format(path, str(err)))
     except:
         raise
     self._reader.ingest_all()
Example #13
0
def create_trial(path, name=None, **kwargs):
    path = path.strip(
    )  # Remove any accidental leading/trailing whitespace input by the user
    if name is None:
        name = os.path.basename(path)
    s3, bucket_name, prefix_name = is_s3(path)
    if s3:
        return S3Trial(name=name,
                       bucket_name=bucket_name,
                       prefix_name=prefix_name,
                       **kwargs)
    else:
        return LocalTrial(name=name, dirname=path, **kwargs)
def helper_test_multi_save_configs_trial(trial_dir):
    tr = create_trial(trial_dir)
    print(tr.steps(), tr.steps(mode=smd.modes.TRAIN),
          tr.steps(mode=smd.modes.EVAL))
    assert len(tr.steps()) == 4
    assert len(tr.steps(mode=smd.modes.TRAIN)) == 3
    assert len(tr.steps(mode=smd.modes.EVAL)) == 1
    assert len(tr.tensor_names()) == 1
    on_s3, bucket, prefix = is_s3(trial_dir)
    if not on_s3:
        shutil.rmtree(trial_dir)
    else:
        delete_s3_prefix(bucket, prefix)
Example #15
0
 def open(self, file_path):
     """
     Open the trace event file
     """
     s3, bucket_name, key_name = is_s3(file_path)
     try:
         if s3:
             self._writer = TSAccessS3(bucket_name, key_name, binary=False)
         else:
             self._writer = TSAccessFile(file_path, "w")
     except (OSError, IOError) as err:
         logger.debug(
             f"Sagemaker-Debugger: failed to open {file_path}: {str(err)}")
     start, length = self._writer.write("[\n")
     self.bytes_written += length
def test_parse_worker_name_from_collection_file():
    path = "s3://smdebug-testing/resources/one-index-file"
    _, bucket_name, key_name = is_s3(path)

    collection_files, _ = list_s3_objects(bucket_name, get_path_to_collections(key_name))

    assert len(collection_files) == 1

    collection_file = collection_files[0]
    worker_name = get_worker_name_from_collection_file(collection_file)
    assert worker_name == "/job:worker/replica:0/task:1/device:GPU:0"

    file_name = "/tmp/collections/000000000/job-worker_1_collections.json"
    worker_name = get_worker_name_from_collection_file(file_name)
    assert worker_name == "job-worker_1"
def test_parse_worker_name_from_index_file():
    filename = "/tmp/ts-logs/index/000000001/000000001230_worker_2.json"
    worker_name = parse_worker_name_from_file(filename)
    assert worker_name == "worker_2"

    filename = "/tmp/ts-logs/index/000000000499__job-worker_replica-0_task-1_device-GPU-6.json"
    worker_name = parse_worker_name_from_file(filename)
    assert worker_name == "/job:worker/replica:0/task:1/device:GPU:6"

    path = "s3://smdebug-testing/resources/one-index-file"

    _, bucket, prefix = is_s3(path)

    index_reader = S3IndexReader(path)
    index_files, _ = index_reader.list_index_files()

    filename = index_files[0]
    worker_name = parse_worker_name_from_file(filename)
    assert worker_name == "/job:worker/replica:0/task:1/device:GPU:4"
Example #18
0
def file_exists(file_path):
    s3, bucket_name, key_name = is_s3(file_path)
    if s3:
        try:
            request = ListRequest(bucket_name, key_name)
            file_available = S3Handler.list_prefixes([request])[0]
            if len(file_available) > 0:
                return True
            else:
                return False
        except ClientError as ex:
            status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"]
            logger.info(f"Client error occurred : {ex}")
            if status_code.startswith("4"):
                raise ex
            else:
                return False
    else:
        return os.path.exists(file_path)
def has_training_ended(trial_prefix):
    file_path = os.path.join(trial_prefix, END_OF_JOB_FILENAME)
    s3, bucket_name, key_name = is_s3(file_path)
    if s3:
        try:
            s3_handler = S3Handler()
            request = ListRequest(bucket_name, key_name)
            file_available = s3_handler.list_prefixes([request])[0]
            if len(file_available) > 0:
                return True
            else:
                return False
        except ClientError as ex:
            status_code = ex.response["ResponseMetadata"]["HTTPStatusCode"]
            logger.info(f"Client error occurred : {ex}")
            if status_code.startswith("4"):
                raise ex
            else:
                return False
    else:
        return os.path.exists(file_path)
Example #20
0
def create_trial(path,
                 name=None,
                 profiler=False,
                 output_dir="/opt/ml/processing/outputs/",
                 **kwargs):
    path = path.strip(
    )  # Remove any accidental leading/trailing whitespace input by the user
    if name is None:
        name = os.path.basename(path)
    s3, bucket_name, prefix_name = is_s3(path)
    if profiler:
        return ProfilerTrial(name=name,
                             trial_dir=path,
                             output_dir=output_dir,
                             **kwargs)
    if s3:
        return S3Trial(name=name,
                       bucket_name=bucket_name,
                       prefix_name=prefix_name,
                       **kwargs)
    else:
        return LocalTrial(name=name, dirname=path, **kwargs)
Example #21
0
def check_dir_exists(path):
    from smdebug.core.access_layer.s3handler import S3Handler, ListRequest

    s3, bucket_name, key_name = is_s3(path)
    if s3:
        try:
            request = ListRequest(bucket_name, key_name)
            folder = S3Handler.list_prefixes([request])[0]
            if len(folder) > 0 and has_training_ended(folder[-1]):
                raise RuntimeError("The path:{} already exists on s3. "
                                   "Please provide a directory path that does "
                                   "not already exist.".format(path))
        except ClientError as ex:
            if ex.response["Error"]["Code"] == "NoSuchBucket":
                # then we do not need to raise any error
                pass
            else:
                # do not know the error
                raise ex
    elif os.path.exists(path) and has_training_ended(path):
        raise RuntimeError("The path:{} already exists on local disk. "
                           "Please provide a directory path that does "
                           "not already exist".format(path))
Example #22
0
def training_has_ended(trial_prefix):
    # Emit the end of training file only if the job is not running under SageMaker.
    if is_sagemaker_job():
        logger.info(
            f"The end of training job file will not be written for jobs running under SageMaker."
        )
        return
    try:
        check_dir_exists(trial_prefix)
        # if path does not exist, then we don't need to write a file
    except RuntimeError:
        # dir exists
        pass
    file_path = os.path.join(trial_prefix, END_OF_JOB_FILENAME)
    s3, bucket_name, key_name = is_s3(file_path)
    if s3:
        writer = TSAccessS3(bucket_name, key_name, binary=False)
    else:
        writer = TSAccessFile(file_path, "a+")
    writer.flush()
    try:
        writer.close()
    except OSError:
        """
def test_s3():
    rval = is_s3("s3://a/b")
    assert rval[0]
    assert rval[1] == "a"
    assert rval[2] == "b"
def test_normal():
    rval = is_s3("a/b/c")
    assert not rval[0]
 def _init_writer(self):
     s3, bucket_name, key_name = is_s3(self.file_path)
     if s3:
         self.writer = TSAccessS3(bucket_name, key_name, binary=False)
     else:
         self.writer = TSAccessFile(self.file_path, "a+")
def test_s3_noprefix2():
    rval = is_s3("s3://a/")
    assert rval[0]
    assert rval[1] == "a"
    assert rval[2] == ""
 def _is_event_file_present(self, file):
     event_files = self.list_event_files()
     _, _, prefix = is_s3(file)
     return prefix in set(event_files)
Example #28
0
def check_s3_trial(path, num_steps=20, num_tensors=10):
    _, bucket, prefix = is_s3(path)
    trial_obj = S3Trial(name=prefix, bucket_name=bucket, prefix_name=prefix)
    check_trial(trial_obj, num_steps=num_steps, num_tensors=num_tensors)