def _stage_configuration(self, configuration):
        """Serializes and uploads the user-provided EMR application configuration to S3.

        This method prepares an input channel.

        Args:
            configuration (Dict): the configuration dict for the EMR application configuration.
        """

        serialized_configuration = BytesIO(
            json.dumps(configuration).encode("utf-8"))
        s3_uri = (
            f"s3://{self.sagemaker_session.default_bucket()}/{self._current_job_name}/"
            f"input/{self._conf_container_input_name}/{self._conf_file_name}")

        S3Uploader.upload_string_as_file_body(
            body=serialized_configuration,
            desired_s3_uri=s3_uri,
            sagemaker_session=self.sagemaker_session,
        )

        conf_input = ProcessingInput(
            source=s3_uri,
            destination=
            f"{self._conf_container_base_path}{self._conf_container_input_name}",
            input_name=_SparkProcessorBase._conf_container_input_name,
        )
        return conf_input
def test_integ_history_server(spark_py_processor, sagemaker_session):
    bucket = spark_py_processor.sagemaker_session.default_bucket()
    spark_event_logs_key_prefix = "spark/spark-history-fs"
    spark_event_logs_s3_uri = f"s3://{bucket}/{spark_event_logs_key_prefix}"

    with open(os.path.join(SPARK_PATH, "files",
                           "sample_spark_event_logs")) as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=spark_event_logs_s3_uri +
            "/sample_spark_event_logs",
            sagemaker_session=sagemaker_session,
        )

    spark_py_processor.start_history_server(
        spark_event_logs_s3_uri=spark_event_logs_s3_uri)

    try:
        response = _request_with_retry(HISTORY_SERVER_ENDPOINT)
        assert response.status == 200

        # spark has redirect behavior, this request verify that page navigation works with redirect
        response = _request_with_retry(
            f"{HISTORY_SERVER_ENDPOINT}{SPARK_APPLICATION_URL_SUFFIX}")
        assert response.status == 200

        html_content = response.data.decode("utf-8")
        assert "Completed Jobs (4)" in html_content
        assert "collect at /opt/ml/processing/input/code/test_long_duration.py:32" in html_content
    finally:
        spark_py_processor.terminate_history_server()
def test_integ_history_server(spark_py_processor, sagemaker_session):
    bucket = spark_py_processor.sagemaker_session.default_bucket()
    spark_event_logs_key_prefix = "spark/spark-history-fs"
    spark_event_logs_s3_uri = f"s3://{bucket}/{spark_event_logs_key_prefix}"

    with open(os.path.join(SPARK_PATH, "files",
                           "sample_spark_event_logs")) as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=spark_event_logs_s3_uri +
            "/sample_spark_event_logs",
            sagemaker_session=sagemaker_session,
        )

    # sleep 3 seconds to avoid s3 eventual consistency issue
    time.sleep(3)
    spark_py_processor.start_history_server(
        spark_event_logs_s3_uri=spark_event_logs_s3_uri)

    try:
        response = _request_with_retry(HISTORY_SERVER_ENDPOINT)
        assert response.status == 200
    finally:
        spark_py_processor.terminate_history_server()
def test_history_server(tag, role, image_uri, sagemaker_session, region):
    spark = PySparkProcessor(
        base_job_name="sm-spark",
        framework_version=tag,
        image_uri=image_uri,
        role=role,
        instance_count=1,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )
    bucket = sagemaker_session.default_bucket()
    spark_event_logs_key_prefix = "spark/spark-history-fs"
    spark_event_logs_s3_uri = "s3://{}/{}".format(bucket, spark_event_logs_key_prefix)
    spark_event_log_local_path = "test/resources/data/files/sample_spark_event_logs"
    file_name = "sample_spark_event_logs"
    file_size = os.path.getsize(spark_event_log_local_path)

    with open("test/resources/data/files/sample_spark_event_logs") as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body, desired_s3_uri=f"{spark_event_logs_s3_uri}/{file_name}", sagemaker_session=sagemaker_session,
        )

    _wait_for_file_to_be_uploaded(region, bucket, spark_event_logs_key_prefix, file_name, file_size)
    spark.start_history_server(spark_event_logs_s3_uri=spark_event_logs_s3_uri)

    try:
        response = _request_with_retry(HISTORY_SERVER_ENDPOINT)
        assert response.status == 200

        response = _request_with_retry(f"{HISTORY_SERVER_ENDPOINT}{SPARK_APPLICATION_URL_SUFFIX}", max_retries=15)
        print(f"Subpage response status code: {response.status}")
    finally:
        spark.terminate_history_server()
Example #5
0
def test_upload(sagemaker_session):
    desired_s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME,
                                  SOURCE_NAME)
    S3Uploader.upload(local_path="/path/to/app.jar",
                      desired_s3_uri=desired_s3_uri,
                      session=sagemaker_session)
    sagemaker_session.upload_data.assert_called_with(
        path="/path/to/app.jar",
        bucket=BUCKET_NAME,
        key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME),
        extra_args=None,
    )
def test_sagemaker_scala_jar_multinode(tag, role, image_uri, configuration,
                                       sagemaker_session, sagemaker_client):
    """Test SparkJarProcessor using Scala application jar with external runtime dependency jars staged by SDK"""
    spark = SparkJarProcessor(
        base_job_name="sm-spark-scala",
        framework_version=tag,
        image_uri=image_uri,
        role=role,
        instance_count=2,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )

    bucket = spark.sagemaker_session.default_bucket()
    with open("test/resources/data/files/data.jsonl") as data:
        body = data.read()
        input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket)
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)
    output_data_uri = "s3://{}/spark/output/sales/{}".format(
        bucket,
        datetime.now().isoformat())

    scala_project_dir = "test/resources/code/scala/hello-scala-spark"
    spark.run(
        submit_app="{}/target/scala-2.11/hello-scala-spark_2.11-1.0.jar".
        format(scala_project_dir),
        submit_class="com.amazonaws.sagemaker.spark.test.HelloScalaSparkApp",
        submit_jars=[
            "{}/lib_managed/jars/org.json4s/json4s-native_2.11/json4s-native_2.11-3.6.9.jar"
            .format(scala_project_dir)
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
    )
    processing_job = spark.latest_job

    waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped")
    waiter.wait(
        ProcessingJobName=processing_job.job_name,
        # poll every 15 seconds. timeout after 15 minutes.
        WaiterConfig={
            "Delay": 15,
            "MaxAttempts": 60
        },
    )

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
def test_sagemaker_spark_processor_default_tag(spark_version, role,
                                               sagemaker_session,
                                               sagemaker_client):
    """Test that spark processor works with default tag"""
    spark = PySparkProcessor(
        base_job_name="sm-spark-py",
        framework_version=spark_version,
        role=role,
        instance_count=1,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )
    bucket = spark.sagemaker_session.default_bucket()
    timestamp = datetime.now().isoformat()
    output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp)
    spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp)
    spark_event_logs_s3_uri = "s3://{}/{}".format(bucket,
                                                  spark_event_logs_key_prefix)

    with open("test/resources/data/files/data.jsonl") as data:
        body = data.read()
        input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket)
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    spark.run(
        submit_app=
        "test/resources/code/python/hello_py_spark/hello_py_spark_app.py",
        submit_py_files=[
            "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py"
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        spark_event_logs_s3_uri=spark_event_logs_s3_uri,
        wait=True,
    )

    processing_job = spark.latest_job
    waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped")
    waiter.wait(
        ProcessingJobName=processing_job.job_name,
        # poll every 15 seconds. timeout after 15 minutes.
        WaiterConfig={
            "Delay": 15,
            "MaxAttempts": 60
        },
    )

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
Example #8
0
def upload_to_s3(local_path, s3_data_location, *, search=None):
    import os
    from tqdm import tqdm
    from sagemaker.s3 import S3Uploader as s3up

    for root, dirs, files in os.walk(local_path):
        if len(files) > 0:
            idx = len(local_path)
            for name in tqdm(files, desc=f"Uploading folder '{root}'"):
                file_path = os.path.join(root, name)
                s3_path = os.path.join(s3_data_location, root[idx:])
                s3up.upload(file_path, s3_path)
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_files(
    sagemaker_session, s3_files_kms_key
):
    my_uuid = str(uuid.uuid4())

    file_1_body = "First File Body {}.".format(my_uuid)
    file_1_name = "first_file_{}.txt".format(my_uuid)
    file_2_body = "Second File Body {}.".format(my_uuid)
    file_2_name = "second_file_{}.txt".format(my_uuid)

    base_s3_uri = os.path.join(
        "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid
    )
    file_1_s3_uri = os.path.join(base_s3_uri, file_1_name)
    file_2_s3_uri = os.path.join(base_s3_uri, file_2_name)

    S3Uploader.upload_string_as_file_body(
        body=file_1_body,
        desired_s3_uri=file_1_s3_uri,
        kms_key=s3_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    S3Uploader.upload_string_as_file_body(
        body=file_2_body,
        desired_s3_uri=file_2_s3_uri,
        kms_key=s3_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    s3_uris = S3Downloader.list(s3_uri=base_s3_uri, sagemaker_session=sagemaker_session)

    assert file_1_name in s3_uris[0]
    assert file_2_name in s3_uris[1]

    assert file_1_body == S3Downloader.read_file(
        s3_uri=s3_uris[0], sagemaker_session=sagemaker_session
    )
    assert file_2_body == S3Downloader.read_file(
        s3_uri=s3_uris[1], sagemaker_session=sagemaker_session
    )

    S3Downloader.download(
        s3_uri=base_s3_uri, local_path=TMP_BASE_PATH, sagemaker_session=sagemaker_session
    )

    with open(os.path.join(TMP_BASE_PATH, file_1_name), "r") as f:
        assert file_1_body == f.read()

    with open(os.path.join(TMP_BASE_PATH, file_2_name), "r") as f:
        assert file_2_body == f.read()
Example #10
0
def test_history_server(tag, role, image_uri, sagemaker_session, region):
    spark = PySparkProcessor(
        base_job_name="sm-spark",
        framework_version=tag,
        image_uri=image_uri,
        role=role,
        instance_count=1,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )
    bucket = sagemaker_session.default_bucket()
    spark_event_logs_key_prefix = "spark/spark-history-fs"
    spark_event_logs_s3_uri = "s3://{}/{}".format(bucket,
                                                  spark_event_logs_key_prefix)
    spark_event_log_local_path = "test/resources/data/files/sample_spark_event_logs"
    file_name = "sample_spark_event_logs"
    file_size = os.path.getsize(spark_event_log_local_path)

    with open("test/resources/data/files/sample_spark_event_logs") as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{spark_event_logs_s3_uri}/{file_name}",
            sagemaker_session=sagemaker_session,
        )

    _wait_for_file_to_be_uploaded(region, bucket, spark_event_logs_key_prefix,
                                  file_name, file_size)
    spark.start_history_server(spark_event_logs_s3_uri=spark_event_logs_s3_uri)

    try:
        response = _request_with_retry(HISTORY_SERVER_ENDPOINT)
        assert response.status == 200

        # spark has redirect behavior, this request verify that page navigation works with redirect
        response = _request_with_retry(
            f"{HISTORY_SERVER_ENDPOINT}{SPARK_APPLICATION_URL_SUFFIX}")
        if response.status != 200:
            print(subprocess.run(["docker", "logs", "history_server"]))

        assert response.status == 200

        html_content = response.data.decode("utf-8")
        assert "Completed Jobs (4)" in html_content
        assert "collect at /opt/ml/processing/input/code/test_long_duration.py:32" in html_content
    finally:
        spark.terminate_history_server()
def test_constraints_object_creation_from_s3_uri_with_customizations(
        sagemaker_session, monitoring_files_kms_key):
    with open(os.path.join(tests.integ.DATA_DIR, "monitor/constraints.json"),
              "r") as f:
        file_body = f.read()

    file_name = "constraints.json"
    desired_s3_uri = os.path.join(
        "s3://",
        sagemaker_session.default_bucket(),
        "integ-test-test-monitoring-files",
        str(uuid.uuid4()),
        file_name,
    )

    s3_uri = S3Uploader.upload_string_as_file_body(
        body=file_body,
        desired_s3_uri=desired_s3_uri,
        kms_key=monitoring_files_kms_key,
        session=sagemaker_session,
    )

    constraints = Constraints.from_s3_uri(
        constraints_file_s3_uri=s3_uri,
        kms_key=monitoring_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    assert constraints.file_s3_uri.startswith("s3://")
    assert constraints.file_s3_uri.endswith("constraints.json")

    assert constraints.body_dict["monitoring_config"][
        "evaluate_constraints"] == "Enabled"
Example #12
0
def test_upload(sagemaker_session, caplog):
    desired_s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME,
                                  SOURCE_NAME)
    S3Uploader.upload(local_path="/path/to/app.jar",
                      desired_s3_uri=desired_s3_uri,
                      session=sagemaker_session)
    sagemaker_session.upload_data.assert_called_with(
        path="/path/to/app.jar",
        bucket=BUCKET_NAME,
        key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME),
        extra_args=None,
    )
    warning_message = (
        "Parameter 'session' will be renamed to 'sagemaker_session' "
        "in SageMaker Python SDK v2.")
    assert warning_message in caplog.text
    def from_string(
        cls, constraints_file_string, kms_key=None, file_name=None, sagemaker_session=None
    ):
        """Generates a Constraints object from an s3 uri.

        Args:
            constraints_file_string (str): The uri of the constraints JSON file.
            kms_key (str): The kms key to be used to encrypt the file in S3.
            file_name (str): The file name to use when uploading to S3.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session
                object, used for SageMaker interactions (default: None). If not
                specified, one is created using the default AWS configuration
                chain.

        Returns:
            sagemaker.model_monitor.Constraints: The instance of Constraints generated from
                the s3 uri.

        """
        sagemaker_session = sagemaker_session or Session()
        file_name = file_name or "constraints.json"
        desired_s3_uri = os.path.join(
            "s3://", sagemaker_session.default_bucket(), "monitoring", str(uuid.uuid4()), file_name
        )
        s3_uri = S3Uploader.upload_string_as_file_body(
            body=constraints_file_string,
            desired_s3_uri=desired_s3_uri,
            kms_key=kms_key,
            session=sagemaker_session,
        )

        return Constraints.from_s3_uri(
            constraints_file_s3_uri=s3_uri, kms_key=kms_key, sagemaker_session=sagemaker_session
        )
def test_constraint_violations_object_creation_from_s3_uri_without_customizations(
        sagemaker_session):
    with open(
            os.path.join(tests.integ.DATA_DIR,
                         "monitor/constraint_violations.json"), "r") as f:
        file_body = f.read()

    file_name = "constraint_violations.json"
    desired_s3_uri = os.path.join(
        "s3://",
        sagemaker_session.default_bucket(),
        "integ-test-test-monitoring-files",
        str(uuid.uuid4()),
        file_name,
    )

    s3_uri = S3Uploader.upload_string_as_file_body(
        body=file_body,
        desired_s3_uri=desired_s3_uri,
        session=sagemaker_session)

    constraint_violations = ConstraintViolations.from_s3_uri(
        constraint_violations_file_s3_uri=s3_uri,
        sagemaker_session=sagemaker_session)

    assert constraint_violations.file_s3_uri.startswith("s3://")
    assert constraint_violations.file_s3_uri.endswith(
        "constraint_violations.json")

    assert constraint_violations.body_dict["violations"][0][
        "feature_name"] == "store_and_fwd_flag"
def test_statistics_object_creation_from_s3_uri_with_customizations(
        sagemaker_session, monitoring_files_kms_key):
    with open(os.path.join(tests.integ.DATA_DIR, "monitor/statistics.json"),
              "r") as f:
        file_body = f.read()

    file_name = "statistics.json"
    desired_s3_uri = os.path.join(
        "s3://",
        sagemaker_session.default_bucket(),
        "integ-test-test-monitoring-files",
        str(uuid.uuid4()),
        file_name,
    )

    s3_uri = S3Uploader.upload_string_as_file_body(
        body=file_body,
        desired_s3_uri=desired_s3_uri,
        kms_key=monitoring_files_kms_key,
        session=sagemaker_session,
    )

    statistics = Statistics.from_s3_uri(
        statistics_file_s3_uri=s3_uri,
        kms_key=monitoring_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    assert statistics.file_s3_uri.startswith("s3://")
    assert statistics.file_s3_uri.endswith("statistics.json")

    assert statistics.body_dict["dataset"]["item_count"] == 418
Example #16
0
def sagemaker_processing_handle(args, config, main):
    if args.sagemaker_run:
        # Remote processing
        sagemaker_processing_run(args=args, config=config)
    else:
        # Local processing
        args, tmps, uris = sagemaker_processing_local_args(args=args,
                                                           config=config)
        try:
            main(args)
            if tmps:
                session = sagemaker_session(
                    profile_name=args.sagemaker_profile)
                for k in tmps.keys():
                    S3Uploader.upload(local_path=getattr(args, k),
                                      desired_s3_uri=uris[k],
                                      sagemaker_session=session)
        finally:
            for tmp in tmps.values():
                tmp.__exit__()
Example #17
0
def upload(src, dst, gz, session: sagemaker.Session, root='.'):
    dst = cli_argument(dst, session=session)
    if not os.path.exists(src):
        raise click.UsageError("Source must exist")
    if not dst.startswith('s3://'):
        if dst.startswith('/'):
            dst = dst[1:]
        bucket = session.default_bucket()
        dst = 's3://{}/{}'.format(bucket, dst)
    url = urlparse(dst)
    assert url.scheme == 's3'
    bucket = url.netloc
    key = url.path
    if key.startswith('/'):
        key = key[1:]
    if os.path.isfile(src):
        if gz:
            raise click.UsageError(
                "Option gz is only valid for source directories")
        s3 = session.boto_session.client('s3')
        s3.upload_file(src, bucket, key)
    elif os.path.isdir(src):
        if gz:
            if not re.match(".*\\.(tar\\.gz||tgz)$", dst, re.IGNORECASE):
                raise click.UsageError(
                    "Destination should end in .tar.gz or tgz")
            s3_dst = os.path.dirname(dst)
            file_name = os.path.basename(dst)
            with _tmpdir() as tmp:
                p = os.path.join(tmp, file_name)
                with tarfile.open(p, 'w:gz') as arc:
                    arc.add(name=src, arcname=root, recursive=True)
                s3 = session.boto_session.client('s3')
                s3.upload_file(p, bucket, key)
        else:
            S3Uploader.upload(local_path=src,
                              desired_s3_uri=dst,
                              sagemaker_session=session)
    else:
        raise click.UsageError("Source must be file or directory")
def dataset(sagemaker_session):
    dataset_local_path = os.path.join(
        DATA_DIR, "pipeline/clarify_check_step/dataset.csv")
    dataset_s3_uri = "s3://{}/{}/{}/{}/{}".format(
        sagemaker_session.default_bucket(),
        "clarify_check_step",
        "input",
        "dataset",
        utils.unique_name_from_base("dataset"),
    )
    return S3Uploader.upload(dataset_local_path,
                             dataset_s3_uri,
                             sagemaker_session=sagemaker_session)
Example #19
0
def upload_local_channel(channel, session, s3_uri):
    url = urlparse(channel)
    if url.scheme == 's3':
        return channel
    elif url.scheme == 'file':
        path = url2pathname(url.path)
        S3Uploader.upload(
            local_path=path,
            desired_s3_uri=s3_uri,
            sagemaker_session=session
        )
        if os.path.isfile(path):
            #todo: urljoin
            s3_uri = "{}/{}".format(s3_uri, os.path.basename(path))
        print("Uploaded [{}] ([{}]) to [{}]".format(
            channel, path, s3_uri
        ))
        return s3_uri
    else:
        print("Type {}".format(type(s3_uri)))
        raise ValueError(
            "Unknown scheme: [{}] (uri: {})".format(url.scheme, channel))
    def _inject_repack_script(self):
        """Injects the _repack_model.py script where it belongs.

        If the source_dir is an S3 path:
            1) downloads the source_dir tar.gz
            2) copies the _repack_model.py script where it belongs
            3) uploads the mutated source_dir

        If the source_dir is a local path:
            1) copies the _repack_model.py script into the source dir
        """
        fname = os.path.join(os.path.dirname(__file__), REPACK_SCRIPT)
        if self._source_dir.lower().startswith("s3://"):
            with tempfile.TemporaryDirectory() as tmp:
                local_path = os.path.join(tmp, "local.tar.gz")

                S3Downloader.download(
                    s3_uri=self._source_dir,
                    local_path=local_path,
                    sagemaker_session=self._estimator.sagemaker_session,
                )

                src_dir = os.path.join(tmp, "src")
                with tarfile.open(name=local_path, mode="r:gz") as tf:
                    tf.extractall(path=src_dir)

                shutil.copy2(fname, os.path.join(src_dir, REPACK_SCRIPT))
                with tarfile.open(name=local_path, mode="w:gz") as tf:
                    tf.add(src_dir, arcname=".")

                S3Uploader.upload(
                    local_path=local_path,
                    desired_s3_uri=self._source_dir,
                    sagemaker_session=self._estimator.sagemaker_session,
                )
        else:
            shutil.copy2(fname, os.path.join(self._source_dir, REPACK_SCRIPT))
def test_sagemaker_java_jar_multinode(spark_jar_processor, sagemaker_session,
                                      configuration, build_jar):
    """Test SparkJarProcessor using Java application jar"""
    bucket = spark_jar_processor.sagemaker_session.default_bucket()
    with open(os.path.join(SPARK_PATH, "files", "data.jsonl")) as data:
        body = data.read()
        input_data_uri = f"s3://{bucket}/spark/input/data.jsonl"
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)
    output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}"

    java_project_dir = os.path.join(SPARK_PATH, "code", "java",
                                    "hello-java-spark")
    spark_jar_processor.run(
        submit_app=f"{java_project_dir}/hello-spark-java.jar",
        submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp",
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
    )
    processing_job = spark_jar_processor.latest_job

    waiter = sagemaker_session.sagemaker_client.get_waiter(
        "processing_job_completed_or_stopped")
    waiter.wait(
        ProcessingJobName=processing_job.job_name,
        # poll every 15 seconds. timeout after 15 minutes.
        WaiterConfig={
            "Delay": 15,
            "MaxAttempts": 60
        },
    )

    describe_response = sagemaker_session.sagemaker_client.describe_processing_job(
        ProcessingJobName=processing_job.job_name)
    assert describe_response["ProcessingJobStatus"] == "Completed"
Example #22
0
    def _normalize_inputs(self, inputs=None):
        """Ensures that all the ``ProcessingInput`` objects have names and S3 URIs.

        Args:
            inputs (list[sagemaker.processing.ProcessingInput]): A list of ``ProcessingInput``
                objects to be normalized (default: None). If not specified,
                an empty list is returned.

        Returns:
            list[sagemaker.processing.ProcessingInput]: The list of normalized
                ``ProcessingInput`` objects.

        Raises:
            TypeError: if the inputs are not ``ProcessingInput`` objects.
        """
        # Initialize a list of normalized ProcessingInput objects.
        normalized_inputs = []
        if inputs is not None:
            # Iterate through the provided list of inputs.
            for count, file_input in enumerate(inputs, 1):
                if not isinstance(file_input, ProcessingInput):
                    raise TypeError(
                        "Your inputs must be provided as ProcessingInput objects."
                    )
                # Generate a name for the ProcessingInput if it doesn't have one.
                if file_input.input_name is None:
                    file_input.input_name = "input-{}".format(count)
                # If the source is a local path, upload it to S3
                # and save the S3 uri in the ProcessingInput source.
                parse_result = urlparse(file_input.source)
                if parse_result.scheme != "s3":
                    desired_s3_uri = os.path.join(
                        "s3://",
                        self.sagemaker_session.default_bucket(),
                        self._current_job_name,
                        "input",
                        file_input.input_name,
                    )
                    s3_uri = S3Uploader.upload(
                        local_path=file_input.source,
                        desired_s3_uri=desired_s3_uri,
                        session=self.sagemaker_session,
                    )
                    file_input.source = s3_uri
                normalized_inputs.append(file_input)
        return normalized_inputs
    def _upload(s3_uri_base, input_file_name, target_time, file_name):
        time_folder = target_time.strftime("%Y/%m/%d/%H")
        time_str = str(target_time.strftime("%Y-%m-%dT%H:%M:%S.%f"))
        s3_uri = os.path.join(s3_uri_base, time_folder, file_name)

        up_to_date_lines = []
        with open(input_file_name, "r") as input_file:
            for line in input_file:
                json_l = json.loads(line)
                json_l["eventMetadata"]["inferenceTime"] = time_str
                up_to_date_lines.append(json.dumps(json_l))

        file_target = "\n".join(up_to_date_lines)

        return S3Uploader.upload_string_as_file_body(
            file_target,
            desired_s3_uri=s3_uri,
            sagemaker_session=sagemaker_session,
        )
Example #24
0
    def _upload_code(self, code):
        """Uploads a code file or directory specified as a string
        and returns the S3 URI.

        Args:
            code (str): A file or directory to be uploaded to S3.

        Returns:
            str: The S3 URI of the uploaded file or directory.

        """
        desired_s3_uri = "s3://{}/{}/input/{}".format(
            self.sagemaker_session.default_bucket(),
            self._current_job_name,
            self._CODE_CONTAINER_INPUT_NAME,
        )
        return S3Uploader.upload(local_path=code,
                                 desired_s3_uri=desired_s3_uri,
                                 session=self.sagemaker_session)
    def save(self, new_save_location_s3_uri=None):
        """Save the current instance's body to s3 using the instance's s3 path.
        The S3 path can be overridden by providing one. This also overrides the
        default save location for this object.

        Args:
            new_save_location_s3_uri (str): Optional. The S3 path to save the file to. If not
                provided, the file is saved in place in S3. If provided, the file's S3 path is
                permanently updated.

        Returns:
            str: The s3 location to which the file was saved.

        """
        if new_save_location_s3_uri is not None:
            self.file_s3_uri = new_save_location_s3_uri

        return S3Uploader.upload_string_as_file_body(
            body=json.dumps(self.body_dict), desired_s3_uri=self.file_s3_uri, kms_key=self.kms_key
        )
def test_one_step_sparkjar_processing_pipeline(
    sagemaker_session,
    role,
    cpu_instance_type,
    pipeline_name,
    region_name,
    configuration,
    build_jar,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")
    spark_path = os.path.join(DATA_DIR, "spark")

    spark_jar_processor = SparkJarProcessor(
        role=role,
        instance_count=2,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version="2.4",
    )
    bucket = spark_jar_processor.sagemaker_session.default_bucket()
    with open(os.path.join(spark_path, "files", "data.jsonl")) as data:
        body = data.read()
        input_data_uri = f"s3://{bucket}/spark/input/data.jsonl"
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session,
        )
    output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}"

    java_project_dir = os.path.join(spark_path, "code", "java",
                                    "hello-java-spark")
    spark_run_args = spark_jar_processor.get_run_args(
        submit_app=f"{java_project_dir}/hello-spark-java.jar",
        submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp",
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
    )

    step_pyspark = ProcessingStep(
        name="sparkjar-process",
        processor=spark_jar_processor,
        inputs=spark_run_args.inputs,
        outputs=spark_run_args.outputs,
        job_arguments=spark_run_args.arguments,
        code=spark_run_args.code,
        cache_config=cache_config,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_pyspark],
        sagemaker_session=sagemaker_session,
    )

    try:
        # NOTE: We should exercise the case when role used in the pipeline execution is
        # different than that required of the steps in the pipeline itself. The role in
        # the pipeline definition needs to create training and processing jobs and other
        # sagemaker entities. However, the jobs created in the steps themselves execute
        # under a potentially different role, often requiring access to S3 and other
        # artifacts not required to during creation of the jobs in the pipeline steps.
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        # Check CacheConfig
        response = json.loads(
            pipeline.describe()
            ["PipelineDefinition"])["Steps"][0]["CacheConfig"]
        assert response["Enabled"] == cache_config.enable_caching
        assert response["ExpireAfter"] == cache_config.expire_after

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "sparkjar-process"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Example #27
0
        def run(
            self,
            entry_point: str,
            source_dir: Optional[str],
            dependencies: Optional[List[str]] = None,
            git_config: Optional[Dict[str, str]] = None,
            inputs: Optional[List[ProcessingInput]] = None,
            outputs: Optional[List[ProcessingOutput]] = None,
            arguments: Optional[List[str]] = None,
            wait: bool = True,
            logs: bool = True,
            job_name: Optional[str] = None,
            experiment_config: Optional[Dict[str, str]] = None,
            kms_key: Optional[str] = None,
        ):
            """Runs a processing job.

            Args:
                entrypoint (str): Path (absolute or relative) to the local Python source
                    file which should be executed as the entry point to training. If
                    ``source_dir`` is specified, then ``entry_point`` must point to a file
                    located at the root of ``source_dir``.
                source_dir (str): Path (absolute, relative or an S3 URI) to a directory
                    with any other training source code dependencies aside from the entry
                    point file (default: None). If ``source_dir`` is an S3 URI, it must
                    point to a tar.gz file. Structure within this directory are preserved
                    when training on Amazon SageMaker.
                dependencies (list[str]): A list of paths to directories (absolute
                    or relative) with any additional libraries that will be exported
                    to the container (default: []). The library folders will be
                    copied to SageMaker in the same folder where the entrypoint is
                    copied. If 'git_config' is provided, 'dependencies' should be a
                    list of relative locations to directories with any additional
                    libraries needed in the Git repo.
                git_config (dict[str, str]): Git configurations used for cloning
                    files, including ``repo``, ``branch``, ``commit``,
                    ``2FA_enabled``, ``username``, ``password`` and ``token``. The
                    ``repo`` field is required. All other fields are optional.
                    ``repo`` specifies the Git repository where your training script
                    is stored. If you don't provide ``branch``, the default value
                    'master' is used. If you don't provide ``commit``, the latest
                    commit in the specified branch is used. .. admonition:: Example

                        The following config:

                        >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git',
                        >>>               'branch': 'test-branch-git-config',
                        >>>               'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'}

                        results in cloning the repo specified in 'repo', then
                        checkout the 'master' branch, and checkout the specified
                        commit.

                    ``2FA_enabled``, ``username``, ``password`` and ``token`` are
                    used for authentication. For GitHub (or other Git) accounts, set
                    ``2FA_enabled`` to 'True' if two-factor authentication is
                    enabled for the account, otherwise set it to 'False'. If you do
                    not provide a value for ``2FA_enabled``, a default value of
                    'False' is used. CodeCommit does not support two-factor
                    authentication, so do not provide "2FA_enabled" with CodeCommit
                    repositories.

                    For GitHub and other Git repos, when SSH URLs are provided, it
                    doesn't matter whether 2FA is enabled or disabled; you should
                    either have no passphrase for the SSH key pairs, or have the
                    ssh-agent configured so that you will not be prompted for SSH
                    passphrase when you do 'git clone' command with SSH URLs. When
                    HTTPS URLs are provided: if 2FA is disabled, then either token
                    or username+password will be used for authentication if provided
                    (token prioritized); if 2FA is enabled, only token will be used
                    for authentication if provided. If required authentication info
                    is not provided, python SDK will try to use local credentials
                    storage to authenticate. If that fails either, an error message
                    will be thrown.

                    For CodeCommit repos, 2FA is not supported, so '2FA_enabled'
                    should not be provided. There is no token in CodeCommit, so
                    'token' should not be provided too. When 'repo' is an SSH URL,
                    the requirements are the same as GitHub-like repos. When 'repo'
                    is an HTTPS URL, username+password will be used for
                    authentication if they are provided; otherwise, python SDK will
                    try to use either CodeCommit credential helper or local
                    credential storage for authentication.
                inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
                    the processing job. These must be provided as
                    :class:`~sagemaker.processing.ProcessingInput` objects (default: None).
                outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
                    the processing job. These can be specified as either path strings or
                    :class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
                arguments (list[str]): A list of string arguments to be passed to a
                    processing job (default: None).
                wait (bool): Whether the call should wait until the job completes (default: True).
                logs (bool): Whether to show the logs produced by the job.
                    Only meaningful when wait is True (default: True).
                job_name (str): Processing job name. If not specified, the processor generates
                    a default job name, based on the base job name and current timestamp.
                experiment_config (dict[str, str]): Experiment management configuration.
                    Dictionary contains three optional keys:
                    'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
                kms_key (str): The ARN of the KMS key that is used to encrypt the
                    user code file (default: None).
            """
            if job_name is None:
                job_name = self._generate_current_job_name()

            estimator = self._upload_payload(entry_point, source_dir,
                                             dependencies, git_config,
                                             job_name)
            inputs = self._patch_inputs_with_payload(
                inputs,
                estimator._hyperparameters["sagemaker_submit_directory"])

            # Upload the bootstrapping code as s3://.../jobname/source/runproc.sh.
            s3_runproc_sh = S3Uploader.upload_string_as_file_body(
                self.runproc_sh.format(entry_point=entry_point),
                desired_s3_uri=f"{self.s3_prefix}/{job_name}/source/runproc.sh",
                sagemaker_session=self.sagemaker_session,
            )
            self.logger.info("runproc.sh uploaded to", s3_runproc_sh)

            # Submit a processing job.
            super().run(
                code=s3_runproc_sh,
                inputs=inputs,
                outputs=outputs,
                arguments=arguments,
                wait=wait,
                logs=logs,
                job_name=job_name,
                experiment_config=experiment_config,
                kms_key=kms_key,
            )
def test_sagemaker_pyspark_multinode(tag, role, image_uri, configuration,
                                     sagemaker_session, region,
                                     sagemaker_client):
    """Test that basic multinode case works on 32KB of data"""
    spark = PySparkProcessor(
        base_job_name="sm-spark-py",
        framework_version=tag,
        image_uri=image_uri,
        role=role,
        instance_count=2,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )
    bucket = spark.sagemaker_session.default_bucket()
    timestamp = datetime.now().isoformat()
    output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp)
    spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp)
    spark_event_logs_s3_uri = "s3://{}/{}".format(bucket,
                                                  spark_event_logs_key_prefix)

    with open("test/resources/data/files/data.jsonl") as data:
        body = data.read()
        input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket)
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    spark.run(
        submit_app=
        "test/resources/code/python/hello_py_spark/hello_py_spark_app.py",
        submit_py_files=[
            "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py"
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
        spark_event_logs_s3_uri=spark_event_logs_s3_uri,
        wait=False,
    )
    processing_job = spark.latest_job

    s3_client = boto3.client("s3", region_name=region)

    file_size = 0
    latest_file_size = None
    updated_times_count = 0
    time_out = time.time() + 900

    while not processing_job_not_fail_or_complete(sagemaker_client,
                                                  processing_job.job_name):
        response = s3_client.list_objects(Bucket=bucket,
                                          Prefix=spark_event_logs_key_prefix)
        if "Contents" in response:
            # somehow when call list_objects the first file size is always 0, this for loop
            # is to skip that.
            for event_log_file in response["Contents"]:
                if event_log_file["Size"] != 0:
                    print("\n##### Latest file size is " +
                          str(event_log_file["Size"]))
                    latest_file_size = event_log_file["Size"]

        # update the file size if it increased
        if latest_file_size and latest_file_size > file_size:
            print("\n##### S3 file updated.")
            updated_times_count += 1
            file_size = latest_file_size

        if time.time() > time_out:
            raise RuntimeError("Timeout")

        time.sleep(20)

    # verify that spark event logs are periodically written to s3
    print("\n##### file_size {} updated_times_count {}".format(
        file_size, updated_times_count))
    assert file_size != 0

    # Commenting this assert because it's flaky.
    # assert updated_times_count > 1

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
Example #29
0
    def _stage_submit_deps(self, submit_deps, input_channel_name):
        """Prepares a list of paths to jars, py-files, or files dependencies.

        This prepared list of paths is provided as `spark-submit` options.
        The submit_deps list may include a combination of S3 URIs and local paths.
        Any S3 URIs are appended to the `spark-submit` option value without modification.
        Any local file paths are copied to a temp directory, uploaded to a default S3 URI,
        and included as a ProcessingInput channel to provide as local files to the SageMaker
        Spark container.

        :param submit_deps (list[str]): List of one or more dependency paths to include.
        :param input_channel_name (str): The `spark-submit` option name associated with
                    the input channel.
        :return (Optional[ProcessingInput], str): Tuple of (left) optional ProcessingInput
                    for the input channel, and (right) comma-delimited value for
                    `spark-submit` option.
        """
        if not submit_deps:
            raise ValueError(
                f"submit_deps value may not be empty. {self._submit_deps_error_message}"
            )
        if not input_channel_name:
            raise ValueError("input_channel_name value may not be empty.")

        input_channel_s3_uri = (
            f"s3://{self.sagemaker_session.default_bucket()}"
            f"/{self._current_job_name}/input/{input_channel_name}")

        use_input_channel = False
        spark_opt_s3_uris = []

        with tempfile.TemporaryDirectory() as tmpdir:
            for dep_path in submit_deps:
                dep_url = urlparse(dep_path)
                # S3 URIs are included as-is in the spark-submit argument
                if dep_url.scheme in ["s3", "s3a"]:
                    spark_opt_s3_uris.append(dep_path)
                # Local files are copied to temp directory to be uploaded to S3
                elif not dep_url.scheme or dep_url.scheme == "file":
                    if not os.path.isfile(dep_path):
                        raise ValueError(
                            f"submit_deps path {dep_path} is not a valid local file. "
                            f"{self._submit_deps_error_message}")
                    logger.info(
                        "Copying dependency from local path %s to tmpdir %s",
                        dep_path, tmpdir)
                    shutil.copy(dep_path, tmpdir)
                else:
                    raise ValueError(
                        f"submit_deps path {dep_path} references unsupported filesystem "
                        f"scheme: {dep_url.scheme} {self._submit_deps_error_message}"
                    )

            # If any local files were found and copied, upload the temp directory to S3
            if os.listdir(tmpdir):
                logger.info("Uploading dependencies from tmpdir %s to S3 %s",
                            tmpdir, input_channel_s3_uri)
                S3Uploader.upload(
                    local_path=tmpdir,
                    desired_s3_uri=input_channel_s3_uri,
                    sagemaker_session=self.sagemaker_session,
                )
                use_input_channel = True

        # If any local files were uploaded, construct a ProcessingInput to provide
        # them to the Spark container  and form the spark-submit option from a
        # combination of S3 URIs and container's local input path
        if use_input_channel:
            input_channel = ProcessingInput(
                source=input_channel_s3_uri,
                destination=
                f"{self._conf_container_base_path}{input_channel_name}",
                input_name=input_channel_name,
            )
            spark_opt = ",".join(spark_opt_s3_uris +
                                 [input_channel.destination])
        # If no local files were uploaded, form the spark-submit option from a list of S3 URIs
        else:
            input_channel = None
            spark_opt = ",".join(spark_opt_s3_uris)

        return input_channel, spark_opt
Example #30
0
 def _write_to_remote_storage(cls, local, remote):
     # Currently, supports Amazon S3 exclusively
     S3Uploader.upload(local, remote)