def get_node_to_idx_mapping(training_job_name: str = None, dataprocessing_job_name: str = None,
                            model_artifacts_location: str = './model-artifacts', vertex_label: str = None):
    assert training_job_name is not None or dataprocessing_job_name is not None, \
        "You must provide either a modeltraining job id or a dataprocessing job id to obtain node to index mappings"

    job_name = training_job_name if training_job_name is not None else dataprocessing_job_name
    job_type = "modeltraining" if training_job_name == job_name else "dataprocessing"
    filename = "mapping.info" if training_job_name == job_name else "info.pkl"
    mapping_key = "node2id" if training_job_name == job_name else "node_id_map"

    # get mappings
    model_artifacts_location = os.path.join(model_artifacts_location, job_name)
    if not os.path.exists(os.path.join(model_artifacts_location, filename)):
        job_s3_output = get_neptune_ml_job_output_location(job_name, job_type)
        print(job_s3_output)
        if not job_s3_output:
            return
        S3Downloader.download(os.path.join(job_s3_output, filename), model_artifacts_location)

    with open(os.path.join(model_artifacts_location, filename), "rb") as f:
        mapping = pickle.load(f)[mapping_key]
        if vertex_label is not None:
            if vertex_label in mapping:
                mapping = mapping[vertex_label]
            else:
                print("Mapping for vertex label: {} not found.".format(vertex_label))
                print("valid vertex labels which have vertices mapped to embeddings: {} ".format(list(mapping.keys())))
                print("Returning mapping for all valid vertex labels")

    return mapping
Esempio n. 2
0
def copy_sample_flow_to_local(workspace, local_dir) :

    config = BPConfig.get_config(workspace, local_dir)
        
    fname = f"{local_dir}/{FLOW_NAME}"
    flow_uri = f"s3://{workspace}/{config.ws_prefix()}/meta/{FLOW_NAME}"
    S3Downloader.download(flow_uri, local_dir)

    # Change the flow definition so that it references the dataset copied over by the user
    def _update_sample_flow_def(fname, s3_uri) :
    
        with open(fname, 'r+') as f:
            flow_def = json.loads(f.read())
    
            nodes = flow_def["nodes"]

            for n in nodes :
                if n["type"] == "SOURCE" :
                    data_def = n["parameters"]["dataset_definition"]
                    dstype = data_def["datasetSourceType"]
                    if dstype == "S3" :
                        data_def["s3ExecutionContext"]["s3Uri"] = s3_uri
            f.seek(0)   
            f.write(json.dumps(flow_def))
            f.truncate()
            
    _update_sample_flow_def(fname, config.sample_data_uri())   
     
    return fname
Esempio n. 3
0
def download_folder(Filename, Bucket, Key, session):
    if isinstance(session, boto3.Session):
        session = sagemaker.Session(boto_session=session)
    ensure_path(Filename)
    S3Downloader.download(s3_uri=f"s3://{Bucket}/{Key}",
                          local_path=Filename,
                          sagemaker_session=session)
Esempio n. 4
0
def download_model(model_data, local_path=".", unzip=False, sagemaker_session=None, model_dir="model"):
    """Downloads model file from sagemaker training to local directory and unzips its to directory if wanted."""
    S3Downloader.download(
        s3_uri=model_data, local_path=os.path.join(local_path, model_dir), sagemaker_session=sagemaker_session
    )
    if unzip:
        with tarfile.open(os.path.join(local_path, model_dir, "model.tar.gz"), "r:gz") as model_zip:
            model_zip.extractall(path=os.path.join(local_path, model_dir))
        os.remove(os.path.join(local_path, model_dir, "model.tar.gz"))
Esempio n. 5
0
    def _download_bp_config(cls, config_uri=None):

        if not config_uri:
            config_uri = cls.default_config_uri(cls.workspace)

        S3Downloader.download(config_uri, cls.local_dir)

        fname = f"{cls.local_dir}/blueprint-config.json"
        return fname
Esempio n. 6
0
def load_dataset(
    s3_uri: str, filename: str, sagemaker_session=None
) -> pd.DataFrame:
    """Load a data set from a S3 uri"""
    S3Downloader.download(
        s3_uri, tempfile.gettempdir(),
        sagemaker_session=sagemaker_session)
    dataset_filename = os.path.join(
        tempfile.gettempdir(), filename)
    return pd.read_csv(dataset_filename, header=None)
Esempio n. 7
0
def test_download(sagemaker_session):
    s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME, SOURCE_NAME)
    S3Downloader.download(s3_uri=s3_uri,
                          local_path="/path/for/download/",
                          session=sagemaker_session)
    sagemaker_session.download_data.assert_called_with(
        path="/path/for/download/",
        bucket=BUCKET_NAME,
        key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME),
        extra_args=None,
    )
Esempio n. 8
0
def test_download_with_kms_key(sagemaker_session):
    s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME, SOURCE_NAME)
    S3Downloader.download(s3_uri=s3_uri,
                          local_path="/path/for/download/",
                          kms_key=KMS_KEY,
                          session=sagemaker_session)
    sagemaker_session.download_data.assert_called_with(
        path="/path/for/download/",
        bucket=BUCKET_NAME,
        key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME),
        extra_args={"SSECustomerKey": KMS_KEY},
    )
Esempio n. 9
0
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_files(
    sagemaker_session, s3_files_kms_key
):
    my_uuid = str(uuid.uuid4())

    file_1_body = "First File Body {}.".format(my_uuid)
    file_1_name = "first_file_{}.txt".format(my_uuid)
    file_2_body = "Second File Body {}.".format(my_uuid)
    file_2_name = "second_file_{}.txt".format(my_uuid)

    base_s3_uri = os.path.join(
        "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid
    )
    file_1_s3_uri = os.path.join(base_s3_uri, file_1_name)
    file_2_s3_uri = os.path.join(base_s3_uri, file_2_name)

    S3Uploader.upload_string_as_file_body(
        body=file_1_body,
        desired_s3_uri=file_1_s3_uri,
        kms_key=s3_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    S3Uploader.upload_string_as_file_body(
        body=file_2_body,
        desired_s3_uri=file_2_s3_uri,
        kms_key=s3_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    s3_uris = S3Downloader.list(s3_uri=base_s3_uri, sagemaker_session=sagemaker_session)

    assert file_1_name in s3_uris[0]
    assert file_2_name in s3_uris[1]

    assert file_1_body == S3Downloader.read_file(
        s3_uri=s3_uris[0], sagemaker_session=sagemaker_session
    )
    assert file_2_body == S3Downloader.read_file(
        s3_uri=s3_uris[1], sagemaker_session=sagemaker_session
    )

    S3Downloader.download(
        s3_uri=base_s3_uri, local_path=TMP_BASE_PATH, sagemaker_session=sagemaker_session
    )

    with open(os.path.join(TMP_BASE_PATH, file_1_name), "r") as f:
        assert file_1_body == f.read()

    with open(os.path.join(TMP_BASE_PATH, file_2_name), "r") as f:
        assert file_2_body == f.read()
Esempio n. 10
0
    def _download_clarify_xai_summary(self):

        try:

            summary_uri = f"s3://{self.bucket}/{self.xai_prefix}/analysis.json"
            S3Downloader.download(summary_uri, os.getcwd())

            with open('analysis.json', 'r') as f:
                summary = json.loads(f.read())

            return summary

        except Exception as e:
            print(f"{e}: Failed to download {xai_summary}")
def get_embeddings(training_job_name: str, download_location: str = './model-artifacts'):
    training_job_s3_output = get_modeltraining_job_output_location(training_job_name)
    if not training_job_s3_output:
        return

    download_location = os.path.join(download_location, training_job_name)
    os.makedirs(download_location, exist_ok=True)
    # download embeddings and mapping info

    S3Downloader.download(os.path.join(training_job_s3_output, "embeddings/"),
                          os.path.join(download_location, "embeddings/"))

    entity_emb = np.load(os.path.join(download_location, "embeddings", "entity.npy"))

    return entity_emb
    def from_s3_uri(cls,
                    constraints_file_s3_uri,
                    kms_key=None,
                    sagemaker_session=None):
        """Generates a Constraints object from an s3 uri.

        Args:
            constraints_file_s3_uri (str): The uri of the constraints JSON file.
            kms_key (str): The kms key to be used to decrypt the file in S3.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session
                object, used for SageMaker interactions (default: None). If not
                specified, one is created using the default AWS configuration
                chain.

        Returns:
            sagemaker.model_monitor.Constraints: The instance of Constraints generated from
                the s3 uri.

        """
        try:
            body_dict = json.loads(
                S3Downloader.read_file(s3_uri=constraints_file_s3_uri,
                                       session=sagemaker_session))
        except ClientError as error:
            print("\nCould not retrieve constraints file at location '{}'. "
                  "To manually retrieve Constraints object from a given uri, "
                  "use 'my_model_monitor.constraints(my_s3_uri)' or "
                  "'Constraints.from_s3_uri(my_s3_uri)'".format(
                      constraints_file_s3_uri))
            raise error

        return cls(body_dict=body_dict,
                   constraints_file_s3_uri=constraints_file_s3_uri,
                   kms_key=kms_key)
def get_performance_metrics(training_job_name: str, download_location: str = './model-artifacts'):
    training_job_s3_output = get_modeltraining_job_output_location(training_job_name)
    if not training_job_s3_output:
        return

    download_location = os.path.join(download_location, training_job_name)
    os.makedirs(download_location, exist_ok=True)
    # download embeddings and mapping info

    S3Downloader.download(os.path.join(training_job_s3_output, "eval_metrics_info.json"),
                          download_location)

    with open(os.path.join(download_location, "eval_metrics_info.json")) as f:
        metrics = json.load(f)

    return metrics
def get_predictions(training_job_name: str, download_location: str = './model-artifacts', class_preds: bool = False):
    training_job_s3_output = get_modeltraining_job_output_location(training_job_name)
    if not training_job_s3_output:
        return

    download_location = os.path.join(download_location, training_job_name)
    os.makedirs(download_location, exist_ok=True)
    # download embeddings and mapping info

    S3Downloader.download(os.path.join(training_job_s3_output, "predictions/"),
                          os.path.join(download_location, "predictions/"))

    preds = np.load(os.path.join(download_location, "predictions", "result.npz"))['infer_scores']

    if class_preds:
        return preds.argmax(axis=1)

    return preds
def test_sagemaker_pyspark_sse_s3(tag, role, image_uri, sagemaker_session,
                                  region, sagemaker_client):
    """Test that Spark container can read and write S3 data encrypted with SSE-S3 (default AES256 encryption)"""
    spark = PySparkProcessor(
        base_job_name="sm-spark-py",
        framework_version=tag,
        image_uri=image_uri,
        role=role,
        instance_count=2,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )
    bucket = sagemaker_session.default_bucket()
    timestamp = datetime.now().isoformat()
    input_data_key = f"spark/input/sales/{timestamp}/data.jsonl"
    input_data_uri = f"s3://{bucket}/{input_data_key}"
    output_data_uri = f"s3://{bucket}/spark/output/sales/{timestamp}"
    s3_client = sagemaker_session.boto_session.client("s3", region_name=region)
    with open("test/resources/data/files/data.jsonl") as data:
        body = data.read()
        s3_client.put_object(Body=body,
                             Bucket=bucket,
                             Key=input_data_key,
                             ServerSideEncryption="AES256")

    spark.run(
        submit_app=
        "test/resources/code/python/hello_py_spark/hello_py_spark_app.py",
        submit_py_files=[
            "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py"
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration={
            "Classification": "core-site",
            "Properties": {
                "fs.s3a.server-side-encryption-algorithm": "AES256"
            },
        },
    )
    processing_job = spark.latest_job

    waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped")
    waiter.wait(
        ProcessingJobName=processing_job.job_name,
        # poll every 15 seconds. timeout after 15 minutes.
        WaiterConfig={
            "Delay": 15,
            "MaxAttempts": 60
        },
    )

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
Esempio n. 16
0
def get_last_object_by_name(s3_location):

    import os
    from sagemaker.s3 import S3Downloader as s3down

    object_list = s3down.list(s3_location)

    object_list.sort()
    obj = object_list.pop()

    return obj
Esempio n. 17
0
def get_object_path_by_filename(s3_location, filename):

    import os
    from sagemaker.s3 import S3Downloader as s3down

    object_list = s3down.list(s3_location)

    for url in object_list:
        if os.path.basename(url) == filename:
            return url

    return None
def test_sagemaker_scala_jar_multinode(tag, role, image_uri, configuration,
                                       sagemaker_session, sagemaker_client):
    """Test SparkJarProcessor using Scala application jar with external runtime dependency jars staged by SDK"""
    spark = SparkJarProcessor(
        base_job_name="sm-spark-scala",
        framework_version=tag,
        image_uri=image_uri,
        role=role,
        instance_count=2,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )

    bucket = spark.sagemaker_session.default_bucket()
    with open("test/resources/data/files/data.jsonl") as data:
        body = data.read()
        input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket)
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)
    output_data_uri = "s3://{}/spark/output/sales/{}".format(
        bucket,
        datetime.now().isoformat())

    scala_project_dir = "test/resources/code/scala/hello-scala-spark"
    spark.run(
        submit_app="{}/target/scala-2.11/hello-scala-spark_2.11-1.0.jar".
        format(scala_project_dir),
        submit_class="com.amazonaws.sagemaker.spark.test.HelloScalaSparkApp",
        submit_jars=[
            "{}/lib_managed/jars/org.json4s/json4s-native_2.11/json4s-native_2.11-3.6.9.jar"
            .format(scala_project_dir)
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
    )
    processing_job = spark.latest_job

    waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped")
    waiter.wait(
        ProcessingJobName=processing_job.job_name,
        # poll every 15 seconds. timeout after 15 minutes.
        WaiterConfig={
            "Delay": 15,
            "MaxAttempts": 60
        },
    )

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
def test_sagemaker_spark_processor_default_tag(spark_version, role,
                                               sagemaker_session,
                                               sagemaker_client):
    """Test that spark processor works with default tag"""
    spark = PySparkProcessor(
        base_job_name="sm-spark-py",
        framework_version=spark_version,
        role=role,
        instance_count=1,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )
    bucket = spark.sagemaker_session.default_bucket()
    timestamp = datetime.now().isoformat()
    output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp)
    spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp)
    spark_event_logs_s3_uri = "s3://{}/{}".format(bucket,
                                                  spark_event_logs_key_prefix)

    with open("test/resources/data/files/data.jsonl") as data:
        body = data.read()
        input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket)
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    spark.run(
        submit_app=
        "test/resources/code/python/hello_py_spark/hello_py_spark_app.py",
        submit_py_files=[
            "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py"
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        spark_event_logs_s3_uri=spark_event_logs_s3_uri,
        wait=True,
    )

    processing_job = spark.latest_job
    waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped")
    waiter.wait(
        ProcessingJobName=processing_job.job_name,
        # poll every 15 seconds. timeout after 15 minutes.
        WaiterConfig={
            "Delay": 15,
            "MaxAttempts": 60
        },
    )

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
Esempio n. 20
0
    def _inject_repack_script(self):
        """Injects the _repack_model.py script where it belongs.

        If the source_dir is an S3 path:
            1) downloads the source_dir tar.gz
            2) copies the _repack_model.py script where it belongs
            3) uploads the mutated source_dir

        If the source_dir is a local path:
            1) copies the _repack_model.py script into the source dir
        """
        fname = os.path.join(os.path.dirname(__file__), REPACK_SCRIPT)
        if self._source_dir.lower().startswith("s3://"):
            with tempfile.TemporaryDirectory() as tmp:
                local_path = os.path.join(tmp, "local.tar.gz")

                S3Downloader.download(
                    s3_uri=self._source_dir,
                    local_path=local_path,
                    sagemaker_session=self._estimator.sagemaker_session,
                )

                src_dir = os.path.join(tmp, "src")
                with tarfile.open(name=local_path, mode="r:gz") as tf:
                    tf.extractall(path=src_dir)

                shutil.copy2(fname, os.path.join(src_dir, REPACK_SCRIPT))
                with tarfile.open(name=local_path, mode="w:gz") as tf:
                    tf.add(src_dir, arcname=".")

                S3Uploader.upload(
                    local_path=local_path,
                    desired_s3_uri=self._source_dir,
                    sagemaker_session=self._estimator.sagemaker_session,
                )
        else:
            shutil.copy2(fname, os.path.join(self._source_dir, REPACK_SCRIPT))
def test_one_step_data_bias_pipeline_constraint_violation(
    sagemaker_session,
    role,
    pipeline_name,
    check_job_config,
    data_bias_check_config,
    supplied_baseline_constraints_uri_param,
):
    data_bias_supplied_baseline_constraints = Constraints.from_file_path(
        constraints_file_path=os.path.join(
            DATA_DIR,
            "pipeline/clarify_check_step/data_bias/bad_cases/analysis.json"),
        sagemaker_session=sagemaker_session,
    ).file_s3_uri
    data_bias_check_step = ClarifyCheckStep(
        name="DataBiasCheckStep",
        clarify_check_config=data_bias_check_config,
        check_job_config=check_job_config,
        skip_check=False,
        register_new_baseline=False,
        supplied_baseline_constraints=supplied_baseline_constraints_uri_param,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        steps=[data_bias_check_step],
        parameters=[supplied_baseline_constraints_uri_param],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        monitoring_analysis_cfg_json = S3Downloader.read_file(
            data_bias_check_config.monitoring_analysis_config_uri,
            sagemaker_session,
        )
        monitoring_analysis_cfg = json.loads(monitoring_analysis_cfg_json)

        assert monitoring_analysis_cfg is not None and len(
            monitoring_analysis_cfg) > 0

        for _ in retries(
                max_retry_count=5,
                exception_message_prefix=
                "Waiting for a successful execution of pipeline",
                seconds_to_sleep=10,
        ):
            execution = pipeline.start(parameters={
                "SuppliedBaselineConstraintsUri":
                data_bias_supplied_baseline_constraints
            }, )
            response = execution.describe()

            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=30, max_attempts=60)
            except WaiterError:
                pass
            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            failure_reason = execution_steps[0].get("FailureReason", "")
            if _CHECK_FAIL_ERROR_MSG not in failure_reason:
                logging.error(
                    f"Pipeline execution failed with error: {failure_reason}. Retrying.."
                )
                continue
            assert execution_steps[0]["StepName"] == "DataBiasCheckStep"
            assert execution_steps[0]["StepStatus"] == "Failed"
            break
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_sagemaker_pyspark_multinode(tag, role, image_uri, configuration,
                                     sagemaker_session, region,
                                     sagemaker_client):
    """Test that basic multinode case works on 32KB of data"""
    spark = PySparkProcessor(
        base_job_name="sm-spark-py",
        framework_version=tag,
        image_uri=image_uri,
        role=role,
        instance_count=2,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )
    bucket = spark.sagemaker_session.default_bucket()
    timestamp = datetime.now().isoformat()
    output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp)
    spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp)
    spark_event_logs_s3_uri = "s3://{}/{}".format(bucket,
                                                  spark_event_logs_key_prefix)

    with open("test/resources/data/files/data.jsonl") as data:
        body = data.read()
        input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket)
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    spark.run(
        submit_app=
        "test/resources/code/python/hello_py_spark/hello_py_spark_app.py",
        submit_py_files=[
            "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py"
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
        spark_event_logs_s3_uri=spark_event_logs_s3_uri,
        wait=False,
    )
    processing_job = spark.latest_job

    s3_client = boto3.client("s3", region_name=region)

    file_size = 0
    latest_file_size = None
    updated_times_count = 0
    time_out = time.time() + 900

    while not processing_job_not_fail_or_complete(sagemaker_client,
                                                  processing_job.job_name):
        response = s3_client.list_objects(Bucket=bucket,
                                          Prefix=spark_event_logs_key_prefix)
        if "Contents" in response:
            # somehow when call list_objects the first file size is always 0, this for loop
            # is to skip that.
            for event_log_file in response["Contents"]:
                if event_log_file["Size"] != 0:
                    print("\n##### Latest file size is " +
                          str(event_log_file["Size"]))
                    latest_file_size = event_log_file["Size"]

        # update the file size if it increased
        if latest_file_size and latest_file_size > file_size:
            print("\n##### S3 file updated.")
            updated_times_count += 1
            file_size = latest_file_size

        if time.time() > time_out:
            raise RuntimeError("Timeout")

        time.sleep(20)

    # verify that spark event logs are periodically written to s3
    print("\n##### file_size {} updated_times_count {}".format(
        file_size, updated_times_count))
    assert file_size != 0

    # Commenting this assert because it's flaky.
    # assert updated_times_count > 1

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
Esempio n. 23
0
def example_run_training_pipeline(workflow_arn, region):
    """
    execute the Workflow, which consists of four steps:

    1. Define job names for pre-processing, training, and evaluation
    2. Upload source code for pre-processing, training, and evaluation
    3. Define URLs for the input, output, and intermediary data
    4. Execute the workflow with populated parameters, and monitor the progress
    5. Inspect the evaluation result when the execution is completed
    """

    training_pipeline = get_existing_training_pipeline(workflow_arn)

    # Step 1 - Generate unique names for Pre-Processing Job, Training Job, and
    unique_id = uuid.uuid1().hex
    # pipeline_job_name = f"pipeline-job-{unique_id}"
    training_job_name = f"scikit-learn-training-{unique_id}"
    preprocessing_job_name = f"scikit-learn-sm-preprocessing-{unique_id}"
    evaluation_job_name = f"scikit-learn-sm-evaluation-{unique_id}"

    # Step 2 - Upload source code (pre-processing, evaluation, and train) to sagemaker
    PREPROCESSING_SCRIPT_LOCATION = "../../src/mlmax/preprocessing.py"
    EVALUATION_SCRIPT_LOCATION = "../../src/mlmax/evaluation.py"
    TRAINING_SCRIPT_LOCATION = "../../src/mlmax/train.py"

    sagemaker_session = sagemaker.Session()
    input_preprocessing_code = sagemaker_session.upload_data(
        PREPROCESSING_SCRIPT_LOCATION,
        bucket=sagemaker_session.default_bucket(),
        key_prefix=f"{preprocessing_job_name}/source",
    )
    input_evaluation_code = sagemaker_session.upload_data(
        EVALUATION_SCRIPT_LOCATION,
        bucket=sagemaker_session.default_bucket(),
        key_prefix=f"{evaluation_job_name}/source",
    )
    s3_bucket_base_uri = f"s3://{sagemaker_session.default_bucket()}"
    sm_submit_dir_url = (
        f"{s3_bucket_base_uri}/{training_job_name}/source/sourcedir.tar.gz")
    tar = tarfile.open("/tmp/sourcedir.tar.gz", "w:gz")
    # TODO need to add directory if source_dir is specified.
    tar.add(TRAINING_SCRIPT_LOCATION, arcname="train.py")
    tar.close()
    sagemaker_session.upload_data(
        "/tmp/sourcedir.tar.gz",
        bucket=sagemaker_session.default_bucket(),
        key_prefix=f"{training_job_name}/source",
    )

    # Step 3 - Define data URLs, preprocessed data URLs can be made
    # specifically to this training job
    input_data = (
        f"s3://sagemaker-sample-data-{region}/processing/census/census-income.csv"
    )
    output_data = f"{s3_bucket_base_uri}/{preprocessing_job_name}/output"
    preprocessed_training_data = f"{output_data}/train_data"
    preprocessed_test_data = f"{output_data}/test_data"
    preprocessed_model_url = f"{s3_bucket_base_uri}/{preprocessing_job_name}/output"
    # Step 4 - Execute workflow
    print(f"Training Job Name is {training_job_name}")
    execution = training_pipeline.execute(
        inputs={
            "InputDataURL":
            input_data,
            # Each pre processing job (SageMaker processing job) requires a unique name,
            "PreprocessingJobName":
            preprocessing_job_name,
            "PreprocessingCodeURL":
            input_preprocessing_code,
            # Each Sagemaker Training job requires a unique name,
            "TrainingJobName":
            training_job_name,
            "SMSubmitDirURL":
            sm_submit_dir_url,
            "SMRegion":
            region,
            # Each SageMaker processing job requires a unique name,
            "EvaluationProcessingJobName":
            evaluation_job_name,
            "EvaluationCodeURL":
            input_evaluation_code,
            "EvaluationResultURL": (
                f"{s3_bucket_base_uri}/{training_job_name}/evaluation"),
            "PreprocessedTrainDataURL":
            preprocessed_training_data,
            "PreprocessedTestDataURL":
            preprocessed_test_data,
            "PreprocessedModelURL":
            preprocessed_model_url,
            "SMOutputDataURL":
            f"{s3_bucket_base_uri}/",
            "SMDebugOutputURL":
            f"{s3_bucket_base_uri}/",
        })
    execution.get_output(wait=True)
    execution.render_progress()

    # Step 5 - Inspect the output of the Workflow execution
    workflow_execution_output_json = execution.get_output(wait=True)
    import json

    from sagemaker.s3 import S3Downloader

    evaluation_output_config = workflow_execution_output_json[
        "ProcessingOutputConfig"]
    for output in evaluation_output_config["Outputs"]:
        if output["OutputName"] == "evaluation":
            evaluation_s3_uri = "{}/{}".format(output["S3Output"]["S3Uri"],
                                               "evaluation.json")
            break

    evaluation_output = S3Downloader.read_file(evaluation_s3_uri)
    evaluation_output_dict = json.loads(evaluation_output)
    print(json.dumps(evaluation_output_dict, sort_keys=True, indent=4))
def main(deploy_data: dict, train_data: dict, capture_prefix: str):
    inference_id_prefix = 'sts_'  # the same used in testendpoint.py

    # Load config from environment and set required defaults
    # AWS especific
    AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1')
    AWS_PROFILE = os.getenv('AWS_PROFILE', 'default')
    AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None)
    AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None)
    b3_session, sm_client, sm_runtime, sm_session = get_sm_session(
        region=AWS_DEFAULT_REGION,
        profile_name=AWS_PROFILE,
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    # read test data
    test_data = load_dataset(train_data['train']['test'],
                             'test.csv',
                             sagemaker_session=sm_session)
    print(f"Loadding {train_data['train']['test']}")
    Y_val = test_data.iloc[:, 0].to_numpy()
    print(f"Test dataset shape: {Y_val.shape}")

    # list capture files, this is just as an example. Not used right
    # now but could be.
    capture_files = sorted(
        S3Downloader.list("{}/{}".format(
            deploy_data['monitor']['s3_capture_upload_path'],
            deploy_data['endpoint']['name']),
                          sagemaker_session=sm_session))
    # just the files with the prefix
    filtered = list(
        filter(lambda file_name: capture_prefix in file_name, capture_files))
    print(f"Detected {len(filtered)} capture files")

    capture_records = []
    for c_file in filtered:
        print(f"Processing: {c_file}")
        # read the capture data directly from S3
        content = S3Downloader.read_file(c_file, sagemaker_session=sm_session)
        records = [json.loads(l) for l in content.split("\n")[:-1]]

        capture_records.extend(records)

    print(f"No. of records {len(capture_records)} captured")
    captured_predictions = {}

    for obj in capture_records:
        # Extract inference ID
        inference_id = obj["eventMetadata"]["inferenceId"]
        # current version of script start in 1 when id=0
        # remove the prefix and get the id
        req_id = int(inference_id[len(inference_id_prefix):])

        # Extract result given by the model
        Y_pred_value = encoders.decode(
            obj["captureData"]["endpointOutput"]["data"],
            # i have fixed this value here becouse
            # obj["captureData"]["endpointOutput"]["observedContentType"]
            # some times include the encoding like: text/csv; utf-8
            # and encoders.decode() will give error.
            content_types.CSV)
        captured_predictions[req_id] = Y_pred_value  # np.array

    # save and upload the ground truth labels
    print("Generating labels")
    fake_records = []
    for i, label in captured_predictions.items():
        val = ground_truth_with_id(i, label, Y_val, inference_id_prefix)
        fake_records.append(json.dumps(val))

    data_to_upload = "\n".join(fake_records)
    target_s3_uri = "{}/{}/{}.jsonl".format(
        deploy_data['monitor']['ground truth uri'], capture_prefix,
        uuid.uuid4().hex)
    print(f"Uploading ground truth to {target_s3_uri} ...", end="")
    S3Uploader.upload_string_as_file_body(data_to_upload,
                                          target_s3_uri,
                                          sagemaker_session=sm_session)
    print("Done !")
def test_sagemaker_pyspark_multinode(spark_py_processor, sagemaker_session,
                                     configuration):
    """Test that basic multinode case works on 32KB of data"""
    bucket = spark_py_processor.sagemaker_session.default_bucket()
    timestamp = datetime.now().isoformat()
    output_data_uri = f"s3://{bucket}/spark/output/sales/{timestamp}"
    spark_event_logs_key_prefix = f"spark/spark-events/{timestamp}"
    spark_event_logs_s3_uri = f"s3://{bucket}/{spark_event_logs_key_prefix}"

    with open(os.path.join(SPARK_PATH, "files", "data.jsonl")) as data:
        body = data.read()
        input_data_uri = f"s3://{bucket}/spark/input/data.jsonl"
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    spark_py_processor.run(
        submit_app=os.path.join(SPARK_PATH, "code", "python", "hello_py_spark",
                                "hello_py_spark_app.py"),
        submit_py_files=[
            os.path.join(SPARK_PATH, "code", "python", "hello_py_spark",
                         "hello_py_spark_udfs.py")
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
        spark_event_logs_s3_uri=spark_event_logs_s3_uri,
        wait=False,
    )
    processing_job = spark_py_processor.latest_job

    s3_client = boto3.client(
        "s3",
        region_name=spark_py_processor.sagemaker_session.boto_region_name)

    file_size = 0
    latest_file_size = None
    updated_times_count = 0
    time_out = time.time() + 900
    while not processing_job_not_fail_or_complete(
            sagemaker_session.sagemaker_client, processing_job.job_name):
        response = s3_client.list_objects(Bucket=bucket,
                                          Prefix=spark_event_logs_key_prefix)
        if "Contents" in response:
            # somehow when call list_objects the first file size is always 0, this for loop
            # is to skip that.
            for event_log_file in response["Contents"]:
                if event_log_file["Size"] != 0:
                    latest_file_size = event_log_file["Size"]

        # update the file size if it increased
        if latest_file_size and latest_file_size > file_size:
            updated_times_count += 1
            file_size = latest_file_size

        if time.time() > time_out:
            raise RuntimeError("Timeout")

        time.sleep(20)

    # verify that spark event logs are periodically written to s3
    assert file_size != 0

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0