def from_string(cls,
                    constraint_violations_file_string,
                    kms_key=None,
                    file_name=None,
                    sagemaker_session=None):
        """Generates a ConstraintViolations object from an s3 uri.

        Args:
            constraint_violations_file_string (str): The uri of the constraint violations JSON file.
            kms_key (str): The kms key to be used to encrypt the file in S3.
            file_name (str): The file name to use when uploading to S3.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session
                object, used for SageMaker interactions (default: None). If not
                specified, one is created using the default AWS configuration
                chain.

        Returns:
            sagemaker.model_monitor.ConstraintViolations: The instance of ConstraintViolations
                generated from the s3 uri.

        """
        sagemaker_session = sagemaker_session or Session()
        file_name = file_name or "constraint_violations.json"
        desired_s3_uri = os.path.join("s3://",
                                      sagemaker_session.default_bucket(),
                                      "monitoring", str(uuid.uuid4()),
                                      file_name)
        s3_uri = S3Uploader.upload_string_as_file_body(
            body=constraint_violations_file_string,
            desired_s3_uri=desired_s3_uri,
            kms_key=kms_key,
            session=sagemaker_session,
        )

        return ConstraintViolations.from_s3_uri(
            constraint_violations_file_s3_uri=s3_uri,
            kms_key=kms_key,
            sagemaker_session=sagemaker_session,
        )
Esempio n. 2
0
    def save(self, new_save_location_s3_uri=None):
        """Save the current instance's body to s3 using the instance's s3 path.
        The S3 path can be overridden by providing one. This also overrides the
        default save location for this object.

        Args:
            new_save_location_s3_uri (str): Optional. The S3 path to save the file to. If not
                provided, the file is saved in place in S3. If provided, the file's S3 path is
                permanently updated.

        Returns:
            str: The s3 location to which the file was saved.

        """
        if new_save_location_s3_uri is not None:
            self.file_s3_uri = new_save_location_s3_uri

        return S3Uploader.upload_string_as_file_body(
            body=json.dumps(self.body_dict),
            desired_s3_uri=self.file_s3_uri,
            kms_key=self.kms_key,
            sagemaker_session=self.session,
        )
def test_statistics_object_creation_from_s3_uri_without_customizations(
        sagemaker_session):
    with open(os.path.join(tests.integ.DATA_DIR, "monitor/statistics.json"),
              "r") as f:
        file_body = f.read()

    file_name = "statistics.json"
    desired_s3_uri = os.path.join(
        "s3://",
        sagemaker_session.default_bucket(),
        "integ-test-test-monitoring-files",
        str(uuid.uuid4()),
        file_name,
    )

    s3_uri = S3Uploader.upload_string_as_file_body(
        body=file_body, desired_s3_uri=desired_s3_uri)

    statistics = Statistics.from_s3_uri(statistics_file_s3_uri=s3_uri)

    assert statistics.file_s3_uri.startswith("s3://")
    assert statistics.file_s3_uri.endswith("statistics.json")

    assert statistics.body_dict["dataset"]["item_count"] == 418
def test_one_step_sparkjar_processing_pipeline(
    sagemaker_session,
    role,
    cpu_instance_type,
    pipeline_name,
    region_name,
    configuration,
    build_jar,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")
    spark_path = os.path.join(DATA_DIR, "spark")

    spark_jar_processor = SparkJarProcessor(
        role=role,
        instance_count=2,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version="2.4",
    )
    bucket = spark_jar_processor.sagemaker_session.default_bucket()
    with open(os.path.join(spark_path, "files", "data.jsonl")) as data:
        body = data.read()
        input_data_uri = f"s3://{bucket}/spark/input/data.jsonl"
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session,
        )
    output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}"

    java_project_dir = os.path.join(spark_path, "code", "java",
                                    "hello-java-spark")
    spark_run_args = spark_jar_processor.get_run_args(
        submit_app=f"{java_project_dir}/hello-spark-java.jar",
        submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp",
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
    )

    step_pyspark = ProcessingStep(
        name="sparkjar-process",
        processor=spark_jar_processor,
        inputs=spark_run_args.inputs,
        outputs=spark_run_args.outputs,
        job_arguments=spark_run_args.arguments,
        code=spark_run_args.code,
        cache_config=cache_config,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_pyspark],
        sagemaker_session=sagemaker_session,
    )

    try:
        # NOTE: We should exercise the case when role used in the pipeline execution is
        # different than that required of the steps in the pipeline itself. The role in
        # the pipeline definition needs to create training and processing jobs and other
        # sagemaker entities. However, the jobs created in the steps themselves execute
        # under a potentially different role, often requiring access to S3 and other
        # artifacts not required to during creation of the jobs in the pipeline steps.
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        # Check CacheConfig
        response = json.loads(
            pipeline.describe()
            ["PipelineDefinition"])["Steps"][0]["CacheConfig"]
        assert response["Enabled"] == cache_config.enable_caching
        assert response["ExpireAfter"] == cache_config.expire_after

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "sparkjar-process"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Esempio n. 5
0
        def run(
            self,
            entry_point: str,
            source_dir: Optional[str],
            dependencies: Optional[List[str]] = None,
            git_config: Optional[Dict[str, str]] = None,
            inputs: Optional[List[ProcessingInput]] = None,
            outputs: Optional[List[ProcessingOutput]] = None,
            arguments: Optional[List[str]] = None,
            wait: bool = True,
            logs: bool = True,
            job_name: Optional[str] = None,
            experiment_config: Optional[Dict[str, str]] = None,
            kms_key: Optional[str] = None,
        ):
            """Runs a processing job.

            Args:
                entrypoint (str): Path (absolute or relative) to the local Python source
                    file which should be executed as the entry point to training. If
                    ``source_dir`` is specified, then ``entry_point`` must point to a file
                    located at the root of ``source_dir``.
                source_dir (str): Path (absolute, relative or an S3 URI) to a directory
                    with any other training source code dependencies aside from the entry
                    point file (default: None). If ``source_dir`` is an S3 URI, it must
                    point to a tar.gz file. Structure within this directory are preserved
                    when training on Amazon SageMaker.
                dependencies (list[str]): A list of paths to directories (absolute
                    or relative) with any additional libraries that will be exported
                    to the container (default: []). The library folders will be
                    copied to SageMaker in the same folder where the entrypoint is
                    copied. If 'git_config' is provided, 'dependencies' should be a
                    list of relative locations to directories with any additional
                    libraries needed in the Git repo.
                git_config (dict[str, str]): Git configurations used for cloning
                    files, including ``repo``, ``branch``, ``commit``,
                    ``2FA_enabled``, ``username``, ``password`` and ``token``. The
                    ``repo`` field is required. All other fields are optional.
                    ``repo`` specifies the Git repository where your training script
                    is stored. If you don't provide ``branch``, the default value
                    'master' is used. If you don't provide ``commit``, the latest
                    commit in the specified branch is used. .. admonition:: Example

                        The following config:

                        >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git',
                        >>>               'branch': 'test-branch-git-config',
                        >>>               'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'}

                        results in cloning the repo specified in 'repo', then
                        checkout the 'master' branch, and checkout the specified
                        commit.

                    ``2FA_enabled``, ``username``, ``password`` and ``token`` are
                    used for authentication. For GitHub (or other Git) accounts, set
                    ``2FA_enabled`` to 'True' if two-factor authentication is
                    enabled for the account, otherwise set it to 'False'. If you do
                    not provide a value for ``2FA_enabled``, a default value of
                    'False' is used. CodeCommit does not support two-factor
                    authentication, so do not provide "2FA_enabled" with CodeCommit
                    repositories.

                    For GitHub and other Git repos, when SSH URLs are provided, it
                    doesn't matter whether 2FA is enabled or disabled; you should
                    either have no passphrase for the SSH key pairs, or have the
                    ssh-agent configured so that you will not be prompted for SSH
                    passphrase when you do 'git clone' command with SSH URLs. When
                    HTTPS URLs are provided: if 2FA is disabled, then either token
                    or username+password will be used for authentication if provided
                    (token prioritized); if 2FA is enabled, only token will be used
                    for authentication if provided. If required authentication info
                    is not provided, python SDK will try to use local credentials
                    storage to authenticate. If that fails either, an error message
                    will be thrown.

                    For CodeCommit repos, 2FA is not supported, so '2FA_enabled'
                    should not be provided. There is no token in CodeCommit, so
                    'token' should not be provided too. When 'repo' is an SSH URL,
                    the requirements are the same as GitHub-like repos. When 'repo'
                    is an HTTPS URL, username+password will be used for
                    authentication if they are provided; otherwise, python SDK will
                    try to use either CodeCommit credential helper or local
                    credential storage for authentication.
                inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
                    the processing job. These must be provided as
                    :class:`~sagemaker.processing.ProcessingInput` objects (default: None).
                outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
                    the processing job. These can be specified as either path strings or
                    :class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
                arguments (list[str]): A list of string arguments to be passed to a
                    processing job (default: None).
                wait (bool): Whether the call should wait until the job completes (default: True).
                logs (bool): Whether to show the logs produced by the job.
                    Only meaningful when wait is True (default: True).
                job_name (str): Processing job name. If not specified, the processor generates
                    a default job name, based on the base job name and current timestamp.
                experiment_config (dict[str, str]): Experiment management configuration.
                    Dictionary contains three optional keys:
                    'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
                kms_key (str): The ARN of the KMS key that is used to encrypt the
                    user code file (default: None).
            """
            if job_name is None:
                job_name = self._generate_current_job_name()

            estimator = self._upload_payload(entry_point, source_dir,
                                             dependencies, git_config,
                                             job_name)
            inputs = self._patch_inputs_with_payload(
                inputs,
                estimator._hyperparameters["sagemaker_submit_directory"])

            # Upload the bootstrapping code as s3://.../jobname/source/runproc.sh.
            s3_runproc_sh = S3Uploader.upload_string_as_file_body(
                self.runproc_sh.format(entry_point=entry_point),
                desired_s3_uri=f"{self.s3_prefix}/{job_name}/source/runproc.sh",
                sagemaker_session=self.sagemaker_session,
            )
            self.logger.info("runproc.sh uploaded to", s3_runproc_sh)

            # Submit a processing job.
            super().run(
                code=s3_runproc_sh,
                inputs=inputs,
                outputs=outputs,
                arguments=arguments,
                wait=wait,
                logs=logs,
                job_name=job_name,
                experiment_config=experiment_config,
                kms_key=kms_key,
            )
def test_sagemaker_pyspark_multinode(tag, role, image_uri, configuration,
                                     sagemaker_session, region,
                                     sagemaker_client):
    """Test that basic multinode case works on 32KB of data"""
    spark = PySparkProcessor(
        base_job_name="sm-spark-py",
        framework_version=tag,
        image_uri=image_uri,
        role=role,
        instance_count=2,
        instance_type="ml.c5.xlarge",
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )
    bucket = spark.sagemaker_session.default_bucket()
    timestamp = datetime.now().isoformat()
    output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp)
    spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp)
    spark_event_logs_s3_uri = "s3://{}/{}".format(bucket,
                                                  spark_event_logs_key_prefix)

    with open("test/resources/data/files/data.jsonl") as data:
        body = data.read()
        input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket)
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    spark.run(
        submit_app=
        "test/resources/code/python/hello_py_spark/hello_py_spark_app.py",
        submit_py_files=[
            "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py"
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
        spark_event_logs_s3_uri=spark_event_logs_s3_uri,
        wait=False,
    )
    processing_job = spark.latest_job

    s3_client = boto3.client("s3", region_name=region)

    file_size = 0
    latest_file_size = None
    updated_times_count = 0
    time_out = time.time() + 900

    while not processing_job_not_fail_or_complete(sagemaker_client,
                                                  processing_job.job_name):
        response = s3_client.list_objects(Bucket=bucket,
                                          Prefix=spark_event_logs_key_prefix)
        if "Contents" in response:
            # somehow when call list_objects the first file size is always 0, this for loop
            # is to skip that.
            for event_log_file in response["Contents"]:
                if event_log_file["Size"] != 0:
                    print("\n##### Latest file size is " +
                          str(event_log_file["Size"]))
                    latest_file_size = event_log_file["Size"]

        # update the file size if it increased
        if latest_file_size and latest_file_size > file_size:
            print("\n##### S3 file updated.")
            updated_times_count += 1
            file_size = latest_file_size

        if time.time() > time_out:
            raise RuntimeError("Timeout")

        time.sleep(20)

    # verify that spark event logs are periodically written to s3
    print("\n##### file_size {} updated_times_count {}".format(
        file_size, updated_times_count))
    assert file_size != 0

    # Commenting this assert because it's flaky.
    # assert updated_times_count > 1

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_directory(
        sagemaker_session, s3_files_kms_key):
    my_uuid = str(uuid.uuid4())
    my_inner_directory_uuid = str(uuid.uuid4())

    file_1_body = "First File Body {}.".format(my_uuid)
    file_1_name = "first_file_{}.txt".format(my_uuid)
    file_2_body = "Second File Body {}.".format(my_uuid)
    file_2_name = "second_file_{}.txt".format(my_uuid)

    base_s3_uri = os.path.join(
        "s3://",
        sagemaker_session.default_bucket(),
        "integ-test-test-s3-list",
        my_uuid,
        my_inner_directory_uuid,
    )
    file_1_s3_uri = os.path.join(base_s3_uri, file_1_name)
    file_2_s3_uri = os.path.join(base_s3_uri, file_2_name)

    S3Uploader.upload_string_as_file_body(
        body=file_1_body,
        desired_s3_uri=file_1_s3_uri,
        kms_key=s3_files_kms_key,
        session=sagemaker_session,
    )

    S3Uploader.upload_string_as_file_body(
        body=file_2_body,
        desired_s3_uri=file_2_s3_uri,
        kms_key=s3_files_kms_key,
        session=sagemaker_session,
    )

    s3_uris = S3Downloader.list(s3_uri=base_s3_uri, session=sagemaker_session)

    assert file_1_name in s3_uris[0]
    assert file_2_name in s3_uris[1]

    assert file_1_body == S3Downloader.read_file(s3_uri=s3_uris[0],
                                                 session=sagemaker_session)
    assert file_2_body == S3Downloader.read_file(s3_uri=s3_uris[1],
                                                 session=sagemaker_session)

    s3_directory_with_directory_underneath = os.path.join(
        "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list",
        my_uuid)

    S3Downloader.download(
        s3_uri=s3_directory_with_directory_underneath,
        local_path=TMP_BASE_PATH,
        session=sagemaker_session,
    )

    with open(
            os.path.join(TMP_BASE_PATH, my_inner_directory_uuid, file_1_name),
            "r") as f:
        assert file_1_body == f.read()

    with open(
            os.path.join(TMP_BASE_PATH, my_inner_directory_uuid, file_2_name),
            "r") as f:
        assert file_2_body == f.read()
def test_sagemaker_pyspark_multinode(spark_py_processor, sagemaker_session,
                                     configuration):
    """Test that basic multinode case works on 32KB of data"""
    bucket = spark_py_processor.sagemaker_session.default_bucket()
    timestamp = datetime.now().isoformat()
    output_data_uri = f"s3://{bucket}/spark/output/sales/{timestamp}"
    spark_event_logs_key_prefix = f"spark/spark-events/{timestamp}"
    spark_event_logs_s3_uri = f"s3://{bucket}/{spark_event_logs_key_prefix}"

    with open(os.path.join(SPARK_PATH, "files", "data.jsonl")) as data:
        body = data.read()
        input_data_uri = f"s3://{bucket}/spark/input/data.jsonl"
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    spark_py_processor.run(
        submit_app=os.path.join(SPARK_PATH, "code", "python", "hello_py_spark",
                                "hello_py_spark_app.py"),
        submit_py_files=[
            os.path.join(SPARK_PATH, "code", "python", "hello_py_spark",
                         "hello_py_spark_udfs.py")
        ],
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
        spark_event_logs_s3_uri=spark_event_logs_s3_uri,
        wait=False,
    )
    processing_job = spark_py_processor.latest_job

    s3_client = boto3.client(
        "s3",
        region_name=spark_py_processor.sagemaker_session.boto_region_name)

    file_size = 0
    latest_file_size = None
    updated_times_count = 0
    time_out = time.time() + 900
    while not processing_job_not_fail_or_complete(
            sagemaker_session.sagemaker_client, processing_job.job_name):
        response = s3_client.list_objects(Bucket=bucket,
                                          Prefix=spark_event_logs_key_prefix)
        if "Contents" in response:
            # somehow when call list_objects the first file size is always 0, this for loop
            # is to skip that.
            for event_log_file in response["Contents"]:
                if event_log_file["Size"] != 0:
                    latest_file_size = event_log_file["Size"]

        # update the file size if it increased
        if latest_file_size and latest_file_size > file_size:
            updated_times_count += 1
            file_size = latest_file_size

        if time.time() > time_out:
            raise RuntimeError("Timeout")

        time.sleep(20)

    # verify that spark event logs are periodically written to s3
    assert file_size != 0

    output_contents = S3Downloader.list(output_data_uri,
                                        sagemaker_session=sagemaker_session)
    assert len(output_contents) != 0
def main(deploy_data: dict, train_data: dict, capture_prefix: str):
    inference_id_prefix = 'sts_'  # the same used in testendpoint.py

    # Load config from environment and set required defaults
    # AWS especific
    AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1')
    AWS_PROFILE = os.getenv('AWS_PROFILE', 'default')
    AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None)
    AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None)
    b3_session, sm_client, sm_runtime, sm_session = get_sm_session(
        region=AWS_DEFAULT_REGION,
        profile_name=AWS_PROFILE,
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    # read test data
    test_data = load_dataset(train_data['train']['test'],
                             'test.csv',
                             sagemaker_session=sm_session)
    print(f"Loadding {train_data['train']['test']}")
    Y_val = test_data.iloc[:, 0].to_numpy()
    print(f"Test dataset shape: {Y_val.shape}")

    # list capture files, this is just as an example. Not used right
    # now but could be.
    capture_files = sorted(
        S3Downloader.list("{}/{}".format(
            deploy_data['monitor']['s3_capture_upload_path'],
            deploy_data['endpoint']['name']),
                          sagemaker_session=sm_session))
    # just the files with the prefix
    filtered = list(
        filter(lambda file_name: capture_prefix in file_name, capture_files))
    print(f"Detected {len(filtered)} capture files")

    capture_records = []
    for c_file in filtered:
        print(f"Processing: {c_file}")
        # read the capture data directly from S3
        content = S3Downloader.read_file(c_file, sagemaker_session=sm_session)
        records = [json.loads(l) for l in content.split("\n")[:-1]]

        capture_records.extend(records)

    print(f"No. of records {len(capture_records)} captured")
    captured_predictions = {}

    for obj in capture_records:
        # Extract inference ID
        inference_id = obj["eventMetadata"]["inferenceId"]
        # current version of script start in 1 when id=0
        # remove the prefix and get the id
        req_id = int(inference_id[len(inference_id_prefix):])

        # Extract result given by the model
        Y_pred_value = encoders.decode(
            obj["captureData"]["endpointOutput"]["data"],
            # i have fixed this value here becouse
            # obj["captureData"]["endpointOutput"]["observedContentType"]
            # some times include the encoding like: text/csv; utf-8
            # and encoders.decode() will give error.
            content_types.CSV)
        captured_predictions[req_id] = Y_pred_value  # np.array

    # save and upload the ground truth labels
    print("Generating labels")
    fake_records = []
    for i, label in captured_predictions.items():
        val = ground_truth_with_id(i, label, Y_val, inference_id_prefix)
        fake_records.append(json.dumps(val))

    data_to_upload = "\n".join(fake_records)
    target_s3_uri = "{}/{}/{}.jsonl".format(
        deploy_data['monitor']['ground truth uri'], capture_prefix,
        uuid.uuid4().hex)
    print(f"Uploading ground truth to {target_s3_uri} ...", end="")
    S3Uploader.upload_string_as_file_body(data_to_upload,
                                          target_s3_uri,
                                          sagemaker_session=sm_session)
    print("Done !")
Esempio n. 10
0
def test_one_step_ingestion_pipeline(sagemaker_session, feature_store_session,
                                     feature_definitions, role, pipeline_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.4xlarge")

    input_name = "features.csv"
    input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv")
    input_data_uri = os.path.join("s3://", sagemaker_session.default_bucket(),
                                  "py-sdk-ingestion-test-input/features.csv")
    with open(input_file_path, "r") as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    inputs = [
        ProcessingInput(
            input_name=input_name,
            source=input_data_uri,
            destination="/opt/ml/processing/features.csv",
        )
    ]

    feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}"
    feature_group = FeatureGroup(
        name=feature_group_name,
        feature_definitions=feature_definitions,
        sagemaker_session=feature_store_session,
    )

    ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input(
        input_name,
        input_data_uri,
        s3_content_type="csv",
        s3_has_header=True,
    )

    outputs = [
        ProcessingOutput(
            output_name=output_name,
            app_managed=True,
            feature_store_output=FeatureStoreOutput(
                feature_group_name=feature_group_name),
        )
    ]

    temp_flow_path = "./ingestion.flow"
    with cleanup_feature_group(feature_group):
        json.dump(ingestion_only_flow, open(temp_flow_path, "w"))

        data_wrangler_processor = DataWranglerProcessor(
            role=role,
            data_wrangler_flow_source=temp_flow_path,
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            max_runtime_in_seconds=86400,
        )

        data_wrangler_step = ProcessingStep(name="ingestion-step",
                                            processor=data_wrangler_processor,
                                            inputs=inputs,
                                            outputs=outputs)

        pipeline = Pipeline(
            name=pipeline_name,
            parameters=[instance_count, instance_type],
            steps=[data_wrangler_step],
            sagemaker_session=sagemaker_session,
        )

        try:
            response = pipeline.create(role)
            create_arn = response["PipelineArn"]

            offline_store_s3_uri = os.path.join(
                "s3://", sagemaker_session.default_bucket(),
                feature_group_name)
            feature_group.create(
                s3_uri=offline_store_s3_uri,
                record_identifier_name="f11",
                event_time_feature_name="f10",
                role_arn=role,
                enable_online_store=False,
            )
            _wait_for_feature_group_create(feature_group)

            execution = pipeline.start()
            response = execution.describe()
            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=60, max_attempts=10)
            except WaiterError:
                pass

            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            assert execution_steps[0]["StepName"] == "ingestion-step"
            assert execution_steps[0]["StepStatus"] == "Succeeded"

            athena_query = feature_group.athena_query()
            with timeout(minutes=10):
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")

                df = athena_query.as_dataframe()
                assert pd.read_csv(input_file_path).shape[0] == df.shape[0]
        finally:
            try:
                pipeline.delete()
            except Exception as e:
                print(f"Delete pipeline failed with error: {e}")
            os.remove(temp_flow_path)
def test_model_registration_with_drift_check_baselines(
    sagemaker_session,
    role,
    pipeline_name,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")

    # upload model data to s3
    model_local_path = os.path.join(DATA_DIR, "mxnet_mnist/model.tar.gz")
    model_base_uri = "s3://{}/{}/input/model/{}".format(
        sagemaker_session.default_bucket(),
        "register_model_test_with_drift_baseline",
        utils.unique_name_from_base("model"),
    )
    model_uri = S3Uploader.upload(model_local_path,
                                  model_base_uri,
                                  sagemaker_session=sagemaker_session)
    model_uri_param = ParameterString(name="model_uri",
                                      default_value=model_uri)

    # upload metrics to s3
    metrics_data = (
        '{"regression_metrics": {"mse": {"value": 4.925353410353891, '
        '"standard_deviation": 2.219186917819692}}}')
    metrics_base_uri = "s3://{}/{}/input/metrics/{}".format(
        sagemaker_session.default_bucket(),
        "register_model_test_with_drift_baseline",
        utils.unique_name_from_base("metrics"),
    )
    metrics_uri = S3Uploader.upload_string_as_file_body(
        body=metrics_data,
        desired_s3_uri=metrics_base_uri,
        sagemaker_session=sagemaker_session,
    )
    metrics_uri_param = ParameterString(name="metrics_uri",
                                        default_value=metrics_uri)

    model_metrics = ModelMetrics(
        bias=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        explainability=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_pre_training=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_post_training=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
    )
    drift_check_baselines = DriftCheckBaselines(
        model_statistics=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        model_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        model_data_statistics=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        model_data_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_config_file=FileSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_pre_training_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_post_training_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        explainability_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        explainability_config_file=FileSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
    )
    customer_metadata_properties = {"key1": "value1"}
    estimator = XGBoost(
        entry_point="training.py",
        source_dir=os.path.join(DATA_DIR, "sip"),
        instance_type=instance_type,
        instance_count=instance_count,
        framework_version="0.90-2",
        sagemaker_session=sagemaker_session,
        py_version="py3",
        role=role,
    )
    step_register = RegisterModel(
        name="MyRegisterModelStep",
        estimator=estimator,
        model_data=model_uri_param,
        content_types=["application/json"],
        response_types=["application/json"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name="testModelPackageGroup",
        model_metrics=model_metrics,
        drift_check_baselines=drift_check_baselines,
        customer_metadata_properties=customer_metadata_properties,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            model_uri_param,
            metrics_uri_param,
            instance_type,
            instance_count,
        ],
        steps=[step_register],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        for _ in retries(
                max_retry_count=5,
                exception_message_prefix=
                "Waiting for a successful execution of pipeline",
                seconds_to_sleep=10,
        ):
            execution = pipeline.start(parameters={
                "model_uri": model_uri,
                "metrics_uri": metrics_uri
            })
            response = execution.describe()

            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=30, max_attempts=60)
            except WaiterError:
                pass
            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            failure_reason = execution_steps[0].get("FailureReason", "")
            if failure_reason != "":
                logging.error(
                    f"Pipeline execution failed with error: {failure_reason}."
                    " Retrying..")
                continue
            assert execution_steps[0]["StepStatus"] == "Succeeded"
            assert execution_steps[0]["StepName"] == "MyRegisterModelStep"

            response = sagemaker_session.sagemaker_client.describe_model_package(
                ModelPackageName=execution_steps[0]["Metadata"]
                ["RegisterModel"]["Arn"])

            assert (response["ModelMetrics"]["Explainability"]["Report"]
                    ["ContentType"] == "application/json")
            assert (response["DriftCheckBaselines"]["Bias"][
                "PreTrainingConstraints"]["ContentType"] == "application/json")
            assert (response["DriftCheckBaselines"]["Explainability"]
                    ["Constraints"]["ContentType"] == "application/json")
            assert (response["DriftCheckBaselines"]["ModelQuality"]
                    ["Statistics"]["ContentType"] == "application/json")
            assert (response["DriftCheckBaselines"]["ModelDataQuality"]
                    ["Statistics"]["ContentType"] == "application/json")
            assert response[
                "CustomerMetadataProperties"] == customer_metadata_properties
            break
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_end_to_end_pipeline_successful_execution(
    sagemaker_session, region_name, role, pipeline_name, wait=False
):
    model_package_group_name = f"{pipeline_name}ModelPackageGroup"
    data_path = os.path.join(DATA_DIR, "workflow")
    default_bucket = sagemaker_session.default_bucket()

    # download the input data
    local_input_path = os.path.join(data_path, "abalone-dataset.csv")
    s3 = sagemaker_session.boto_session.resource("s3")
    s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file(
        "dataset/abalone-dataset.csv", local_input_path
    )

    # # upload the input data to our bucket
    base_uri = f"s3://{default_bucket}/{pipeline_name}"
    with open(local_input_path) as data:
        body = data.read()
        input_data_uri = S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{base_uri}/abalone-dataset.csv",
            sagemaker_session=sagemaker_session,
        )

    # download batch transform data
    local_batch_path = os.path.join(data_path, "abalone-dataset-batch")
    s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file(
        "dataset/abalone-dataset-batch", local_batch_path
    )

    # upload the batch transform data
    with open(local_batch_path) as data:
        body = data.read()
        batch_data_uri = S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{base_uri}/abalone-dataset-batch",
            sagemaker_session=sagemaker_session,
        )

    # define parameters
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")
    input_data = ParameterString(
        name="InputData",
        default_value=input_data_uri,
    )
    batch_data = ParameterString(
        name="BatchData",
        default_value=batch_data_uri,
    )

    # define processing step
    framework_version = "0.23-1"
    sklearn_processor = SKLearnProcessor(
        framework_version=framework_version,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{pipeline_name}-process",
        role=role,
        sagemaker_session=sagemaker_session,
    )
    step_process = ProcessingStep(
        name="AbaloneProcess",
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(data_path, "abalone/preprocessing.py"),
    )

    # define training step
    model_path = f"s3://{default_bucket}/{pipeline_name}Train"
    image_uri = image_uris.retrieve(
        framework="xgboost",
        region=region_name,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        role=role,
        sagemaker_session=sagemaker_session,
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="AbaloneTrain",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # define evaluation step
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{pipeline_name}-eval",
        role=role,
        sagemaker_session=sagemaker_session,
    )
    evaluation_report = PropertyFile(
        name="EvaluationReport", output_name="evaluation", path="evaluation.json"
    )
    step_eval = ProcessingStep(
        name="AbaloneEval",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(data_path, "abalone/evaluation.py"),
        property_files=[evaluation_report],
    )

    # define create model step
    model = Model(
        image_uri=image_uri,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_create_model = CreateModelStep(
        name="AbaloneCreateModel",
        model=model,
        inputs=inputs,
    )

    # define transform step
    transformer = Transformer(
        model_name=step_create_model.properties.ModelName,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        output_path=f"s3://{default_bucket}/{pipeline_name}Transform",
        sagemaker_session=sagemaker_session,
    )
    step_transform = TransformStep(
        name="AbaloneTransform",
        transformer=transformer,
        inputs=TransformInput(data=batch_data),
    )

    # define register model step
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )
    step_register = RegisterModel(
        name="AbaloneRegisterModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # define condition step
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step_name=step_eval.name,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value",
        ),
        right=20.0,
    )

    step_cond = ConditionStep(
        name="AbaloneMSECond",
        conditions=[cond_lte],
        if_steps=[step_register, step_create_model, step_transform],
        else_steps=[],
    )

    # define pipeline
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
            batch_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )

    pipeline.create(role)
    execution = pipeline.start()
    execution_arn = execution.arn

    if wait:
        execution.wait()

    return execution_arn