def test_one_step_ingestion_pipeline(
    sagemaker_session, feature_store_session, feature_definitions, role, pipeline_name
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge")

    input_name = "features.csv"
    input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv")
    input_data_uri = os.path.join(
        "s3://",
        sagemaker_session.default_bucket(),
        "py-sdk-ingestion-test-input/features.csv",
    )

    with open(input_file_path, "r") as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session,
        )

    inputs = [
        ProcessingInput(
            input_name=input_name,
            source=input_data_uri,
            destination="/opt/ml/processing/features.csv",
        )
    ]

    feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}"
    feature_group = FeatureGroup(
        name=feature_group_name,
        feature_definitions=feature_definitions,
        sagemaker_session=feature_store_session,
    )

    ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input(
        input_name,
        input_data_uri,
        s3_content_type="csv",
        s3_has_header=True,
    )

    outputs = [
        ProcessingOutput(
            output_name=output_name,
            app_managed=True,
            feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),
        )
    ]

    output_content_type = "CSV"
    output_config = {output_name: {"content_type": output_content_type}}
    job_argument = [f"--output-config '{json.dumps(output_config)}'"]

    temp_flow_path = "./ingestion.flow"
    with cleanup_feature_group(feature_group):
        json.dump(ingestion_only_flow, open(temp_flow_path, "w"))

        data_wrangler_processor = DataWranglerProcessor(
            role=role,
            data_wrangler_flow_source=temp_flow_path,
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            max_runtime_in_seconds=86400,
        )

        data_wrangler_step = ProcessingStep(
            name="ingestion-step",
            processor=data_wrangler_processor,
            inputs=inputs,
            outputs=outputs,
            job_arguments=job_argument,
        )

        pipeline = Pipeline(
            name=pipeline_name,
            parameters=[instance_count, instance_type],
            steps=[data_wrangler_step],
            sagemaker_session=sagemaker_session,
        )

        try:
            response = pipeline.create(role)
            create_arn = response["PipelineArn"]

            offline_store_s3_uri = os.path.join(
                "s3://", sagemaker_session.default_bucket(), feature_group_name
            )
            feature_group.create(
                s3_uri=offline_store_s3_uri,
                record_identifier_name="f11",
                event_time_feature_name="f10",
                role_arn=role,
                enable_online_store=False,
            )
            _wait_for_feature_group_create(feature_group)

            execution = pipeline.start()
            response = execution.describe()
            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=60, max_attempts=10)
            except WaiterError:
                pass

            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            assert execution_steps[0]["StepName"] == "ingestion-step"
            assert execution_steps[0]["StepStatus"] == "Succeeded"

            athena_query = feature_group.athena_query()
            with timeout(minutes=10):
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get("QueryExecution").get(
                    "Status"
                ).get("State")

                df = athena_query.as_dataframe()
                assert pd.read_csv(input_file_path).shape[0] == df.shape[0]
        finally:
            try:
                pipeline.delete()
            except Exception as e:
                print(f"Delete pipeline failed with error: {e}")
            os.remove(temp_flow_path)
def test_three_step_definition(
    sagemaker_session,
    region_name,
    role,
    script_dir,
    pipeline_name,
    athena_dataset_definition,
):
    framework_version = "0.20.0"
    instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    output_prefix = ParameterString(name="OutputPrefix", default_value="output")

    input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv"

    sklearn_processor = SKLearnProcessor(
        framework_version=framework_version,
        instance_type=instance_type,
        instance_count=instance_count,
        base_job_name="test-sklearn",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="my-process",
        display_name="ProcessingStep",
        description="description for Processing step",
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
            ProcessingInput(dataset_definition=athena_dataset_definition),
        ],
        outputs=[
            ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
            ProcessingOutput(
                output_name="test_data",
                source="/opt/ml/processing/test",
                destination=Join(
                    on="/",
                    values=[
                        "s3:/",
                        sagemaker_session.default_bucket(),
                        "test-sklearn",
                        output_prefix,
                        ExecutionVariables.PIPELINE_EXECUTION_ID,
                    ],
                ),
            ),
        ],
        code=os.path.join(script_dir, "preprocessing.py"),
    )

    sklearn_train = SKLearn(
        framework_version=framework_version,
        entry_point=os.path.join(script_dir, "train.py"),
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_train = TrainingStep(
        name="my-train",
        display_name="TrainingStep",
        description="description for Training step",
        estimator=sklearn_train,
        inputs=TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "train_data"
            ].S3Output.S3Uri
        ),
    )

    model = Model(
        image_uri=sklearn_train.image_uri,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    model_inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_model = CreateModelStep(
        name="my-model",
        display_name="ModelStep",
        description="description for Model step",
        model=model,
        inputs=model_inputs,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_type, instance_count, output_prefix],
        steps=[step_process, step_train, step_model],
        sagemaker_session=sagemaker_session,
    )

    definition = json.loads(pipeline.definition())
    assert definition["Version"] == "2020-12-01"

    assert set(tuple(param.items()) for param in definition["Parameters"]) == set(
        [
            tuple(
                {
                    "Name": "InstanceType",
                    "Type": "String",
                    "DefaultValue": "ml.m5.xlarge",
                }.items()
            ),
            tuple({"Name": "InstanceCount", "Type": "Integer", "DefaultValue": 1}.items()),
            tuple(
                {
                    "Name": "OutputPrefix",
                    "Type": "String",
                    "DefaultValue": "output",
                }.items()
            ),
        ]
    )

    steps = definition["Steps"]
    assert len(steps) == 3

    names_and_types = []
    display_names_and_desc = []
    processing_args = {}
    training_args = {}
    for step in steps:
        names_and_types.append((step["Name"], step["Type"]))
        display_names_and_desc.append((step["DisplayName"], step["Description"]))
        if step["Type"] == "Processing":
            processing_args = step["Arguments"]
        if step["Type"] == "Training":
            training_args = step["Arguments"]
        if step["Type"] == "Model":
            model_args = step["Arguments"]

    assert set(names_and_types) == set(
        [
            ("my-process", "Processing"),
            ("my-train", "Training"),
            ("my-model", "Model"),
        ]
    )

    assert set(display_names_and_desc) == set(
        [
            ("ProcessingStep", "description for Processing step"),
            ("TrainingStep", "description for Training step"),
            ("ModelStep", "description for Model step"),
        ]
    )
    assert processing_args["ProcessingResources"]["ClusterConfig"] == {
        "InstanceType": {"Get": "Parameters.InstanceType"},
        "InstanceCount": {"Get": "Parameters.InstanceCount"},
        "VolumeSizeInGB": 30,
    }

    assert training_args["ResourceConfig"] == {
        "InstanceCount": 1,
        "InstanceType": {"Get": "Parameters.InstanceType"},
        "VolumeSizeInGB": 30,
    }
    assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == {
        "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri"
    }
    assert model_args["PrimaryContainer"]["ModelDataUrl"] == {
        "Get": "Steps.my-train.ModelArtifacts.S3ModelArtifacts"
    }
    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_steps_with_map_params_pipeline(
    sagemaker_session,
    role,
    script_dir,
    pipeline_name,
    region_name,
    athena_dataset_definition,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    framework_version = "0.20.0"
    instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
    output_prefix = ParameterString(name="OutputPrefix", default_value="output")
    input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv"

    sklearn_processor = SKLearnProcessor(
        framework_version=framework_version,
        instance_type=instance_type,
        instance_count=instance_count,
        base_job_name="test-sklearn",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="my-process",
        display_name="ProcessingStep",
        description="description for Processing step",
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
            ProcessingInput(dataset_definition=athena_dataset_definition),
        ],
        outputs=[
            ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
            ProcessingOutput(
                output_name="test_data",
                source="/opt/ml/processing/test",
                destination=Join(
                    on="/",
                    values=[
                        "s3:/",
                        sagemaker_session.default_bucket(),
                        "test-sklearn",
                        output_prefix,
                        ExecutionVariables.PIPELINE_EXECUTION_ID,
                    ],
                ),
            ),
        ],
        code=os.path.join(script_dir, "preprocessing.py"),
    )

    sklearn_train = SKLearn(
        framework_version=framework_version,
        entry_point=os.path.join(script_dir, "train.py"),
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        role=role,
        hyperparameters={
            "batch-size": 500,
            "epochs": 5,
        },
    )
    step_train = TrainingStep(
        name="my-train",
        display_name="TrainingStep",
        description="description for Training step",
        estimator=sklearn_train,
        inputs=TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "train_data"
            ].S3Output.S3Uri
        ),
    )

    model = Model(
        image_uri=sklearn_train.image_uri,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    model_inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_model = CreateModelStep(
        name="my-model",
        display_name="ModelStep",
        description="description for Model step",
        model=model,
        inputs=model_inputs,
    )

    # Condition step for evaluating model quality and branching execution
    cond_lte = ConditionGreaterThanOrEqualTo(
        left=step_train.properties.HyperParameters["batch-size"],
        right=6.0,
    )

    step_cond = ConditionStep(
        name="CustomerChurnAccuracyCond",
        conditions=[cond_lte],
        if_steps=[],
        else_steps=[step_model],
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_type, instance_count, output_prefix],
        steps=[step_process, step_train, step_cond],
        sagemaker_session=sagemaker_session,
    )

    definition = json.loads(pipeline.definition())
    assert definition["Version"] == "2020-12-01"

    steps = definition["Steps"]
    assert len(steps) == 3
    training_args = {}
    condition_args = {}
    for step in steps:
        if step["Type"] == "Training":
            training_args = step["Arguments"]
        if step["Type"] == "Condition":
            condition_args = step["Arguments"]

    assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == {
        "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri"
    }
    assert condition_args["Conditions"][0]["LeftValue"] == {
        "Get": "Steps.my-train.HyperParameters['batch-size']"
    }

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Esempio n. 4
0
    def runProcessing(
            self,
            entrypoint=None,
            command=None,
            env=None,
            code=None,
            arguments=None,
            inputs=list(),
            outputs=list(),
            instance_type=constants.DEFAULT_INSTANCE_TYPE_TRAINING,
            instance_count=constants.DEFAULT_INSTANCE_COUNT,
            role_name=constants.DEFAULT_IAM_ROLE,
            volume_size=constants.DEFAULT_VOLUME_SIZE,
            max_run_mins=constants.DEFAULT_MAX_RUN,
            tags=dict(),
            input_distribution="FullyReplicated",
            dependencies=list(),
    ):
        logger.info(
            f"===== Running a processing job {self.task_name} entrypoint={entrypoint} "
            f"command={command} code={code} arguments={arguments}... =====")
        job_name = self._getJobName()

        # ## Outputs

        # state - continuesly updated
        state_path = "/opt/ml/processing/state"
        outputs.append(
            ProcessingOutput(state_path, self.stateS3Uri, "state",
                             "Continuous"))
        env["SSM_STATE"] = state_path

        # output - copied by end of job
        output_path = "/opt/ml/processing/output"
        output_s3_uri = sagemaker.s3.s3_path_join(self.baseTaskS3Uri, job_name,
                                                  "output")
        outputs.append(
            ProcessingOutput(output_path, output_s3_uri, "output", "EndOfJob"))
        env["SSM_OUTPUT"] = output_path

        # ## Inputs

        # prev state
        bucket, prefix = sagemaker.s3.parse_s3_url(self.stateS3Uri)
        if self.smSession.list_s3_files(bucket, prefix):
            prev_state_path = "/opt/ml/processing/state_prev"
            inputs.append(
                ProcessingInput(
                    self.stateS3Uri,
                    prev_state_path,
                    "state_prev",
                    s3_data_distribution_type="FullyReplicated",
                ))

        # dependencies

        # append the internal dependencies
        dependencies.extend(self.internalDependencies)
        for dep in dependencies:
            dep = os.path.abspath(dep)
            basename = os.path.basename(dep)
            local_path = f"/opt/ml/processing/input/code/{basename}"
            inputs.append(
                ProcessingInput(
                    dep,
                    local_path,
                    "DEP_" + basename,
                    s3_data_distribution_type="FullyReplicated",
                ))

        # input data
        if self.inputS3Uri:
            data_path = "/opt/ml/processing/data"
            inputs.append(
                ProcessingInput(
                    self.inputS3Uri,
                    data_path,
                    "data",
                    s3_data_distribution_type=input_distribution,
                ))
            env["SM_CHANNEL_DATA"] = data_path

        tags["SimpleSagemakerTask"] = self.task_name
        tags["SimpleSagemakerVersion"] = VERSION
        tags = [{"Key": k, "Value": v} for k, v in tags.items()]

        additional_args = dict()
        if code:
            processor_class = ScriptProcessor
            additional_args["command"] = command
        else:
            assert (
                not command
            ), "Command can't be given where code isn't given (for the `Processor` class)"
            processor_class = Processor
            additional_args["entrypoint"] = entrypoint

        processor = processor_class(
            role=role_name,
            image_uri=self.image_uri,
            instance_count=instance_count,
            instance_type=instance_type,
            volume_size_in_gb=volume_size,
            max_runtime_in_seconds=max_run_mins * 60,
            sagemaker_session=self.smSession,
            tags=tags,
            env=env,
            **additional_args,
        )
        if code:
            processor.run(
                code=code,
                inputs=inputs,
                outputs=outputs,
                arguments=arguments,
                job_name=job_name,
            )
        else:
            processor.run(
                inputs=inputs,
                outputs=outputs,
                arguments=arguments,
                job_name=job_name,
            )

        proecessing_job_description = self.smSession.describe_processing_job(
            job_name)

        self.estimators.append(processor)
        self.jobNames.append(job_name)
        self.descriptions.append(proecessing_job_description)
        # print(proecessing_job_description)
        # if "Completed" != proecessing_job_description["TrainingJobStatus"]:
        #    logger.error(
        #        f"Task failed with status: {proecessing_job_description['TrainingJobStatus']}"
        #    )
        return job_name
Esempio n. 5
0
def define_inference_pipeline(
    sm_role,
    workflow_execution_role,
    inference_pipeline_name,
    return_yaml=True,
    dump_yaml_file="templates/sagemaker_inference_pipeline.yaml",
):
    """
    Return YAML definition of the training pipeline, which consists of multiple
    Amazon StepFunction steps

    sm_role:                    ARN of the SageMaker execution role
    workflow_execution_role:    ARN of the StepFunction execution role
    return_yaml:                Return YAML representation or not, if False,
                     it returns an instance of `stepfunctions.workflow.WorkflowObject`
    dump_yaml_file:  If not None, a YAML file will be generated at this file location

    """

    # Pass required parameters dynamically for each execution using placeholders.
    execution_input = ExecutionInput(
        schema={
            "InputDataURL": str,
            "PreprocessingJobName": str,
            "InferenceJobName": str,
            "ProcModelS3": str,
            "PreprocessingCodeURL": str,
            "InferenceCodeURL": str,
            "ModelS3": str,
            "PreprocessedTrainDataURL": str,
            "PreprocessedTestDataURL": str,
            "OutputPathURL": str,
        })
    """
    Create Preprocessing Model from model artifact.
    """
    # sagemaker_session = sagemaker.Session()

    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=sm_role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        max_runtime_in_seconds=1200,
    )
    # Create ProcessingInputs and ProcessingOutputs objects for Inputs and
    # Outputs respectively for the SageMaker Processing Job
    inputs = [
        ProcessingInput(
            source=execution_input["InputDataURL"],
            destination="/opt/ml/processing/input",
            input_name="input-1",
        ),
        ProcessingInput(
            source=execution_input["PreprocessingCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
        ProcessingInput(
            source=execution_input["ProcModelS3"],
            destination="/opt/ml/processing/model",
            input_name="proc_model",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/test",
            destination=execution_input["PreprocessedTestDataURL"],
            output_name="test_data",
        ),
    ]

    processing_step = ProcessingStep(
        "SageMaker pre-processing step",
        processor=sklearn_processor,
        job_name=execution_input["PreprocessingJobName"],
        inputs=inputs,
        outputs=outputs,
        container_arguments=["--mode", "infer"],
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/preprocessing.py",
        ],
    )
    """
    Create inference with sklearn processing step.

    Inputs are the preprocessed data S3 URL, the inference code S3 URL, and
    the model S3 URL. Output is the inferred data.
    """
    sklearn_processor2 = SKLearnProcessor(
        framework_version="0.20.0",
        role=sm_role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        max_runtime_in_seconds=1200,
    )
    inputs = [
        ProcessingInput(
            source=execution_input["PreprocessedTestDataURL"],
            destination="/opt/ml/processing/input",
            input_name="input-1",
        ),
        ProcessingInput(
            source=execution_input["InferenceCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
        ProcessingInput(
            source=execution_input["ModelS3"],
            destination="/opt/ml/processing/model",
            input_name="model",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/test",
            destination=execution_input["OutputPathURL"],
            output_name="test_data",
        ),
    ]

    inference_step = ProcessingStep(
        "SageMaker inference step",
        processor=sklearn_processor2,
        job_name=execution_input["InferenceJobName"],
        inputs=inputs,
        outputs=outputs,
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/inference.py",
        ],
    )

    # Create Fail state to mark the workflow failed in case any of the steps fail.
    failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
        "ML Workflow failed", cause="SageMakerProcessingJobFailed")

    # Add the Error handling in the workflow
    catch_state_processing = stepfunctions.steps.states.Catch(
        error_equals=["States.TaskFailed"],
        next_step=failed_state_sagemaker_processing_failure,
    )

    processing_step.add_catch(catch_state_processing)
    inference_step.add_catch(catch_state_processing)

    # Create the Workflow
    workflow_graph = Chain([processing_step, inference_step])
    inference_pipeline = Workflow(
        name=inference_pipeline_name,
        definition=workflow_graph,
        role=workflow_execution_role,
    )
    return inference_pipeline
Esempio n. 6
0
def test_local_processing_script_processor(sagemaker_local_session,
                                           sklearn_image_uri):
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    script_processor = ScriptProcessor(
        role="SageMakerRole",
        image_uri=sklearn_image_uri,
        command=["python3"],
        instance_count=1,
        instance_type="local",
        volume_size_in_gb=30,
        volume_kms_key=None,
        max_runtime_in_seconds=3600,
        base_job_name="test-script-processor",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_local_session,
    )

    script_processor.run(
        code=os.path.join(DATA_DIR, "dummy_script.py"),
        inputs=[
            ProcessingInput(
                source=input_file_path,
                destination="/opt/ml/processing/input/container/path/",
                input_name="dummy_input",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/output/container/path/",
                output_name="dummy_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["-v"],
        wait=True,
        logs=True,
    )

    job_description = script_processor.latest_job.describe()

    assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input"

    assert job_description["ProcessingInputs"][1]["InputName"] == "code"

    assert job_description["ProcessingJobName"].startswith(
        "test-script-processor")

    assert job_description["ProcessingJobStatus"] == "Completed"

    assert job_description["ProcessingOutputConfig"]["Outputs"][0][
        "OutputName"] == "dummy_output"

    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceType"] == "local"
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "VolumeSizeInGB"] == 30

    assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["AppSpecification"]["ImageUri"] == sklearn_image_uri

    assert job_description["Environment"] == {
        "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"
    }
    def getter(self, attr: str) -> Dict[str, Any]:
        data = {
            'tfrecord_processing': {
                'endpoint': ['python3', 'criteo_ads_data/run_processing.py'],
                'inputs': [
                    ProcessingInput(
                        source='s3://criteo-ads-data/prod/train_csv',
                        destination='/opt/ml/processing/input',
                        s3_data_distribution_type='ShardedByS3Key',
                    )
                ],
                'outputs': [
                    ProcessingOutput(
                        source='/opt/ml/processing/output',
                        destination=
                        's3://criteo-ads-data/prod/train_tfrecord_gz',
                    )
                ],
                'arguments': [
                    '--input_path=/opt/ml/processing/input',
                    '--output_path=/opt/ml/processing/output',
                ],
                'sm_config':
                SagemakerProcessingConfig(
                    project_name=self.project_name,
                    env=self.env,
                    region_name=self.region_name,
                    current_time=self.current_time,
                    sm_instance_type='ml.c5.2xlarge',
                    sm_instance_count=20,
                    sm_volumesize=100,
                    max_run=1 * 60 * 60,
                )
            },
            'layer_processing': {
                'endpoint':
                ['python3', 'criteo_ads_data/run_processing_layer.py'],
                'inputs': [
                    ProcessingInput(
                        source=
                        's3://criteo-ads-data/prod/train_tfrecord_gz/train',
                        destination='/opt/ml/processing/input',
                        s3_data_distribution_type='FullyReplicated',
                    )
                ],
                'outputs': [
                    ProcessingOutput(
                        source='/opt/ml/processing/output',
                        destination='s3://criteo-ads-data/prod/proc_layer',
                    )
                ],
                'arguments': [
                    '--input_path=/opt/ml/processing/input',
                    '--output_path=/opt/ml/processing/output',
                ],
                'sm_config':
                SagemakerProcessingConfig(
                    project_name=self.project_name,
                    env=self.env,
                    region_name=self.region_name,
                    current_time=self.current_time,
                    sm_instance_type='ml.c5.9xlarge',
                    sm_instance_count=1,
                    sm_volumesize=100,
                    max_run=24 * 60 * 60,
                )
            },
            'estimator': {
                'sm_input': {
                    'train':
                    TrainingInput(
                        s3_data=
                        's3://criteo-ads-data/prod/train_tfrecord_100000_gz/train',
                        distribution='FullyReplicated',
                    ),
                    'test':
                    TrainingInput(
                        s3_data=
                        's3://criteo-ads-data/prod/train_tfrecord_100000_gz/test',
                        distribution='FullyReplicated',
                    ),
                    'layer':
                    TrainingInput(
                        s3_data='s3://criteo-ads-data/prod/proc_layer_100000',
                        distribution='FullyReplicated',
                    ),
                },
                'shared_hyperparameters': {
                    'tf_logs_path': self.tf_logs_path,
                    'batch_size': 512,
                },
                'sm_config':
                SagemakerTrainingConfig(
                    project_name=self.project_name,
                    env=self.env,
                    region_name=self.region_name,
                    current_time=self.current_time,
                    sm_instance_type='ml.c5.2xlarge',
                    sm_instance_count=1,
                    sm_volumesize=300,
                    max_run=1 * 24 * 60 * 60,
                )
            },
            'hparam_tuning': {
                'objective_metric_name':
                'validation:loss',
                'metric_definitions': [
                    {
                        'Name': 'train:loss',
                        'Regex': '.*loss: ([0-9\\.]+) - auc: [0-9\\.]+.*'
                    },
                    {
                        'Name': 'train:auc',
                        'Regex': '.*loss: [0-9\\.]+ - auc: ([0-9\\.]+).*'
                    },
                    {
                        'Name':
                        'validation:loss',
                        'Regex':
                        '.*step - loss: [0-9\\.]+ - auc: [0-9\\.]+ - val_loss: ([0-9\\.]+) - val_auc: [0-9\\.]+.*'
                    },
                    {
                        'Name':
                        'validation:auc',
                        'Regex':
                        '.*step - loss: [0-9\\.]+ - auc: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_auc: ([0-9\\.]+).*'
                    },
                ],
                'hyperparameter_ranges': {
                    'epochs': IntegerParameter(1, 50),
                    'batch_size': CategoricalParameter([64, 128, 256, 512])
                },
                'objective_type':
                'Minimize',
                'max_jobs':
                5,
                'max_parallel_jobs':
                5,
            },
        }

        return data.get(attr)
def run_model_monitor_job_processor(
    region,
    instance_type,
    role,
    data_capture_path,
    statistics_path,
    constraints_path,
    reports_path,
    instance_count=1,
    preprocessor_path=None,
    postprocessor_path=None,
    publish_cloudwatch_metrics="Disabled",
):

    data_capture_sub_path = data_capture_path[data_capture_path.
                                              rfind("datacapture/"):]
    data_capture_sub_path = data_capture_sub_path[data_capture_sub_path.
                                                  find("/") + 1:]
    processing_output_paths = reports_path + "/" + data_capture_sub_path

    input_1 = ProcessingInput(
        input_name="input_1",
        source=data_capture_path,
        destination="/opt/ml/processing/input/endpoint/" +
        data_capture_sub_path,
        s3_data_type="S3Prefix",
        s3_input_mode="File",
    )

    baseline = ProcessingInput(
        input_name="baseline",
        source=statistics_path,
        destination="/opt/ml/processing/baseline/stats",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
    )

    constraints = ProcessingInput(
        input_name="constraints",
        source=constraints_path,
        destination="/opt/ml/processing/baseline/constraints",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
    )

    outputs = ProcessingOutput(
        output_name="result",
        source="/opt/ml/processing/output",
        destination=processing_output_paths,
        s3_upload_mode="Continuous",
    )

    env = {
        "baseline_constraints":
        "/opt/ml/processing/baseline/constraints/" +
        get_file_name(constraints_path),
        "baseline_statistics":
        "/opt/ml/processing/baseline/stats/" + get_file_name(statistics_path),
        "dataset_format":
        '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}',
        "dataset_source":
        "/opt/ml/processing/input/endpoint",
        "output_path":
        "/opt/ml/processing/output",
        "publish_cloudwatch_metrics":
        publish_cloudwatch_metrics,
    }

    inputs = [input_1, baseline, constraints]

    if postprocessor_path:
        env["post_analytics_processor_script"] = "/opt/ml/processing/code/postprocessing/" + get_file_name(
            postprocessor_path)

        post_processor_script = ProcessingInput(
            input_name="post_processor_script",
            source=postprocessor_path,
            destination="/opt/ml/processing/code/postprocessing",
            s3_data_type="S3Prefix",
            s3_input_mode="File",
        )
        inputs.append(post_processor_script)

    if preprocessor_path:
        env["record_preprocessor_script"] = "/opt/ml/processing/code/preprocessing/" + get_file_name(
            preprocessor_path)

        pre_processor_script = ProcessingInput(
            input_name="pre_processor_script",
            source=preprocessor_path,
            destination="/opt/ml/processing/code/preprocessing",
            s3_data_type="S3Prefix",
            s3_input_mode="File",
        )

        inputs.append(pre_processor_script)

    processor = Processor(
        image_uri=get_model_monitor_container_uri(region),
        instance_count=instance_count,
        instance_type=instance_type,
        role=role,
        env=env,
    )

    return processor.run(inputs=inputs, outputs=[outputs])
Esempio n. 9
0
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="sagemaker-group-insurance",
    pipeline_name="sagemaker-pipeline-insurance",
    base_job_prefix="sagemaker-featurestore-insurance",
):
    """Gets a SageMaker ML Pipeline instance working with on WIP data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="Approved"
    )

    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-insurance-preprocess",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="PreprocessInsuranceData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input_dataset_1", "41214", 
                       "--input_dataset_2", "41215",],
    )
    
    '''
    # feature store step
    feature_path = 's3://' + default_bucket+'/'+base_job_prefix + '/features'
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    feature_processor = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-insurance-feature-store",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_feature = ProcessingStep(
        name="FeatureStoreInsuranceData",
        processor=feature_processor,
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/training_input"),
        ],
        code=os.path.join(BASE_DIR, "feature_store.py"),
        job_arguments=["feature_s3_url", feature_path, "--feature_group_name", "sagemaker-featurestore-insurance"],
    )
    '''    

    # training step for generating model artifacts
    model_path = 's3://' + default_bucket+'/'+base_job_prefix + '/features'
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/insurance-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(objective = "reg:tweedie",
                                   num_round = 50)        
    step_train = TrainingStep(
        name="TrainAbaloneModel",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-wip-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="WipEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateWipModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json"
        )
    )
    step_register = RegisterModel(
        name="register-insurance-model",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step=step_eval,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value"
        ),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSEWipEvaluation",
        conditions=[cond_lte],
        if_steps=[],
        else_steps=[step_register],
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
Esempio n. 10
0
    def _run(
        self,
        data_config,
        analysis_config,
        wait,
        logs,
        job_name,
        kms_key,
        experiment_config,
    ):
        """Runs a ProcessingJob with the Sagemaker Clarify container and an analysis config.

        Args:
            data_config (:class:`~sagemaker.clarify.DataConfig`): Config of the input/output data.
            analysis_config (dict): Config following the analysis_config.json format.
            wait (bool): Whether the call should wait until the job completes (default: True).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when ``wait`` is True (default: True).
            job_name (str): Processing job name.
            kms_key (str): The ARN of the KMS key that is used to encrypt the
                user code file (default: None).
            experiment_config (dict[str, str]): Experiment management configuration.
                Dictionary contains three optional keys:
                'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
        """
        analysis_config["methods"]["report"] = {
            "name": "report",
            "title": "Analysis Report"
        }
        with tempfile.TemporaryDirectory() as tmpdirname:
            analysis_config_file = os.path.join(tmpdirname,
                                                "analysis_config.json")
            with open(analysis_config_file, "w") as f:
                json.dump(analysis_config, f)
            s3_analysis_config_file = _upload_analysis_config(
                analysis_config_file,
                data_config.s3_output_path,
                self.sagemaker_session,
                kms_key,
            )
            config_input = ProcessingInput(
                input_name="analysis_config",
                source=s3_analysis_config_file,
                destination=self._CLARIFY_CONFIG_INPUT,
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_compression_type="None",
            )
            data_input = ProcessingInput(
                input_name="dataset",
                source=data_config.s3_data_input_path,
                destination=self._CLARIFY_DATA_INPUT,
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type=data_config.
                s3_data_distribution_type,
                s3_compression_type=data_config.s3_compression_type,
            )
            result_output = ProcessingOutput(
                source=self._CLARIFY_OUTPUT,
                destination=data_config.s3_output_path,
                output_name="analysis_result",
                s3_upload_mode="EndOfJob",
            )
            super().run(
                inputs=[data_input, config_input],
                outputs=[result_output],
                wait=wait,
                logs=logs,
                job_name=job_name,
                kms_key=kms_key,
                experiment_config=experiment_config,
            )
Esempio n. 11
0
def test_tuning_multi_algos(
    sagemaker_session,
    role,
    cpu_instance_type,
    pipeline_name,
    region_name,
    script_dir,
    athena_dataset_definition,
):
    base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    entry_point = os.path.join(base_dir, "mnist.py")
    input_path = sagemaker_session.upload_data(
        path=os.path.join(base_dir, "training"),
        key_prefix="integ-test-data/pytorch_mnist/training",
    )

    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")

    input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv"

    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        instance_type=instance_type,
        instance_count=instance_count,
        base_job_name="test-sklearn",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    property_file = PropertyFile(name="DataAttributes",
                                 output_name="attributes",
                                 path="attributes.json")

    step_process = ProcessingStep(
        name="my-process",
        display_name="ProcessingStep",
        description="description for Processing step",
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(source=input_data,
                            destination="/opt/ml/processing/input"),
            ProcessingInput(dataset_definition=athena_dataset_definition),
        ],
        outputs=[
            ProcessingOutput(output_name="train_data",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="attributes",
                             source="/opt/ml/processing/attributes.json"),
        ],
        property_files=[property_file],
        code=os.path.join(script_dir, "preprocessing.py"),
    )

    static_hp_1 = ParameterString(name="InstanceType",
                                  default_value="ml.m5.xlarge")
    json_get_hp = JsonGet(step_name=step_process.name,
                          property_file=property_file,
                          json_path="train_size")
    pytorch_estimator = PyTorch(
        entry_point=entry_point,
        role=role,
        framework_version="1.5.0",
        py_version="py3",
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        enable_sagemaker_metrics=True,
        max_retry_attempts=3,
        hyperparameters={
            "static-hp": static_hp_1,
            "train_size": json_get_hp
        },
    )

    min_batch_size = ParameterString(name="MinBatchSize", default_value="64")
    max_batch_size = json_get_hp

    tuner = HyperparameterTuner.create(
        estimator_dict={
            "estimator-1": pytorch_estimator,
            "estimator-2": pytorch_estimator,
        },
        objective_metric_name_dict={
            "estimator-1": "test:acc",
            "estimator-2": "test:acc",
        },
        hyperparameter_ranges_dict={
            "estimator-1": {
                "batch-size": IntegerParameter(min_batch_size, max_batch_size)
            },
            "estimator-2": {
                "batch-size": IntegerParameter(min_batch_size, max_batch_size)
            },
        },
        metric_definitions_dict={
            "estimator-1": [{
                "Name": "test:acc",
                "Regex": "Overall test accuracy: (.*?);"
            }],
            "estimator-2": [{
                "Name": "test:acc",
                "Regex": "Overall test accuracy: (.*?);"
            }],
        },
    )

    inputs = {
        "estimator-1": TrainingInput(s3_data=input_path),
        "estimator-2": TrainingInput(s3_data=input_path),
    }

    step_tune = TuningStep(
        name="my-tuning-step",
        tuner=tuner,
        inputs=inputs,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            instance_count, instance_type, min_batch_size, max_batch_size
        ],
        steps=[step_process, step_tune],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
                             instance_type='local',
                             role=role)

print('Starting processing job.')
print(
    'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.'
)
processor.run(code='processing_script.py',
              inputs=[
                  ProcessingInput(
                      source='./dependencies/',
                      destination='/opt/ml/processing/dependencies/'),
                  ProcessingInput(source='./input_data/',
                                  destination='/opt/ml/processing/input_data/')
              ],
              outputs=[
                  ProcessingOutput(output_name='tokenized_words_data',
                                   source='/opt/ml/processing/processed_data/')
              ],
              arguments=['job-type', 'word-tokenize'])

preprocessing_job_description = processor.jobs[-1].describe()
output_config = preprocessing_job_description['ProcessingOutputConfig']

print(output_config)

for output in output_config['Outputs']:
    if output['OutputName'] == 'tokenized_words_data':
        tokenized_words_data_file = output['S3Output']['S3Uri']

print('Output file is located on: {}'.format(tokenized_words_data_file))
Esempio n. 13
0
def test_processor_with_all_parameters(sagemaker_session):
    processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
        entrypoint=[
            "python3", "/opt/ml/processing/input/code/processing_code.py"
        ],
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="processor_base_name",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
    )

    processor.run(
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(
        processor._current_job_name)
    # Drop the "code" input from expected values.
    expected_args["inputs"] = [expected_args["inputs"][0]]

    sagemaker_session.process.assert_called_with(**expected_args)
Esempio n. 14
0
def test_sklearn_with_all_parameters(exists_mock, isfile_mock,
                                     botocore_resolver, sklearn_version,
                                     sagemaker_session):
    botocore_resolver.return_value.construct_endpoint.return_value = {
        "hostname": ECR_HOSTNAME
    }

    processor = SKLearnProcessor(
        role=ROLE,
        framework_version=sklearn_version,
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    processor.run(
        code="/local/path/to/processing_code.py",
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(
        processor._current_job_name)
    sklearn_image_uri = (
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3"
    ).format(sklearn_version)
    expected_args["app_specification"]["ImageUri"] = sklearn_image_uri

    sagemaker_session.process.assert_called_with(**expected_args)
def test_end_to_end_pipeline_successful_execution(
    sagemaker_session, region_name, role, pipeline_name, wait=False
):
    model_package_group_name = f"{pipeline_name}ModelPackageGroup"
    data_path = os.path.join(DATA_DIR, "workflow")
    default_bucket = sagemaker_session.default_bucket()

    # download the input data
    local_input_path = os.path.join(data_path, "abalone-dataset.csv")
    s3 = sagemaker_session.boto_session.resource("s3")
    s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file(
        "dataset/abalone-dataset.csv", local_input_path
    )

    # # upload the input data to our bucket
    base_uri = f"s3://{default_bucket}/{pipeline_name}"
    with open(local_input_path) as data:
        body = data.read()
        input_data_uri = S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{base_uri}/abalone-dataset.csv",
            sagemaker_session=sagemaker_session,
        )

    # download batch transform data
    local_batch_path = os.path.join(data_path, "abalone-dataset-batch")
    s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file(
        "dataset/abalone-dataset-batch", local_batch_path
    )

    # upload the batch transform data
    with open(local_batch_path) as data:
        body = data.read()
        batch_data_uri = S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{base_uri}/abalone-dataset-batch",
            sagemaker_session=sagemaker_session,
        )

    # define parameters
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")
    input_data = ParameterString(
        name="InputData",
        default_value=input_data_uri,
    )
    batch_data = ParameterString(
        name="BatchData",
        default_value=batch_data_uri,
    )

    # define processing step
    framework_version = "0.23-1"
    sklearn_processor = SKLearnProcessor(
        framework_version=framework_version,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{pipeline_name}-process",
        role=role,
        sagemaker_session=sagemaker_session,
    )
    step_process = ProcessingStep(
        name="AbaloneProcess",
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(data_path, "abalone/preprocessing.py"),
    )

    # define training step
    model_path = f"s3://{default_bucket}/{pipeline_name}Train"
    image_uri = image_uris.retrieve(
        framework="xgboost",
        region=region_name,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        role=role,
        sagemaker_session=sagemaker_session,
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="AbaloneTrain",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # define evaluation step
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{pipeline_name}-eval",
        role=role,
        sagemaker_session=sagemaker_session,
    )
    evaluation_report = PropertyFile(
        name="EvaluationReport", output_name="evaluation", path="evaluation.json"
    )
    step_eval = ProcessingStep(
        name="AbaloneEval",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(data_path, "abalone/evaluation.py"),
        property_files=[evaluation_report],
    )

    # define create model step
    model = Model(
        image_uri=image_uri,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_create_model = CreateModelStep(
        name="AbaloneCreateModel",
        model=model,
        inputs=inputs,
    )

    # define transform step
    transformer = Transformer(
        model_name=step_create_model.properties.ModelName,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        output_path=f"s3://{default_bucket}/{pipeline_name}Transform",
        sagemaker_session=sagemaker_session,
    )
    step_transform = TransformStep(
        name="AbaloneTransform",
        transformer=transformer,
        inputs=TransformInput(data=batch_data),
    )

    # define register model step
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )
    step_register = RegisterModel(
        name="AbaloneRegisterModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # define condition step
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step_name=step_eval.name,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value",
        ),
        right=20.0,
    )

    step_cond = ConditionStep(
        name="AbaloneMSECond",
        conditions=[cond_lte],
        if_steps=[step_register, step_create_model, step_transform],
        else_steps=[],
    )

    # define pipeline
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
            batch_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )

    pipeline.create(role)
    execution = pipeline.start()
    execution_arn = execution.arn

    if wait:
        execution.wait()

    return execution_arn
Esempio n. 16
0
        ProcessingInput(
            source=input_data,
            destination="/opt/ml/processing/input",
            input_name="input-1",
        ),
        ProcessingInput(
            source=input_code,
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/train",
            destination="{}/{}".format(output_data, "train_data"),
            output_name="train_data",
        ),
        ProcessingOutput(
            source="/opt/ml/processing/test",
            destination="{}/{}".format(output_data, "test_data"),
            output_name="test_data",
        ),
    ]

    processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=role.role_arn,
        instance_type="ml.m5.xlarge",
        instance_count=1,
    )
def test_workflow_with_clarify(
    data_config,
    data_bias_config,
    model_config,
    model_predicted_label_config,
    pipeline_name,
    role,
    sagemaker_session,
):

    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)

    analysis_config = data_config.get_config()
    analysis_config.update(data_bias_config.get_config())
    (
        probability_threshold,
        predictor_config,
    ) = model_predicted_label_config.get_predictor_config()
    predictor_config.update(model_config.get_predictor_config())
    analysis_config["methods"] = {"post_training_bias": {"methods": "all"}}
    analysis_config["predictor"] = predictor_config
    analysis_config["probability_threshold"] = probability_threshold
    analysis_config["methods"]["report"] = {
        "name": "report",
        "title": "Analysis Report"
    }

    with tempfile.TemporaryDirectory() as tmpdirname:
        analysis_config_file = os.path.join(tmpdirname, "analysis_config.json")
        with open(analysis_config_file, "w") as f:
            json.dump(analysis_config, f)
        config_input = ProcessingInput(
            input_name="analysis_config",
            source=analysis_config_file,
            destination="/opt/ml/processing/input/config",
            s3_data_type="S3Prefix",
            s3_input_mode="File",
            s3_compression_type="None",
        )

        data_input = ProcessingInput(
            input_name="dataset",
            source=data_config.s3_data_input_path,
            destination="/opt/ml/processing/input/data",
            s3_data_type="S3Prefix",
            s3_input_mode="File",
            s3_data_distribution_type=data_config.s3_data_distribution_type,
            s3_compression_type=data_config.s3_compression_type,
        )

        result_output = ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=data_config.s3_output_path,
            output_name="analysis_result",
            s3_upload_mode="EndOfJob",
        )

        processor = SageMakerClarifyProcessor(
            role="SageMakerRole",
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
        )

        property_file = PropertyFile(
            name="BiasOutput",
            output_name="analysis_result",
            path="analysis.json",
        )

        step_process = ProcessingStep(
            name="my-process",
            processor=processor,
            inputs=[data_input, config_input],
            outputs=[result_output],
            property_files=[property_file],
        )

        cond_left = JsonGet(
            step=step_process,
            property_file="BiasOutput",
            json_path=
            "post_training_bias_metrics.facets.F1[0].metrics[0].value",
        )

        step_condition = ConditionStep(
            name="bias-condition",
            conditions=[ConditionLessThanOrEqualTo(left=cond_left, right=1)],
            if_steps=[],
            else_steps=[],
        )

        pipeline = Pipeline(
            name=pipeline_name,
            parameters=[instance_type, instance_count],
            steps=[step_process, step_condition],
            sagemaker_session=sagemaker_session,
        )

        try:
            response = pipeline.create(role)
            create_arn = response["PipelineArn"]

            execution = pipeline.start(parameters={})

            response = execution.describe()
            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=30, max_attempts=60)
            except WaiterError:
                pass
            execution_steps = execution.list_steps()

            assert len(execution_steps) == 2
            assert execution_steps[1]["StepName"] == "my-process"
            assert execution_steps[1]["StepStatus"] == "Succeeded"
            assert execution_steps[0]["StepName"] == "bias-condition"
            assert execution_steps[0]["StepStatus"] == "Succeeded"
            assert execution_steps[0]["Metadata"]["Condition"][
                "Outcome"] == "True"

        finally:
            try:
                pipeline.delete()
            except Exception:
                pass
def run_model_monitor_job_processor(region, instance_type, role,
                                    data_capture_path, preprocessor_path,
                                    postprocessor_path, statistics_path,
                                    constraints_path, reports_path):

    data_capture_sub_path = data_capture_path[data_capture_path.
                                              rfind('datacapture/'):]
    data_capture_sub_path = data_capture_sub_path[data_capture_sub_path.
                                                  find('/') + 1:]
    processing_output_paths = reports_path + '/' + data_capture_sub_path

    input_1 = ProcessingInput(
        input_name='input_1',
        source=data_capture_path,
        destination='/opt/ml/processing/input/endpoint/' +
        data_capture_sub_path,
        s3_data_type='S3Prefix',
        s3_input_mode='File')

    baseline = ProcessingInput(input_name='baseline',
                               source=statistics_path,
                               destination='/opt/ml/processing/baseline/stats',
                               s3_data_type='S3Prefix',
                               s3_input_mode='File')

    constraints = ProcessingInput(
        input_name='constraints',
        source=constraints_path,
        destination='/opt/ml/processing/baseline/constraints',
        s3_data_type='S3Prefix',
        s3_input_mode='File')

    post_processor_script = ProcessingInput(
        input_name='post_processor_script',
        source=postprocessor_path,
        destination='/opt/ml/processing/code/postprocessing',
        s3_data_type='S3Prefix',
        s3_input_mode='File')

    pre_processor_script = ProcessingInput(
        input_name='pre_processor_script',
        source=preprocessor_path,
        destination='/opt/ml/processing/code/preprocessing',
        s3_data_type='S3Prefix',
        s3_input_mode='File')

    outputs = ProcessingOutput(output_name='result',
                               source='/opt/ml/processing/output',
                               destination=processing_output_paths,
                               s3_upload_mode='Continuous')

    processor = Processor(
        image_uri=get_model_monitor_container_uri(region),
        instance_count=1,
        instance_type=instance_type,
        role=role,
        env={
            'baseline_constraints':
            '/opt/ml/processing/baseline/constraints/constraints.json',
            'baseline_statistics':
            '/opt/ml/processing/baseline/stats/statistics.json',
            'dataset_format':
            '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}',
            'dataset_source':
            '/opt/ml/processing/input/endpoint',
            'output_path':
            '/opt/ml/processing/output',
            'post_analytics_processor_script':
            '/opt/ml/processing/code/postprocessing/postprocessor.py',
            'publish_cloudwatch_metrics':
            'Disabled',
            'record_preprocessor_script':
            '/opt/ml/processing/code/preprocessing/preprocessor.py'
        })

    return processor.run(inputs=[
        input_1, baseline, constraints, post_processor_script,
        pre_processor_script
    ],
                         outputs=[outputs])
role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

processor = ScriptProcessor(
    command=['python3'],
    image_uri='sagemaker-delta-sharing-processing-local',
    role=role,
    instance_count=1,
    instance_type='local')

processor.run(code='processing_script.py',
              inputs=[
                  ProcessingInput(source='./profile/',
                                  destination='/opt/ml/processing/profile/')
              ],
              outputs=[
                  ProcessingOutput(output_name='delta_lake_processed_data',
                                   source='/opt/ml/processing/processed_data/')
              ])

preprocessing_job_description = processor.jobs[-1].describe()
output_config = preprocessing_job_description['ProcessingOutputConfig']

print(output_config)

for output in output_config['Outputs']:
    if output['OutputName'] == 'delta_lake_processed_data':
        delta_lake_processed_data_file = output['S3Output']['S3Uri']
        bucket = delta_lake_processed_data_file.split("/")[:3][2]
        output_file_name = '/'.join(
            delta_lake_processed_data_file.split("/")
            [3:]) + "/total_cases_per_location.csv"
def test_processing_step_with_placeholders(sklearn_processor_fixture,
                                           sagemaker_session, sfn_client,
                                           sfn_role_arn, sagemaker_role_arn):
    region = boto3.session.Session().region_name
    input_data = f"s3://sagemaker-sample-data-{region}/processing/census/census-income.csv"

    input_s3 = sagemaker_session.upload_data(
        path=os.path.join(DATA_DIR, 'sklearn_processing'),
        bucket=sagemaker_session.default_bucket(),
        key_prefix='integ-test-data/sklearn_processing/code')

    output_s3 = f"s3://{sagemaker_session.default_bucket()}/integ-test-data/sklearn_processing"

    inputs = [
        ProcessingInput(source=input_data,
                        destination='/opt/ml/processing/input',
                        input_name='input-1'),
        ProcessingInput(source=input_s3 + '/preprocessor.py',
                        destination='/opt/ml/processing/input/code',
                        input_name='code'),
    ]

    outputs = [
        ProcessingOutput(source='/opt/ml/processing/train',
                         destination=output_s3 + '/train_data',
                         output_name='train_data'),
        ProcessingOutput(source='/opt/ml/processing/test',
                         destination=output_s3 + '/test_data',
                         output_name='test_data'),
    ]

    # Build workflow definition
    execution_input = ExecutionInput(
        schema={
            'image_uri': str,
            'instance_count': int,
            'entrypoint': str,
            'role': str,
            'volume_size_in_gb': int,
            'max_runtime_in_seconds': int,
            'container_arguments': [str],
        })

    parameters = {
        'AppSpecification': {
            'ContainerEntrypoint': execution_input['entrypoint'],
            'ImageUri': execution_input['image_uri']
        },
        'ProcessingResources': {
            'ClusterConfig': {
                'InstanceCount': execution_input['instance_count'],
                'VolumeSizeInGB': execution_input['volume_size_in_gb']
            }
        },
        'RoleArn': execution_input['role'],
        'StoppingCondition': {
            'MaxRuntimeInSeconds': execution_input['max_runtime_in_seconds']
        }
    }

    job_name = generate_job_name()
    processing_step = ProcessingStep(
        'create_processing_job_step',
        processor=sklearn_processor_fixture,
        job_name=job_name,
        inputs=inputs,
        outputs=outputs,
        container_arguments=execution_input['container_arguments'],
        container_entrypoint=execution_input['entrypoint'],
        parameters=parameters)
    processing_step.add_retry(SAGEMAKER_RETRY_STRATEGY)
    workflow_graph = Chain([processing_step])

    with timeout(minutes=DEFAULT_TIMEOUT_MINUTES):
        workflow = create_workflow_and_check_definition(
            workflow_graph=workflow_graph,
            workflow_name=unique_name_from_base(
                "integ-test-processing-step-workflow"),
            sfn_client=sfn_client,
            sfn_role_arn=sfn_role_arn)

        execution_input = {
            'image_uri':
            '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3',
            'instance_count':
            1,
            'entrypoint':
            ['python3', '/opt/ml/processing/input/code/preprocessor.py'],
            'role':
            sagemaker_role_arn,
            'volume_size_in_gb':
            30,
            'max_runtime_in_seconds':
            500,
            'container_arguments': ['--train-test-split-ratio', '0.2']
        }

        # Execute workflow
        execution = workflow.execute(inputs=execution_input)
        execution_output = execution.get_output(wait=True)

        # Check workflow output
        assert execution_output.get("ProcessingJobStatus") == "Completed"

        # Cleanup
        state_machine_delete_wait(sfn_client, workflow.state_machine_arn)