Beispiel #1
0
def parseIOAndAllowAccess(args, env, sm_project):
    input_data_path = None
    distribution = "FullyReplicated"
    if args.input_path:
        input_data_path, distribution, subdir = args.input_path[0]
        if input_data_path.lower().startswith("s3://"):
            input_data_path = sagemaker.s3.s3_path_join(
                input_data_path, subdir)
        else:
            input_data_path = os.path.join(input_data_path, subdir)

    inputs = list()
    if args.input_task:
        for (input_name, task_name, ttype, distribution,
             subdir) in args.input_task:
            s3_uri = sm_project.getInputConfig(
                task_name,
                ttype,
                distribution=distribution,
                subdir=subdir,
                return_s3uri=True,
            )
            inputs.append(
                ProcessingInput(
                    s3_uri,
                    f"/opt/ml/processing/input/data/{input_name}",
                    input_name,
                    s3_data_distribution_type=distribution,
                ))
            env[f"SM_CHANNEL_{input_name.upper()}"] = f"/opt/ml/processing/input/data/{input_name}"
    if args.input_s3:
        for (input_name, s3_uri, distribution, subdir) in args.input_s3:
            s3_uri = sagemaker.s3.s3_path_join(s3_uri, subdir)
            bucket, _ = sagemaker.s3.parse_s3_url(s3_uri)
            sm_project.allowAccessToS3Bucket(bucket)
            inputs.append(
                ProcessingInput(
                    s3_uri,
                    f"/opt/ml/processing/processing/input/data/{input_name}",
                    input_name,
                    s3_data_distribution_type=distribution,
                ))
            env[f"SM_CHANNEL_{input_name.upper()}"] = f"/opt/ml/processing/input/data/{input_name}"

    outputs = list()
    # TBD: support outputs

    return input_data_path, distribution, inputs, outputs
Beispiel #2
0
def test_processing_step_normalizes_args_with_no_code(mock_normalize_args,
                                                      script_processor):
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    inputs = [
        ProcessingInput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    outputs = [
        ProcessingOutput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=script_processor,
        inputs=inputs,
        outputs=outputs,
        job_arguments=["arg1", "arg2"],
        cache_config=cache_config,
    )
    mock_normalize_args.return_value = [step.inputs, step.outputs]
    step.to_request()
    mock_normalize_args.assert_called_with(
        job_name=None,
        arguments=step.job_arguments,
        inputs=step.inputs,
        outputs=step.outputs,
        code=None,
    )
Beispiel #3
0
def processing_input(bucket):
    return [
        ProcessingInput(
            source=f"s3://{bucket}/processing_manifest",
            destination="processing_manifest",
        )
    ]
Beispiel #4
0
def test_sklearn_with_network_config(sagemaker_session, sklearn_full_version,
                                     cpu_instance_type):
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_full_version,
        role=ROLE,
        instance_type=cpu_instance_type,
        instance_count=1,
        command=["python3"],
        sagemaker_session=sagemaker_session,
        base_job_name="test-sklearn-with-network-config",
        network_config=NetworkConfig(enable_network_isolation=True,
                                     encrypt_inter_container_traffic=True),
    )

    sklearn_processor.run(
        code=script_path,
        inputs=[
            ProcessingInput(source=input_file_path,
                            destination="/opt/ml/processing/inputs/")
        ],
        wait=False,
        logs=False,
    )

    job_description = sklearn_processor.latest_job.describe()
    network_config = job_description["NetworkConfig"]
    assert network_config["EnableInterContainerTrafficEncryption"]
    assert network_config["EnableNetworkIsolation"]
Beispiel #5
0
def test_processor_with_all_parameters(sagemaker_session):
    processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
        entrypoint=[
            "python3", "/opt/ml/processing/input/code/processing_code.py"
        ],
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="processor_base_name",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
        ),
    )

    processor.run(
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(
        processor._current_job_name)
    # Drop the "code" input from expected values.
    expected_args["inputs"] = [expected_args["inputs"][0]]

    sagemaker_session.process.assert_called_with(**expected_args)
def make_processing_input(mount, name, source, s3, mode=None):
    destination = "{}/{}".format(mount, name)
    if mode:
        assert mode in ['File', 'Pipe']
    processing_input = ProcessingInput(
        source=source,
        destination=destination,
        input_name=name,
        # s3_data_type='S3Prefix',
        s3_input_mode=mode or 'File',
        # s3_data_distribution_type='FullyReplicated',
        # s3_compression_type='None'
    )
    path = get_local_path(source)
    if path:
        if not os.path.exists(path):
            raise ValueError(
                "Local path [{}]: [{}] does not exist".format(name, source))
        if os.path.isfile(path):
            basename = os.path.basename(path)
            path_argument = "{}/{}".format(destination, basename)
        elif os.path.isdir(path):
            path_argument = destination
        else:
            raise ValueError(
                "Local path [{}] is neither file nor directory".format(source))
    else:
        if is_s3_file(source, s3=s3):
            basename = os.path.basename(source)
            path_argument = "{}/{}".format(destination, basename)
        else:
            path_argument = destination
    return processing_input, path_argument
Beispiel #7
0
    def _generate_baseline_job_inputs(self):
        """Generates a dict with ProcessingInput objects

        Generates a dict with three ProcessingInput objects: baseline_dataset_input,
            post_processor_script_input and record_preprocessor_script_input

        Returns:
            dict: with three ProcessingInput objects as baseline job inputs
        """
        baseline_dataset = self.quality_check_config.baseline_dataset
        baseline_dataset_des = str(
            pathlib.PurePosixPath(
                _CONTAINER_BASE_PATH, _CONTAINER_INPUT_PATH, _BASELINE_DATASET_INPUT_NAME
            )
        )
        if isinstance(baseline_dataset, (ExecutionVariable, Expression, Parameter, Properties)):
            baseline_dataset_input = ProcessingInput(
                source=self.quality_check_config.baseline_dataset,
                destination=baseline_dataset_des,
                input_name=_BASELINE_DATASET_INPUT_NAME,
            )
        else:
            baseline_dataset_input = self._model_monitor._upload_and_convert_to_processing_input(
                source=self.quality_check_config.baseline_dataset,
                destination=baseline_dataset_des,
                name=_BASELINE_DATASET_INPUT_NAME,
            )

        post_processor_script_input = self._model_monitor._upload_and_convert_to_processing_input(
            source=self.quality_check_config.post_analytics_processor_script,
            destination=str(
                pathlib.PurePosixPath(
                    _CONTAINER_BASE_PATH,
                    _CONTAINER_INPUT_PATH,
                    _POST_ANALYTICS_PROCESSOR_SCRIPT_INPUT_NAME,
                )
            ),
            name=_POST_ANALYTICS_PROCESSOR_SCRIPT_INPUT_NAME,
        )

        record_preprocessor_script_input = None
        if isinstance(self.quality_check_config, DataQualityCheckConfig):
            record_preprocessor_script_input = (
                self._model_monitor._upload_and_convert_to_processing_input(
                    source=self.quality_check_config.record_preprocessor_script,
                    destination=str(
                        pathlib.PurePosixPath(
                            _CONTAINER_BASE_PATH,
                            _CONTAINER_INPUT_PATH,
                            _RECORD_PREPROCESSOR_SCRIPT_INPUT_NAME,
                        )
                    ),
                    name=_RECORD_PREPROCESSOR_SCRIPT_INPUT_NAME,
                )
            )
        return dict(
            baseline_dataset_input=baseline_dataset_input,
            post_processor_script_input=post_processor_script_input,
            record_preprocessor_script_input=record_preprocessor_script_input,
        )
def test_processing_step(sagemaker_session):
    processing_input_data_uri_parameter = ParameterString(
        name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest"
    )
    instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1)
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=processing_input_data_uri_parameter,
            destination="processing_manifest",
        )
    ]
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )
    assert step.to_request() == {
        "Name": "MyProcessingStep",
        "Type": "Processing",
        "Arguments": {
            "AppSpecification": {"ImageUri": "fakeimage"},
            "ProcessingInputs": [
                {
                    "InputName": "input-1",
                    "AppManaged": False,
                    "S3Input": {
                        "LocalPath": "processing_manifest",
                        "S3CompressionType": "None",
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3InputMode": "File",
                        "S3Uri": processing_input_data_uri_parameter,
                    },
                }
            ],
            "ProcessingResources": {
                "ClusterConfig": {
                    "InstanceCount": instance_count_parameter,
                    "InstanceType": instance_type_parameter,
                    "VolumeSizeInGB": 30,
                }
            },
            "RoleArn": "DummyRole",
        },
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
    }
    assert step.properties.ProcessingJobName.expr == {
        "Get": "Steps.MyProcessingStep.ProcessingJobName"
    }
    def _stage_configuration(self, configuration):
        """Serializes and uploads the user-provided EMR application configuration to S3.

        This method prepares an input channel.

        Args:
            configuration (Dict): the configuration dict for the EMR application configuration.
        """

        serialized_configuration = BytesIO(
            json.dumps(configuration).encode("utf-8"))
        s3_uri = (
            f"s3://{self.sagemaker_session.default_bucket()}/{self._current_job_name}/"
            f"input/{self._conf_container_input_name}/{self._conf_file_name}")

        S3Uploader.upload_string_as_file_body(
            body=serialized_configuration,
            desired_s3_uri=s3_uri,
            sagemaker_session=self.sagemaker_session,
        )

        conf_input = ProcessingInput(
            source=s3_uri,
            destination=
            f"{self._conf_container_base_path}{self._conf_container_input_name}",
            input_name=_SparkProcessorBase._conf_container_input_name,
        )
        return conf_input
Beispiel #10
0
def create_prepro_step(params, pre_processor, execution_input):
    prepro_input_data = params['prep-input-path']
    prepro_output_data = params['prep-output-path']
    input_dir = '/opt/ml/processing/input'
    output_dir = '/opt/ml/processing/output'

    prepro_inputs = [
        ProcessingInput(
            source=prepro_input_data,
            destination=input_dir,
            input_name="input-data"
        )
    ]

    prepro_outputs = [
        ProcessingOutput(
            source=output_dir,
            destination=prepro_output_data,
            output_name="processed-data",
        )
    ]

    processing_step = ProcessingStep(
        "SageMaker pre-processing step",
        processor=pre_processor,
        job_name=execution_input["PreprocessingJobName"],
        inputs=prepro_inputs,
        outputs=prepro_outputs,
        container_arguments=["--input-dir", input_dir,
                             "--output-dir", output_dir]
    )
    return processing_step
Beispiel #11
0
def create_evaluation_step(params, model_evaluation_processor,
                           execution_input, job_name, train_job_name):
    evaluation_output_destination = os.path.join(
        params['eval-result-path'], job_name)
    prepro_input_data = params['prep-input-path']
    trained_model_data = os.path.join(params['train-output-path'],
                                      train_job_name, 'output/model.tar.gz')
    model_dir = '/opt/ml/processing/model'
    data_dir = '/opt/ml/processing/test'
    output_dir = '/opt/ml/processing/evaluation'

    inputs_evaluation = [
        # data path for model evaluation
        ProcessingInput(
            source=prepro_input_data,
            destination=data_dir,
            input_name="data-dir",
        ),
        # model path
        ProcessingInput(
            source=trained_model_data,
            destination=model_dir,
            input_name="model-dir",
        ),
    ]

    outputs_evaluation = [
        ProcessingOutput(
            source=output_dir,
            destination=evaluation_output_destination,
            output_name="output-dir",
        ),
    ]

    evaluation_step = ProcessingStep(
        "SageMaker Evaluation step",
        processor=model_evaluation_processor,
        job_name=execution_input["EvaluationJobName"],
        inputs=inputs_evaluation,
        outputs=outputs_evaluation,
        container_arguments=["--data-dir", data_dir, "--model-dir", model_dir,
                             "--output-dir", output_dir, 
                             "--experiment-name", params['experiment-name'],
                             "--mlflow-server", params['mlflow-server-uri']]
    )

    return evaluation_step
def test_byo_container_with_baked_in_script(sagemaker_session):
    custom_processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
    )

    custom_processor.run(
        inputs=[
            ProcessingInput(source="/local/path/to/my/sklearn_transformer",
                            destination="/code/")
        ],
        arguments=["CensusTract", "County"],
    )

    expected_args = {
        "inputs": [{
            "InputName": "input-1",
            "S3Input": {
                "S3Uri": "mocked_s3_uri_from_upload_data",
                "LocalPath": "/code/",
                "S3DataType": "S3Prefix",
                "S3InputMode": "File",
                "S3DataDistributionType": "FullyReplicated",
                "S3CompressionType": "None",
            },
        }],
        "output_config": {
            "Outputs": []
        },
        "job_name":
        custom_processor._current_job_name,
        "resources": {
            "ClusterConfig": {
                "InstanceType": "ml.m4.xlarge",
                "InstanceCount": 1,
                "VolumeSizeInGB": 30,
            }
        },
        "stopping_condition":
        None,
        "app_specification": {
            "ImageUri": CUSTOM_IMAGE_URI,
            "ContainerArguments": ["CensusTract", "County"],
        },
        "environment":
        None,
        "network_config":
        None,
        "role_arn":
        ROLE,
        "tags":
        None,
        "experiment_config":
        None,
    }
    sagemaker_session.process.assert_called_with(**expected_args)
Beispiel #13
0
def test_script_processor_with_all_parameters(exists_mock, isfile_mock, sagemaker_session):
    processor = ScriptProcessor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    processor.run(
        code="/local/path/to/processing_code.py",
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                app_managed=True,
                feature_store_output=FeatureStoreOutput("Foo"),
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(processor._current_job_name)

    sagemaker_session.process.assert_called_with(**expected_args)
    assert "my_job_name" in processor._current_job_name
def test_processing_step_normalizes_args(mock_normalize_args,
                                         sagemaker_session):
    processor = ScriptProcessor(
        role=ROLE,
        image_uri=
        "012345678901.dkr.ecr.us-west-2.amazonaws.com/my-custom-image-uri",
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    inputs = [
        ProcessingInput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    outputs = [
        ProcessingOutput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=processor,
        code="foo.py",
        inputs=inputs,
        outputs=outputs,
        job_arguments=["arg1", "arg2"],
        cache_config=cache_config,
    )
    mock_normalize_args.return_value = [step.inputs, step.outputs]
    step.to_request()
    mock_normalize_args.assert_called_with(
        arguments=step.job_arguments,
        inputs=step.inputs,
        outputs=step.outputs,
        code=step.code,
    )
def test_sklearn_with_all_parameters(exists_mock, isfile_mock, sagemaker_session):
    processor = SKLearnProcessor(
        role=ROLE,
        framework_version="0.20.0",
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    processor.run(
        code="/local/path/to/processing_code.py",
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(processor._current_job_name)
    sklearn_image_uri = (
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3"
    )
    expected_args["app_specification"]["ImageUri"] = sklearn_image_uri

    sagemaker_session.process.assert_called_with(**expected_args)
 def _get_recipe_input(self):
     """Creates a ProcessingInput with Data Wrangler recipe uri and appends it to inputs"""
     return ProcessingInput(
         source=self.data_wrangler_flow_source,
         destination="/opt/ml/processing/flow",
         input_name="flow",
         s3_data_type="S3Prefix",
         s3_input_mode="File",
         s3_data_distribution_type="FullyReplicated",
     )
Beispiel #17
0
def test_add_depends_on(sagemaker_session):
    processing_input_data_uri_parameter = ParameterString(
        name="ProcessingInputDataUri",
        default_value=f"s3://{BUCKET}/processing_manifest")
    instance_type_parameter = ParameterString(name="InstanceType",
                                              default_value="ml.m4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount",
                                                default_value=1)
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=processing_input_data_uri_parameter,
            destination="processing_manifest",
        )
    ]
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")

    step_1 = ProcessingStep(
        name="MyProcessingStep-1",
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )

    step_2 = ProcessingStep(
        name="MyProcessingStep-2",
        depends_on=[step_1],
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )

    step_3 = ProcessingStep(
        name="MyProcessingStep-3",
        depends_on=[step_1],
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )
    step_3.add_depends_on([step_2.name])

    assert "DependsOn" not in step_1.to_request()
    assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"]
    assert step_3.to_request()["DependsOn"] == [
        "MyProcessingStep-1", "MyProcessingStep-2"
    ]
def test_processing_step(sagemaker_session):
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=1,
        instance_type="ml.m4.4xlarge",
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=processor,
        inputs=inputs,
        outputs=[],
    )
    assert step.to_request() == {
        "Name": "MyProcessingStep",
        "Type": "Processing",
        "Arguments": {
            "AppSpecification": {
                "ImageUri": "fakeimage"
            },
            "ProcessingInputs": [{
                "InputName": "input-1",
                "AppManaged": False,
                "S3Input": {
                    "LocalPath": "processing_manifest",
                    "S3CompressionType": "None",
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3InputMode": "File",
                    "S3Uri": "s3://my-bucket/processing_manifest",
                },
            }],
            "ProcessingResources": {
                "ClusterConfig": {
                    "InstanceCount": 1,
                    "InstanceType": "ml.m4.4xlarge",
                    "VolumeSizeInGB": 30,
                }
            },
            "RoleArn":
            "DummyRole",
        },
    }
    assert step.properties.ProcessingJobName.expr == {
        "Get": "Steps.MyProcessingStep.ProcessingJobName"
    }
Beispiel #19
0
def test_script_processor_with_one_input(exists_mock, isfile_mock, sagemaker_session):
    processor = _get_script_processor(sagemaker_session)
    processor.run(
        code="/local/path/to/processing_code.py",
        inputs=[
            ProcessingInput(source="/local/path/to/my/dataset/census.csv", destination="/data/")
        ],
    )

    expected_args = _get_expected_args(processor._current_job_name)
    expected_args["inputs"].insert(0, _get_data_input())

    sagemaker_session.process.assert_called_with(**expected_args)
Beispiel #20
0
def create_s3_processing_input(s3_dataset_definition, name, base_dir):
    """Create an S3 processing input for a DW job

    (From Data Wrangler Job notebook template 2021-03-10)
    """
    return ProcessingInput(
        source=s3_dataset_definition['s3ExecutionContext']['s3Uri'],
        destination=f"{base_dir}/{name}",
        input_name=name,
        s3_data_type="S3Prefix",
        s3_input_mode="File",
        s3_data_distribution_type="FullyReplicated",
    )
Beispiel #21
0
def create_flow_notebook_processing_input(base_dir, flow_s3_uri):
    """Create the flow file processing input for a DW job

    (From Data Wrangler Job notebook template 2021-03-10)
    """
    return ProcessingInput(
        source=flow_s3_uri,
        destination=f"{base_dir}/flow",
        input_name="flow",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
        s3_data_distribution_type="FullyReplicated",
    )
def test_sklearn(sagemaker_session, sklearn_full_version, cpu_instance_type):
    logging.getLogger().setLevel(logging.DEBUG)  # TODO-reinvent-2019: REMOVE

    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_full_version,
        role=ROLE,
        instance_type=cpu_instance_type,
        instance_count=1,
        command=["python3"],
        sagemaker_session=sagemaker_session,
        max_runtime_in_seconds=3600,  # TODO-reinvent-2019: REMOVE
        base_job_name="test-sklearn",
    )

    sklearn_processor.run(
        code=script_path,
        inputs=[
            ProcessingInput(source=input_file_path,
                            destination="/opt/ml/processing/inputs/")
        ],
        wait=False,
        logs=False,
    )

    job_description = sklearn_processor.latest_job.describe()

    assert len(job_description["ProcessingInputs"]) == 2
    assert job_description["ProcessingResources"] == {
        "ClusterConfig": {
            "InstanceCount": 1,
            "InstanceType": "ml.m4.xlarge",
            "VolumeSizeInGB": 30
        }
    }
    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["RoleArn"] == ROLE
Beispiel #23
0
def create_athena_processing_input(athena_dataset_defintion, name, base_dir):
    """Create an Athena processing input for a DW job

    (From Data Wrangler Job notebook template 2021-03-10)
    """
    return ProcessingInput(
        input_name=name,
        dataset_definition=DatasetDefinition(
            local_path=f"{base_dir}/{name}",
            data_distribution_type="FullyReplicated",
            athena_dataset_definition=AthenaDatasetDefinition(
                catalog=athena_dataset_defintion["catalogName"],
                database=athena_dataset_defintion["databaseName"],
                query_string=athena_dataset_defintion["queryString"],
                output_s3_uri=athena_dataset_defintion["s3OutputLocation"] +
                f"{name}/",
                output_format=athena_dataset_defintion["outputFormat"].upper(
                ))))
Beispiel #24
0
def test_sklearn(sagemaker_session, sklearn_full_version, cpu_instance_type):
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_full_version,
        role=ROLE,
        instance_type=cpu_instance_type,
        instance_count=1,
        command=["python3"],
        sagemaker_session=sagemaker_session,
        base_job_name="test-sklearn",
    )

    sklearn_processor.run(
        code=script_path,
        inputs=[
            ProcessingInput(source=input_file_path,
                            destination="/opt/ml/processing/inputs/")
        ],
        wait=False,
        logs=False,
    )

    job_description = sklearn_processor.latest_job.describe()

    assert len(job_description["ProcessingInputs"]) == 2
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert (job_description["ProcessingResources"]["ClusterConfig"]
            ["InstanceType"] == cpu_instance_type)
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "VolumeSizeInGB"] == 30
    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 86400
    }
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert ROLE in job_description["RoleArn"]
Beispiel #25
0
 def _patch_inputs_with_payload(self, inputs,
                                s3_payload) -> List[ProcessingInput]:
     # ScriptProcessor job will download only s3://..../code/runproc.sh, hence we need to also
     # inject our s3://.../sourcedir.tar.gz.
     #
     # We'll follow the exact same mechanism that ScriptProcessor does, which is to inject the
     # S3 code artifact as a processing input with destination /opt/ml/processing/input/code/payload/.
     #
     # Unfortunately, as much as I'd like to put sourcedir.tar.gz to /opt/ml/processing/input/code/,
     # this cannot be done as this destination is already used by the ScriptProcessor for runproc.sh,
     # and the SDK does not allow another input with the same destination.
     # - Note that the parameterized form of this path is available as ScriptProcessor._CODE_CONTAINER_BASE_PATH
     #   and ScriptProcessor._CODE_CONTAINER_INPUT_NAME.
     # - See: https://github.com/aws/sagemaker-python-sdk/blob/a7399455f5386d83ddc5cb15c0db00c04bd518ec/src/sagemaker/processing.py#L425-L426)
     if inputs is None:
         inputs = []
     inputs.append(
         ProcessingInput(
             source=s3_payload,
             destination="/opt/ml/processing/input/code/payload/"))
     return inputs
Beispiel #26
0
def create_redshift_processing_input(redshift_dataset_defintion, name,
                                     base_dir):
    """Create a Redshift processing input for a DW job

    (From Data Wrangler Job notebook template 2021-03-10)
    """
    return ProcessingInput(
        input_name=name,
        dataset_definition=DatasetDefinition(
            local_path=f"{base_dir}/{name}",
            data_distribution_type="FullyReplicated",
            redshift_dataset_definition=RedshiftDatasetDefinition(
                cluster_id=redshift_dataset_defintion["clusterIdentifier"],
                database=redshift_dataset_defintion["database"],
                db_user=redshift_dataset_defintion["dbUser"],
                query_string=redshift_dataset_defintion["queryString"],
                cluster_role_arn=redshift_dataset_defintion["unloadIamRole"],
                output_s3_uri=redshift_dataset_defintion["s3OutputLocation"] +
                f"{name}/",
                output_format=redshift_dataset_defintion["outputFormat"].upper(
                ))))
def test_data_wrangler_processor_with_mock_input(sagemaker_session):
    processor = DataWranglerProcessor(
        role=ROLE,
        data_wrangler_flow_source=DATA_WRANGLER_RECIPE_SOURCE,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
    )

    mock_input = ProcessingInput(
        source=MOCK_S3_URI,
        destination="/opt/ml/processing/mock_input",
        input_name="mock_input",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
        s3_data_distribution_type="FullyReplicated",
    )
    processor.run(inputs=[mock_input])
    expected_args = _get_expected_args(processor._current_job_name,
                                       add_mock_input=True)
    sagemaker_session.process.assert_called_with(**expected_args)
Beispiel #28
0
def test_local_processing_sklearn(sagemaker_local_session_no_local_code,
                                  sklearn_latest_version):
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_latest_version,
        role="SageMakerRole",
        instance_type="local",
        instance_count=1,
        command=["python3"],
        sagemaker_session=sagemaker_local_session_no_local_code,
    )

    sklearn_processor.run(
        code=script_path,
        inputs=[
            ProcessingInput(source=input_file_path,
                            destination="/opt/ml/processing/inputs/")
        ],
        wait=False,
        logs=False,
    )

    job_description = sklearn_processor.latest_job.describe()

    assert len(job_description["ProcessingInputs"]) == 2
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceType"] == "local"
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["RoleArn"] == "<no_role>"
Beispiel #29
0
def test_preprocessing_script_in_local_container(processor):
    code_path = "../../src/mlmax/preprocessing.py"
    execution_mode = "tr ain"  # Configure to either 'train', or 'infer'
    input_data_path = "input/census-income-sample.csv"
    local_data_path = "opt/ml/processing/input"

    processor.run(
        code=code_path,
        inputs=[
            ProcessingInput(source=local_data_path,
                            destination="/opt/ml/processing/input")
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/train",
                output_name="train_data",
            ),
            ProcessingOutput(
                source="/opt/ml/processing/test",
                output_name="test_data",
            ),
            ProcessingOutput(
                source="/opt/ml/processing/model",
                output_name="proc_model",
            ),
        ],
        arguments=[
            "--train-test-split-ratio",
            "0.2",
            "--mode",
            execution_mode,
            "--data-input",
            input_data_path,
        ],
        wait=False,
    )
def test_one_step_data_wrangler_processing_pipeline(sagemaker_session, role,
                                                    pipeline_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.4xlarge")

    recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow")
    input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv")

    output_name = "3f74973c-fd1e-4845-89f8-0dd400031be9.default"
    output_content_type = "CSV"
    output_config = {output_name: {"content_type": output_content_type}}
    job_argument = [f"--output-config '{json.dumps(output_config)}'"]

    inputs = [
        ProcessingInput(
            input_name="dummy_data.csv",
            source=input_file_path,
            destination="/opt/ml/processing/dummy_data.csv",
        )
    ]

    output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output"
    outputs = [
        ProcessingOutput(
            output_name=output_name,
            source="/opt/ml/processing/output",
            destination=output_s3_uri,
            s3_upload_mode="EndOfJob",
        )
    ]

    data_wrangler_processor = DataWranglerProcessor(
        role=role,
        data_wrangler_flow_source=recipe_file_path,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        max_runtime_in_seconds=86400,
    )

    data_wrangler_step = ProcessingStep(
        name="data-wrangler-step",
        processor=data_wrangler_processor,
        inputs=inputs,
        outputs=outputs,
        job_arguments=job_argument,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count, instance_type],
        steps=[data_wrangler_step],
        sagemaker_session=sagemaker_session,
    )

    definition = json.loads(pipeline.definition())
    expected_image_uri = image_uris.retrieve(
        "data-wrangler", region=sagemaker_session.boto_region_name)
    assert len(definition["Steps"]) == 1
    assert definition["Steps"][0]["Arguments"]["AppSpecification"][
        "ImageUri"] is not None
    assert definition["Steps"][0]["Arguments"]["AppSpecification"][
        "ImageUri"] == expected_image_uri

    assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None
    processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"]
    assert len(processing_inputs) == 2
    for processing_input in processing_inputs:
        if processing_input["InputName"] == "flow":
            assert processing_input["S3Input"]["S3Uri"].endswith(".flow")
            assert processing_input["S3Input"][
                "LocalPath"] == "/opt/ml/processing/flow"
        elif processing_input["InputName"] == "dummy_data.csv":
            assert processing_input["S3Input"]["S3Uri"].endswith(".csv")
            assert processing_input["S3Input"][
                "LocalPath"] == "/opt/ml/processing/dummy_data.csv"
        else:
            raise AssertionError("Unknown input name")
    assert definition["Steps"][0]["Arguments"][
        "ProcessingOutputConfig"] is not None
    processing_outputs = definition["Steps"][0]["Arguments"][
        "ProcessingOutputConfig"]["Outputs"]
    assert len(processing_outputs) == 1
    assert processing_outputs[0]["OutputName"] == output_name
    assert processing_outputs[0]["S3Output"] is not None
    assert processing_outputs[0]["S3Output"][
        "LocalPath"] == "/opt/ml/processing/output"
    assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        execution = pipeline.start()
        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        try:
            execution.wait(delay=60, max_attempts=10)
        except WaiterError:
            pass

        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "data-wrangler-step"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass