Ejemplo n.º 1
0
def test_processor_with_all_parameters(sagemaker_session):
    processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
        entrypoint=[
            "python3", "/opt/ml/processing/input/code/processing_code.py"
        ],
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="processor_base_name",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
        ),
    )

    processor.run(
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(
        processor._current_job_name)
    # Drop the "code" input from expected values.
    expected_args["inputs"] = [expected_args["inputs"][0]]

    sagemaker_session.process.assert_called_with(**expected_args)
def test_byo_container_with_baked_in_script(sagemaker_session):
    custom_processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
    )

    custom_processor.run(
        inputs=[
            ProcessingInput(source="/local/path/to/my/sklearn_transformer",
                            destination="/code/")
        ],
        arguments=["CensusTract", "County"],
    )

    expected_args = {
        "inputs": [{
            "InputName": "input-1",
            "S3Input": {
                "S3Uri": "mocked_s3_uri_from_upload_data",
                "LocalPath": "/code/",
                "S3DataType": "S3Prefix",
                "S3InputMode": "File",
                "S3DataDistributionType": "FullyReplicated",
                "S3CompressionType": "None",
            },
        }],
        "output_config": {
            "Outputs": []
        },
        "job_name":
        custom_processor._current_job_name,
        "resources": {
            "ClusterConfig": {
                "InstanceType": "ml.m4.xlarge",
                "InstanceCount": 1,
                "VolumeSizeInGB": 30,
            }
        },
        "stopping_condition":
        None,
        "app_specification": {
            "ImageUri": CUSTOM_IMAGE_URI,
            "ContainerArguments": ["CensusTract", "County"],
        },
        "environment":
        None,
        "network_config":
        None,
        "role_arn":
        ROLE,
        "tags":
        None,
        "experiment_config":
        None,
    }
    sagemaker_session.process.assert_called_with(**expected_args)
Ejemplo n.º 3
0
def test_processor_with_required_parameters(sagemaker_session):
    processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
    )

    processor.run()

    expected_args = _get_expected_args(processor._current_job_name)
    del expected_args["app_specification"]["ContainerEntrypoint"]
    expected_args["inputs"] = []

    sagemaker_session.process.assert_called_with(**expected_args)
Ejemplo n.º 4
0
def test_processing_step_with_processor_and_step_args(pipeline_session, processing_input):
    processor = Processor(
        image_uri=IMAGE_URI,
        role=sagemaker.get_execution_role(),
        instance_count=1,
        instance_type=INSTANCE_TYPE,
        sagemaker_session=pipeline_session,
    )

    step_args = processor.run(inputs=processing_input)

    try:
        ProcessingStep(
            name="MyProcessingStep",
            step_args=step_args,
            processor=processor,
        )
        assert False
    except Exception as e:
        assert isinstance(e, ValueError)

    try:
        ProcessingStep(
            name="MyProcessingStep",
        )
        assert False
    except Exception as e:
        assert isinstance(e, ValueError)
Ejemplo n.º 5
0
def test_processor_with_missing_network_config_parameters(sagemaker_session):
    processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
        network_config=NetworkConfig(enable_network_isolation=True),
    )

    processor.run()

    expected_args = _get_expected_args(processor._current_job_name)
    del expected_args["app_specification"]["ContainerEntrypoint"]
    expected_args["inputs"] = []
    expected_args["network_config"] = {"EnableNetworkIsolation": True}

    sagemaker_session.process.assert_called_with(**expected_args)
def test_processor_with_all_parameters(sagemaker_session):
    processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
        entrypoint=[
            "python3", "/opt/ml/processing/input/code/processing_code.py"
        ],
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="processor_base_name",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
    )

    processor.run(
        inputs=_get_data_inputs_all_parameters(),
        outputs=_get_data_outputs_all_parameters(),
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(
        processor._current_job_name)
    # Drop the "code" input from expected values.
    expected_args["inputs"] = expected_args["inputs"][:-1]

    sagemaker_session.process.assert_called_with(**expected_args)
Ejemplo n.º 7
0
def test_processing_step_with_processor(pipeline_session, processing_input):
    processor = Processor(
        image_uri=IMAGE_URI,
        role=sagemaker.get_execution_role(),
        instance_count=1,
        instance_type=INSTANCE_TYPE,
        sagemaker_session=pipeline_session,
    )

    with warnings.catch_warnings(record=True) as w:
        step_args = processor.run(inputs=processing_input)
        assert len(w) == 1
        assert issubclass(w[-1].category, UserWarning)
        assert "Running within a PipelineSession" in str(w[-1].message)

    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    evaluation_report = PropertyFile(
        name="EvaluationReport", output_name="evaluation", path="evaluation.json"
    )

    with warnings.catch_warnings(record=True) as w:
        step = ProcessingStep(
            name="MyProcessingStep",
            step_args=step_args,
            description="ProcessingStep description",
            display_name="MyProcessingStep",
            depends_on=["TestStep", "SecondTestStep"],
            cache_config=cache_config,
            property_files=[evaluation_report],
        )
        assert len(w) == 0

    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step],
        sagemaker_session=pipeline_session,
    )
    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyProcessingStep",
        "Description": "ProcessingStep description",
        "DisplayName": "MyProcessingStep",
        "Type": "Processing",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "Arguments": step_args,
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
        "PropertyFiles": [
            {
                "FilePath": "evaluation.json",
                "OutputName": "evaluation",
                "PropertyFileName": "EvaluationReport",
            }
        ],
    }
    assert step.properties.ProcessingJobName.expr == {
        "Get": "Steps.MyProcessingStep.ProcessingJobName"
    }
Ejemplo n.º 8
0
def test_processor_with_custom_bucket(
    sagemaker_session_with_custom_bucket,
    custom_bucket_name,
    image_uri,
    cpu_instance_type,
    output_kms_key,
):
    script_path = os.path.join(DATA_DIR, "dummy_script.py")

    processor = Processor(
        role=ROLE,
        image_uri=image_uri,
        instance_count=1,
        instance_type=cpu_instance_type,
        entrypoint=[
            "python3", "/opt/ml/processing/input/code/dummy_script.py"
        ],
        volume_size_in_gb=100,
        volume_kms_key=None,
        output_kms_key=output_kms_key,
        max_runtime_in_seconds=3600,
        base_job_name="test-processor",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session_with_custom_bucket,
    )

    processor.run(
        inputs=[
            ProcessingInput(source=script_path,
                            destination="/opt/ml/processing/input/code/",
                            input_name="code")
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/output/container/path/",
                output_name="dummy_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["-v"],
        wait=True,
        logs=True,
    )

    job_description = processor.latest_job.describe()

    assert job_description["ProcessingInputs"][0]["InputName"] == "code"
    assert custom_bucket_name in job_description["ProcessingInputs"][0][
        "S3Input"]["S3Uri"]

    assert job_description["ProcessingJobName"].startswith("test-processor")

    assert job_description["ProcessingJobStatus"] == "Completed"

    assert job_description["ProcessingOutputConfig"][
        "KmsKeyId"] == output_kms_key
    assert job_description["ProcessingOutputConfig"]["Outputs"][0][
        "OutputName"] == "dummy_output"

    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert (job_description["ProcessingResources"]["ClusterConfig"]
            ["InstanceType"] == cpu_instance_type)
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "VolumeSizeInGB"] == 100

    assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["AppSpecification"]["ImageUri"] == image_uri

    assert job_description["Environment"] == {
        "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"
    }

    assert ROLE in job_description["RoleArn"]

    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
Ejemplo n.º 9
0
def run_model_monitor_job_processor(region, instance_type, role, data_capture_path, statistics_path, constraints_path, reports_path,
                                    instance_count=1, preprocessor_path=None, postprocessor_path=None, publish_cloudwatch_metrics='Disabled'):
    
    data_capture_sub_path = data_capture_path[data_capture_path.rfind('datacapture/') :]
    data_capture_sub_path = data_capture_sub_path[data_capture_sub_path.find('/') + 1 :]
    processing_output_paths = reports_path + '/' + data_capture_sub_path
    
    input_1 = ProcessingInput(input_name='input_1',
                          source=data_capture_path,
                          destination='/opt/ml/processing/input/endpoint/' + data_capture_sub_path,
                          s3_data_type='S3Prefix',
                          s3_input_mode='File')

    baseline = ProcessingInput(input_name='baseline',
                               source=statistics_path,
                               destination='/opt/ml/processing/baseline/stats',
                               s3_data_type='S3Prefix',
                               s3_input_mode='File')

    constraints = ProcessingInput(input_name='constraints',
                                  source=constraints_path,
                                  destination='/opt/ml/processing/baseline/constraints',
                                  s3_data_type='S3Prefix',
                                  s3_input_mode='File')

    outputs = ProcessingOutput(output_name='result',
                               source='/opt/ml/processing/output',
                               destination=processing_output_paths,
                               s3_upload_mode='Continuous')

    env = {'baseline_constraints': '/opt/ml/processing/baseline/constraints/' + get_file_name(constraints_path),
           'baseline_statistics': '/opt/ml/processing/baseline/stats/' + get_file_name(statistics_path),
           'dataset_format': '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}',
           'dataset_source': '/opt/ml/processing/input/endpoint',
           'output_path': '/opt/ml/processing/output',
           'publish_cloudwatch_metrics': publish_cloudwatch_metrics }
    
    inputs=[input_1, baseline, constraints]
    
    if postprocessor_path:
        env['post_analytics_processor_script'] = '/opt/ml/processing/code/postprocessing/' + get_file_name(postprocessor_path)
        
        post_processor_script = ProcessingInput(input_name='post_processor_script',
                                                source=postprocessor_path,
                                                destination='/opt/ml/processing/code/postprocessing',
                                                s3_data_type='S3Prefix',
                                                s3_input_mode='File')
        inputs.append(post_processor_script)

    if preprocessor_path:
        env['record_preprocessor_script'] = '/opt/ml/processing/code/preprocessing/' + get_file_name(preprocessor_path)
         
        pre_processor_script = ProcessingInput(input_name='pre_processor_script',
                                               source=preprocessor_path,
                                               destination='/opt/ml/processing/code/preprocessing',
                                               s3_data_type='S3Prefix',
                                               s3_input_mode='File')
        
        inputs.append(pre_processor_script) 
    
    processor = Processor(image_uri = get_model_monitor_container_uri(region),
                          instance_count = instance_count,
                          instance_type = instance_type,
                          role=role,
                          env = env)

    return processor.run(inputs=inputs, outputs=[outputs])
Ejemplo n.º 10
0
        project_name=project_name,
        env=env,
        region_name=region,
        current_time=current_time,
    )
    proc_config = metadata.getter(prcossing_task)
    sm_config = proc_config.get('sm_config')

    # create sagemaker session
    sess = sm.Session(default_bucket=sm_config.getter('sm_bucket'))

    processor = Processor(
        role=sm_config.getter('sm_role'),
        image_uri=image_uri,
        instance_count=sm_config.getter('sm_instance_count'),
        instance_type=sm_config.getter('sm_instance_type'),
        entrypoint=proc_config.get('endpoint'),
        volume_size_in_gb=sm_config.getter('sm_volumesize'),
        sagemaker_session=sess,
        tags=sm_config.getter('project_tag'),
    )

    processor.run(
        inputs=proc_config.get('inputs'),
        outputs=proc_config.get('outputs'),
        arguments=proc_config.get('arguments'),
        wait=False,
        logs=False,
        job_name=sm_config.getter('processing_job_name'),
    )
def run_model_monitor_job_processor(
    region,
    instance_type,
    role,
    data_capture_path,
    statistics_path,
    constraints_path,
    reports_path,
    instance_count=1,
    preprocessor_path=None,
    postprocessor_path=None,
    publish_cloudwatch_metrics="Disabled",
):

    data_capture_sub_path = data_capture_path[data_capture_path.
                                              rfind("datacapture/"):]
    data_capture_sub_path = data_capture_sub_path[data_capture_sub_path.
                                                  find("/") + 1:]
    processing_output_paths = reports_path + "/" + data_capture_sub_path

    input_1 = ProcessingInput(
        input_name="input_1",
        source=data_capture_path,
        destination="/opt/ml/processing/input/endpoint/" +
        data_capture_sub_path,
        s3_data_type="S3Prefix",
        s3_input_mode="File",
    )

    baseline = ProcessingInput(
        input_name="baseline",
        source=statistics_path,
        destination="/opt/ml/processing/baseline/stats",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
    )

    constraints = ProcessingInput(
        input_name="constraints",
        source=constraints_path,
        destination="/opt/ml/processing/baseline/constraints",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
    )

    outputs = ProcessingOutput(
        output_name="result",
        source="/opt/ml/processing/output",
        destination=processing_output_paths,
        s3_upload_mode="Continuous",
    )

    env = {
        "baseline_constraints":
        "/opt/ml/processing/baseline/constraints/" +
        get_file_name(constraints_path),
        "baseline_statistics":
        "/opt/ml/processing/baseline/stats/" + get_file_name(statistics_path),
        "dataset_format":
        '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}',
        "dataset_source":
        "/opt/ml/processing/input/endpoint",
        "output_path":
        "/opt/ml/processing/output",
        "publish_cloudwatch_metrics":
        publish_cloudwatch_metrics,
    }

    inputs = [input_1, baseline, constraints]

    if postprocessor_path:
        env["post_analytics_processor_script"] = "/opt/ml/processing/code/postprocessing/" + get_file_name(
            postprocessor_path)

        post_processor_script = ProcessingInput(
            input_name="post_processor_script",
            source=postprocessor_path,
            destination="/opt/ml/processing/code/postprocessing",
            s3_data_type="S3Prefix",
            s3_input_mode="File",
        )
        inputs.append(post_processor_script)

    if preprocessor_path:
        env["record_preprocessor_script"] = "/opt/ml/processing/code/preprocessing/" + get_file_name(
            preprocessor_path)

        pre_processor_script = ProcessingInput(
            input_name="pre_processor_script",
            source=preprocessor_path,
            destination="/opt/ml/processing/code/preprocessing",
            s3_data_type="S3Prefix",
            s3_input_mode="File",
        )

        inputs.append(pre_processor_script)

    processor = Processor(
        image_uri=get_model_monitor_container_uri(region),
        instance_count=instance_count,
        instance_type=instance_type,
        role=role,
        env=env,
    )

    return processor.run(inputs=inputs, outputs=[outputs])