def parseIOAndAllowAccess(args, env, sm_project): input_data_path = None distribution = "FullyReplicated" if args.input_path: input_data_path, distribution, subdir = args.input_path[0] if input_data_path.lower().startswith("s3://"): input_data_path = sagemaker.s3.s3_path_join( input_data_path, subdir) else: input_data_path = os.path.join(input_data_path, subdir) inputs = list() if args.input_task: for (input_name, task_name, ttype, distribution, subdir) in args.input_task: s3_uri = sm_project.getInputConfig( task_name, ttype, distribution=distribution, subdir=subdir, return_s3uri=True, ) inputs.append( ProcessingInput( s3_uri, f"/opt/ml/processing/input/data/{input_name}", input_name, s3_data_distribution_type=distribution, )) env[f"SM_CHANNEL_{input_name.upper()}"] = f"/opt/ml/processing/input/data/{input_name}" if args.input_s3: for (input_name, s3_uri, distribution, subdir) in args.input_s3: s3_uri = sagemaker.s3.s3_path_join(s3_uri, subdir) bucket, _ = sagemaker.s3.parse_s3_url(s3_uri) sm_project.allowAccessToS3Bucket(bucket) inputs.append( ProcessingInput( s3_uri, f"/opt/ml/processing/processing/input/data/{input_name}", input_name, s3_data_distribution_type=distribution, )) env[f"SM_CHANNEL_{input_name.upper()}"] = f"/opt/ml/processing/input/data/{input_name}" outputs = list() # TBD: support outputs return input_data_path, distribution, inputs, outputs
def test_processing_step_normalizes_args_with_no_code(mock_normalize_args, script_processor): cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") inputs = [ ProcessingInput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] outputs = [ ProcessingOutput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] step = ProcessingStep( name="MyProcessingStep", processor=script_processor, inputs=inputs, outputs=outputs, job_arguments=["arg1", "arg2"], cache_config=cache_config, ) mock_normalize_args.return_value = [step.inputs, step.outputs] step.to_request() mock_normalize_args.assert_called_with( job_name=None, arguments=step.job_arguments, inputs=step.inputs, outputs=step.outputs, code=None, )
def processing_input(bucket): return [ ProcessingInput( source=f"s3://{bucket}/processing_manifest", destination="processing_manifest", ) ]
def test_sklearn_with_network_config(sagemaker_session, sklearn_full_version, cpu_instance_type): script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_full_version, role=ROLE, instance_type=cpu_instance_type, instance_count=1, command=["python3"], sagemaker_session=sagemaker_session, base_job_name="test-sklearn-with-network-config", network_config=NetworkConfig(enable_network_isolation=True, encrypt_inter_container_traffic=True), ) sklearn_processor.run( code=script_path, inputs=[ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/") ], wait=False, logs=False, ) job_description = sklearn_processor.latest_job.describe() network_config = job_description["NetworkConfig"] assert network_config["EnableInterContainerTrafficEncryption"] assert network_config["EnableNetworkIsolation"]
def test_processor_with_all_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, entrypoint=[ "python3", "/opt/ml/processing/input/code/processing_code.py" ], volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="processor_base_name", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, ), ) processor.run( inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters( processor._current_job_name) # Drop the "code" input from expected values. expected_args["inputs"] = [expected_args["inputs"][0]] sagemaker_session.process.assert_called_with(**expected_args)
def make_processing_input(mount, name, source, s3, mode=None): destination = "{}/{}".format(mount, name) if mode: assert mode in ['File', 'Pipe'] processing_input = ProcessingInput( source=source, destination=destination, input_name=name, # s3_data_type='S3Prefix', s3_input_mode=mode or 'File', # s3_data_distribution_type='FullyReplicated', # s3_compression_type='None' ) path = get_local_path(source) if path: if not os.path.exists(path): raise ValueError( "Local path [{}]: [{}] does not exist".format(name, source)) if os.path.isfile(path): basename = os.path.basename(path) path_argument = "{}/{}".format(destination, basename) elif os.path.isdir(path): path_argument = destination else: raise ValueError( "Local path [{}] is neither file nor directory".format(source)) else: if is_s3_file(source, s3=s3): basename = os.path.basename(source) path_argument = "{}/{}".format(destination, basename) else: path_argument = destination return processing_input, path_argument
def _generate_baseline_job_inputs(self): """Generates a dict with ProcessingInput objects Generates a dict with three ProcessingInput objects: baseline_dataset_input, post_processor_script_input and record_preprocessor_script_input Returns: dict: with three ProcessingInput objects as baseline job inputs """ baseline_dataset = self.quality_check_config.baseline_dataset baseline_dataset_des = str( pathlib.PurePosixPath( _CONTAINER_BASE_PATH, _CONTAINER_INPUT_PATH, _BASELINE_DATASET_INPUT_NAME ) ) if isinstance(baseline_dataset, (ExecutionVariable, Expression, Parameter, Properties)): baseline_dataset_input = ProcessingInput( source=self.quality_check_config.baseline_dataset, destination=baseline_dataset_des, input_name=_BASELINE_DATASET_INPUT_NAME, ) else: baseline_dataset_input = self._model_monitor._upload_and_convert_to_processing_input( source=self.quality_check_config.baseline_dataset, destination=baseline_dataset_des, name=_BASELINE_DATASET_INPUT_NAME, ) post_processor_script_input = self._model_monitor._upload_and_convert_to_processing_input( source=self.quality_check_config.post_analytics_processor_script, destination=str( pathlib.PurePosixPath( _CONTAINER_BASE_PATH, _CONTAINER_INPUT_PATH, _POST_ANALYTICS_PROCESSOR_SCRIPT_INPUT_NAME, ) ), name=_POST_ANALYTICS_PROCESSOR_SCRIPT_INPUT_NAME, ) record_preprocessor_script_input = None if isinstance(self.quality_check_config, DataQualityCheckConfig): record_preprocessor_script_input = ( self._model_monitor._upload_and_convert_to_processing_input( source=self.quality_check_config.record_preprocessor_script, destination=str( pathlib.PurePosixPath( _CONTAINER_BASE_PATH, _CONTAINER_INPUT_PATH, _RECORD_PREPROCESSOR_SCRIPT_INPUT_NAME, ) ), name=_RECORD_PREPROCESSOR_SCRIPT_INPUT_NAME, ) ) return dict( baseline_dataset_input=baseline_dataset_input, post_processor_script_input=post_processor_script_input, record_preprocessor_script_input=record_preprocessor_script_input, )
def test_processing_step(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest" ) instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = ProcessingStep( name="MyProcessingStep", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) assert step.to_request() == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": { "AppSpecification": {"ImageUri": "fakeimage"}, "ProcessingInputs": [ { "InputName": "input-1", "AppManaged": False, "S3Input": { "LocalPath": "processing_manifest", "S3CompressionType": "None", "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3Uri": processing_input_data_uri_parameter, }, } ], "ProcessingResources": { "ClusterConfig": { "InstanceCount": instance_count_parameter, "InstanceType": instance_type_parameter, "VolumeSizeInGB": 30, } }, "RoleArn": "DummyRole", }, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def _stage_configuration(self, configuration): """Serializes and uploads the user-provided EMR application configuration to S3. This method prepares an input channel. Args: configuration (Dict): the configuration dict for the EMR application configuration. """ serialized_configuration = BytesIO( json.dumps(configuration).encode("utf-8")) s3_uri = ( f"s3://{self.sagemaker_session.default_bucket()}/{self._current_job_name}/" f"input/{self._conf_container_input_name}/{self._conf_file_name}") S3Uploader.upload_string_as_file_body( body=serialized_configuration, desired_s3_uri=s3_uri, sagemaker_session=self.sagemaker_session, ) conf_input = ProcessingInput( source=s3_uri, destination= f"{self._conf_container_base_path}{self._conf_container_input_name}", input_name=_SparkProcessorBase._conf_container_input_name, ) return conf_input
def create_prepro_step(params, pre_processor, execution_input): prepro_input_data = params['prep-input-path'] prepro_output_data = params['prep-output-path'] input_dir = '/opt/ml/processing/input' output_dir = '/opt/ml/processing/output' prepro_inputs = [ ProcessingInput( source=prepro_input_data, destination=input_dir, input_name="input-data" ) ] prepro_outputs = [ ProcessingOutput( source=output_dir, destination=prepro_output_data, output_name="processed-data", ) ] processing_step = ProcessingStep( "SageMaker pre-processing step", processor=pre_processor, job_name=execution_input["PreprocessingJobName"], inputs=prepro_inputs, outputs=prepro_outputs, container_arguments=["--input-dir", input_dir, "--output-dir", output_dir] ) return processing_step
def create_evaluation_step(params, model_evaluation_processor, execution_input, job_name, train_job_name): evaluation_output_destination = os.path.join( params['eval-result-path'], job_name) prepro_input_data = params['prep-input-path'] trained_model_data = os.path.join(params['train-output-path'], train_job_name, 'output/model.tar.gz') model_dir = '/opt/ml/processing/model' data_dir = '/opt/ml/processing/test' output_dir = '/opt/ml/processing/evaluation' inputs_evaluation = [ # data path for model evaluation ProcessingInput( source=prepro_input_data, destination=data_dir, input_name="data-dir", ), # model path ProcessingInput( source=trained_model_data, destination=model_dir, input_name="model-dir", ), ] outputs_evaluation = [ ProcessingOutput( source=output_dir, destination=evaluation_output_destination, output_name="output-dir", ), ] evaluation_step = ProcessingStep( "SageMaker Evaluation step", processor=model_evaluation_processor, job_name=execution_input["EvaluationJobName"], inputs=inputs_evaluation, outputs=outputs_evaluation, container_arguments=["--data-dir", data_dir, "--model-dir", model_dir, "--output-dir", output_dir, "--experiment-name", params['experiment-name'], "--mlflow-server", params['mlflow-server-uri']] ) return evaluation_step
def test_byo_container_with_baked_in_script(sagemaker_session): custom_processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, ) custom_processor.run( inputs=[ ProcessingInput(source="/local/path/to/my/sklearn_transformer", destination="/code/") ], arguments=["CensusTract", "County"], ) expected_args = { "inputs": [{ "InputName": "input-1", "S3Input": { "S3Uri": "mocked_s3_uri_from_upload_data", "LocalPath": "/code/", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", "S3CompressionType": "None", }, }], "output_config": { "Outputs": [] }, "job_name": custom_processor._current_job_name, "resources": { "ClusterConfig": { "InstanceType": "ml.m4.xlarge", "InstanceCount": 1, "VolumeSizeInGB": 30, } }, "stopping_condition": None, "app_specification": { "ImageUri": CUSTOM_IMAGE_URI, "ContainerArguments": ["CensusTract", "County"], }, "environment": None, "network_config": None, "role_arn": ROLE, "tags": None, "experiment_config": None, } sagemaker_session.process.assert_called_with(**expected_args)
def test_script_processor_with_all_parameters(exists_mock, isfile_mock, sagemaker_session): processor = ScriptProcessor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, command=["python3"], instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{"Key": "my-tag", "Value": "my-tag-value"}], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), sagemaker_session=sagemaker_session, ) processor.run( code="/local/path/to/processing_code.py", inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( app_managed=True, feature_store_output=FeatureStoreOutput("Foo"), source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters(processor._current_job_name) sagemaker_session.process.assert_called_with(**expected_args) assert "my_job_name" in processor._current_job_name
def test_processing_step_normalizes_args(mock_normalize_args, sagemaker_session): processor = ScriptProcessor( role=ROLE, image_uri= "012345678901.dkr.ecr.us-west-2.amazonaws.com/my-custom-image-uri", command=["python3"], instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), sagemaker_session=sagemaker_session, ) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") inputs = [ ProcessingInput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] outputs = [ ProcessingOutput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] step = ProcessingStep( name="MyProcessingStep", processor=processor, code="foo.py", inputs=inputs, outputs=outputs, job_arguments=["arg1", "arg2"], cache_config=cache_config, ) mock_normalize_args.return_value = [step.inputs, step.outputs] step.to_request() mock_normalize_args.assert_called_with( arguments=step.job_arguments, inputs=step.inputs, outputs=step.outputs, code=step.code, )
def test_sklearn_with_all_parameters(exists_mock, isfile_mock, sagemaker_session): processor = SKLearnProcessor( role=ROLE, framework_version="0.20.0", instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{"Key": "my-tag", "Value": "my-tag-value"}], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, ), sagemaker_session=sagemaker_session, ) processor.run( code="/local/path/to/processing_code.py", inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters(processor._current_job_name) sklearn_image_uri = ( "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3" ) expected_args["app_specification"]["ImageUri"] = sklearn_image_uri sagemaker_session.process.assert_called_with(**expected_args)
def _get_recipe_input(self): """Creates a ProcessingInput with Data Wrangler recipe uri and appends it to inputs""" return ProcessingInput( source=self.data_wrangler_flow_source, destination="/opt/ml/processing/flow", input_name="flow", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", )
def test_add_depends_on(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest") instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step_1 = ProcessingStep( name="MyProcessingStep-1", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_2 = ProcessingStep( name="MyProcessingStep-2", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3 = ProcessingStep( name="MyProcessingStep-3", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3.add_depends_on([step_2.name]) assert "DependsOn" not in step_1.to_request() assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"] assert step_3.to_request()["DependsOn"] == [ "MyProcessingStep-1", "MyProcessingStep-2" ]
def test_processing_step(sagemaker_session): processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=1, instance_type="ml.m4.4xlarge", sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] step = ProcessingStep( name="MyProcessingStep", processor=processor, inputs=inputs, outputs=[], ) assert step.to_request() == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": { "AppSpecification": { "ImageUri": "fakeimage" }, "ProcessingInputs": [{ "InputName": "input-1", "AppManaged": False, "S3Input": { "LocalPath": "processing_manifest", "S3CompressionType": "None", "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3Uri": "s3://my-bucket/processing_manifest", }, }], "ProcessingResources": { "ClusterConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.4xlarge", "VolumeSizeInGB": 30, } }, "RoleArn": "DummyRole", }, } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def test_script_processor_with_one_input(exists_mock, isfile_mock, sagemaker_session): processor = _get_script_processor(sagemaker_session) processor.run( code="/local/path/to/processing_code.py", inputs=[ ProcessingInput(source="/local/path/to/my/dataset/census.csv", destination="/data/") ], ) expected_args = _get_expected_args(processor._current_job_name) expected_args["inputs"].insert(0, _get_data_input()) sagemaker_session.process.assert_called_with(**expected_args)
def create_s3_processing_input(s3_dataset_definition, name, base_dir): """Create an S3 processing input for a DW job (From Data Wrangler Job notebook template 2021-03-10) """ return ProcessingInput( source=s3_dataset_definition['s3ExecutionContext']['s3Uri'], destination=f"{base_dir}/{name}", input_name=name, s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", )
def create_flow_notebook_processing_input(base_dir, flow_s3_uri): """Create the flow file processing input for a DW job (From Data Wrangler Job notebook template 2021-03-10) """ return ProcessingInput( source=flow_s3_uri, destination=f"{base_dir}/flow", input_name="flow", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", )
def test_sklearn(sagemaker_session, sklearn_full_version, cpu_instance_type): logging.getLogger().setLevel(logging.DEBUG) # TODO-reinvent-2019: REMOVE script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_full_version, role=ROLE, instance_type=cpu_instance_type, instance_count=1, command=["python3"], sagemaker_session=sagemaker_session, max_runtime_in_seconds=3600, # TODO-reinvent-2019: REMOVE base_job_name="test-sklearn", ) sklearn_processor.run( code=script_path, inputs=[ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/") ], wait=False, logs=False, ) job_description = sklearn_processor.latest_job.describe() assert len(job_description["ProcessingInputs"]) == 2 assert job_description["ProcessingResources"] == { "ClusterConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 30 } } assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 3600 } assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["RoleArn"] == ROLE
def create_athena_processing_input(athena_dataset_defintion, name, base_dir): """Create an Athena processing input for a DW job (From Data Wrangler Job notebook template 2021-03-10) """ return ProcessingInput( input_name=name, dataset_definition=DatasetDefinition( local_path=f"{base_dir}/{name}", data_distribution_type="FullyReplicated", athena_dataset_definition=AthenaDatasetDefinition( catalog=athena_dataset_defintion["catalogName"], database=athena_dataset_defintion["databaseName"], query_string=athena_dataset_defintion["queryString"], output_s3_uri=athena_dataset_defintion["s3OutputLocation"] + f"{name}/", output_format=athena_dataset_defintion["outputFormat"].upper( ))))
def test_sklearn(sagemaker_session, sklearn_full_version, cpu_instance_type): script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_full_version, role=ROLE, instance_type=cpu_instance_type, instance_count=1, command=["python3"], sagemaker_session=sagemaker_session, base_job_name="test-sklearn", ) sklearn_processor.run( code=script_path, inputs=[ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/") ], wait=False, logs=False, ) job_description = sklearn_processor.latest_job.describe() assert len(job_description["ProcessingInputs"]) == 2 assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert (job_description["ProcessingResources"]["ClusterConfig"] ["InstanceType"] == cpu_instance_type) assert job_description["ProcessingResources"]["ClusterConfig"][ "VolumeSizeInGB"] == 30 assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 86400 } assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert ROLE in job_description["RoleArn"]
def _patch_inputs_with_payload(self, inputs, s3_payload) -> List[ProcessingInput]: # ScriptProcessor job will download only s3://..../code/runproc.sh, hence we need to also # inject our s3://.../sourcedir.tar.gz. # # We'll follow the exact same mechanism that ScriptProcessor does, which is to inject the # S3 code artifact as a processing input with destination /opt/ml/processing/input/code/payload/. # # Unfortunately, as much as I'd like to put sourcedir.tar.gz to /opt/ml/processing/input/code/, # this cannot be done as this destination is already used by the ScriptProcessor for runproc.sh, # and the SDK does not allow another input with the same destination. # - Note that the parameterized form of this path is available as ScriptProcessor._CODE_CONTAINER_BASE_PATH # and ScriptProcessor._CODE_CONTAINER_INPUT_NAME. # - See: https://github.com/aws/sagemaker-python-sdk/blob/a7399455f5386d83ddc5cb15c0db00c04bd518ec/src/sagemaker/processing.py#L425-L426) if inputs is None: inputs = [] inputs.append( ProcessingInput( source=s3_payload, destination="/opt/ml/processing/input/code/payload/")) return inputs
def create_redshift_processing_input(redshift_dataset_defintion, name, base_dir): """Create a Redshift processing input for a DW job (From Data Wrangler Job notebook template 2021-03-10) """ return ProcessingInput( input_name=name, dataset_definition=DatasetDefinition( local_path=f"{base_dir}/{name}", data_distribution_type="FullyReplicated", redshift_dataset_definition=RedshiftDatasetDefinition( cluster_id=redshift_dataset_defintion["clusterIdentifier"], database=redshift_dataset_defintion["database"], db_user=redshift_dataset_defintion["dbUser"], query_string=redshift_dataset_defintion["queryString"], cluster_role_arn=redshift_dataset_defintion["unloadIamRole"], output_s3_uri=redshift_dataset_defintion["s3OutputLocation"] + f"{name}/", output_format=redshift_dataset_defintion["outputFormat"].upper( ))))
def test_data_wrangler_processor_with_mock_input(sagemaker_session): processor = DataWranglerProcessor( role=ROLE, data_wrangler_flow_source=DATA_WRANGLER_RECIPE_SOURCE, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, ) mock_input = ProcessingInput( source=MOCK_S3_URI, destination="/opt/ml/processing/mock_input", input_name="mock_input", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", ) processor.run(inputs=[mock_input]) expected_args = _get_expected_args(processor._current_job_name, add_mock_input=True) sagemaker_session.process.assert_called_with(**expected_args)
def test_local_processing_sklearn(sagemaker_local_session_no_local_code, sklearn_latest_version): script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_latest_version, role="SageMakerRole", instance_type="local", instance_count=1, command=["python3"], sagemaker_session=sagemaker_local_session_no_local_code, ) sklearn_processor.run( code=script_path, inputs=[ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/") ], wait=False, logs=False, ) job_description = sklearn_processor.latest_job.describe() assert len(job_description["ProcessingInputs"]) == 2 assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceType"] == "local" assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["RoleArn"] == "<no_role>"
def test_preprocessing_script_in_local_container(processor): code_path = "../../src/mlmax/preprocessing.py" execution_mode = "tr ain" # Configure to either 'train', or 'infer' input_data_path = "input/census-income-sample.csv" local_data_path = "opt/ml/processing/input" processor.run( code=code_path, inputs=[ ProcessingInput(source=local_data_path, destination="/opt/ml/processing/input") ], outputs=[ ProcessingOutput( source="/opt/ml/processing/train", output_name="train_data", ), ProcessingOutput( source="/opt/ml/processing/test", output_name="test_data", ), ProcessingOutput( source="/opt/ml/processing/model", output_name="proc_model", ), ], arguments=[ "--train-test-split-ratio", "0.2", "--mode", execution_mode, "--data-input", input_data_path, ], wait=False, )
def test_one_step_data_wrangler_processing_pipeline(sagemaker_session, role, pipeline_name): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow") input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv") output_name = "3f74973c-fd1e-4845-89f8-0dd400031be9.default" output_content_type = "CSV" output_config = {output_name: {"content_type": output_content_type}} job_argument = [f"--output-config '{json.dumps(output_config)}'"] inputs = [ ProcessingInput( input_name="dummy_data.csv", source=input_file_path, destination="/opt/ml/processing/dummy_data.csv", ) ] output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output" outputs = [ ProcessingOutput( output_name=output_name, source="/opt/ml/processing/output", destination=output_s3_uri, s3_upload_mode="EndOfJob", ) ] data_wrangler_processor = DataWranglerProcessor( role=role, data_wrangler_flow_source=recipe_file_path, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, max_runtime_in_seconds=86400, ) data_wrangler_step = ProcessingStep( name="data-wrangler-step", processor=data_wrangler_processor, inputs=inputs, outputs=outputs, job_arguments=job_argument, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[data_wrangler_step], sagemaker_session=sagemaker_session, ) definition = json.loads(pipeline.definition()) expected_image_uri = image_uris.retrieve( "data-wrangler", region=sagemaker_session.boto_region_name) assert len(definition["Steps"]) == 1 assert definition["Steps"][0]["Arguments"]["AppSpecification"][ "ImageUri"] is not None assert definition["Steps"][0]["Arguments"]["AppSpecification"][ "ImageUri"] == expected_image_uri assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"] assert len(processing_inputs) == 2 for processing_input in processing_inputs: if processing_input["InputName"] == "flow": assert processing_input["S3Input"]["S3Uri"].endswith(".flow") assert processing_input["S3Input"][ "LocalPath"] == "/opt/ml/processing/flow" elif processing_input["InputName"] == "dummy_data.csv": assert processing_input["S3Input"]["S3Uri"].endswith(".csv") assert processing_input["S3Input"][ "LocalPath"] == "/opt/ml/processing/dummy_data.csv" else: raise AssertionError("Unknown input name") assert definition["Steps"][0]["Arguments"][ "ProcessingOutputConfig"] is not None processing_outputs = definition["Steps"][0]["Arguments"][ "ProcessingOutputConfig"]["Outputs"] assert len(processing_outputs) == 1 assert processing_outputs[0]["OutputName"] == output_name assert processing_outputs[0]["S3Output"] is not None assert processing_outputs[0]["S3Output"][ "LocalPath"] == "/opt/ml/processing/output" assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri try: response = pipeline.create(role) create_arn = response["PipelineArn"] execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60, max_attempts=10) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "data-wrangler-step" finally: try: pipeline.delete() except Exception: pass