def test_sklearn_processor_with_required_parameters(exists_mock, isfile_mock, botocore_resolver, sagemaker_session, sklearn_version): botocore_resolver.return_value.construct_endpoint.return_value = { "hostname": ECR_HOSTNAME } processor = SKLearnProcessor( role=ROLE, instance_type="ml.m4.xlarge", framework_version=sklearn_version, instance_count=1, sagemaker_session=sagemaker_session, ) processor.run(code="/local/path/to/processing_code.py") expected_args = _get_expected_args(processor._current_job_name) sklearn_image_uri = ( "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3" ).format(sklearn_version) expected_args["app_specification"]["ImageUri"] = sklearn_image_uri sagemaker_session.process.assert_called_with(**expected_args)
def test_sklearn_with_network_config(sagemaker_session, sklearn_full_version, cpu_instance_type): script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_full_version, role=ROLE, instance_type=cpu_instance_type, instance_count=1, command=["python3"], sagemaker_session=sagemaker_session, base_job_name="test-sklearn-with-network-config", network_config=NetworkConfig(enable_network_isolation=True, encrypt_inter_container_traffic=True), ) sklearn_processor.run( code=script_path, inputs=[ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/") ], wait=False, logs=False, ) job_description = sklearn_processor.latest_job.describe() network_config = job_description["NetworkConfig"] assert network_config["EnableInterContainerTrafficEncryption"] assert network_config["EnableNetworkIsolation"]
def test_sklearn_with_no_inputs_or_outputs(sagemaker_session, image_uri, sklearn_full_version, cpu_instance_type): sklearn_processor = SKLearnProcessor( framework_version=sklearn_full_version, role=ROLE, command=["python3"], instance_type=cpu_instance_type, instance_count=1, volume_size_in_gb=100, volume_kms_key=None, max_runtime_in_seconds=3600, base_job_name="test-sklearn-with-no-inputs-or-outputs", env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"}, tags=[{ "Key": "dummy-tag", "Value": "dummy-tag-value" }], sagemaker_session=sagemaker_session, ) sklearn_processor.run(code=os.path.join(DATA_DIR, "dummy_script.py"), arguments=["-v"], wait=True, logs=True) job_description = sklearn_processor.latest_job.describe() assert job_description["ProcessingInputs"][0]["InputName"] == "code" assert job_description["ProcessingJobName"].startswith( "test-sklearn-with-no-inputs") assert job_description["ProcessingJobStatus"] == "Completed" assert job_description["ProcessingResources"] == { "ClusterConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 100 } } assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"] assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["AppSpecification"]["ImageUri"] == image_uri assert job_description["Environment"] == { "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value" } assert ROLE in job_description["RoleArn"] assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 3600 }
def test_sklearn_with_no_inputs(sagemaker_session): sklearn_processor = SKLearnProcessor( framework_version="0.20.0", role=ROLE, command=["python3"], instance_type="ml.m4.xlarge", instance_count=1, sagemaker_session=sagemaker_session, ) with patch("os.path.isfile", return_value=True): sklearn_processor.run(code="/local/path/to/sklearn_transformer.py") expected_args = { "inputs": [{ "InputName": "code", "S3Input": { "S3Uri": "mocked_s3_uri_from_upload_data", "LocalPath": "/opt/ml/processing/input/code", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", "S3CompressionType": "None", }, }], "output_config": { "Outputs": [] }, "job_name": sklearn_processor._current_job_name, "resources": { "ClusterConfig": { "InstanceType": "ml.m4.xlarge", "InstanceCount": 1, "VolumeSizeInGB": 30, } }, "stopping_condition": None, "app_specification": { "ImageUri": "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3", "ContainerEntrypoint": [ "python3", "/opt/ml/processing/input/code/sklearn_transformer.py", ], }, "environment": None, "network_config": None, "role_arn": ROLE, "tags": None, "experiment_config": None, } sagemaker_session.process.assert_called_with(**expected_args)
def test_sklearn_with_all_parameters_via_run_args_called_twice( exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session ): botocore_resolver.return_value.construct_endpoint.return_value = {"hostname": ECR_HOSTNAME} processor = SKLearnProcessor( role=ROLE, framework_version=sklearn_version, instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{"Key": "my-tag", "Value": "my-tag-value"}], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), sagemaker_session=sagemaker_session, ) run_args = processor.get_run_args( code="/local/path/to/processing_code.py", inputs=_get_data_inputs_all_parameters(), outputs=_get_data_outputs_all_parameters(), arguments=["--drop-columns", "'SelfEmployed'"], ) run_args = processor.get_run_args( code="/local/path/to/processing_code.py", inputs=_get_data_inputs_all_parameters(), outputs=_get_data_outputs_all_parameters(), arguments=["--drop-columns", "'SelfEmployed'"], ) processor.run( code=run_args.code, inputs=run_args.inputs, outputs=run_args.outputs, arguments=run_args.arguments, wait=True, logs=False, experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters(processor._current_job_name) sklearn_image_uri = ( "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3" ).format(sklearn_version) expected_args["app_specification"]["ImageUri"] = sklearn_image_uri sagemaker_session.process.assert_called_with(**expected_args)
def test_sklearn_with_all_parameters(exists_mock, isfile_mock, sagemaker_session): processor = SKLearnProcessor( role=ROLE, framework_version="0.20.0", instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{"Key": "my-tag", "Value": "my-tag-value"}], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, ), sagemaker_session=sagemaker_session, ) processor.run( code="/local/path/to/processing_code.py", inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters(processor._current_job_name) sklearn_image_uri = ( "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3" ) expected_args["app_specification"]["ImageUri"] = sklearn_image_uri sagemaker_session.process.assert_called_with(**expected_args)
def test_sklearn(sagemaker_session, sklearn_full_version, cpu_instance_type): logging.getLogger().setLevel(logging.DEBUG) # TODO-reinvent-2019: REMOVE script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_full_version, role=ROLE, instance_type=cpu_instance_type, instance_count=1, command=["python3"], sagemaker_session=sagemaker_session, max_runtime_in_seconds=3600, # TODO-reinvent-2019: REMOVE base_job_name="test-sklearn", ) sklearn_processor.run( code=script_path, inputs=[ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/") ], wait=False, logs=False, ) job_description = sklearn_processor.latest_job.describe() assert len(job_description["ProcessingInputs"]) == 2 assert job_description["ProcessingResources"] == { "ClusterConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 30 } } assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 3600 } assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["RoleArn"] == ROLE
def test_sklearn_processor_with_required_parameters(exists_mock, isfile_mock, sagemaker_session): processor = SKLearnProcessor( role=ROLE, instance_type="ml.m4.xlarge", framework_version="0.20.0", instance_count=1, sagemaker_session=sagemaker_session, ) processor.run(code="/local/path/to/processing_code.py") expected_args = _get_expected_args(processor._current_job_name) sklearn_image_uri = ( "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3" ) expected_args["app_specification"]["ImageUri"] = sklearn_image_uri sagemaker_session.process.assert_called_with(**expected_args)
def test_sklearn(sagemaker_session, sklearn_full_version, cpu_instance_type): script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_full_version, role=ROLE, instance_type=cpu_instance_type, instance_count=1, command=["python3"], sagemaker_session=sagemaker_session, base_job_name="test-sklearn", ) sklearn_processor.run( code=script_path, inputs=[ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/") ], wait=False, logs=False, ) job_description = sklearn_processor.latest_job.describe() assert len(job_description["ProcessingInputs"]) == 2 assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert (job_description["ProcessingResources"]["ClusterConfig"] ["InstanceType"] == cpu_instance_type) assert job_description["ProcessingResources"]["ClusterConfig"][ "VolumeSizeInGB"] == 30 assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 86400 } assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert ROLE in job_description["RoleArn"]
def test_local_processing_sklearn(sagemaker_local_session_no_local_code, sklearn_latest_version): script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_latest_version, role="SageMakerRole", instance_type="local", instance_count=1, command=["python3"], sagemaker_session=sagemaker_local_session_no_local_code, ) sklearn_processor.run( code=script_path, inputs=[ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/") ], wait=False, logs=False, ) job_description = sklearn_processor.latest_job.describe() assert len(job_description["ProcessingInputs"]) == 2 assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceType"] == "local" assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["RoleArn"] == "<no_role>"
def test_sklearn_with_custom_default_bucket( sagemaker_session_with_custom_bucket, custom_bucket_name, image_uri, sklearn_full_version, cpu_instance_type, output_kms_key, ): input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") sklearn_processor = SKLearnProcessor( framework_version=sklearn_full_version, role=ROLE, command=["python3"], instance_type=cpu_instance_type, instance_count=1, volume_size_in_gb=100, volume_kms_key=None, output_kms_key=output_kms_key, max_runtime_in_seconds=3600, base_job_name="test-sklearn-with-customizations", env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"}, tags=[{ "Key": "dummy-tag", "Value": "dummy-tag-value" }], sagemaker_session=sagemaker_session_with_custom_bucket, ) sklearn_processor.run( code=os.path.join(DATA_DIR, "dummy_script.py"), inputs=[ ProcessingInput( source=input_file_path, destination="/opt/ml/processing/input/container/path/", input_name="dummy_input", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/opt/ml/processing/output/container/path/", output_name="dummy_output", s3_upload_mode="EndOfJob", ) ], arguments=["-v"], wait=True, logs=True, ) job_description = sklearn_processor.latest_job.describe() assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input" assert custom_bucket_name in job_description["ProcessingInputs"][0][ "S3Input"]["S3Uri"] assert job_description["ProcessingInputs"][1]["InputName"] == "code" assert custom_bucket_name in job_description["ProcessingInputs"][1][ "S3Input"]["S3Uri"] assert job_description["ProcessingJobName"].startswith( "test-sklearn-with-customizations") assert job_description["ProcessingJobStatus"] == "Completed" assert job_description["ProcessingOutputConfig"][ "KmsKeyId"] == output_kms_key assert job_description["ProcessingOutputConfig"]["Outputs"][0][ "OutputName"] == "dummy_output" assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert (job_description["ProcessingResources"]["ClusterConfig"] ["InstanceType"] == cpu_instance_type) assert job_description["ProcessingResources"]["ClusterConfig"][ "VolumeSizeInGB"] == 100 assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"] assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["AppSpecification"]["ImageUri"] == image_uri assert job_description["Environment"] == { "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value" } assert ROLE in job_description["RoleArn"] assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 3600 }
# For local training a dummy role will be sufficient role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001' processor = SKLearnProcessor(framework_version='0.20.0', instance_count=1, instance_type='local', role=role) print('Starting processing job.') print('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.') processor.run(code='processing_script.py', inputs=[ProcessingInput( source='./input_data/', destination='/opt/ml/processing/input_data/')], outputs=[ProcessingOutput( output_name='word_count_data', source='/opt/ml/processing/processed_data/')], arguments=['job-type', 'word-count'] ) preprocessing_job_description = processor.jobs[-1].describe() output_config = preprocessing_job_description['ProcessingOutputConfig'] print(output_config) for output in output_config['Outputs']: if output['OutputName'] == 'word_count_data': word_count_data_file = output['S3Output']['S3Uri'] print('Output file is located on: {}'.format(word_count_data_file))
def test_sklearn_with_all_customizations(sagemaker_session): sklearn_processor = SKLearnProcessor( framework_version="0.20.0", role=ROLE, command=["python3"], instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key=None, output_kms_key="arn:aws:kms:us-west-2:012345678901:key/kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, ), sagemaker_session=sagemaker_session, ) with patch("os.path.isdir", return_value=True): sklearn_processor.run( code="/local/path/to/sklearn_transformer.py", inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = { "inputs": [ { "InputName": "my_dataset", "S3Input": { "S3Uri": "s3://path/to/my/dataset/census.csv", "LocalPath": "/container/path/", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", "S3CompressionType": "None", }, }, { "InputName": "code", "S3Input": { "S3Uri": "mocked_s3_uri_from_upload_data", "LocalPath": "/opt/ml/processing/input/code", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", "S3CompressionType": "None", }, }, ], "output_config": { "Outputs": [{ "OutputName": "my_output", "S3Output": { "S3Uri": "s3://uri/", "LocalPath": "/container/path/", "S3UploadMode": "EndOfJob", }, }], "KmsKeyId": "arn:aws:kms:us-west-2:012345678901:key/kms-key", }, "job_name": sklearn_processor._current_job_name, "resources": { "ClusterConfig": { "InstanceType": "ml.m4.xlarge", "InstanceCount": 1, "VolumeSizeInGB": 100, } }, "stopping_condition": { "MaxRuntimeInSeconds": 3600 }, "app_specification": { "ImageUri": "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3", "ContainerArguments": ["--drop-columns", "'SelfEmployed'"], "ContainerEntrypoint": [ "python3", "/opt/ml/processing/input/code/sklearn_transformer.py", ], }, "environment": { "my_env_variable": "my_env_variable_value" }, "network_config": { "EnableNetworkIsolation": True, "VpcConfig": { "SecurityGroupIds": ["my_security_group_id"], "Subnets": ["my_subnet_id"], }, }, "role_arn": ROLE, "tags": [{ "Key": "my-tag", "Value": "my-tag-value" }], "experiment_config": { "ExperimentName": "AnExperiment" }, } sagemaker_session.process.assert_called_with(**expected_args)
processor = SKLearnProcessor(framework_version='0.20.0', instance_count=1, instance_type='local', role=role) print('Starting processing job.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.' ) processor.run(code='processing_script.py', inputs=[ ProcessingInput( source='./dependencies/', destination='/opt/ml/processing/dependencies/'), ProcessingInput(source='./input_data/', destination='/opt/ml/processing/input_data/') ], outputs=[ ProcessingOutput(output_name='tokenized_words_data', source='/opt/ml/processing/processed_data/') ], arguments=['job-type', 'word-tokenize']) preprocessing_job_description = processor.jobs[-1].describe() output_config = preprocessing_job_description['ProcessingOutputConfig'] print(output_config) for output in output_config['Outputs']: if output['OutputName'] == 'tokenized_words_data': tokenized_words_data_file = output['S3Output']['S3Uri']