def test_script_processor_with_all_parameters(exists_mock, isfile_mock, sagemaker_session): processor = ScriptProcessor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, command=["python3"], instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), sagemaker_session=sagemaker_session, ) processor.run( code="/local/path/to/processing_code.py", inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters( processor._current_job_name) sagemaker_session.process.assert_called_with(**expected_args) assert "my_job_name" in processor._current_job_name
def test_script_processor_with_no_inputs_or_outputs(sagemaker_session, image_uri, cpu_instance_type): script_processor = ScriptProcessor( role=ROLE, image_uri=image_uri, command=["python3"], instance_count=1, instance_type=cpu_instance_type, volume_size_in_gb=100, volume_kms_key=None, max_runtime_in_seconds=3600, base_job_name="test-script-processor-with-no-inputs-or-outputs", env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"}, tags=[{ "Key": "dummy-tag", "Value": "dummy-tag-value" }], sagemaker_session=sagemaker_session, ) script_processor.run(code=os.path.join(DATA_DIR, "dummy_script.py"), arguments=["-v"], wait=True, logs=True) job_description = script_processor.latest_job.describe() assert job_description["ProcessingInputs"][0]["InputName"] == "code" assert job_description["ProcessingJobName"].startswith( "test-script-processor-with-no-inputs") assert job_description["ProcessingJobStatus"] == "Completed" assert job_description["ProcessingResources"] == { "ClusterConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", "VolumeSizeInGB": 100 } } assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"] assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["AppSpecification"]["ImageUri"] == image_uri assert job_description["Environment"] == { "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value" } assert ROLE in job_description["RoleArn"] assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 3600 }
def test_script_processor_with_all_parameters_via_run_args( exists_mock, isfile_mock, sagemaker_session ): processor = ScriptProcessor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, command=["python3"], instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{"Key": "my-tag", "Value": "my-tag-value"}], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), sagemaker_session=sagemaker_session, ) run_args = processor.get_run_args( code="/local/path/to/processing_code.py", inputs=_get_data_inputs_all_parameters(), outputs=_get_data_outputs_all_parameters(), arguments=["--drop-columns", "'SelfEmployed'"], ) processor.run( code=run_args.code, inputs=run_args.inputs, outputs=run_args.outputs, arguments=run_args.arguments, wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters(processor._current_job_name) sagemaker_session.process.assert_called_with(**expected_args) assert "my_job_name" in processor._current_job_name
def lambda_handler(event, context): timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) # Variables de ambiente local_filename, headers = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv') os.rename(local_filename,'/tmp/abalone.csv') bucket = os.environ['bucket'] role = os.environ['role'] sagemaker_session = sagemaker.Session() spark_repository_uri = os.environ['spark_repository_uri'] # Prefix constantes prefix = 'sagemaker/spark-preprocess-demo/' + timestamp_prefix input_prefix = prefix + '/input/raw/abalone' input_preprocessed_prefix = prefix + '/input/preprocessed/abalone' mleap_model_prefix = prefix + '/mleap-model' # Store the value of the execution timestamp client = boto3.client('s3') client.put_object(Body=timestamp_prefix.encode('ascii'), Bucket=bucket, Key='execution.txt') # Upload data so it's present for training and inference print(sagemaker_session.upload_data(path='/tmp/abalone.csv', bucket=bucket, key_prefix=input_prefix)) spark_processor = ScriptProcessor(base_job_name='spark-preprocessor', image_uri=spark_repository_uri, command=['/opt/program/submit'], role=role, instance_count=2, instance_type='ml.r5.xlarge', max_runtime_in_seconds=1200, env={'mode': 'python'}) spark_processor.run(code=f's3://{bucket}/sparkdemo/preprocess.py', arguments=['s3_input_bucket', bucket, 's3_input_key_prefix', input_prefix, 's3_output_bucket', bucket, 's3_output_key_prefix', input_preprocessed_prefix, 's3_model_bucket', bucket, 's3_mleap_model_prefix', mleap_model_prefix], logs=True) event['s3_output_path'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/xgboost_model' event['train_data'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/input/preprocessed/abalone/train/part' event['validation_data'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/input/preprocessed/abalone/validation/part' event['training_job'] = f'{timestamp_prefix}-job' return event
def test_processing_job_inputs_and_output_config(sagemaker_session, image_uri, cpu_instance_type, output_kms_key): script_processor = ScriptProcessor( role=ROLE, image_uri=image_uri, command=["python3"], instance_count=1, instance_type=cpu_instance_type, volume_size_in_gb=100, volume_kms_key=None, output_kms_key=output_kms_key, max_runtime_in_seconds=3600, base_job_name="test-script-processor", env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"}, tags=[{ "Key": "dummy-tag", "Value": "dummy-tag-value" }], sagemaker_session=sagemaker_session, ) script_processor.run( code=os.path.join(DATA_DIR, "dummy_script.py"), inputs=_get_processing_inputs_with_all_parameters( sagemaker_session.default_bucket()), outputs=_get_processing_outputs_with_all_parameters(), arguments=["-v"], wait=False, ) job_description = script_processor.latest_job.describe() expected_inputs_and_outputs = _get_processing_job_inputs_and_outputs( sagemaker_session.default_bucket(), output_kms_key) assert (job_description["ProcessingInputs"][:-1] == expected_inputs_and_outputs["ProcessingInputs"]) assert (job_description["ProcessingOutputConfig"] == expected_inputs_and_outputs["ProcessingOutputConfig"])
def test_processing_step_with_script_processor(pipeline_session, processing_input, network_config): processor = ScriptProcessor( role=sagemaker.get_execution_role(), image_uri=IMAGE_URI, command=["python3"], instance_type=INSTANCE_TYPE, instance_count=1, volume_size_in_gb=100, volume_kms_key="volume-kms-key", output_kms_key="output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{"Key": "my-tag", "Value": "my-tag-value"}], network_config=network_config, sagemaker_session=pipeline_session, ) step_args = processor.run( inputs=processing_input, code=DUMMY_S3_SCRIPT_PATH, job_name="my-processing-job" ) step = ProcessingStep( name="MyProcessingStep", step_args=step_args, ) pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": step_args, }
def test_script_processor(sagemaker_session, image_uri, cpu_instance_type, output_kms_key): input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") script_processor = ScriptProcessor( role=ROLE, image_uri=image_uri, command=["python3"], instance_count=1, instance_type=cpu_instance_type, volume_size_in_gb=100, volume_kms_key=None, output_kms_key=output_kms_key, max_runtime_in_seconds=3600, base_job_name="test-script-processor", env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"}, tags=[{ "Key": "dummy-tag", "Value": "dummy-tag-value" }], sagemaker_session=sagemaker_session, ) script_processor.run( code=os.path.join(DATA_DIR, "dummy_script.py"), inputs=[ ProcessingInput( source=input_file_path, destination="/opt/ml/processing/input/container/path/", input_name="dummy_input", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/opt/ml/processing/output/container/path/", output_name="dummy_output", s3_upload_mode="EndOfJob", ) ], arguments=["-v"], wait=True, logs=True, ) job_description = script_processor.latest_job.describe() assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input" assert job_description["ProcessingInputs"][1]["InputName"] == "code" assert job_description["ProcessingJobName"].startswith( "test-script-processor") assert job_description["ProcessingJobStatus"] == "Completed" assert job_description["ProcessingOutputConfig"][ "KmsKeyId"] == output_kms_key assert job_description["ProcessingOutputConfig"]["Outputs"][0][ "OutputName"] == "dummy_output" assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert (job_description["ProcessingResources"]["ClusterConfig"] ["InstanceType"] == cpu_instance_type) assert job_description["ProcessingResources"]["ClusterConfig"][ "VolumeSizeInGB"] == 100 assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"] assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["AppSpecification"]["ImageUri"] == image_uri assert job_description["Environment"] == { "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value" } assert ROLE in job_description["RoleArn"] assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 3600 }
role=role, image_uri=image_uri, command=["python3"], instance_count=1, instance_type="ml.m5.xlarge", ) script_processor.run( code="s3://sagemaker-ap-southeast-1-342474125894/riotinto/preprocessing/code/processor.py", inputs=[ ProcessingInput( # Include data files, Can be s3 or local path source="s3://sagemaker-ap-southeast-1-342474125894/riotinto/preprocessing/input", destination="/opt/ml/processing/input/data", input_name="parquet", s3_data_type="S3Prefix", ), ], outputs=[ ProcessingOutput( source="/opt/ml/processing/output", destination="s3://sagemaker-ap-southeast-1-342474125894/riotinto/preprocessing/output", output_name="output", ), ], # Must be list of str arguments=["--option", "1"], wait=True, )
def process( session: SagemakerSession, role, script, # script_source, inputs=None, outputs=None, dependencies=None, requirements=None, configuration_script=None, configuration_command=None, base_job_name=PROCESSING_JOB_NAME, job_name=None, image=Images.PROCESSING.tag, image_path=Images.PROCESSING.path, image_accounts=",".join(Images.PROCESSING.accounts), instance=PROCESSING_INSTANCE, volume_size=30, runtime_seconds=PROCESSING_RUNTIME_SECONDS, output_mount=OUTPUT_MOUNT, input_mount=INPUT_MOUNT, module_mount=MODULE_MOUNT, python='python3', wait=True, logs=True, arguments=None, tags=None, output_json=None, env=None ): iam = session.boto_session.client('iam') image_uri = ecr_ensure_image( image=Image( path=image_path, tag=image, accounts=image_accounts.split(",") ), session=session.boto_session ) role = ensure_processing_role(iam=iam, role_name=role) if inputs is None: inputs = {} if outputs is None: outputs = {} if dependencies is None: dependencies = {} if tags is None: tags = {} else: tags = tags.copy() if arguments is None: arguments = {} else: arguments = arguments.copy() # if module_mount is not None and len(module_mount)> 0: # command = ["PYTHONPATH={module_mount};${{PYTHONPATH}}".format(module_mount=module_mount), "python3"] # else: # command = ['python3'] command = ['sh'] path_arguments = {} processing_inputs = [] s3 = session.boto_session.client('s3') for name, source in inputs.items(): processing_input, path_argument = make_processing_input( mount=input_mount, name=name, source=source.local, mode=source.mode, s3=s3 ) processing_inputs.append(processing_input) path_arguments[name] = path_argument for name, source in dependencies.items(): processing_input, path_argument = make_processing_input( mount=module_mount, name=name, source=source, s3=s3 ) processing_inputs.append(processing_input) path_arguments[name] = path_argument script_remote = "{}/{}".format(module_mount, os.path.basename(script)) processing_inputs.append( ProcessingInput( source=script, destination=module_mount, input_name="aws_sagemaker_remote_script", # s3_data_type='S3Prefix', s3_input_mode='File', # s3_data_distribution_type='FullyReplicated', # s3_compression_type='None' ) ) if env: env = env.copy() else: env = {} env.update({ "AWS_SAGEMAKER_REMOTE_MODULE_MOUNT": module_mount, "AWS_SAGEMAKER_REMOTE_PYTHON": python, "AWS_SAGEMAKER_REMOTE_SCRIPT": script_remote, "IS_SAGEMAKER": "1" }) if requirements: requirements_remote = "{}/requirements_txt/{}".format( module_mount, 'requirements.txt') env['AWS_SAGEMAKER_REMOTE_REQUIREMENTS'] = requirements_remote processing_inputs.append( ProcessingInput( source=requirements, destination="{}/requirements_txt".format(module_mount), input_name="aws_sagemaker_remote_requirements", s3_input_mode='File', ) ) if configuration_script: configuration_script_remote = "{}/{}".format( module_mount, os.path.basename(configuration_script)) env['AWS_SAGEMAKER_REMOTE_CONFIGURATION_SCRIPT'] = configuration_script_remote processing_inputs.append( ProcessingInput( source=configuration_script, destination=module_mount, input_name="aws_sagemaker_remote_configuration_script", s3_input_mode='File' ) ) if configuration_command and len(configuration_command) > 0: env['AWS_SAGEMAKER_REMOTE_CONFIGURATION_COMMAND'] = configuration_command tags["Source"] = 'aws-sagemaker-remote' tags["BaseJobName"] = base_job_name tags = make_tags(tags) print("Tags: {}".format(tags)) processor = ScriptProcessor( role, image_uri=image_uri, instance_count=1, instance_type=instance, command=command, volume_size_in_gb=volume_size, # volume_kms_key=None, # output_kms_key=None, max_runtime_in_seconds=runtime_seconds, base_job_name=base_job_name, sagemaker_session=session, env=env, tags=tags # network_config=None ) processing_outputs = [] for name, dest in outputs.items(): # todo: move into PathArgument class if not ((not dest.remote) or dest.remote == 'default' or dest.remote.startswith('s3://')): raise ValueError("Argument [{}] must be either `default` or an S3 url (`s3://...`). Value given was [{}].".format( variable_to_argparse("{}_s3".format(name)), dest.remote)) source = "{}/{}".format(output_mount, name) if dest.mode: assert dest.mode in ['EndOfJob', 'Continuous'] processing_outputs.append( ProcessingOutput( source=source, destination=dest.remote if dest.remote and dest.remote != 'default' else None, output_name=name, s3_upload_mode=dest.mode or 'EndOfJob' )) path_arguments[name] = source ensure_eol(PROCESSING_SCRIPT) code = Path(PROCESSING_SCRIPT).as_uri() if job_name is None or len(str(job_name).strip()) == 0: job_name = None else: job_name = str(job_name).strip() arguments.update(path_arguments) processor.run( code=code, inputs=processing_inputs, outputs=processing_outputs, wait=False, logs=logs, job_name=job_name, arguments=sagemaker_arguments(vargs=arguments) ) job = processor.latest_job if output_json: obj = job.describe() #print("Describe: {}".format(obj)) os.makedirs(os.path.dirname( os.path.abspath(output_json)), exist_ok=True) with open(output_json, 'w') as f: json.dump(obj, f, default=json_converter, indent=4) if wait: job.wait(logs=logs) return processor
's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_raw', destination="/opt/ml/processing/input/data"), ] OUTPUTS = [ ProcessingOutput( source="/opt/ml/processing/output/train", destination= 's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_features/train' ), ProcessingOutput( source="/opt/ml/processing/output/test", destination= 's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_features/test' ), ] # PROCESSOR BUILD AND RUN processor = ScriptProcessor(base_job_name=JOB_NAME, tags=TAGS, role=ROLE_SAGEMAKER, instance_type=INSTANCE_TYPE, instance_count=INSTANCE_COUNT, image_uri=IMAGE_URI, command=['python3']) processor.run( code=SCRIPT, arguments=ARGUMENTS, inputs=INPUTS, outputs=OUTPUTS, )
def test_byo_container_with_script_processor(sagemaker_session): script_processor = ScriptProcessor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, command=["python3"], instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, ) with patch("os.path.isfile", return_value=True): script_processor.run( code="/local/path/to/sklearn_transformer.py", inputs=[ ProcessingInput(source="/local/path/to/my/dataset/census.csv", destination="/data/") ], experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = { "inputs": [ { "InputName": "input-1", "S3Input": { "S3Uri": "mocked_s3_uri_from_upload_data", "LocalPath": "/data/", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", "S3CompressionType": "None", }, }, { "InputName": "code", "S3Input": { "S3Uri": "mocked_s3_uri_from_upload_data", "LocalPath": "/opt/ml/processing/input/code", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", "S3CompressionType": "None", }, }, ], "output_config": { "Outputs": [] }, "job_name": script_processor._current_job_name, "resources": { "ClusterConfig": { "InstanceType": "ml.m4.xlarge", "InstanceCount": 1, "VolumeSizeInGB": 30, } }, "stopping_condition": None, "app_specification": { "ImageUri": CUSTOM_IMAGE_URI, "ContainerEntrypoint": [ "python3", "/opt/ml/processing/input/code/sklearn_transformer.py", ], }, "environment": None, "network_config": None, "role_arn": ROLE, "tags": None, "experiment_config": { "ExperimentName": "AnExperiment" }, } sagemaker_session.process.assert_called_with(**expected_args)
ROLE_ARN = sagemaker.get_execution_role() ## image uri code ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account') REGION = boto3.Session().region_name ECR_REPOSITORY = 'sagemaker-processing-container' TAG = ':latest' IMAGE_URI = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(ACCOUNT_ID, REGION, ECR_REPOSITORY + TAG) ## call processing job script_processor = ScriptProcessor(command=['python3'], image_uri=IMAGE_URI, role=ROLE_ARN, instance_count=1, instance_type='ml.m5.xlarge') script_processor.run( code='train_val_test_split.py', inputs=[ ProcessingInput(source=f's3://{BUCKET}/{INPUT_FOLDER}/', destination='/opt/ml/processing/input') ], outputs=[ ProcessingOutput(source='/opt/ml/processing/output/train', destination=f's3://{BUCKET}/{OUTPUT_FOLDER}/train'), ProcessingOutput( source='/opt/ml/processing/output/validation', destination=f's3://{BUCKET}/{OUTPUT_FOLDER}/validation'), ProcessingOutput(source='/opt/ml/processing/output/test', destination=f's3://{BUCKET}/{OUTPUT_FOLDER}/test') ])
# For local training a dummy role will be sufficient role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001' processor = ScriptProcessor( command=['python3'], image_uri='sagemaker-scikit-learn-processing-local', role=role, instance_count=1, instance_type='local') processor.run(code='processing_script.py', inputs=[ ProcessingInput(source='./input_data/', destination='/opt/ml/processing/input_data/') ], outputs=[ ProcessingOutput(output_name='word_count_data', source='/opt/ml/processing/processed_data/') ], arguments=['job-type', 'word-count']) preprocessing_job_description = processor.jobs[-1].describe() output_config = preprocessing_job_description['ProcessingOutputConfig'] print(output_config) for output in output_config['Outputs']: if output['OutputName'] == 'word_count_data': word_count_data_file = output['S3Output']['S3Uri'] print('Output file is located on: {}'.format(word_count_data_file))
# For local training a dummy role will be sufficient role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001' processor = ScriptProcessor( command=['python3'], image_uri='sagemaker-delta-sharing-processing-local', role=role, instance_count=1, instance_type='local') processor.run(code='processing_script.py', inputs=[ ProcessingInput(source='./profile/', destination='/opt/ml/processing/profile/') ], outputs=[ ProcessingOutput(output_name='delta_lake_processed_data', source='/opt/ml/processing/processed_data/') ]) preprocessing_job_description = processor.jobs[-1].describe() output_config = preprocessing_job_description['ProcessingOutputConfig'] print(output_config) for output in output_config['Outputs']: if output['OutputName'] == 'delta_lake_processed_data': delta_lake_processed_data_file = output['S3Output']['S3Uri'] bucket = delta_lake_processed_data_file.split("/")[:3][2] output_file_name = '/'.join(