def test_one_step_pyspark_processing_pipeline( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") cache_config = CacheConfig(enable_caching=True, expire_after="T30m") pyspark_processor = PySparkProcessor( base_job_name="sm-spark", framework_version="2.4", role=role, instance_count=instance_count, instance_type=cpu_instance_type, max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) spark_run_args = pyspark_processor.get_run_args( submit_app=script_path, arguments=[ "--s3_input_bucket", sagemaker_session.default_bucket(), "--s3_input_key_prefix", "spark-input", "--s3_output_bucket", sagemaker_session.default_bucket(), "--s3_output_key_prefix", "spark-output", ], ) step_pyspark = ProcessingStep( name="pyspark-process", processor=pyspark_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, cache_config=cache_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_pyspark], sagemaker_session=sagemaker_session, ) try: # NOTE: We should exercise the case when role used in the pipeline execution is # different than that required of the steps in the pipeline itself. The role in # the pipeline definition needs to create training and processing jobs and other # sagemaker entities. However, the jobs created in the steps themselves execute # under a potentially different role, often requiring access to S3 and other # artifacts not required to during creation of the jobs in the pipeline steps. response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn # Check CacheConfig response = json.loads( pipeline.describe() ["PipelineDefinition"])["Steps"][0]["CacheConfig"] assert response["Enabled"] == cache_config.enable_caching assert response["ExpireAfter"] == cache_config.expire_after try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "pyspark-process" finally: try: pipeline.delete() except Exception: pass
def test_two_processing_job_depends_on( sagemaker_session, role, pipeline_name, region_name, cpu_instance_type, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") pyspark_processor = PySparkProcessor( base_job_name="sm-spark", framework_version="2.4", role=role, instance_count=instance_count, instance_type=cpu_instance_type, max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) spark_run_args = pyspark_processor.get_run_args( submit_app=script_path, arguments=[ "--s3_input_bucket", sagemaker_session.default_bucket(), "--s3_input_key_prefix", "spark-input", "--s3_output_bucket", sagemaker_session.default_bucket(), "--s3_output_key_prefix", "spark-output", ], ) step_pyspark_1 = ProcessingStep( name="pyspark-process-1", processor=pyspark_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, ) step_pyspark_2 = ProcessingStep( name="pyspark-process-2", depends_on=[step_pyspark_1], processor=pyspark_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_pyspark_1, step_pyspark_2], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 2 time_stamp = {} for execution_step in execution_steps: name = execution_step["StepName"] if name == "pyspark-process-1": time_stamp[name] = execution_step["EndTime"] else: time_stamp[name] = execution_step["StartTime"] assert time_stamp["pyspark-process-1"] < time_stamp["pyspark-process-2"] finally: try: pipeline.delete() except Exception: pass
configuration = [{ "Classification": "spark-defaults", "Properties": { "spark.executor.memory": "200g", "spark.driver.memory": "200g", "spark.executor.cores": "20", "spark.cores.memmaxory": "20" } }] node_1_run_args = node_1_proc.get_run_args( submit_app='node_1.py', configuration=configuration, arguments=["--aws_bucket", BUCKET, "--aws_prefix", PREFIX], outputs=[ ProcessingOutput(output_name="node-1-output", source="/opt/ml/processing/output/node-1.parquet", destination=f"s3://{BUCKET}/{PREFIX}") ]) node_1_step = ProcessingStep(name="node-1-step", processor=node_1_proc, outputs=node_1_run_args.outputs, code=node_1_run_args.code, job_arguments=node_1_run_args.arguments) # ###### # # Node 2 # node_2_proc = PySparkProcessor( base_job_name='spark-proc-name',