def test_pipeline_override_experiment_config(): pipeline = Pipeline( name="MyPipeline", pipeline_experiment_config=PipelineExperimentConfig( "MyExperiment", "MyTrial"), steps=[CustomStep(name="MyStep", input_data="input")], sagemaker_session=sagemaker_session_mock, ) assert ordered(json.loads(pipeline.definition())) == ordered({ "Version": "2020-12-01", "Metadata": {}, "Parameters": [], "PipelineExperimentConfig": { "ExperimentName": "MyExperiment", "TrialName": "MyTrial" }, "Steps": [{ "Name": "MyStep", "Type": "Training", "Arguments": { "input_data": "input" }, }], })
class Pipeline(Entity): """Pipeline for workflow. Attributes: name (str): The name of the pipeline. parameters (Sequence[Parameters]): The list of the parameters. pipeline_experiment_config (Optional[PipelineExperimentConfig]): If set, the workflow will attempt to create an experiment and trial before executing the steps. Creation will be skipped if an experiment or a trial with the same name already exists. By default, pipeline name is used as experiment name and execution id is used as the trial name. If set to None, no experiment or trial will be created automatically. steps (Sequence[Steps]): The list of the non-conditional steps associated with the pipeline. Any steps that are within the `if_steps` or `else_steps` of a `ConditionStep` cannot be listed in the steps of a pipeline. Of particular note, the workflow service rejects any pipeline definitions that specify a step in the list of steps of a pipeline and that step in the `if_steps` or `else_steps` of any `ConditionStep`. sagemaker_session (sagemaker.session.Session): Session object that manages interactions with Amazon SageMaker APIs and any other AWS services needed. If not specified, the pipeline creates one using the default AWS configuration chain. """ name: str = attr.ib(factory=str) parameters: Sequence[Parameter] = attr.ib(factory=list) pipeline_experiment_config: Optional[PipelineExperimentConfig] = attr.ib( default=PipelineExperimentConfig( ExecutionVariables.PIPELINE_NAME, ExecutionVariables.PIPELINE_EXECUTION_ID)) steps: Sequence[Union[Step, StepCollection]] = attr.ib(factory=list) sagemaker_session: Session = attr.ib(factory=Session) _version: str = "2020-12-01" _metadata: Dict[str, Any] = dict() def to_request(self) -> RequestType: """Gets the request structure for workflow service calls.""" return { "Version": self._version, "Metadata": self._metadata, "Parameters": list_to_request(self.parameters), "PipelineExperimentConfig": self.pipeline_experiment_config.to_request() if self.pipeline_experiment_config is not None else None, "Steps": list_to_request(self.steps), } def create( self, role_arn: str, description: str = None, tags: List[Dict[str, str]] = None, ) -> Dict[str, Any]: """Creates a Pipeline in the Pipelines service. Args: role_arn (str): The role arn that is assumed by the pipeline to create step artifacts. description (str): A description of the pipeline. tags (List[Dict[str, str]]): A list of {"Key": "string", "Value": "string"} dicts as tags. Returns: A response dict from the service. """ tags = _append_project_tags(tags) kwargs = self._create_args(role_arn, description) update_args( kwargs, Tags=tags, ) return self.sagemaker_session.sagemaker_client.create_pipeline( **kwargs) def _create_args(self, role_arn: str, description: str): """Constructs the keyword argument dict for a create_pipeline call. Args: role_arn (str): The role arn that is assumed by pipelines to create step artifacts. description (str): A description of the pipeline. Returns: A keyword argument dict for calling create_pipeline. """ kwargs = dict( PipelineName=self.name, PipelineDefinition=self.definition(), RoleArn=role_arn, ) update_args( kwargs, PipelineDescription=description, ) return kwargs def describe(self) -> Dict[str, Any]: """Describes a Pipeline in the Workflow service. Returns: Response dict from the service. """ return self.sagemaker_session.sagemaker_client.describe_pipeline( PipelineName=self.name) def update(self, role_arn: str, description: str = None) -> Dict[str, Any]: """Updates a Pipeline in the Workflow service. Args: role_arn (str): The role arn that is assumed by pipelines to create step artifacts. description (str): A description of the pipeline. Returns: A response dict from the service. """ kwargs = self._create_args(role_arn, description) return self.sagemaker_session.sagemaker_client.update_pipeline( **kwargs) def upsert( self, role_arn: str, description: str = None, tags: List[Dict[str, str]] = None, ) -> Dict[str, Any]: """Creates a pipeline or updates it, if it already exists. Args: role_arn (str): The role arn that is assumed by workflow to create step artifacts. description (str): A description of the pipeline. tags (List[Dict[str, str]]): A list of {"Key": "string", "Value": "string"} dicts as tags. Returns: response dict from service """ try: response = self.create(role_arn, description, tags) except ClientError as e: error = e.response["Error"] if (error["Code"] == "ValidationException" and "Pipeline names must be unique within" in error["Message"]): response = self.update(role_arn, description) else: raise return response def delete(self) -> Dict[str, Any]: """Deletes a Pipeline in the Workflow service. Returns: A response dict from the service. """ return self.sagemaker_session.sagemaker_client.delete_pipeline( PipelineName=self.name) def start( self, parameters: Dict[str, Any] = None, execution_display_name: str = None, execution_description: str = None, ): """Starts a Pipeline execution in the Workflow service. Args: parameters (List[Dict[str, str]]): A list of parameter dicts of the form {"Name": "string", "Value": "string"}. execution_display_name (str): The display name of the pipeline execution. execution_description (str): A description of the execution. Returns: A `_PipelineExecution` instance, if successful. """ exists = True try: self.describe() except ClientError: exists = False if not exists: raise ValueError( "This pipeline is not associated with a Pipeline in SageMaker. " "Please invoke create() first before attempting to invoke start()." ) kwargs = dict(PipelineName=self.name) update_args( kwargs, PipelineParameters=format_start_parameters(parameters), PipelineExecutionDescription=execution_description, PipelineExecutionDisplayName=execution_display_name, ) response = self.sagemaker_session.sagemaker_client.start_pipeline_execution( **kwargs) return _PipelineExecution( arn=response["PipelineExecutionArn"], sagemaker_session=self.sagemaker_session, ) def definition(self) -> str: """Converts a request structure to string representation for workflow service calls.""" request_dict = self.to_request() request_dict["PipelineExperimentConfig"] = interpolate( request_dict["PipelineExperimentConfig"]) request_dict["Steps"] = interpolate(request_dict["Steps"]) return json.dumps(request_dict)
def test_pipeline_experiment_config(): config = PipelineExperimentConfig("experiment-name", "trial-name") assert config.to_request() == {"ExperimentName": "experiment-name", "TrialName": "trial-name"}
class Pipeline(Entity): """Pipeline for workflow. Attributes: name (str): The name of the pipeline. parameters (Sequence[Parameter]): The list of the parameters. pipeline_experiment_config (Optional[PipelineExperimentConfig]): If set, the workflow will attempt to create an experiment and trial before executing the steps. Creation will be skipped if an experiment or a trial with the same name already exists. By default, pipeline name is used as experiment name and execution id is used as the trial name. If set to None, no experiment or trial will be created automatically. steps (Sequence[Union[Step, StepCollection]]): The list of the non-conditional steps associated with the pipeline. Any steps that are within the `if_steps` or `else_steps` of a `ConditionStep` cannot be listed in the steps of a pipeline. Of particular note, the workflow service rejects any pipeline definitions that specify a step in the list of steps of a pipeline and that step in the `if_steps` or `else_steps` of any `ConditionStep`. sagemaker_session (sagemaker.session.Session): Session object that manages interactions with Amazon SageMaker APIs and any other AWS services needed. If not specified, the pipeline creates one using the default AWS configuration chain. """ name: str = attr.ib(factory=str) parameters: Sequence[Parameter] = attr.ib(factory=list) pipeline_experiment_config: Optional[PipelineExperimentConfig] = attr.ib( default=PipelineExperimentConfig( ExecutionVariables.PIPELINE_NAME, ExecutionVariables.PIPELINE_EXECUTION_ID)) steps: Sequence[Union[Step, StepCollection]] = attr.ib(factory=list) sagemaker_session: Session = attr.ib(factory=Session) _version: str = "2020-12-01" _metadata: Dict[str, Any] = dict() def to_request(self) -> RequestType: """Gets the request structure for workflow service calls.""" return { "Version": self._version, "Metadata": self._metadata, "Parameters": list_to_request(self.parameters), "PipelineExperimentConfig": self.pipeline_experiment_config.to_request() if self.pipeline_experiment_config is not None else None, "Steps": list_to_request(self.steps), } def create( self, role_arn: str, description: str = None, tags: List[Dict[str, str]] = None, parallelism_config: ParallelismConfiguration = None, ) -> Dict[str, Any]: """Creates a Pipeline in the Pipelines service. Args: role_arn (str): The role arn that is assumed by the pipeline to create step artifacts. description (str): A description of the pipeline. tags (List[Dict[str, str]]): A list of {"Key": "string", "Value": "string"} dicts as tags. parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration that is applied to each of the executions of the pipeline. It takes precedence over the parallelism configuration of the parent pipeline. Returns: A response dict from the service. """ tags = _append_project_tags(tags) kwargs = self._create_args(role_arn, description, parallelism_config) update_args( kwargs, Tags=tags, ) return self.sagemaker_session.sagemaker_client.create_pipeline( **kwargs) def _create_args(self, role_arn: str, description: str, parallelism_config: ParallelismConfiguration): """Constructs the keyword argument dict for a create_pipeline call. Args: role_arn (str): The role arn that is assumed by pipelines to create step artifacts. description (str): A description of the pipeline. parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration that is applied to each of the executions of the pipeline. It takes precedence over the parallelism configuration of the parent pipeline. Returns: A keyword argument dict for calling create_pipeline. """ pipeline_definition = self.definition() kwargs = dict( PipelineName=self.name, RoleArn=role_arn, ) # If pipeline definition is large, upload to S3 bucket and # provide PipelineDefinitionS3Location to request instead. if len(pipeline_definition.encode("utf-8")) < 1024 * 100: kwargs["PipelineDefinition"] = pipeline_definition else: desired_s3_uri = s3.s3_path_join( "s3://", self.sagemaker_session.default_bucket(), self.name) s3.S3Uploader.upload_string_as_file_body( body=pipeline_definition, desired_s3_uri=desired_s3_uri, sagemaker_session=self.sagemaker_session, ) kwargs["PipelineDefinitionS3Location"] = { "Bucket": self.sagemaker_session.default_bucket(), "ObjectKey": self.name, } update_args(kwargs, PipelineDescription=description, ParallelismConfiguration=parallelism_config) return kwargs def describe(self) -> Dict[str, Any]: """Describes a Pipeline in the Workflow service. Returns: Response dict from the service. See `boto3 client documentation <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/\ sagemaker.html#SageMaker.Client.describe_pipeline>`_ """ return self.sagemaker_session.sagemaker_client.describe_pipeline( PipelineName=self.name) def update( self, role_arn: str, description: str = None, parallelism_config: ParallelismConfiguration = None, ) -> Dict[str, Any]: """Updates a Pipeline in the Workflow service. Args: role_arn (str): The role arn that is assumed by pipelines to create step artifacts. description (str): A description of the pipeline. parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration that is applied to each of the executions of the pipeline. It takes precedence over the parallelism configuration of the parent pipeline. Returns: A response dict from the service. """ kwargs = self._create_args(role_arn, description, parallelism_config) return self.sagemaker_session.sagemaker_client.update_pipeline( **kwargs) def upsert( self, role_arn: str, description: str = None, tags: List[Dict[str, str]] = None, parallelism_config: ParallelismConfiguration = None, ) -> Dict[str, Any]: """Creates a pipeline or updates it, if it already exists. Args: role_arn (str): The role arn that is assumed by workflow to create step artifacts. description (str): A description of the pipeline. tags (List[Dict[str, str]]): A list of {"Key": "string", "Value": "string"} dicts as tags. parallelism_config (Optional[Config for parallel steps, Parallelism configuration that is applied to each of. the executions Returns: response dict from service """ try: response = self.create(role_arn, description, tags, parallelism_config) except ClientError as e: error = e.response["Error"] if (error["Code"] == "ValidationException" and "Pipeline names must be unique within" in error["Message"]): response = self.update(role_arn, description) if tags is not None: old_tags = self.sagemaker_session.sagemaker_client.list_tags( ResourceArn=response["PipelineArn"])["Tags"] tag_keys = [tag["Key"] for tag in tags] for old_tag in old_tags: if old_tag["Key"] not in tag_keys: tags.append(old_tag) self.sagemaker_session.sagemaker_client.add_tags( ResourceArn=response["PipelineArn"], Tags=tags) else: raise return response def delete(self) -> Dict[str, Any]: """Deletes a Pipeline in the Workflow service. Returns: A response dict from the service. """ return self.sagemaker_session.sagemaker_client.delete_pipeline( PipelineName=self.name) def start( self, parameters: Dict[str, Union[str, bool, int, float]] = None, execution_display_name: str = None, execution_description: str = None, parallelism_config: ParallelismConfiguration = None, ): """Starts a Pipeline execution in the Workflow service. Args: parameters (Dict[str, Union[str, bool, int, float]]): values to override pipeline parameters. execution_display_name (str): The display name of the pipeline execution. execution_description (str): A description of the execution. parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration that is applied to each of the executions of the pipeline. It takes precedence over the parallelism configuration of the parent pipeline. Returns: A `_PipelineExecution` instance, if successful. """ exists = True try: self.describe() except ClientError: exists = False if not exists: raise ValueError( "This pipeline is not associated with a Pipeline in SageMaker. " "Please invoke create() first before attempting to invoke start()." ) kwargs = dict(PipelineName=self.name) update_args( kwargs, PipelineParameters=format_start_parameters(parameters), PipelineExecutionDescription=execution_description, PipelineExecutionDisplayName=execution_display_name, ParallelismConfiguration=parallelism_config, ) response = self.sagemaker_session.sagemaker_client.start_pipeline_execution( **kwargs) return _PipelineExecution( arn=response["PipelineExecutionArn"], sagemaker_session=self.sagemaker_session, ) def definition(self) -> str: """Converts a request structure to string representation for workflow service calls.""" request_dict = self.to_request() request_dict["PipelineExperimentConfig"] = interpolate( request_dict["PipelineExperimentConfig"], {}, {}) callback_output_to_step_map = _map_callback_outputs(self.steps) lambda_output_to_step_name = _map_lambda_outputs(self.steps) request_dict["Steps"] = interpolate( request_dict["Steps"], callback_output_to_step_map=callback_output_to_step_map, lambda_output_to_step_map=lambda_output_to_step_name, ) return json.dumps(request_dict)
def get_pipeline( region, sagemaker_project_arn=None, role=None, default_bucket=None, model_package_group_name="restatePackageGroup", # Choose any name pipeline_name="restate-p-XXXXXXXXX", # You can find your pipeline name in the Studio UI (project -> Pipelines -> name) base_job_prefix="restate", # Choose any name ): """Gets a SageMaker ML Pipeline instance working with on RE data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Parameters for pipeline execution processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.2xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval", # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval. ) input_data = ParameterString( name="InputDataUrl", default_value=f"", # Change this to point to the s3 location of your raw input data. ) data_sources = [] # Sagemaker session sess = sagemaker_session # You can configure this with your own bucket name, e.g. # bucket = "my-bucket" bucket = sess.default_bucket() data_sources.append( ProcessingInput( input_name="restate-california", dataset_definition=DatasetDefinition( local_path="/opt/ml/processing/restate-california", data_distribution_type="FullyReplicated", # You can override below to point to other database or use different queries athena_dataset_definition=AthenaDatasetDefinition( catalog="AwsDataCatalog", database="restate", query_string="SELECT * FROM restate.california_10", output_s3_uri=f"s3://{bucket}/athena/", output_format="PARQUET", ), ), ) ) print(f"Data Wrangler export storage bucket: {bucket}") # unique flow export ID flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}" flow_export_name = f"flow-{flow_export_id}" # Output name is auto-generated from the select node's ID + output name from the flow file. output_name = "99ae1ec3-dd5f-453c-bfae-721dac423cd7.default" s3_output_prefix = f"export-{flow_export_name}/output" s3_output_path = f"s3://{bucket}/{s3_output_prefix}" print(f"Flow S3 export result path: {s3_output_path}") processing_job_output = ProcessingOutput( output_name=output_name, source="/opt/ml/processing/output", destination=s3_output_path, s3_upload_mode="EndOfJob", ) # name of the flow file which should exist in the current notebook working directory flow_file_name = "sagemaker-pipeline/restate-athena-california.flow" # Load .flow file from current notebook working directory #!echo "Loading flow file from current notebook working directory: $PWD" with open(flow_file_name) as f: flow = json.load(f) # Upload flow to S3 s3_client = boto3.client("s3") s3_client.upload_file( flow_file_name, bucket, f"data_wrangler_flows/{flow_export_name}.flow", ExtraArgs={"ServerSideEncryption": "aws:kms"}, ) flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow" print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}") ## Input - Flow: restate-athena-russia.flow flow_input = ProcessingInput( source=flow_s3_uri, destination="/opt/ml/processing/flow", input_name="flow", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", ) # IAM role for executing the processing job. iam_role = role # Unique processing job name. Give a unique name every time you re-execute processing jobs processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}" # Data Wrangler Container URL. container_uri = sagemaker.image_uris.retrieve( framework="data-wrangler", # we are using the Sagemaker built in xgboost algorithm region=region, ) # Processing Job Instance count and instance type. instance_count = 2 instance_type = "ml.m5.4xlarge" # Size in GB of the EBS volume to use for storing data during processing volume_size_in_gb = 30 # Content type for each output. Data Wrangler supports CSV as default and Parquet. output_content_type = "CSV" # Network Isolation mode; default is off enable_network_isolation = False # List of tags to be passed to the processing job user_tags = [] # Output configuration used as processing job container arguments output_config = {output_name: {"content_type": output_content_type}} # KMS key for per object encryption; default is None kms_key = None processor = Processor( role=iam_role, image_uri=container_uri, instance_count=instance_count, instance_type=instance_type, volume_size_in_gb=volume_size_in_gb, network_config=NetworkConfig(enable_network_isolation=enable_network_isolation), sagemaker_session=sess, output_kms_key=kms_key, tags=user_tags, ) data_wrangler_step = ProcessingStep( name="DataWranglerProcess", processor=processor, inputs=[flow_input] + data_sources, outputs=[processing_job_output], job_arguments=[f"--output-config '{json.dumps(output_config)}'"], ) # Processing step for feature engineering # this processor does not have awswrangler installed sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-restate-preprocess", # choose any name sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="Preprocess", # choose any name processor=sklearn_processor, inputs=[ ProcessingInput( source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ output_name ].S3Output.S3Uri, destination="/opt/ml/processing/data/raw-data-dir", ) ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=[ "--input-data", data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ output_name ].S3Output.S3Uri, ], ) # Training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain" model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain" cache_config = CacheConfig(enable_caching=True, expire_after="30d") xgb_image_uri = sagemaker.image_uris.retrieve( framework="xgboost", # we are using the Sagemaker built in xgboost algorithm region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=xgb_image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/restate-xgb-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( # #objective="binary:logistic", # objective="reg:linear", num_round=50, # max_depth=5, # eta=0.2, # gamma=4, # min_child_weight=6, # subsample=0.7, # silent=0, ) xgb_train.set_hyperparameters(grow_policy="lossguide") xgb_objective_metric_name = "validation:mse" xgb_hyperparameter_ranges = { "max_depth": IntegerParameter(2, 10, scaling_type="Linear"), } xgb_tuner_log = HyperparameterTuner( xgb_train, xgb_objective_metric_name, xgb_hyperparameter_ranges, max_jobs=3, max_parallel_jobs=3, strategy="Random", objective_type="Minimize", ) xgb_step_tuning = TuningStep( name="XGBHPTune", tuner=xgb_tuner_log, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, cache_config=cache_config, ) # dtree_image_uri = '625467769535.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-decision-tree:latest' dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version( ImageName="restate-dtree" )["ContainerImage"] dtree_train = Estimator( image_uri=dtree_image_uri, role=role, instance_count=1, instance_type=training_instance_type, base_job_name=f"{base_job_prefix}/restate-dtree-train", output_path=model_path, sagemaker_session=sagemaker_session, ) dtree_objective_metric_name = "validation:mse" dtree_metric_definitions = [{"Name": "validation:mse", "Regex": "mse:(\S+)"}] dtree_hyperparameter_ranges = { "max_depth": IntegerParameter(10, 50, scaling_type="Linear"), "max_leaf_nodes": IntegerParameter(2, 12, scaling_type="Linear"), } dtree_tuner_log = HyperparameterTuner( dtree_train, dtree_objective_metric_name, dtree_hyperparameter_ranges, dtree_metric_definitions, max_jobs=3, max_parallel_jobs=3, strategy="Random", objective_type="Minimize", ) dtree_step_tuning = TuningStep( name="DTreeHPTune", tuner=dtree_tuner_log, inputs={ "training": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, cache_config=cache_config, ) dtree_script_eval = ScriptProcessor( image_uri=dtree_image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-dtree-eval", sagemaker_session=sagemaker_session, role=role, ) dtree_evaluation_report = PropertyFile( name="EvaluationReportDTree", output_name="dtree_evaluation", path="dtree_evaluation.json", ) dtree_step_eval = ProcessingStep( name="DTreeEval", processor=dtree_script_eval, inputs=[ ProcessingInput( # source=dtree_step_train.properties.ModelArtifacts.S3ModelArtifacts, source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput( output_name="dtree_evaluation", source="/opt/ml/processing/evaluation" ), ], code=os.path.join(BASE_DIR, "dtree_evaluate.py"), property_files=[dtree_evaluation_report], ) xgb_script_eval = ScriptProcessor( image_uri=xgb_image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-xgb-eval", sagemaker_session=sagemaker_session, role=role, ) xgb_evaluation_report = PropertyFile( name="EvaluationReportXGBoost", output_name="xgb_evaluation", path="xgb_evaluation.json", ) xgb_step_eval = ProcessingStep( name="XGBEval", processor=xgb_script_eval, inputs=[ ProcessingInput( source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "xgb_evaluate.py"), property_files=[xgb_evaluation_report], ) xgb_model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/xgb_evaluation.json".format( xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json", ) ) dtree_model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/dtree_evaluation.json".format( dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][ "S3Uri" ] ), content_type="application/json", ) ) xgb_eval_metrics = JsonGet( step=xgb_step_eval, property_file=xgb_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ) dtree_eval_metrics = JsonGet( step=dtree_step_eval, property_file=dtree_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ) # Register model step that will be conditionally executed dtree_step_register = RegisterModel( name="DTreeReg", estimator=dtree_train, model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=dtree_model_metrics, ) # Register model step that will be conditionally executed xgb_step_register = RegisterModel( name="XGBReg", estimator=xgb_train, model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=xgb_model_metrics, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here left=JsonGet( step=dtree_step_eval, property_file=dtree_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), right=JsonGet( step=xgb_step_eval, property_file=xgb_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), # You can change the threshold here ) step_cond = ConditionStep( name="AccuracyCond", conditions=[cond_lte], if_steps=[dtree_step_register], else_steps=[xgb_step_register], ) create_date = time.strftime("%Y-%m-%d-%H-%M-%S") # Pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data ], pipeline_experiment_config=PipelineExperimentConfig( pipeline_name + "-" + create_date, "restate-{}".format(create_date) ), steps=[ data_wrangler_step, step_process, dtree_step_tuning, xgb_step_tuning, dtree_step_eval, xgb_step_eval, step_cond, ], sagemaker_session=sagemaker_session, ) return pipeline
def test_pipeline_execution_with_custom_experiment_config( sagemaker_session, smclient, role, sklearn_latest_version, cpu_instance_type, pipeline_name, athena_dataset_definition, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") inputs = [ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"), ProcessingInput(dataset_definition=athena_dataset_definition), ] sklearn_processor = SKLearnProcessor( framework_version=sklearn_latest_version, role=role, instance_type=cpu_instance_type, instance_count=instance_count, command=["python3"], sagemaker_session=sagemaker_session, base_job_name="test-sklearn", ) step_sklearn = ProcessingStep( name="sklearn-process", processor=sklearn_processor, inputs=inputs, code=script_path, ) experiment_name = f"my-experiment-{int(time.time() * 10**7)}" pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], pipeline_experiment_config=PipelineExperimentConfig( experiment_name=experiment_name, trial_name=Join( on="-", values=["my-trial", ExecutionVariables.PIPELINE_EXECUTION_ID]), ), steps=[step_sklearn], sagemaker_session=sagemaker_session, ) try: pipeline.create(role) execution = pipeline.start(parameters={}) try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "sklearn-process" execution_id = execution.arn.split("/")[-1] # trial components trial_components = smclient.list_trial_components( TrialName=f"my-trial-{execution_id}") assert len(trial_components["TrialComponentSummaries"]) == 1 # trial details trial = smclient.describe_trial(TrialName=f"my-trial-{execution_id}") assert experiment_name == trial["ExperimentName"] finally: try: pipeline.delete() except Exception: pass