def _generate_baseline_job_inputs(self): """Generates a dict with ProcessingInput objects Generates a dict with three ProcessingInput objects: baseline_dataset_input, post_processor_script_input and record_preprocessor_script_input Returns: dict: with three ProcessingInput objects as baseline job inputs """ baseline_dataset = self.quality_check_config.baseline_dataset baseline_dataset_des = str( pathlib.PurePosixPath(_CONTAINER_BASE_PATH, _CONTAINER_INPUT_PATH, _BASELINE_DATASET_INPUT_NAME)) if is_pipeline_variable(baseline_dataset): baseline_dataset_input = ProcessingInput( source=self.quality_check_config.baseline_dataset, destination=baseline_dataset_des, input_name=_BASELINE_DATASET_INPUT_NAME, ) else: baseline_dataset_input = self._model_monitor._upload_and_convert_to_processing_input( source=self.quality_check_config.baseline_dataset, destination=baseline_dataset_des, name=_BASELINE_DATASET_INPUT_NAME, ) post_processor_script_input = self._model_monitor._upload_and_convert_to_processing_input( source=self.quality_check_config.post_analytics_processor_script, destination=str( pathlib.PurePosixPath( _CONTAINER_BASE_PATH, _CONTAINER_INPUT_PATH, _POST_ANALYTICS_PROCESSOR_SCRIPT_INPUT_NAME, )), name=_POST_ANALYTICS_PROCESSOR_SCRIPT_INPUT_NAME, ) record_preprocessor_script_input = None if isinstance(self.quality_check_config, DataQualityCheckConfig): record_preprocessor_script_input = ( self._model_monitor._upload_and_convert_to_processing_input( source=self.quality_check_config. record_preprocessor_script, destination=str( pathlib.PurePosixPath( _CONTAINER_BASE_PATH, _CONTAINER_INPUT_PATH, _RECORD_PREPROCESSOR_SCRIPT_INPUT_NAME, )), name=_RECORD_PREPROCESSOR_SCRIPT_INPUT_NAME, )) return dict( baseline_dataset_input=baseline_dataset_input, post_processor_script_input=post_processor_script_input, record_preprocessor_script_input=record_preprocessor_script_input, )
def __init__(self, values): # pylint: disable=super-init-not-called """Initialize a ``CategoricalParameter``. Args: values (list or object): The possible values for the hyperparameter. This input will be converted into a list of strings. """ values = values if isinstance(values, list) else [values] self.values = [ str(v) if not is_pipeline_variable(v) else v.to_string() for v in values ]
def prepare_container_def(self, instance_type=None, accelerator_type=None, serverless_inference_config=None): """Prepare the container definition. Args: instance_type: Instance type of the container. accelerator_type: Accelerator type, if applicable. serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig): Specifies configuration related to serverless endpoint. Instance type is not provided in serverless inference. So this is used to find image URIs. Returns: A container definition for deploying a ``Model`` to an ``Endpoint``. """ if not self.image_uri: if instance_type is None and serverless_inference_config is None: raise ValueError( "Must supply either an instance type (for choosing CPU vs GPU) or an image URI." ) image_uri = self._get_image_uri( instance_type, accelerator_type, serverless_inference_config=serverless_inference_config) env = self._get_container_env() # If self.model_data is pipeline variable, model is not yet there. # So defer repacking to later during pipeline execution if self.entry_point and not is_pipeline_variable(self.model_data): key_prefix = sagemaker.fw_utils.model_code_key_prefix( self.key_prefix, self.name, image_uri) bucket = self.bucket or self.sagemaker_session.default_bucket() model_data = s3.s3_path_join("s3://", bucket, key_prefix, "model.tar.gz") sagemaker.utils.repack_model( self.entry_point, self.source_dir, self.dependencies, self.model_data, model_data, self.sagemaker_session, kms_key=self.model_kms_key, ) else: model_data = self.model_data return sagemaker.container_def(image_uri, model_data, env)
def as_tuning_range(self, name): """Represent the parameter range as a dictionary. It is suitable for a request to create an Amazon SageMaker hyperparameter tuning job. Args: name (str): The name of the hyperparameter. Returns: dict[str, str]: A dictionary that contains the name and values of the hyperparameter. """ return { "Name": name, "MinValue": str(self.min_value) if not is_pipeline_variable(self.min_value) else self.min_value.to_string(), "MaxValue": str(self.max_value) if not is_pipeline_variable(self.max_value) else self.max_value.to_string(), "ScalingType": self.scaling_type, }
def primitive_or_expr( value: Union[ExecutionVariable, Expression, PrimitiveType, Parameter, Properties] ) -> Union[Dict[str, str], PrimitiveType]: """Provide the expression of the value or return value if it is a primitive. Args: value (Union[ConditionValueType, PrimitiveType]): The value to evaluate. Returns: Either the expression of the value or the primitive value. """ if is_pipeline_variable(value): return value.expr return value
def model_code_key_prefix(code_location_key_prefix, model_name, image): """Returns the s3 key prefix for uploading code during model deployment. The location returned is a potential concatenation of 2 parts 1. code_location_key_prefix if it exists 2. model_name or a name derived from the image Args: code_location_key_prefix (str): the s3 key prefix from code_location model_name (str): the name of the model image (str): the image from which a default name can be extracted Returns: str: the key prefix to be used in uploading code """ name_from_image = f"/model_code/{int(time.time())}" if not is_pipeline_variable(image): name_from_image = sagemaker.utils.name_from_image(image) return "/".join( filter(None, [code_location_key_prefix, model_name or name_from_image]))
def prepare_container_def(self, instance_type=None, accelerator_type=None, serverless_inference_config=None): """Prepare the container definition. Args: instance_type: Instance type of the container. accelerator_type: Accelerator type, if applicable. serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig): Specifies configuration related to serverless endpoint. Instance type is not provided in serverless inference. So this is used to find image URIs. Returns: A container definition for deploying a ``Model`` to an ``Endpoint``. """ if not self.image_uri: if instance_type is None and serverless_inference_config is None: raise ValueError( "Must supply either an instance type (for choosing CPU vs GPU) or an image URI." ) image_uri = self._get_image_uri( instance_type, accelerator_type, serverless_inference_config=serverless_inference_config) env = self._get_container_env() if self.entry_point and not is_pipeline_variable(self.model_data): key_prefix = sagemaker.fw_utils.model_code_key_prefix( self.key_prefix, self.name, image_uri) bucket = self.bucket or self.sagemaker_session.default_bucket() model_data = s3.s3_path_join("s3://", bucket, key_prefix, "model.tar.gz") sagemaker.utils.repack_model( self.entry_point, self.source_dir, self.dependencies, self.model_data, model_data, self.sagemaker_session, kms_key=self.model_kms_key, ) elif self.entry_point and is_pipeline_variable(self.model_data): # model is not yet there, defer repacking to later during pipeline execution if isinstance(self.sagemaker_session, PipelineSession): self.sagemaker_session.context.need_runtime_repack.add( id(self)) else: # TODO: link the doc in the warning once ready logging.warning( "The model_data is a Pipeline variable of type %s, " "which should be used under `PipelineSession` and " "leverage `ModelStep` to create or register model. " "Otherwise some functionalities e.g. " "runtime repack may be missing", type(self.model_data), ) model_data = self.model_data else: model_data = self.model_data return sagemaker.container_def(image_uri, model_data, env)
def transform( self, data, data_type="S3Prefix", content_type=None, compression_type=None, split_type=None, job_name=None, input_filter=None, output_filter=None, join_source=None, experiment_config=None, model_client_config=None, wait=True, logs=True, ): """Start a new transform job. Args: data (str): Input data location in S3. data_type (str): What the S3 location defines (default: 'S3Prefix'). Valid values: * 'S3Prefix' - the S3 URI defines a key name prefix. All objects with this prefix will be used as inputs for the transform job. * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object to use as an input for the transform job. content_type (str): MIME type of the input data (default: None). compression_type (str): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None. split_type (str): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'. job_name (str): job name (default: None). If not specified, one will be generated. input_filter (str): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value '$', representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the `RFC format <https://tools.ietf.org/html/rfc4180>`_. See `Supported JSONPath Operators <https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#data-processing-operators>`_ for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for `CreateTransformJob <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_. Some examples: "$[1:]", "$.features" (default: None). output_filter (str): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for `CreateTransformJob <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_. Some examples: "$[1:]", "$.prediction" (default: None). join_source (str): The source of data to be joined to the transform output. It can be set to 'Input' meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None. experiment_config (dict[str, str]): Experiment management configuration. Optionally, the dict can contain three keys: 'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'. The behavior of setting these keys is as follows: * If `ExperimentName` is supplied but `TrialName` is not a Trial will be automatically created and the job's Trial Component associated with the Trial. * If `TrialName` is supplied and the Trial already exists the job's Trial Component will be associated with the Trial. * If both `ExperimentName` and `TrialName` are not supplied the trial component will be unassociated. * `TrialComponentDisplayName` is used for display in Studio. model_client_config (dict[str, str]): Model configuration. Dictionary contains two optional keys, 'InvocationsTimeoutInSeconds', and 'InvocationsMaxRetries'. (default: ``None``). wait (bool): Whether the call should wait until the job completes (default: ``True``). logs (bool): Whether to show the logs produced by the job. Only meaningful when wait is ``True`` (default: ``True``). """ local_mode = self.sagemaker_session.local_mode if not local_mode and not data.startswith("s3://"): raise ValueError("Invalid S3 URI: {}".format(data)) if job_name is not None: self._current_job_name = job_name else: base_name = self.base_transform_job_name if base_name is None: base_name = ( "transform-job" if is_pipeline_variable(self.model_name) else self._retrieve_base_name() ) self._current_job_name = name_from_base(base_name) if self.output_path is None or self._reset_output_path is True: self.output_path = "s3://{}/{}".format( self.sagemaker_session.default_bucket(), self._current_job_name ) self._reset_output_path = True self.latest_transform_job = _TransformJob.start_new( self, data, data_type, content_type, compression_type, split_type, input_filter, output_filter, join_source, experiment_config, model_client_config, ) if wait: self.latest_transform_job.wait(logs=logs)
def __init__( self, name: str, clarify_check_config: ClarifyCheckConfig, check_job_config: CheckJobConfig, skip_check: Union[bool, PipelineNonPrimitiveInputTypes] = False, register_new_baseline: Union[bool, PipelineNonPrimitiveInputTypes] = False, model_package_group_name: Union[str, PipelineNonPrimitiveInputTypes] = None, supplied_baseline_constraints: Union[ str, PipelineNonPrimitiveInputTypes] = None, display_name: str = None, description: str = None, cache_config: CacheConfig = None, depends_on: Union[List[str], List[Step]] = None, ): """Constructs a ClarifyCheckStep. Args: name (str): The name of the ClarifyCheckStep step. clarify_check_config (ClarifyCheckConfig): A ClarifyCheckConfig instance. check_job_config (CheckJobConfig): A CheckJobConfig instance. skip_check (bool or PipelineNonPrimitiveInputTypes): Whether the check should be skipped (default: False). register_new_baseline (bool or PipelineNonPrimitiveInputTypes): Whether the new baseline should be registered (default: False). model_package_group_name (str or PipelineNonPrimitiveInputTypes): The name of a registered model package group, among which the baseline will be fetched from the latest approved model (default: None). supplied_baseline_constraints (str or PipelineNonPrimitiveInputTypes): The S3 path to the supplied constraints object representing the constraints JSON file which will be used for drift to check (default: None). display_name (str): The display name of the ClarifyCheckStep step (default: None). description (str): The description of the ClarifyCheckStep step (default: None). cache_config (CacheConfig): A `sagemaker.workflow.steps.CacheConfig` instance (default: None). depends_on (List[str] or List[Step]): A list of step names or step instances this `sagemaker.workflow.steps.ClarifyCheckStep` depends on (default: None). """ if (not isinstance(clarify_check_config, DataBiasCheckConfig) and not isinstance(clarify_check_config, ModelBiasCheckConfig) and not isinstance(clarify_check_config, ModelExplainabilityCheckConfig)): raise RuntimeError( "The clarify_check_config can only be object of " + "DataBiasCheckConfig, ModelBiasCheckConfig or ModelExplainabilityCheckConfig" ) if is_pipeline_variable(clarify_check_config.data_config. s3_analysis_config_output_path): raise RuntimeError( "s3_analysis_config_output_path cannot be of type " + "ExecutionVariable/Expression/Parameter/Properties") if (not clarify_check_config.data_config.s3_analysis_config_output_path and is_pipeline_variable( clarify_check_config.data_config.s3_output_path)): raise RuntimeError( "`s3_output_path` cannot be of type ExecutionVariable/Expression/Parameter" + "/Properties if `s3_analysis_config_output_path` is none or empty " ) super(ClarifyCheckStep, self).__init__(name, display_name, description, StepTypeEnum.CLARIFY_CHECK, depends_on) self.skip_check = skip_check self.register_new_baseline = register_new_baseline self.clarify_check_config = clarify_check_config self.check_job_config = check_job_config self.model_package_group_name = model_package_group_name self.supplied_baseline_constraints = supplied_baseline_constraints self.cache_config = cache_config if isinstance(self.clarify_check_config, ModelExplainabilityCheckConfig): self._model_monitor = self.check_job_config._generate_model_monitor( "ModelExplainabilityMonitor") else: self._model_monitor = self.check_job_config._generate_model_monitor( "ModelBiasMonitor") self.clarify_check_config.monitoring_analysis_config_uri = ( self._upload_monitoring_analysis_config()) self._baselining_processor = self._model_monitor._create_baselining_processor( ) self._processing_params = self._generate_processing_job_parameters( self._generate_processing_job_analysis_config(), self._baselining_processor) root_path = f"Steps.{name}" root_prop = Properties(path=root_path) root_prop.__dict__["CalculatedBaselineConstraints"] = Properties( f"{root_path}.CalculatedBaselineConstraints") root_prop.__dict__[ "BaselineUsedForDriftCheckConstraints"] = Properties( f"{root_path}.BaselineUsedForDriftCheckConstraints") self._properties = root_prop