Exemple #1
0
    def _generate_baseline_job_inputs(self):
        """Generates a dict with ProcessingInput objects

        Generates a dict with three ProcessingInput objects: baseline_dataset_input,
            post_processor_script_input and record_preprocessor_script_input

        Returns:
            dict: with three ProcessingInput objects as baseline job inputs
        """
        baseline_dataset = self.quality_check_config.baseline_dataset
        baseline_dataset_des = str(
            pathlib.PurePosixPath(_CONTAINER_BASE_PATH, _CONTAINER_INPUT_PATH,
                                  _BASELINE_DATASET_INPUT_NAME))
        if is_pipeline_variable(baseline_dataset):
            baseline_dataset_input = ProcessingInput(
                source=self.quality_check_config.baseline_dataset,
                destination=baseline_dataset_des,
                input_name=_BASELINE_DATASET_INPUT_NAME,
            )
        else:
            baseline_dataset_input = self._model_monitor._upload_and_convert_to_processing_input(
                source=self.quality_check_config.baseline_dataset,
                destination=baseline_dataset_des,
                name=_BASELINE_DATASET_INPUT_NAME,
            )

        post_processor_script_input = self._model_monitor._upload_and_convert_to_processing_input(
            source=self.quality_check_config.post_analytics_processor_script,
            destination=str(
                pathlib.PurePosixPath(
                    _CONTAINER_BASE_PATH,
                    _CONTAINER_INPUT_PATH,
                    _POST_ANALYTICS_PROCESSOR_SCRIPT_INPUT_NAME,
                )),
            name=_POST_ANALYTICS_PROCESSOR_SCRIPT_INPUT_NAME,
        )

        record_preprocessor_script_input = None
        if isinstance(self.quality_check_config, DataQualityCheckConfig):
            record_preprocessor_script_input = (
                self._model_monitor._upload_and_convert_to_processing_input(
                    source=self.quality_check_config.
                    record_preprocessor_script,
                    destination=str(
                        pathlib.PurePosixPath(
                            _CONTAINER_BASE_PATH,
                            _CONTAINER_INPUT_PATH,
                            _RECORD_PREPROCESSOR_SCRIPT_INPUT_NAME,
                        )),
                    name=_RECORD_PREPROCESSOR_SCRIPT_INPUT_NAME,
                ))
        return dict(
            baseline_dataset_input=baseline_dataset_input,
            post_processor_script_input=post_processor_script_input,
            record_preprocessor_script_input=record_preprocessor_script_input,
        )
    def __init__(self, values):  # pylint: disable=super-init-not-called
        """Initialize a ``CategoricalParameter``.

        Args:
            values (list or object): The possible values for the hyperparameter.
                This input will be converted into a list of strings.
        """
        values = values if isinstance(values, list) else [values]
        self.values = [
            str(v) if not is_pipeline_variable(v) else v.to_string()
            for v in values
        ]
    def prepare_container_def(self,
                              instance_type=None,
                              accelerator_type=None,
                              serverless_inference_config=None):
        """Prepare the container definition.

        Args:
            instance_type: Instance type of the container.
            accelerator_type: Accelerator type, if applicable.
            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
                Specifies configuration related to serverless endpoint. Instance type is
                not provided in serverless inference. So this is used to find image URIs.

        Returns:
            A container definition for deploying a ``Model`` to an ``Endpoint``.
        """
        if not self.image_uri:
            if instance_type is None and serverless_inference_config is None:
                raise ValueError(
                    "Must supply either an instance type (for choosing CPU vs GPU) or an image URI."
                )

        image_uri = self._get_image_uri(
            instance_type,
            accelerator_type,
            serverless_inference_config=serverless_inference_config)
        env = self._get_container_env()

        # If self.model_data is pipeline variable, model is not yet there.
        # So defer repacking to later during pipeline execution
        if self.entry_point and not is_pipeline_variable(self.model_data):
            key_prefix = sagemaker.fw_utils.model_code_key_prefix(
                self.key_prefix, self.name, image_uri)

            bucket = self.bucket or self.sagemaker_session.default_bucket()
            model_data = s3.s3_path_join("s3://", bucket, key_prefix,
                                         "model.tar.gz")

            sagemaker.utils.repack_model(
                self.entry_point,
                self.source_dir,
                self.dependencies,
                self.model_data,
                model_data,
                self.sagemaker_session,
                kms_key=self.model_kms_key,
            )
        else:
            model_data = self.model_data

        return sagemaker.container_def(image_uri, model_data, env)
    def as_tuning_range(self, name):
        """Represent the parameter range as a dictionary.

        It is suitable for a request to create an Amazon SageMaker hyperparameter tuning job.

        Args:
            name (str): The name of the hyperparameter.

        Returns:
            dict[str, str]: A dictionary that contains the name and values of
            the hyperparameter.
        """
        return {
            "Name":
            name,
            "MinValue":
            str(self.min_value) if not is_pipeline_variable(self.min_value)
            else self.min_value.to_string(),
            "MaxValue":
            str(self.max_value) if not is_pipeline_variable(self.max_value)
            else self.max_value.to_string(),
            "ScalingType":
            self.scaling_type,
        }
Exemple #5
0
def primitive_or_expr(
    value: Union[ExecutionVariable, Expression, PrimitiveType, Parameter,
                 Properties]
) -> Union[Dict[str, str], PrimitiveType]:
    """Provide the expression of the value or return value if it is a primitive.

    Args:
        value (Union[ConditionValueType, PrimitiveType]): The value to evaluate.

    Returns:
        Either the expression of the value or the primitive value.
    """
    if is_pipeline_variable(value):
        return value.expr
    return value
Exemple #6
0
def model_code_key_prefix(code_location_key_prefix, model_name, image):
    """Returns the s3 key prefix for uploading code during model deployment.

    The location returned is a potential concatenation of 2 parts
        1. code_location_key_prefix if it exists
        2. model_name or a name derived from the image

    Args:
        code_location_key_prefix (str): the s3 key prefix from code_location
        model_name (str): the name of the model
        image (str): the image from which a default name can be extracted

    Returns:
        str: the key prefix to be used in uploading code
    """
    name_from_image = f"/model_code/{int(time.time())}"
    if not is_pipeline_variable(image):
        name_from_image = sagemaker.utils.name_from_image(image)
    return "/".join(
        filter(None,
               [code_location_key_prefix, model_name or name_from_image]))
Exemple #7
0
    def prepare_container_def(self,
                              instance_type=None,
                              accelerator_type=None,
                              serverless_inference_config=None):
        """Prepare the container definition.

        Args:
            instance_type: Instance type of the container.
            accelerator_type: Accelerator type, if applicable.
            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
                Specifies configuration related to serverless endpoint. Instance type is
                not provided in serverless inference. So this is used to find image URIs.

        Returns:
            A container definition for deploying a ``Model`` to an ``Endpoint``.
        """
        if not self.image_uri:
            if instance_type is None and serverless_inference_config is None:
                raise ValueError(
                    "Must supply either an instance type (for choosing CPU vs GPU) or an image URI."
                )

        image_uri = self._get_image_uri(
            instance_type,
            accelerator_type,
            serverless_inference_config=serverless_inference_config)
        env = self._get_container_env()

        if self.entry_point and not is_pipeline_variable(self.model_data):
            key_prefix = sagemaker.fw_utils.model_code_key_prefix(
                self.key_prefix, self.name, image_uri)

            bucket = self.bucket or self.sagemaker_session.default_bucket()
            model_data = s3.s3_path_join("s3://", bucket, key_prefix,
                                         "model.tar.gz")

            sagemaker.utils.repack_model(
                self.entry_point,
                self.source_dir,
                self.dependencies,
                self.model_data,
                model_data,
                self.sagemaker_session,
                kms_key=self.model_kms_key,
            )
        elif self.entry_point and is_pipeline_variable(self.model_data):
            # model is not yet there, defer repacking to later during pipeline execution
            if isinstance(self.sagemaker_session, PipelineSession):
                self.sagemaker_session.context.need_runtime_repack.add(
                    id(self))
            else:
                # TODO: link the doc in the warning once ready
                logging.warning(
                    "The model_data is a Pipeline variable of type %s, "
                    "which should be used under `PipelineSession` and "
                    "leverage `ModelStep` to create or register model. "
                    "Otherwise some functionalities e.g. "
                    "runtime repack may be missing",
                    type(self.model_data),
                )
            model_data = self.model_data
        else:
            model_data = self.model_data

        return sagemaker.container_def(image_uri, model_data, env)
    def transform(
        self,
        data,
        data_type="S3Prefix",
        content_type=None,
        compression_type=None,
        split_type=None,
        job_name=None,
        input_filter=None,
        output_filter=None,
        join_source=None,
        experiment_config=None,
        model_client_config=None,
        wait=True,
        logs=True,
    ):
        """Start a new transform job.

        Args:
            data (str): Input data location in S3.
            data_type (str): What the S3 location defines (default: 'S3Prefix').
                Valid values:

                * 'S3Prefix' - the S3 URI defines a key name prefix. All objects with this prefix
                    will be used as inputs for the transform job.

                * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3
                    object to use as an input for the transform job.

            content_type (str): MIME type of the input data (default: None).
            compression_type (str): Compression type of the input data, if
                compressed (default: None). Valid values: 'Gzip', None.
            split_type (str): The record delimiter for the input object
                (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and
                'TFRecord'.
            job_name (str): job name (default: None). If not specified, one will
                be generated.
            input_filter (str): A JSONPath to select a portion of the input to
                pass to the algorithm container for inference. If you omit the
                field, it gets the value '$', representing the entire input.
                For CSV data, each row is taken as a JSON array,
                so only index-based JSONPaths can be applied, e.g. $[0], $[1:].
                CSV data should follow the `RFC format <https://tools.ietf.org/html/rfc4180>`_.
                See `Supported JSONPath Operators
                <https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#data-processing-operators>`_
                for a table of supported JSONPath operators.
                For more information, see the SageMaker API documentation for
                `CreateTransformJob
                <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
                Some examples: "$[1:]", "$.features" (default: None).
            output_filter (str): A JSONPath to select a portion of the
                joined/original output to return as the output.
                For more information, see the SageMaker API documentation for
                `CreateTransformJob
                <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
                Some examples: "$[1:]", "$.prediction" (default: None).
            join_source (str): The source of data to be joined to the transform
                output. It can be set to 'Input' meaning the entire input record
                will be joined to the inference result. You can use OutputFilter
                to select the useful portion before uploading to S3. (default:
                None). Valid values: Input, None.
            experiment_config (dict[str, str]): Experiment management configuration.
                Optionally, the dict can contain three keys:
                'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
                The behavior of setting these keys is as follows:
                * If `ExperimentName` is supplied but `TrialName` is not a Trial will be
                automatically created and the job's Trial Component associated with the Trial.
                * If `TrialName` is supplied and the Trial already exists the job's Trial Component
                will be associated with the Trial.
                * If both `ExperimentName` and `TrialName` are not supplied the trial component
                will be unassociated.
                * `TrialComponentDisplayName` is used for display in Studio.
            model_client_config (dict[str, str]): Model configuration.
                Dictionary contains two optional keys,
                'InvocationsTimeoutInSeconds', and 'InvocationsMaxRetries'.
                (default: ``None``).
            wait (bool): Whether the call should wait until the job completes
                (default: ``True``).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when wait is ``True`` (default: ``True``).
        """
        local_mode = self.sagemaker_session.local_mode
        if not local_mode and not data.startswith("s3://"):
            raise ValueError("Invalid S3 URI: {}".format(data))

        if job_name is not None:
            self._current_job_name = job_name
        else:
            base_name = self.base_transform_job_name

            if base_name is None:
                base_name = (
                    "transform-job"
                    if is_pipeline_variable(self.model_name)
                    else self._retrieve_base_name()
                )

            self._current_job_name = name_from_base(base_name)

        if self.output_path is None or self._reset_output_path is True:
            self.output_path = "s3://{}/{}".format(
                self.sagemaker_session.default_bucket(), self._current_job_name
            )
            self._reset_output_path = True

        self.latest_transform_job = _TransformJob.start_new(
            self,
            data,
            data_type,
            content_type,
            compression_type,
            split_type,
            input_filter,
            output_filter,
            join_source,
            experiment_config,
            model_client_config,
        )

        if wait:
            self.latest_transform_job.wait(logs=logs)
Exemple #9
0
    def __init__(
        self,
        name: str,
        clarify_check_config: ClarifyCheckConfig,
        check_job_config: CheckJobConfig,
        skip_check: Union[bool, PipelineNonPrimitiveInputTypes] = False,
        register_new_baseline: Union[bool,
                                     PipelineNonPrimitiveInputTypes] = False,
        model_package_group_name: Union[str,
                                        PipelineNonPrimitiveInputTypes] = None,
        supplied_baseline_constraints: Union[
            str, PipelineNonPrimitiveInputTypes] = None,
        display_name: str = None,
        description: str = None,
        cache_config: CacheConfig = None,
        depends_on: Union[List[str], List[Step]] = None,
    ):
        """Constructs a ClarifyCheckStep.

        Args:
            name (str): The name of the ClarifyCheckStep step.
            clarify_check_config (ClarifyCheckConfig): A ClarifyCheckConfig instance.
            check_job_config (CheckJobConfig): A CheckJobConfig instance.
            skip_check (bool or PipelineNonPrimitiveInputTypes): Whether the check
                should be skipped (default: False).
            register_new_baseline (bool or PipelineNonPrimitiveInputTypes): Whether
                the new baseline should be registered (default: False).
            model_package_group_name (str or PipelineNonPrimitiveInputTypes): The name of a
                registered model package group, among which the baseline will be fetched
                from the latest approved model (default: None).
            supplied_baseline_constraints (str or PipelineNonPrimitiveInputTypes): The S3 path
                to the supplied constraints object representing the constraints JSON file
                which will be used for drift to check (default: None).
            display_name (str): The display name of the ClarifyCheckStep step (default: None).
            description (str): The description of the ClarifyCheckStep step (default: None).
            cache_config (CacheConfig):  A `sagemaker.workflow.steps.CacheConfig` instance
                (default: None).
            depends_on (List[str] or List[Step]): A list of step names or step instances
                this `sagemaker.workflow.steps.ClarifyCheckStep` depends on (default: None).
        """
        if (not isinstance(clarify_check_config, DataBiasCheckConfig)
                and not isinstance(clarify_check_config, ModelBiasCheckConfig)
                and not isinstance(clarify_check_config,
                                   ModelExplainabilityCheckConfig)):
            raise RuntimeError(
                "The clarify_check_config can only be object of " +
                "DataBiasCheckConfig, ModelBiasCheckConfig or ModelExplainabilityCheckConfig"
            )

        if is_pipeline_variable(clarify_check_config.data_config.
                                s3_analysis_config_output_path):
            raise RuntimeError(
                "s3_analysis_config_output_path cannot be of type " +
                "ExecutionVariable/Expression/Parameter/Properties")

        if (not clarify_check_config.data_config.s3_analysis_config_output_path
                and is_pipeline_variable(
                    clarify_check_config.data_config.s3_output_path)):
            raise RuntimeError(
                "`s3_output_path` cannot be of type ExecutionVariable/Expression/Parameter"
                +
                "/Properties if `s3_analysis_config_output_path` is none or empty "
            )

        super(ClarifyCheckStep,
              self).__init__(name, display_name, description,
                             StepTypeEnum.CLARIFY_CHECK, depends_on)
        self.skip_check = skip_check
        self.register_new_baseline = register_new_baseline
        self.clarify_check_config = clarify_check_config
        self.check_job_config = check_job_config
        self.model_package_group_name = model_package_group_name
        self.supplied_baseline_constraints = supplied_baseline_constraints
        self.cache_config = cache_config

        if isinstance(self.clarify_check_config,
                      ModelExplainabilityCheckConfig):
            self._model_monitor = self.check_job_config._generate_model_monitor(
                "ModelExplainabilityMonitor")
        else:
            self._model_monitor = self.check_job_config._generate_model_monitor(
                "ModelBiasMonitor")

        self.clarify_check_config.monitoring_analysis_config_uri = (
            self._upload_monitoring_analysis_config())
        self._baselining_processor = self._model_monitor._create_baselining_processor(
        )
        self._processing_params = self._generate_processing_job_parameters(
            self._generate_processing_job_analysis_config(),
            self._baselining_processor)

        root_path = f"Steps.{name}"
        root_prop = Properties(path=root_path)
        root_prop.__dict__["CalculatedBaselineConstraints"] = Properties(
            f"{root_path}.CalculatedBaselineConstraints")
        root_prop.__dict__[
            "BaselineUsedForDriftCheckConstraints"] = Properties(
                f"{root_path}.BaselineUsedForDriftCheckConstraints")
        self._properties = root_prop