Ejemplo n.º 1
0
def move_to_destination(source, destination, job_name, sagemaker_session):
    """Move source to destination.

    Can handle uploading to S3.

    Args:
        source (str): root directory to move
        destination (str): file:// or s3:// URI that source will be moved to.
        job_name (str): SageMaker job name.
        sagemaker_session (sagemaker.Session): a sagemaker_session to interact
            with S3 if needed

    Returns:
        (str): destination URI
    """
    parsed_uri = urlparse(destination)
    if parsed_uri.scheme == "file":
        recursive_copy(source, parsed_uri.path)
        final_uri = destination
    elif parsed_uri.scheme == "s3":
        bucket = parsed_uri.netloc
        path = s3.s3_path_join(parsed_uri.path, job_name)
        final_uri = s3.s3_path_join("s3://", bucket, path)
        sagemaker_session.upload_data(source, bucket, path)
    else:
        raise ValueError("Invalid destination URI, must be s3:// or file://, got: %s" % destination)

    shutil.rmtree(source)
    return final_uri
Ejemplo n.º 2
0
    def add_model(self, model_data_source, model_data_path=None):
        """Adds a model to the ``MultiDataModel``.

        It is done by uploading or copying the model_data_source artifact to the given
        S3 path model_data_path relative to model_data_prefix

        Args:
            model_source: Valid local file path or S3 path of the trained model artifact
            model_data_path: S3 path where the trained model artifact
                should be uploaded relative to ``self.model_data_prefix`` path. (default: None).
                If None, then the model artifact is uploaded to a path relative to model_data_prefix

        Returns:
            str: S3 uri to uploaded model artifact
        """
        parse_result = urlparse(model_data_source)

        # If the model source is an S3 path, copy the model artifact to the destination S3 path
        if parse_result.scheme == "s3":
            source_bucket, source_model_data_path = s3.parse_s3_url(
                model_data_source)
            copy_source = {
                "Bucket": source_bucket,
                "Key": source_model_data_path
            }

            if not model_data_path:
                model_data_path = source_model_data_path

            # Construct the destination path
            dst_url = s3.s3_path_join(self.model_data_prefix, model_data_path)
            destination_bucket, destination_model_data_path = s3.parse_s3_url(
                dst_url)

            # Copy the model artifact
            self.s3_client.copy(copy_source, destination_bucket,
                                destination_model_data_path)
            return s3.s3_path_join("s3://", destination_bucket,
                                   destination_model_data_path)

        # If the model source is a local path, upload the local model artifact to the destination
        # S3 path
        if os.path.exists(model_data_source):
            destination_bucket, dst_prefix = s3.parse_s3_url(
                self.model_data_prefix)
            if model_data_path:
                dst_s3_uri = s3.s3_path_join(dst_prefix, model_data_path)
            else:
                dst_s3_uri = s3.s3_path_join(
                    dst_prefix, os.path.basename(model_data_source))
            self.s3_client.upload_file(model_data_source, destination_bucket,
                                       dst_s3_uri)
            # return upload_path
            return s3.s3_path_join("s3://", destination_bucket, dst_s3_uri)

        # Raise error if the model source is of an unexpected type
        raise ValueError(
            "model_source must either be a valid local file path or s3 uri. Received: "
            '"{}"'.format(model_data_source))
    def from_string(
        cls, constraints_file_string, kms_key=None, file_name=None, sagemaker_session=None
    ):
        """Generates a Constraints object from an s3 uri.

        Args:
            constraints_file_string (str): The uri of the constraints JSON file.
            kms_key (str): The kms key to be used to encrypt the file in S3.
            file_name (str): The file name to use when uploading to S3.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session
                object, used for SageMaker interactions (default: None). If not
                specified, one is created using the default AWS configuration
                chain.

        Returns:
            sagemaker.model_monitor.Constraints: The instance of Constraints generated from
                the s3 uri.

        """
        sagemaker_session = sagemaker_session or Session()
        file_name = file_name or "constraints.json"
        desired_s3_uri = s3.s3_path_join(
            "s3://", sagemaker_session.default_bucket(), "monitoring", str(uuid.uuid4()), file_name
        )
        s3_uri = s3.S3Uploader.upload_string_as_file_body(
            body=constraints_file_string,
            desired_s3_uri=desired_s3_uri,
            kms_key=kms_key,
            sagemaker_session=sagemaker_session,
        )

        return Constraints.from_s3_uri(
            constraints_file_s3_uri=s3_uri, kms_key=kms_key, sagemaker_session=sagemaker_session
        )
Ejemplo n.º 4
0
    def prepare_container_def(self, instance_type=None, accelerator_type=None):
        """
        Args:
            instance_type:
            accelerator_type:
        """
        if self.image_uri is None and instance_type is None:
            raise ValueError(
                "Must supply either an instance type (for choosing CPU vs GPU) or an image URI."
            )

        image_uri = self._get_image_uri(instance_type, accelerator_type)
        env = self._get_container_env()

        if self.entry_point:
            key_prefix = sagemaker.fw_utils.model_code_key_prefix(
                self.key_prefix, self.name, image_uri)

            bucket = self.bucket or self.sagemaker_session.default_bucket()
            model_data = s3.s3_path_join("s3://", bucket, key_prefix,
                                         "model.tar.gz")

            sagemaker.utils.repack_model(
                self.entry_point,
                self.source_dir,
                self.dependencies,
                self.model_data,
                model_data,
                self.sagemaker_session,
                kms_key=self.model_kms_key,
            )
        else:
            model_data = self.model_data

        return sagemaker.container_def(image_uri, model_data, env)
Ejemplo n.º 5
0
    def _upload_code(self, code, kms_key=None):
        """Uploads a code file or directory specified as a string
        and returns the S3 URI.

        Args:
            code (str): A file or directory to be uploaded to S3.
            kms_key (str): The ARN of the KMS key that is used to encrypt the
                user code file (default: None).

        Returns:
            str: The S3 URI of the uploaded file or directory.

        """
        desired_s3_uri = s3.s3_path_join(
            "s3://",
            self.sagemaker_session.default_bucket(),
            self._current_job_name,
            "input",
            self._CODE_CONTAINER_INPUT_NAME,
        )
        return s3.S3Uploader.upload(
            local_path=code,
            desired_s3_uri=desired_s3_uri,
            sagemaker_session=self.sagemaker_session,
            kms_key=kms_key,
        )
Ejemplo n.º 6
0
 def _default_s3_path(self, directory, mpi=False):
     """Placeholder docstring"""
     local_code = utils.get_config_value("local.local_code", self.sagemaker_session.config)
     if self.sagemaker_session.local_mode and local_code:
         return "/opt/ml/shared/{}".format(directory)
     if mpi:
         return "/opt/ml/model"
     if self._current_job_name:
         return s3.s3_path_join(self.output_path, self._current_job_name, directory)
     return None
Ejemplo n.º 7
0
    def _normalize_inputs(self, inputs=None, kms_key=None):
        """Ensures that all the ``ProcessingInput`` objects have names and S3 URIs.

        Args:
            inputs (list[sagemaker.processing.ProcessingInput]): A list of ``ProcessingInput``
                objects to be normalized (default: None). If not specified,
                an empty list is returned.
            kms_key (str): The ARN of the KMS key that is used to encrypt the
                user code file (default: None).

        Returns:
            list[sagemaker.processing.ProcessingInput]: The list of normalized
                ``ProcessingInput`` objects.

        Raises:
            TypeError: if the inputs are not ``ProcessingInput`` objects.
        """
        # Initialize a list of normalized ProcessingInput objects.
        normalized_inputs = []
        if inputs is not None:
            # Iterate through the provided list of inputs.
            for count, file_input in enumerate(inputs, 1):
                if not isinstance(file_input, ProcessingInput):
                    raise TypeError(
                        "Your inputs must be provided as ProcessingInput objects."
                    )
                # Generate a name for the ProcessingInput if it doesn't have one.
                if file_input.input_name is None:
                    file_input.input_name = "input-{}".format(count)

                if isinstance(file_input.source,
                              Properties) or file_input.dataset_definition:
                    normalized_inputs.append(file_input)
                    continue

                # If the source is a local path, upload it to S3
                # and save the S3 uri in the ProcessingInput source.
                parse_result = urlparse(file_input.s3_input.s3_uri)
                if parse_result.scheme != "s3":
                    desired_s3_uri = s3.s3_path_join(
                        "s3://",
                        self.sagemaker_session.default_bucket(),
                        self._current_job_name,
                        "input",
                        file_input.input_name,
                    )
                    s3_uri = s3.S3Uploader.upload(
                        local_path=file_input.s3_input.s3_uri,
                        desired_s3_uri=desired_s3_uri,
                        sagemaker_session=self.sagemaker_session,
                        kms_key=kms_key,
                    )
                    file_input.s3_input.s3_uri = s3_uri
                normalized_inputs.append(file_input)
        return normalized_inputs
Ejemplo n.º 8
0
def test_path_join():
    test_cases = (
        ("foo/bar", ("foo", "bar")),
        ("foo/bar", ("foo/", "bar")),
        ("foo/bar", ("/foo/", "bar")),
        ("s3://foo/bar", ("s3://", "foo", "bar")),
        ("s3://foo/bar", ("s3://", "/foo", "bar")),
        ("s3://foo/bar", ("s3://foo", "bar")),
    )

    for expected, args in test_cases:
        assert expected == s3.s3_path_join(*args)
Ejemplo n.º 9
0
    def prepare_container_def(self,
                              instance_type=None,
                              accelerator_type=None,
                              serverless_inference_config=None):
        """Prepare the container definition.

        Args:
            instance_type: Instance type of the container.
            accelerator_type: Accelerator type, if applicable.
            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
                Specifies configuration related to serverless endpoint. Instance type is
                not provided in serverless inference. So this is used to find image URIs.

        Returns:
            A container definition for deploying a ``Model`` to an ``Endpoint``.
        """
        if not self.image_uri:
            if instance_type is None and serverless_inference_config is None:
                raise ValueError(
                    "Must supply either an instance type (for choosing CPU vs GPU) or an image URI."
                )

        image_uri = self._get_image_uri(
            instance_type,
            accelerator_type,
            serverless_inference_config=serverless_inference_config)
        env = self._get_container_env()

        # If self.model_data is pipeline variable, model is not yet there.
        # So defer repacking to later during pipeline execution
        if self.entry_point and not is_pipeline_variable(self.model_data):
            key_prefix = sagemaker.fw_utils.model_code_key_prefix(
                self.key_prefix, self.name, image_uri)

            bucket = self.bucket or self.sagemaker_session.default_bucket()
            model_data = s3.s3_path_join("s3://", bucket, key_prefix,
                                         "model.tar.gz")

            sagemaker.utils.repack_model(
                self.entry_point,
                self.source_dir,
                self.dependencies,
                self.model_data,
                model_data,
                self.sagemaker_session,
                kms_key=self.model_kms_key,
            )
        else:
            model_data = self.model_data

        return sagemaker.container_def(image_uri, model_data, env)
Ejemplo n.º 10
0
    def _get_s3_base_uri_for_monitoring_analysis_config(self) -> str:
        """Generate s3 base uri for monitoring schedule analysis config

        Returns:
            str: The S3 base uri of the monitoring schedule analysis config
        """
        s3_analysis_config_output_path = (self.clarify_check_config.data_config
                                          .s3_analysis_config_output_path)
        monitoring_cfg_base_name = f"{_BIAS_MONITORING_CFG_BASE_NAME}-configuration"
        if isinstance(self.clarify_check_config,
                      ModelExplainabilityCheckConfig):
            monitoring_cfg_base_name = f"{_EXPLAINABILITY_MONITORING_CFG_BASE_NAME}-configuration"

        if s3_analysis_config_output_path:
            return s3.s3_path_join(
                s3_analysis_config_output_path,
                monitoring_cfg_base_name,
            )
        return s3.s3_path_join(
            "s3://",
            self._model_monitor.sagemaker_session.default_bucket(),
            _MODEL_MONITOR_S3_PATH,
            monitoring_cfg_base_name,
        )
    def __init__(
        self,
        enable_capture,
        sampling_percentage=20,
        destination_s3_uri=None,
        kms_key_id=None,
        capture_options=None,
        csv_content_types=None,
        json_content_types=None,
        sagemaker_session=None,
    ):
        """Initialize a DataCaptureConfig object for capturing data from Amazon SageMaker Endpoints.

        Args:
            enable_capture (bool): Required. Whether data capture should be enabled or not.
            sampling_percentage (int): Optional. Default=20. The percentage of data to sample.
                Must be between 0 and 100.
            destination_s3_uri (str): Optional. Defaults to "s3://<default-session-bucket>/
                model-monitor/data-capture".
            kms_key_id (str): Optional. Default=None. The kms key to use when writing to S3.
            capture_options ([str]): Optional. Must be a list containing any combination of the
                following values: "REQUEST", "RESPONSE". Default=["REQUEST", "RESPONSE"]. Denotes
                which data to capture between request and response.
            csv_content_types ([str]): Optional. Default=["text/csv"].
            json_content_types([str]): Optional. Default=["application/json"].
            sagemaker_session (sagemaker.session.Session): A SageMaker Session
                object, used for SageMaker interactions (default: None). If not
                specified, one is created using the default AWS configuration
                chain.
        """
        self.enable_capture = enable_capture
        self.sampling_percentage = sampling_percentage
        self.destination_s3_uri = destination_s3_uri
        if self.destination_s3_uri is None:
            sagemaker_session = sagemaker_session or Session()
            self.destination_s3_uri = s3.s3_path_join(
                "s3://",
                sagemaker_session.default_bucket(),
                _MODEL_MONITOR_S3_PATH,
                _DATA_CAPTURE_S3_PATH,
            )

        self.kms_key_id = kms_key_id
        self.capture_options = capture_options or ["REQUEST", "RESPONSE"]
        self.csv_content_types = csv_content_types or ["text/csv"]
        self.json_content_types = json_content_types or ["application/json"]
Ejemplo n.º 12
0
    def _normalize_outputs(self, outputs=None):
        """Ensures that all the outputs are ``ProcessingOutput`` objects with names and S3 URIs.

        Args:
            outputs (list[sagemaker.processing.ProcessingOutput]): A list
                of outputs to be normalized (default: None). Can be either strings or
                ``ProcessingOutput`` objects. If not specified,
                an empty list is returned.

        Returns:
            list[sagemaker.processing.ProcessingOutput]: The list of normalized
                ``ProcessingOutput`` objects.

        Raises:
            TypeError: if the outputs are not ``ProcessingOutput`` objects.
        """
        # Initialize a list of normalized ProcessingOutput objects.
        normalized_outputs = []
        if outputs is not None:
            # Iterate through the provided list of outputs.
            for count, output in enumerate(outputs, 1):
                if not isinstance(output, ProcessingOutput):
                    raise TypeError(
                        "Your outputs must be provided as ProcessingOutput objects."
                    )
                # Generate a name for the ProcessingOutput if it doesn't have one.
                if output.output_name is None:
                    output.output_name = "output-{}".format(count)
                # if the output's destination is a workflow expression, do no normalization
                if isinstance(output.destination, Expression):
                    normalized_outputs.append(output)
                    continue
                # If the output's destination is not an s3_uri, create one.
                parse_result = urlparse(output.destination)
                if parse_result.scheme != "s3":
                    s3_uri = s3.s3_path_join(
                        "s3://",
                        self.sagemaker_session.default_bucket(),
                        self._current_job_name,
                        "output",
                        output.output_name,
                    )
                    output.destination = s3_uri
                normalized_outputs.append(output)
        return normalized_outputs
Ejemplo n.º 13
0
    def _generate_baseline_output(self):
        """Generates a ProcessingOutput object

        Returns:
            sagemaker.processing.ProcessingOutput: The normalized ProcessingOutput object.
        """
        s3_uri = self.quality_check_config.output_s3_uri or s3.s3_path_join(
            "s3://",
            self._model_monitor.sagemaker_session.default_bucket(),
            _MODEL_MONITOR_S3_PATH,
            _BASELINING_S3_PATH,
            self._model_monitor.latest_baselining_job_name,
            _RESULTS_S3_PATH,
        )
        return ProcessingOutput(
            source=str(pathlib.PurePosixPath(_CONTAINER_BASE_PATH, _CONTAINER_OUTPUT_PATH)),
            destination=s3_uri,
            output_name=_DEFAULT_OUTPUT_NAME,
        )
Ejemplo n.º 14
0
    def _create_args(self, role_arn: str, description: str,
                     parallelism_config: ParallelismConfiguration):
        """Constructs the keyword argument dict for a create_pipeline call.

        Args:
            role_arn (str): The role arn that is assumed by pipelines to create step artifacts.
            description (str): A description of the pipeline.
            parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration
                that is applied to each of the executions of the pipeline. It takes precedence
                over the parallelism configuration of the parent pipeline.

        Returns:
            A keyword argument dict for calling create_pipeline.
        """
        pipeline_definition = self.definition()
        kwargs = dict(
            PipelineName=self.name,
            RoleArn=role_arn,
        )

        # If pipeline definition is large, upload to S3 bucket and
        # provide PipelineDefinitionS3Location to request instead.
        if len(pipeline_definition.encode("utf-8")) < 1024 * 100:
            kwargs["PipelineDefinition"] = pipeline_definition
        else:
            desired_s3_uri = s3.s3_path_join(
                "s3://", self.sagemaker_session.default_bucket(), self.name)
            s3.S3Uploader.upload_string_as_file_body(
                body=pipeline_definition,
                desired_s3_uri=desired_s3_uri,
                sagemaker_session=self.sagemaker_session,
            )
            kwargs["PipelineDefinitionS3Location"] = {
                "Bucket": self.sagemaker_session.default_bucket(),
                "ObjectKey": self.name,
            }

        update_args(kwargs,
                    PipelineDescription=description,
                    ParallelismConfiguration=parallelism_config)
        return kwargs
Ejemplo n.º 15
0
    def prepare_container_def(self,
                              instance_type=None,
                              accelerator_type=None,
                              serverless_inference_config=None):
        """Prepare the container definition.

        Args:
            instance_type: Instance type of the container.
            accelerator_type: Accelerator type, if applicable.
            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
                Specifies configuration related to serverless endpoint. Instance type is
                not provided in serverless inference. So this is used to find image URIs.

        Returns:
            A container definition for deploying a ``Model`` to an ``Endpoint``.
        """
        if not self.image_uri:
            if instance_type is None and serverless_inference_config is None:
                raise ValueError(
                    "Must supply either an instance type (for choosing CPU vs GPU) or an image URI."
                )

        image_uri = self._get_image_uri(
            instance_type,
            accelerator_type,
            serverless_inference_config=serverless_inference_config)
        env = self._get_container_env()

        if self.entry_point and not is_pipeline_variable(self.model_data):
            key_prefix = sagemaker.fw_utils.model_code_key_prefix(
                self.key_prefix, self.name, image_uri)

            bucket = self.bucket or self.sagemaker_session.default_bucket()
            model_data = s3.s3_path_join("s3://", bucket, key_prefix,
                                         "model.tar.gz")

            sagemaker.utils.repack_model(
                self.entry_point,
                self.source_dir,
                self.dependencies,
                self.model_data,
                model_data,
                self.sagemaker_session,
                kms_key=self.model_kms_key,
            )
        elif self.entry_point and is_pipeline_variable(self.model_data):
            # model is not yet there, defer repacking to later during pipeline execution
            if isinstance(self.sagemaker_session, PipelineSession):
                self.sagemaker_session.context.need_runtime_repack.add(
                    id(self))
            else:
                # TODO: link the doc in the warning once ready
                logging.warning(
                    "The model_data is a Pipeline variable of type %s, "
                    "which should be used under `PipelineSession` and "
                    "leverage `ModelStep` to create or register model. "
                    "Otherwise some functionalities e.g. "
                    "runtime repack may be missing",
                    type(self.model_data),
                )
            model_data = self.model_data
        else:
            model_data = self.model_data

        return sagemaker.container_def(image_uri, model_data, env)