Ejemplo n.º 1
0
def prepare_framework(estimator, s3_operations):
    """Prepare S3 operations (specify where to upload `source_dir` ) and
    environment variables related to framework.

    Args:
        estimator (sagemaker.estimator.Estimator): The framework estimator to
            get information from and update.
        s3_operations (dict): The dict to specify s3 operations (upload
            `source_dir` ).
    """
    if estimator.code_location is not None:
        bucket, key = fw_utils.parse_s3_url(estimator.code_location)
        key = os.path.join(key, estimator._current_job_name, "source",
                           "sourcedir.tar.gz")
    elif estimator.uploaded_code is not None:
        bucket, key = fw_utils.parse_s3_url(estimator.uploaded_code.s3_prefix)
    else:
        bucket = estimator.sagemaker_session._default_bucket
        key = os.path.join(estimator._current_job_name, "source",
                           "sourcedir.tar.gz")

    script = os.path.basename(estimator.entry_point)

    if estimator.source_dir and estimator.source_dir.lower().startswith(
            "s3://"):
        code_dir = estimator.source_dir
        estimator.uploaded_code = fw_utils.UploadedCode(s3_prefix=code_dir,
                                                        script_name=script)
    else:
        code_dir = "s3://{}/{}".format(bucket, key)
        estimator.uploaded_code = fw_utils.UploadedCode(s3_prefix=code_dir,
                                                        script_name=script)
        s3_operations["S3Upload"] = [{
            "Path":
            estimator.source_dir or estimator.entry_point,
            "Bucket":
            bucket,
            "Key":
            key,
            "Tar":
            True,
        }]
    estimator._hyperparameters[sagemaker.model.DIR_PARAM_NAME] = code_dir
    estimator._hyperparameters[sagemaker.model.SCRIPT_PARAM_NAME] = script
    estimator._hyperparameters[
        sagemaker.model.
        CLOUDWATCH_METRICS_PARAM_NAME] = estimator.enable_cloudwatch_metrics
    estimator._hyperparameters[
        sagemaker.model.
        CONTAINER_LOG_LEVEL_PARAM_NAME] = estimator.container_log_level
    estimator._hyperparameters[
        sagemaker.model.JOB_NAME_PARAM_NAME] = estimator._current_job_name
    estimator._hyperparameters[
        sagemaker.model.
        SAGEMAKER_REGION_PARAM_NAME] = estimator.sagemaker_session.boto_region_name
Ejemplo n.º 2
0
    def __init__(self, model_data, image, role, entry_point, source_dir=None, predictor_cls=None, env=None, name=None,
                 enable_cloudwatch_metrics=False, container_log_level=logging.INFO, code_location=None,
                 sagemaker_session=None, dependencies=None, **kwargs):
        """Initialize a ``FrameworkModel``.

        Args:
            model_data (str): The S3 location of a SageMaker model data ``.tar.gz`` file.
            image (str): A Docker image URI.
            role (str): An IAM role name or ARN for SageMaker to access AWS resources on your behalf.
            entry_point (str): Path (absolute or relative) to the Python source file which should be executed
                as the entry point to model hosting. This should be compatible with either Python 2.7 or Python 3.5.
            source_dir (str): Path (absolute or relative) to a directory with any other training
                source code dependencies aside from tne entry point file (default: None). Structure within this
                directory will be preserved when training on SageMaker.
                If the directory points to S3, no code will be uploaded and the S3 location will be used instead.
            dependencies (list[str]): A list of paths to directories (absolute or relative) with
                any additional libraries that will be exported to the container (default: []).
                The library folders will be copied to SageMaker in the same folder where the entrypoint is copied.
                If the ```source_dir``` points to S3, code will be uploaded and the S3 location will be used
                instead. Example:

                    The following call
                    >>> Estimator(entry_point='train.py', dependencies=['my/libs/common', 'virtual-env'])
                    results in the following inside the container:

                    >>> $ ls

                    >>> opt/ml/code
                    >>>     |------ train.py
                    >>>     |------ common
                    >>>     |------ virtual-env

            predictor_cls (callable[string, sagemaker.session.Session]): A function to call to create
               a predictor (default: None). If not None, ``deploy`` will return the result of invoking
               this function on the created endpoint name.
            env (dict[str, str]): Environment variables to run with ``image`` when hosted in SageMaker
               (default: None).
            name (str): The model name. If None, a default model name will be selected on each ``deploy``.
            enable_cloudwatch_metrics (bool): Whether training and hosting containers will
               generate CloudWatch metrics under the AWS/SageMakerContainer namespace (default: False).
            container_log_level (int): Log level to use within the container (default: logging.INFO).
                Valid values are defined in the Python logging module.
            code_location (str): Name of the S3 bucket where custom code is uploaded (default: None).
                If not specified, default bucket created by ``sagemaker.session.Session`` is used.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker
               interactions (default: None). If not specified, one is created using the default AWS configuration chain.
            **kwargs: Keyword arguments passed to the ``Model`` initializer.
        """
        super(FrameworkModel, self).__init__(model_data, image, role, predictor_cls=predictor_cls, env=env, name=name,
                                             sagemaker_session=sagemaker_session, **kwargs)
        self.entry_point = entry_point
        self.source_dir = source_dir
        self.dependencies = dependencies or []
        self.enable_cloudwatch_metrics = enable_cloudwatch_metrics
        self.container_log_level = container_log_level
        if code_location:
            self.bucket, self.key_prefix = fw_utils.parse_s3_url(code_location)
        else:
            self.bucket, self.key_prefix = None, None
        self.uploaded_code = None
Ejemplo n.º 3
0
    def __init__(self,
                 model_data,
                 image,
                 role,
                 entry_point,
                 source_dir=None,
                 predictor_cls=None,
                 env=None,
                 name=None,
                 enable_cloudwatch_metrics=False,
                 container_log_level=logging.INFO,
                 code_location=None,
                 sagemaker_session=None):
        """Initialize a ``FrameworkModel``.

        Args:
            model_data (str): The S3 location of a SageMaker model data ``.tar.gz`` file.
            image (str): A Docker image URI.
            role (str): An IAM role name or ARN for SageMaker to access AWS resources on your behalf.
            entry_point (str): Path (absolute or relative) to the Python source file which should be executed
                as the entry point to model hosting. This should be compatible with either Python 2.7 or Python 3.5.
            source_dir (str): Path (absolute or relative) to a directory with any other training
                source code dependencies aside from tne entry point file (default: None). Structure within this
                directory will be preserved when training on SageMaker.
                If the directory points to S3, no code will be uploaded and the S3 location will be used instead.
            predictor_cls (callable[string, sagemaker.session.Session]): A function to call to create
               a predictor (default: None). If not None, ``deploy`` will return the result of invoking
               this function on the created endpoint name.
            env (dict[str, str]): Environment variables to run with ``image`` when hosted in SageMaker
               (default: None).
            name (str): The model name. If None, a default model name will be selected on each ``deploy``.
            enable_cloudwatch_metrics (bool): Whether training and hosting containers will
               generate CloudWatch metrics under the AWS/SageMakerContainer namespace (default: False).
            container_log_level (int): Log level to use within the container (default: logging.INFO).
                Valid values are defined in the Python logging module.
            code_location (str): Name of the S3 bucket where custom code is uploaded (default: None).
                If not specified, default bucket created by ``sagemaker.session.Session`` is used.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker
               interactions (default: None). If not specified, one is created using the default AWS configuration chain.
        """
        super(FrameworkModel,
              self).__init__(model_data,
                             image,
                             role,
                             predictor_cls=predictor_cls,
                             env=env,
                             name=name,
                             sagemaker_session=sagemaker_session)
        self.entry_point = entry_point
        self.source_dir = source_dir
        self.enable_cloudwatch_metrics = enable_cloudwatch_metrics
        self.container_log_level = container_log_level
        if code_location:
            self.bucket, self.key_prefix = parse_s3_url(code_location)
        else:
            self.bucket, self.key_prefix = None, None
    def fit(self, inputs, wait=True, logs=True, job_name=None):
        """Train a model using the input training dataset.

        The API calls the Amazon SageMaker CreateTrainingJob API to start model training.
        The API uses configuration you provided to create the estimator and the
        specified input training data to send the CreatingTrainingJob request to Amazon SageMaker.

        This is a synchronous operation. After the model training successfully completes,
        you can call the ``deploy()`` method to host the model using the Amazon SageMaker hosting services.

        Args:
            inputs (str or dict or sagemaker.session.s3_input): Information about the training data.
                This can be one of three types:
                (str) - the S3 location where training data is saved.
                (dict[str, str] or dict[str, sagemaker.session.s3_input]) - If using multiple channels for
                    training data, you can specify a dict mapping channel names
                    to strings or :func:`~sagemaker.session.s3_input` objects.
                (sagemaker.session.s3_input) - channel configuration for S3 data sources that can provide
                    additional information about the training dataset. See :func:`sagemaker.session.s3_input`
                    for full details.
            wait (bool): Whether the call shouldl wait until the job completes (default: True).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when wait is True (default: True).
            job_name (str): Training job name. If not specified, the estimator generates a default job name,
                based on the training image name and current timestamp.
        """
        # always determine new job name _here_ because it is used before base is called
        if job_name is not None:
            self._current_job_name = job_name
        else:
            # honor supplied base_job_name or generate it
            base_name = self.base_job_name or base_name_from_image(self.train_image())
            self._current_job_name = name_from_base(base_name)

        if self.code_location is None:
            code_bucket = self.sagemaker_session.default_bucket()
            code_s3_prefix = '{}/source'.format(self._current_job_name)
        else:
            code_bucket, key_prefix = parse_s3_url(self.code_location)
            code_s3_prefix = '{}/{}/source'.format(key_prefix, self._current_job_name)

        self.uploaded_code = tar_and_upload_dir(session=self.sagemaker_session.boto_session,
                                                bucket=code_bucket,
                                                s3_key_prefix=code_s3_prefix,
                                                script=self.entry_point,
                                                directory=self.source_dir)

        # Modify hyperparameters in-place to add the URLs to the uploaded code.
        self._hyperparameters[DIR_PARAM_NAME] = self.uploaded_code.s3_prefix
        self._hyperparameters[SCRIPT_PARAM_NAME] = self.uploaded_code.script_name
        self._hyperparameters[CLOUDWATCH_METRICS_PARAM_NAME] = self.enable_cloudwatch_metrics
        self._hyperparameters[CONTAINER_LOG_LEVEL_PARAM_NAME] = self.container_log_level
        self._hyperparameters[JOB_NAME_PARAM_NAME] = self._current_job_name
        self._hyperparameters[SAGEMAKER_REGION_PARAM_NAME] = self.sagemaker_session.boto_session.region_name
        super(Framework, self).fit(inputs, wait, logs, self._current_job_name)
Ejemplo n.º 5
0
    def _stage_user_code_in_s3(self):
        """ Upload the user training script to s3 and return the location.

        Returns: s3 uri

        """
        if self.code_location is None:
            code_bucket = self.sagemaker_session.default_bucket()
            code_s3_prefix = '{}/source'.format(self._current_job_name)
        else:
            code_bucket, key_prefix = parse_s3_url(self.code_location)
            code_s3_prefix = '/'.join(filter(None, [key_prefix, self._current_job_name, 'source']))

        return tar_and_upload_dir(session=self.sagemaker_session.boto_session,
                                  bucket=code_bucket,
                                  s3_key_prefix=code_s3_prefix,
                                  script=self.entry_point,
                                  directory=self.source_dir)
Ejemplo n.º 6
0
    def _stage_user_code_in_s3(self):
        """Upload the user training script to s3 and return the location.

        Returns: s3 uri

        """
        if self.code_location is None:
            code_bucket = self.sagemaker_session.default_bucket()
            code_s3_prefix = '{}/source'.format(self._current_job_name)
        else:
            code_bucket, key_prefix = parse_s3_url(self.code_location)
            code_s3_prefix = '/'.join(filter(None, [key_prefix, self._current_job_name, 'source']))

        return tar_and_upload_dir(session=self.sagemaker_session.boto_session,
                                  bucket=code_bucket,
                                  s3_key_prefix=code_s3_prefix,
                                  script=self.entry_point,
                                  directory=self.source_dir)
Ejemplo n.º 7
0
    def __init__(self, model_data, image, role, entry_point, source_dir=None, predictor_cls=None, env=None, name=None,
                 enable_cloudwatch_metrics=False, container_log_level=logging.INFO, code_location=None,
                 sagemaker_session=None):
        """Initialize a ``FrameworkModel``.

        Args:
            model_data (str): The S3 location of a SageMaker model data ``.tar.gz`` file.
            image (str): A Docker image URI.
            role (str): An IAM role name or ARN for SageMaker to access AWS resources on your behalf.
            entry_point (str): Path (absolute or relative) to the Python source file which should be executed
                as the entry point to model hosting. This should be compatible with either Python 2.7 or Python 3.5.
            source_dir (str): Path (absolute or relative) to a directory with any other training
                source code dependencies aside from tne entry point file (default: None). Structure within this
                directory will be preserved when training on SageMaker.
                If the directory points to S3, no code will be uploaded and the S3 location will be used instead.
            predictor_cls (callable[string, sagemaker.session.Session]): A function to call to create
               a predictor (default: None). If not None, ``deploy`` will return the result of invoking
               this function on the created endpoint name.
            env (dict[str, str]): Environment variables to run with ``image`` when hosted in SageMaker
               (default: None).
            name (str): The model name. If None, a default model name will be selected on each ``deploy``.
            enable_cloudwatch_metrics (bool): Whether training and hosting containers will
               generate CloudWatch metrics under the AWS/SageMakerContainer namespace (default: False).
            container_log_level (int): Log level to use within the container (default: logging.INFO).
                Valid values are defined in the Python logging module.
            code_location (str): Name of the S3 bucket where custom code is uploaded (default: None).
                If not specified, default bucket created by ``sagemaker.session.Session`` is used.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker
               interactions (default: None). If not specified, one is created using the default AWS configuration chain.
        """
        super(FrameworkModel, self).__init__(model_data, image, role, predictor_cls=predictor_cls, env=env, name=name,
                                             sagemaker_session=sagemaker_session)
        self.entry_point = entry_point
        self.source_dir = source_dir
        self.enable_cloudwatch_metrics = enable_cloudwatch_metrics
        self.container_log_level = container_log_level
        if code_location:
            self.bucket, self.key_prefix = parse_s3_url(code_location)
        else:
            self.bucket, self.key_prefix = None, None
Ejemplo n.º 8
0
    def _initialize_job(
        self, monitored_metrics, dataset, num_samples, quantiles, job_name
    ):
        if self.sagemaker_session.local_mode:
            # TODO implement local mode support
            raise NotImplementedError(
                "Local mode has not yet been implemented."
            )

        # set metrics to be monitored
        self.metric_definitions = make_metrics(monitored_metrics)

        self._hyperparameters.update(
            DATASET=dataset,  # pass dataset as hyper-parameter
            NUM_SAMPLES=num_samples,
            QUANTILES=str(quantiles),
        )

        # needed to set default output and code location properly
        if self.output_path is None:
            default_bucket = self.sagemaker_session.default_bucket()
            self.output_path = f"s3://{default_bucket}"

        if self.code_location is None:
            code_bucket, _ = parse_s3_url(self.output_path)
            self.code_location = (
                f"s3://{code_bucket}"  # for consistency with sagemaker API
            )

        locations = Locations(
            job_name=job_name,
            output_path=self.output_path,
            code_location=self.code_location,
        )

        logger.info(f"OUTPUT_PATH: {locations.job_output_path}")
        logger.info(f"CODE_LOCATION: {locations.job_code_location}")

        return locations
Ejemplo n.º 9
0
    def __init__(self,
                 model_data,
                 image,
                 role,
                 entry_point,
                 source_dir=None,
                 predictor_cls=None,
                 env=None,
                 name=None,
                 enable_cloudwatch_metrics=False,
                 container_log_level=logging.INFO,
                 code_location=None,
                 sagemaker_session=None,
                 dependencies=None,
                 git_config=None,
                 **kwargs):
        """Initialize a ``FrameworkModel``.

        Args:
            model_data (str): The S3 location of a SageMaker model data ``.tar.gz`` file.
            image (str): A Docker image URI.
            role (str): An IAM role name or ARN for SageMaker to access AWS resources on your behalf.
            entry_point (str): Path (absolute or relative) to the Python source file which should be executed
                as the entry point to model hosting. This should be compatible with either Python 2.7 or Python 3.5.
                If 'git_config' is provided, 'entry_point' should be a relative location to the Python source file in
                the Git repo.
                Example:

                    With the following GitHub repo directory structure:

                    >>> |----- README.md
                    >>> |----- src
                    >>>         |----- inference.py
                    >>>         |----- test.py

                    You can assign entry_point='src/inference.py'.
            git_config (dict[str, str]): Git configurations used for cloning files, including ``repo``, ``branch``,
                ``commit``, ``2FA_enabled``, ``username``, ``password`` and ``token``. The ``repo`` field is required.
                All other fields are optional. ``repo`` specifies the Git repository where your training script is
                stored. If you don't provide ``branch``, the default value  'master' is used. If you don't provide
                ``commit``, the latest commit in the specified branch is used.
                Example:

                    The following config:

                    >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git',
                    >>>               'branch': 'test-branch-git-config',
                    >>>               'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'}

                    results in cloning the repo specified in 'repo', then checkout the 'master' branch, and checkout
                    the specified commit.
                ``2FA_enabled``, ``username``, ``password`` and ``token`` are used for authentication. For GitHub
                (or other Git) accounts, set ``2FA_enabled`` to 'True' if two-factor authentication is enabled for the
                account, otherwise set it to 'False'. If you do not provide a value for ``2FA_enabled``, a default
                value of 'False' is used. CodeCommit does not support two-factor authentication, so do not provide
                "2FA_enabled" with CodeCommit repositories.

                For GitHub and other Git repos, when SSH URLs are provided, it doesn't matter whether 2FA is
                enabled or disabled; you should either have no passphrase for the SSH key pairs, or have the ssh-agent
                configured so that you will not be prompted for SSH passphrase when you do 'git clone' command with SSH
                URLs. When HTTPS URLs are provided: if 2FA is disabled, then either token or username+password will be
                used for authentication if provided (token prioritized); if 2FA is enabled, only token will be used for
                authentication if provided. If required authentication info is not provided, python SDK will try to use
                local credentials storage to authenticate. If that fails either, an error message will be thrown.

                For CodeCommit repos, 2FA is not supported, so '2FA_enabled' should not be provided. There is no token
                in CodeCommit, so 'token' should not be provided too. When 'repo' is an SSH URL, the requirements are
                the same as GitHub-like repos. When 'repo' is an HTTPS URL, username+password will be used for
                authentication if they are provided; otherwise, python SDK will try to use either CodeCommit credential
                helper or local credential storage for authentication.
            source_dir (str): Path (absolute or relative) to a directory with any other training
                source code dependencies aside from the entry point file (default: None). Structure within this
                directory will be preserved when training on SageMaker. If 'git_config' is provided,
                'source_dir' should be a relative location to a directory in the Git repo. If the directory points
                to S3, no code will be uploaded and the S3 location will be used instead.
                Example:

                    With the following GitHub repo directory structure:

                    >>> |----- README.md
                    >>> |----- src
                    >>>         |----- inference.py
                    >>>         |----- test.py

                    You can assign entry_point='inference.py', source_dir='src'.
            dependencies (list[str]): A list of paths to directories (absolute or relative) with
                any additional libraries that will be exported to the container (default: []).
                The library folders will be copied to SageMaker in the same folder where the entrypoint is copied.
                If 'git_config' is provided, 'dependencies' should be a list of relative locations to directories
                with any additional libraries needed in the Git repo. If the ```source_dir``` points to S3, code
                will be uploaded and the S3 location will be used instead.
                Example:

                    The following call
                    >>> Estimator(entry_point='inference.py', dependencies=['my/libs/common', 'virtual-env'])
                    results in the following inside the container:

                    >>> $ ls

                    >>> opt/ml/code
                    >>>     |------ inference.py
                    >>>     |------ common
                    >>>     |------ virtual-env

            predictor_cls (callable[string, sagemaker.session.Session]): A function to call to create
               a predictor (default: None). If not None, ``deploy`` will return the result of invoking
               this function on the created endpoint name.
            env (dict[str, str]): Environment variables to run with ``image`` when hosted in SageMaker
               (default: None).
            name (str): The model name. If None, a default model name will be selected on each ``deploy``.
            enable_cloudwatch_metrics (bool): Whether training and hosting containers will
               generate CloudWatch metrics under the AWS/SageMakerContainer namespace (default: False).
            container_log_level (int): Log level to use within the container (default: logging.INFO).
                Valid values are defined in the Python logging module.
            code_location (str): Name of the S3 bucket where custom code is uploaded (default: None).
                If not specified, default bucket created by ``sagemaker.session.Session`` is used.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker
               interactions (default: None). If not specified, one is created using the default AWS configuration chain.
            **kwargs: Keyword arguments passed to the ``Model`` initializer.
        """
        super(FrameworkModel,
              self).__init__(model_data,
                             image,
                             role,
                             predictor_cls=predictor_cls,
                             env=env,
                             name=name,
                             sagemaker_session=sagemaker_session,
                             **kwargs)
        self.entry_point = entry_point
        self.source_dir = source_dir
        self.dependencies = dependencies or []
        self.git_config = git_config
        self.enable_cloudwatch_metrics = enable_cloudwatch_metrics
        self.container_log_level = container_log_level
        if code_location:
            self.bucket, self.key_prefix = fw_utils.parse_s3_url(code_location)
        else:
            self.bucket, self.key_prefix = None, None
        if self.git_config:
            updates = git_utils.git_clone_repo(self.git_config,
                                               self.entry_point,
                                               self.source_dir,
                                               self.dependencies)
            self.entry_point = updates["entry_point"]
            self.source_dir = updates["source_dir"]
            self.dependencies = updates["dependencies"]
        self.uploaded_code = None
        self.repacked_model_data = None
Ejemplo n.º 10
0
def test_parse_s3_url_fail():
    with pytest.raises(ValueError) as error:
        fw_utils.parse_s3_url("t3://code_location")
    assert "Expecting 's3' scheme" in str(error)
Ejemplo n.º 11
0
def test_parse_s3_url():
    bucket, key_prefix = fw_utils.parse_s3_url("s3://bucket/code_location")
    assert "bucket" == bucket
    assert "code_location" == key_prefix
Ejemplo n.º 12
0
def test_parse_s3_url_fail():
    with pytest.raises(ValueError) as error:
        fw_utils.parse_s3_url('t3://code_location')
    assert 'Expecting \'s3\' scheme' in str(error)
Ejemplo n.º 13
0
def test_parse_s3_url():
    bucket, key_prefix = fw_utils.parse_s3_url('s3://bucket/code_location')
    assert 'bucket' == bucket
    assert 'code_location' == key_prefix
Ejemplo n.º 14
0
def test_parse_s3_url_fail():
    with pytest.raises(ValueError) as error:
        parse_s3_url('t3://code_location')
    assert 'Expecting \'s3\' scheme' in str(error)
Ejemplo n.º 15
0
def test_parse_s3_url():
    bucket, key_prefix = parse_s3_url('s3://bucket/code_location')
    assert 'bucket' == bucket
    assert 'code_location' == key_prefix