def test_create_tuning_kmeans_identical_dataset_algorithm_tuner(sagemaker_session,
                                                                kmeans_train_set,
                                                                kmeans_estimator,
                                                                hyperparameter_ranges):
    """Tests Identical dataset and algorithm use case with one parent and child job launched with
        .create_identical_dataset_and_algorithm_tuner() """

    parent_tuning_job_name = name_from_base("km-iden2-parent", max_length=32, short=True)
    child_tuning_job_name = name_from_base("km-iden2-child", max_length=32, short=True)

    parent_tuner = _tune(kmeans_estimator, kmeans_train_set, job_name=parent_tuning_job_name,
                         hyperparameter_ranges=hyperparameter_ranges, max_parallel_jobs=1, max_jobs=1)

    child_tuner = create_identical_dataset_and_algorithm_tuner(parent=parent_tuner.latest_tuning_job.name,
                                                               sagemaker_session=sagemaker_session)

    _tune(kmeans_estimator, kmeans_train_set, job_name=child_tuning_job_name, tuner=child_tuner, max_parallel_jobs=1,
          max_jobs=1)

    child_warm_start_config_response = WarmStartConfig.from_job_desc(
        sagemaker_session.sagemaker_client.describe_hyper_parameter_tuning_job(
            HyperParameterTuningJobName=child_tuning_job_name)["WarmStartConfig"])

    assert child_warm_start_config_response.type == child_tuner.warm_start_config.type
    assert child_warm_start_config_response.parents == child_tuner.warm_start_config.parents
def test_create_transfer_learning_tuner(sagemaker_session,
                                        kmeans_train_set,
                                        kmeans_estimator,
                                        hyperparameter_ranges):
    """Tests Transfer learning use case with two parents and child job launched with
        create_transfer_learning_tuner() """
    parent_tuning_job_name_1 = name_from_base("km-tran2-parent1", max_length=32, short=True)
    parent_tuning_job_name_2 = name_from_base("km-tran2-parent2", max_length=32, short=True)
    child_tuning_job_name = name_from_base("km-tran2-child", max_length=32, short=True)

    parent_tuner_1 = _tune(kmeans_estimator, kmeans_train_set, job_name=parent_tuning_job_name_1,
                           hyperparameter_ranges=hyperparameter_ranges, max_parallel_jobs=1, max_jobs=1)

    parent_tuner_2 = _tune(kmeans_estimator, kmeans_train_set, job_name=parent_tuning_job_name_2,
                           hyperparameter_ranges=hyperparameter_ranges, max_parallel_jobs=1, max_jobs=1)

    child_tuner = create_transfer_learning_tuner(parent=parent_tuner_1.latest_tuning_job.name,
                                                 sagemaker_session=sagemaker_session,
                                                 estimator=kmeans_estimator,
                                                 additional_parents={parent_tuner_2.latest_tuning_job.name})
    _tune(kmeans_estimator, kmeans_train_set, job_name=child_tuning_job_name, tuner=child_tuner)

    child_warm_start_config_response = WarmStartConfig.from_job_desc(
        sagemaker_session.sagemaker_client.describe_hyper_parameter_tuning_job(
            HyperParameterTuningJobName=child_tuning_job_name)["WarmStartConfig"])

    assert child_warm_start_config_response.type == child_tuner.warm_start_config.type
    assert child_warm_start_config_response.parents == child_tuner.warm_start_config.parents
Example #3
0
    def run_pre_training_bias(
        self,
        data_config,
        data_bias_config,
        methods="all",
        wait=True,
        logs=True,
        job_name=None,
        kms_key=None,
        experiment_config=None,
    ):
        """Runs a ProcessingJob to compute the requested bias 'methods' of the input data.

        Computes the requested methods that compare 'methods' (e.g. fraction of examples) for the
        sensitive group vs the other examples.

        Args:
            data_config (:class:`~sagemaker.clarify.DataConfig`): Config of the input/output data.
            data_bias_config (:class:`~sagemaker.clarify.BiasConfig`): Config of sensitive groups.
            methods (str or list[str]): Selector of a subset of potential metrics:
                ["`CI <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-ci.html>`_",
                "`DPL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-dpl.html>`_",
                "`KL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-kl.html>`_",
                "`JS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-js.html>`_",
                "`LP <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-lp.html>`_",
                "`TVD <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-tvd.html>`_",
                "`KS <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-ks.html>`_",
                "`CDDL <https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-post-training-bias-metric-cdd.html>`_"].
                Defaults to computing all.
            wait (bool): Whether the call should wait until the job completes (default: True).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when ``wait`` is True (default: True).
            job_name (str): Processing job name. When ``job_name`` is not specified, if
                ``job_name_prefix`` in :class:`SageMakerClarifyProcessor` specified, the job name
                will be composed of ``job_name_prefix`` and current timestamp; otherwise use
                "Clarify-Pretraining-Bias" as prefix.
            kms_key (str): The ARN of the KMS key that is used to encrypt the
                user code file (default: None).
            experiment_config (dict[str, str]): Experiment management configuration.
                Dictionary contains three optional keys:
                'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
        """
        analysis_config = data_config.get_config()
        analysis_config.update(data_bias_config.get_config())
        analysis_config["methods"] = {
            "pre_training_bias": {
                "methods": methods
            }
        }
        if job_name is None:
            if self.job_name_prefix:
                job_name = utils.name_from_base(self.job_name_prefix)
            else:
                job_name = utils.name_from_base("Clarify-Pretraining-Bias")
        self._run(data_config, analysis_config, wait, logs, job_name, kms_key,
                  experiment_config)
Example #4
0
    def _prepare_for_training(self, job_name=None):
        if job_name is not None:
            self._current_job_name = job_name
        else:
            base_name = self.base_tuning_job_name or base_name_from_image(
                self.estimator.train_image())
            self._current_job_name = name_from_base(
                base_name,
                max_length=self.TUNING_JOB_NAME_MAX_LENGTH,
                short=True)

        self.static_hyperparameters = {
            to_str(k): to_str(v)
            for (k, v) in self.estimator.hyperparameters().items()
        }
        for hyperparameter_name in self._hyperparameter_ranges.keys():
            self.static_hyperparameters.pop(hyperparameter_name, None)

        # For attach() to know what estimator to use for non-1P algorithms
        # (1P algorithms don't accept extra hyperparameters)
        if not isinstance(self.estimator, AmazonAlgorithmEstimatorBase):
            self.static_hyperparameters[
                self.SAGEMAKER_ESTIMATOR_CLASS_NAME] = json.dumps(
                    self.estimator.__class__.__name__)
            self.static_hyperparameters[
                self.SAGEMAKER_ESTIMATOR_MODULE] = json.dumps(
                    self.estimator.__module__)
Example #5
0
    def transform(self, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None,
                  job_name=None):
        """Start a new transform job.

        Args:
            data (str): Input data location in S3.
            data_type (str): What the S3 location defines (default: 'S3Prefix'). Valid values:

                * 'S3Prefix' - the S3 URI defines a key name prefix. All objects with this prefix will be used as
                    inputs for the transform job.
                * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object to use as
                    an input for the transform job.

            content_type (str): MIME type of the input data (default: None).
            compression (str): Compression type of the input data, if compressed (default: None).
                Valid values: 'Gzip', None.
            split_type (str): The record delimiter for the input object (default: 'None').
                Valid values: 'None', 'Line', and 'RecordIO'.
            job_name (str): job name (default: None). If not specified, one will be generated.
        """
        if not data.startswith('s3://'):
            raise ValueError('Invalid S3 URI: {}'.format(data))

        if job_name is not None:
            self._current_job_name = job_name
        else:
            base_name = self.base_transform_job_name or base_name_from_image(self._retrieve_image_name())
            self._current_job_name = name_from_base(base_name)

        if self.output_path is None:
            self.output_path = 's3://{}/{}'.format(self.sagemaker_session.default_bucket(), self._current_job_name)

        self.latest_transform_job = _TransformJob.start_new(self, data, data_type, content_type, compression_type,
                                                            split_type)
def test_linear_learner_multiclass(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

        ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner',
                           predictor_type='multiclass_classifier', num_classes=10, sagemaker_session=sagemaker_session)

        ll.epochs = 1
        ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]))

    endpoint_name = name_from_base('linear-learner')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):

        predictor = ll.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        result = predictor.predict(train_set[0][0:100])
        assert len(result) == 100
        for record in result:
            assert record.label["predicted_label"] is not None
            assert record.label["score"] is not None
Example #7
0
    def _create_sagemaker_model(self, *args):  # pylint: disable=unused-argument
        """Create a SageMaker Model Entity

        Args:
            *args: Arguments coming from the caller. This class
                does not require any so they are ignored.
        """
        if self.algorithm_arn:
            # When ModelPackage is created using an algorithm_arn we need to first
            # create a ModelPackage. If we had already created one then its fine to re-use it.
            if self._created_model_package_name is None:
                model_package_name = self._create_sagemaker_model_package()
                self.sagemaker_session.wait_for_model_package(
                    model_package_name)
                self._created_model_package_name = model_package_name
            model_package_name = self._created_model_package_name
        else:
            # When a ModelPackageArn is provided we just create the Model
            model_package_name = self.model_package_arn

        container_def = {"ModelPackageName": model_package_name}

        if self.env != {}:
            container_def["Environment"] = self.env

        model_package_short_name = model_package_name.split("/")[-1]
        enable_network_isolation = self.enable_network_isolation()
        self.name = self.name or utils.name_from_base(model_package_short_name)
        self.sagemaker_session.create_model(
            self.name,
            self.role,
            container_def,
            vpc_config=self.vpc_config,
            enable_network_isolation=enable_network_isolation,
        )
Example #8
0
def test_lda(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'lda')
        data_filename = 'nips-train_1.pbr'

        with open(os.path.join(data_path, data_filename), 'rb') as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(all_records[0].features['values'].float32_tensor.shape[0])

        lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10,
                  sagemaker_session=sagemaker_session, base_job_name='test-lda')

        record_set = prepare_record_set_from_local_files(data_path, lda.data_location,
                                                         len(all_records), feature_num, sagemaker_session)
        lda.fit(record_set, 100)

    endpoint_name = name_from_base('lda')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = LDAModel(lda.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["topic_mixture"] is not None
    def _upload_data_to_s3(
        self,
        data,
        input_path=None,
    ):
        """Upload request data to Amazon S3 for users"""
        if input_path:
            bucket, key = parse_s3_url(input_path)
        else:
            my_uuid = str(uuid.uuid4())
            timestamp = sagemaker_timestamp()
            bucket = self.sagemaker_session.default_bucket()
            key = "async-endpoint-inputs/{}/{}-{}".format(
                name_from_base(self.name, short=True),
                timestamp,
                my_uuid,
            )

        data = self.serializer.serialize(data)
        self.s3_client.put_object(Body=data,
                                  Bucket=bucket,
                                  Key=key,
                                  ContentType=self.serializer.CONTENT_TYPE)
        input_path = input_path or "s3://{}/{}".format(
            self.sagemaker_session.default_bucket(), key)

        return input_path
Example #10
0
    def run_post_training_bias(
        self,
        data_config,
        data_bias_config,
        model_config,
        model_predicted_label_config,
        methods="all",
        wait=True,
        logs=True,
        job_name=None,
        kms_key=None,
        experiment_config=None,
    ):
        """Runs a ProcessingJob to compute the requested bias 'methods' of the model predictions.

        Spins up a model endpoint, runs inference over the input example in the
        's3_data_input_path' to obtain predicted labels. Computes a the requested methods that
        compare 'methods' (e.g. accuracy, precision, recall) for the sensitive group vs the other
        examples.

        Args:
            data_config (:class:`~sagemaker.clarify.DataConfig`): Config of the input/output data.
            data_bias_config (:class:`~sagemaker.clarify.BiasConfig`): Config of sensitive groups.
            model_config (:class:`~sagemaker.clarify.ModelConfig`): Config of the model and its
                endpoint to be created.
            model_predicted_label_config (:class:`~sagemaker.clarify.ModelPredictedLabelConfig`):
                Config of how to extract the predicted label from the model output.
            methods (str or list[str]): Selector of a subset of potential metrics:
                # TODO: Provide a pointer to the official documentation of those.
                ["DPPL", "DI", "DCA", "DCR", "RD", "DAR", "DRR", "AD", "CDDPL", "TE", "FT"].
                Defaults to computing all.
            wait (bool): Whether the call should wait until the job completes (default: True).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when ``wait`` is True (default: True).
            job_name (str): Processing job name. If not specified, a name is composed of
                "Clarify-Posttraining-Bias" and current timestamp.
            kms_key (str): The ARN of the KMS key that is used to encrypt the
                user code file (default: None).
            experiment_config (dict[str, str]): Experiment management configuration.
                Dictionary contains three optional keys:
                'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
        """
        analysis_config = data_config.get_config()
        analysis_config.update(data_bias_config.get_config())
        (
            probability_threshold,
            predictor_config,
        ) = model_predicted_label_config.get_predictor_config()
        predictor_config.update(model_config.get_predictor_config())
        analysis_config["methods"] = {
            "post_training_bias": {
                "methods": methods
            }
        }
        analysis_config["predictor"] = predictor_config
        _set(probability_threshold, "probability_threshold", analysis_config)
        if job_name is None:
            job_name = utils.name_from_base("Clarify-Posttraining-Bias")
        self._run(data_config, analysis_config, wait, logs, job_name, kms_key,
                  experiment_config)
def test_randomcutforest(sagemaker_session):
    with timeout(minutes=15):
        # Generate a thousand 14-dimensional datapoints.
        feature_num = 14
        train_input = np.random.rand(1000, feature_num)

        rcf = RandomCutForest(role='SageMakerRole',
                              train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              num_trees=50,
                              num_samples_per_tree=20,
                              sagemaker_session=sagemaker_session,
                              base_job_name='test-randomcutforest')

        rcf.fit(rcf.record_set(train_input))

    endpoint_name = name_from_base('randomcutforest')
    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=20):
        model = RandomCutForestModel(rcf.model_data,
                                     role='SageMakerRole',
                                     sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["score"] is not None
            assert len(record.label["score"].float32_tensor.values) == 1
Example #12
0
    def run_pre_training_bias(
        self,
        data_config,
        data_bias_config,
        methods="all",
        wait=True,
        logs=True,
        job_name=None,
        kms_key=None,
    ):
        """Runs a ProcessingJob to compute the requested bias 'methods' of the input data.

        Computes the requested methods that compare 'methods' (e.g. fraction of examples) for the
        sensitive group vs the other examples.

        Args:
            data_config (:class:`~sagemaker.clarify.DataConfig`): Config of the input/output data.
            data_bias_config (:class:`~sagemaker.clarify.BiasConfig`): Config of sensitive groups.
            methods (str or list[str]): Selector of a subset of potential metrics:
                ["CI", "DPL", "KL", "JS", "LP", "TVD", "KS", "CDDL"]. Defaults to computing all.
                # TODO: Provide a pointer to the official documentation of those.
            wait (bool): Whether the call should wait until the job completes (default: True).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when ``wait`` is True (default: True).
            job_name (str): Processing job name. If not specified, a name is composed of
                "Clarify-Pretraining-Bias" and current timestamp.
            kms_key (str): The ARN of the KMS key that is used to encrypt the
                user code file (default: None).
        """
        analysis_config = data_config.get_config()
        analysis_config.update(data_bias_config.get_config())
        analysis_config["methods"] = {"pre_training_bias": {"methods": methods}}
        if job_name is None:
            job_name = utils.name_from_base("Clarify-Pretraining-Bias")
        self._run(data_config, analysis_config, wait, logs, job_name, kms_key)
Example #13
0
def test_knn_regressor(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        knn = KNN(role='SageMakerRole', train_instance_count=1,
                  train_instance_type='ml.c4.xlarge',
                  k=10, predictor_type='regressor', sample_size=500,
                  sagemaker_session=sagemaker_session, base_job_name='test-knn-rr')

        # training labels must be 'float32'
        knn.fit(knn.record_set(train_set[0][:200], train_set[1][:200].astype('float32')))

    endpoint_name = name_from_base('knn')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = KNNModel(knn.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["score"] is not None
Example #14
0
def prepare_framework_container_def(model, instance_type, s3_operations):
    """Prepare the framework model container information. Specify related S3
    operations for Airflow to perform. (Upload `source_dir` )

    Args:
        model (sagemaker.model.FrameworkModel): The framework model
        instance_type (str): The EC2 instance type to deploy this Model to. For
            example, 'ml.p2.xlarge'.
        s3_operations (dict): The dict to specify S3 operations (upload
            `source_dir` ).

    Returns:
        dict: The container information of this framework model.
    """
    deploy_image = model.image
    if not deploy_image:
        region_name = model.sagemaker_session.boto_session.region_name
        deploy_image = fw_utils.create_image_uri(
            region_name,
            model.__framework_name__,
            instance_type,
            model.framework_version,
            model.py_version,
        )

    base_name = utils.base_name_from_image(deploy_image)
    model.name = model.name or utils.name_from_base(base_name)

    bucket = model.bucket or model.sagemaker_session._default_bucket
    script = os.path.basename(model.entry_point)
    key = "{}/source/sourcedir.tar.gz".format(model.name)

    if model.source_dir and model.source_dir.lower().startswith("s3://"):
        code_dir = model.source_dir
        model.uploaded_code = fw_utils.UploadedCode(s3_prefix=code_dir,
                                                    script_name=script)
    else:
        code_dir = "s3://{}/{}".format(bucket, key)
        model.uploaded_code = fw_utils.UploadedCode(s3_prefix=code_dir,
                                                    script_name=script)
        s3_operations["S3Upload"] = [{
            "Path": model.source_dir or script,
            "Bucket": bucket,
            "Key": key,
            "Tar": True
        }]

    deploy_env = dict(model.env)
    deploy_env.update(model._framework_env_vars())

    try:
        if model.model_server_workers:
            deploy_env[
                sagemaker.model.MODEL_SERVER_WORKERS_PARAM_NAME.upper()] = str(
                    model.model_server_workers)
    except AttributeError:
        # This applies to a FrameworkModel which is not SageMaker Deep Learning Framework Model
        pass

    return sagemaker.container_def(deploy_image, model.model_data, deploy_env)
Example #15
0
def test_factorization_machines(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        fm = FactorizationMachines(role='SageMakerRole', train_instance_count=1,
                                   train_instance_type='ml.c4.xlarge',
                                   num_factors=10, predictor_type='regressor',
                                   epochs=2, clip_gradient=1e2, eps=0.001, rescale_grad=1.0/100,
                                   sagemaker_session=sagemaker_session, base_job_name='test-fm')

        # training labels must be 'float32'
        fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')))

    endpoint_name = name_from_base('fm')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = FactorizationMachinesModel(fm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["score"] is not None
def test_pca(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        pca = sagemaker.amazon.pca.PCA(role='SageMakerRole', train_instance_count=1,
                                       train_instance_type='ml.m4.xlarge',
                                       num_components=48, sagemaker_session=sagemaker_session, base_job_name='test-pca')

        pca.algorithm_mode = 'randomized'
        pca.subtract_mean = True
        pca.extra_components = 5
        pca.fit(pca.record_set(train_set[0][:100]))

    endpoint_name = name_from_base('pca')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        pca_model = sagemaker.amazon.pca.PCAModel(model_data=pca.model_data, role='SageMakerRole',
                                                  sagemaker_session=sagemaker_session)
        predictor = pca_model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge",
                                     endpoint_name=endpoint_name)

        result = predictor.predict(train_set[0][:5])

        assert len(result) == 5
        for record in result:
            assert record.label["projection"] is not None
    def update_data_capture_config(self, data_capture_config):
        """Updates the DataCaptureConfig for the Predictor's associated Amazon SageMaker Endpoint
        with the provided DataCaptureConfig.

        Args:
            data_capture_config (sagemaker.model_monitor.DataCaptureConfig): The
                DataCaptureConfig to update the predictor's endpoint to use.
        """
        endpoint_desc = self.sagemaker_session.sagemaker_client.describe_endpoint(
            EndpointName=self.endpoint_name)

        new_config_name = name_from_base(base=self.endpoint_name)

        data_capture_config_dict = None
        if data_capture_config is not None:
            data_capture_config_dict = data_capture_config._to_request_dict()

        self.sagemaker_session.create_endpoint_config_from_existing(
            existing_config_name=endpoint_desc["EndpointConfigName"],
            new_config_name=new_config_name,
            new_data_capture_config_dict=data_capture_config_dict,
        )

        self.sagemaker_session.update_endpoint(
            endpoint_name=self.endpoint_name,
            endpoint_config_name=new_config_name)
Example #18
0
    def _prepare_for_training(self, job_name=None):
        """Set any values in the estimator that need to be set before training.

        Args:
            * job_name (str): Name of the training job to be created. If not specified, one is generated,
                using the base name given to the constructor if applicable.
        """
        if job_name is not None:
            self._current_job_name = job_name
        else:
            # honor supplied base_job_name or generate it
            base_name = self.base_job_name or base_name_from_image(
                self.train_image())
            self._current_job_name = name_from_base(base_name)

        # if output_path was specified we use it otherwise initialize here.
        # For Local Mode with local_code=True we don't need an explicit output_path
        if self.output_path is None:
            local_code = get_config_value('local.local_code',
                                          self.sagemaker_session.config)
            if self.sagemaker_session.local_mode and local_code:
                self.output_path = ''
            else:
                self.output_path = 's3://{}/'.format(
                    self.sagemaker_session.default_bucket())
def test_linear_learner_multiclass(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

        ll = LinearLearner('SageMakerRole',
                           1,
                           'ml.c4.2xlarge',
                           base_job_name='test-linear-learner',
                           predictor_type='multiclass_classifier',
                           num_classes=10,
                           sagemaker_session=sagemaker_session)

        ll.epochs = 1
        ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]))

    endpoint_name = name_from_base('linear-learner')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):

        predictor = ll.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        result = predictor.predict(train_set[0][0:100])
        assert len(result) == 100
        for record in result:
            assert record.label["predicted_label"] is not None
            assert record.label["score"] is not None
    def transform(self, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None,
                  job_name=None):
        """Start a new transform job.

        Args:
            data (str): Input data location in S3.
            data_type (str): What the S3 location defines (default: 'S3Prefix'). Valid values:

                * 'S3Prefix' - the S3 URI defines a key name prefix. All objects with this prefix will be used as
                    inputs for the transform job.
                * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object to use as
                    an input for the transform job.

            content_type (str): MIME type of the input data (default: None).
            compression (str): Compression type of the input data, if compressed (default: None).
                Valid values: 'Gzip', None.
            split_type (str): The record delimiter for the input object (default: 'None').
                Valid values: 'None', 'Line', and 'RecordIO'.
            job_name (str): job name (default: None). If not specified, one will be generated.
        """
        if not data.startswith('s3://'):
            raise ValueError('Invalid S3 URI: {}'.format(data))

        if job_name is not None:
            self._current_job_name = job_name
        else:
            base_name = self.base_transform_job_name or base_name_from_image(self._retrieve_image_name())
            self._current_job_name = name_from_base(base_name)

        if self.output_path is None:
            self.output_path = 's3://{}/{}'.format(self.sagemaker_session.default_bucket(), self._current_job_name)

        self.latest_transform_job = _TransformJob.start_new(self, data, data_type, content_type, compression_type,
                                                            split_type)
def test_kmeans(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        kmeans.fit(kmeans.record_set(train_set[0][:100]))

    endpoint_name = name_from_base('kmeans')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Example #22
0
def test_pca():
    with timeout(minutes=15):
        sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        pca = sagemaker.amazon.pca.PCA(role='SageMakerRole', train_instance_count=1,
                                       train_instance_type='ml.m4.xlarge',
                                       num_components=48, sagemaker_session=sagemaker_session, base_job_name='test-pca')

        pca.algorithm_mode = 'randomized'
        pca.subtract_mean = True
        pca.extra_components = 5
        pca.fit(pca.record_set(train_set[0][:100]))

    endpoint_name = name_from_base('pca')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
        pca_model = sagemaker.amazon.pca.PCAModel(model_data=pca.model_data, role='SageMakerRole',
                                                  sagemaker_session=sagemaker_session)
        predictor = pca_model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge",
                                     endpoint_name=endpoint_name)

        result = predictor.predict(train_set[0][:5])

        assert len(result) == 5
        for record in result:
            assert record.label["projection"] is not None
Example #23
0
def test_ntm(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'ntm')
        data_filename = 'nips-train_1.pbr'

        with open(os.path.join(data_path, data_filename), 'rb') as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(all_records[0].features['values'].float32_tensor.shape[0])

        ntm = NTM(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_topics=10,
                  sagemaker_session=sagemaker_session, base_job_name='test-ntm')

        record_set = prepare_record_set_from_local_files(data_path, ntm.data_location,
                                                         len(all_records), feature_num, sagemaker_session)
        ntm.fit(record_set, None)

    endpoint_name = name_from_base('ntm')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = NTMModel(ntm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["topic_weights"] is not None
def test_factorization_machines(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        fm = FactorizationMachines(role='SageMakerRole', train_instance_count=1,
                                   train_instance_type='ml.c4.xlarge',
                                   num_factors=10, predictor_type='regressor',
                                   epochs=2, clip_gradient=1e2, eps=0.001, rescale_grad=1.0 / 100,
                                   sagemaker_session=sagemaker_session, base_job_name='test-fm')

        # training labels must be 'float32'
        fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')))

    endpoint_name = name_from_base('fm')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = FactorizationMachinesModel(fm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["score"] is not None
    def _prepare_for_training(self, job_name=None, include_cls_metadata=False):
        """
        Args:
            job_name:
            include_cls_metadata:
        """
        if job_name is not None:
            self._current_job_name = job_name
        else:
            base_name = self.base_tuning_job_name or base_name_from_image(
                self.estimator.train_image()
            )
            self._current_job_name = name_from_base(
                base_name, max_length=self.TUNING_JOB_NAME_MAX_LENGTH, short=True
            )

        self.static_hyperparameters = {
            to_str(k): to_str(v) for (k, v) in self.estimator.hyperparameters().items()
        }
        for hyperparameter_name in self._hyperparameter_ranges.keys():
            self.static_hyperparameters.pop(hyperparameter_name, None)

        # For attach() to know what estimator to use for frameworks
        # (other algorithms may not accept extra hyperparameters)
        if include_cls_metadata or isinstance(self.estimator, Framework):
            self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_CLASS_NAME] = json.dumps(
                self.estimator.__class__.__name__
            )
            self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_MODULE] = json.dumps(
                self.estimator.__module__
            )
def test_tuning_kmeans_identical_dataset_algorithm_tuner_from_non_terminal_parent(sagemaker_session,
                                                                                  kmeans_train_set,
                                                                                  kmeans_estimator,
                                                                                  hyperparameter_ranges):
    """Tests Identical dataset and algorithm use case with one non terminal parent and child job launched with
    .identical_dataset_and_algorithm_tuner() """
    parent_tuning_job_name = name_from_base("km-non-term", max_length=32, short=True)
    child_tuning_job_name = name_from_base("km-non-term-child", max_length=32, short=True)

    parent_tuner = _tune(kmeans_estimator, kmeans_train_set, job_name=parent_tuning_job_name,
                         hyperparameter_ranges=hyperparameter_ranges, wait_till_terminal=False, max_parallel_jobs=1,
                         max_jobs=1)

    child_tuner = parent_tuner.identical_dataset_and_algorithm_tuner()
    with pytest.raises(ClientError):
        _tune(kmeans_estimator, kmeans_train_set, job_name=child_tuning_job_name, tuner=child_tuner,
              max_parallel_jobs=1, max_jobs=1)
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, 'train',
                                                          key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole',
                              train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session,
                              base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name,
                                     sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1,
                                 'ml.m4.xlarge',
                                 endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
def test_async_kmeans():

    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        sagemaker_session = sagemaker.Session(boto_session=boto3.Session(
            region_name=REGION))
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole',
                        train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10,
                        sagemaker_session=sagemaker_session,
                        base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=35):
        estimator = KMeans.attach(training_job_name=training_job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role='SageMakerRole',
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Example #29
0
def test_async_factorization_machines():

    training_job_name = ""
    endpoint_name = name_from_base('factorizationMachines')
    sagemaker_session = sagemaker.Session(boto_session=boto3.Session(
        region_name=REGION))

    with timeout(minutes=5):

        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        fm = FactorizationMachines(role='SageMakerRole',
                                   train_instance_count=1,
                                   train_instance_type='ml.c4.xlarge',
                                   num_factors=10,
                                   predictor_type='regressor',
                                   epochs=2,
                                   clip_gradient=1e2,
                                   eps=0.001,
                                   rescale_grad=1.0 / 100,
                                   sagemaker_session=sagemaker_session,
                                   base_job_name='test-fm')

        # training labels must be 'float32'
        fm.fit(fm.record_set(train_set[0][:200],
                             train_set[1][:200].astype('float32')),
               wait=False)
        training_job_name = fm.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=35):
        estimator = FactorizationMachines.attach(
            training_job_name=training_job_name,
            sagemaker_session=sagemaker_session)
        model = FactorizationMachinesModel(estimator.model_data,
                                           role='SageMakerRole',
                                           sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["score"] is not None
Example #30
0
    def _create_sagemaker_model_package(self):
        if self.algorithm_arn is None:
            raise ValueError('No algorithm_arn was provided to create a SageMaker Model Pacakge')

        name = self.name or utils.name_from_base(self.algorithm_arn.split('/')[-1])
        description = 'Model Package created from training with %s' % self.algorithm_arn
        self.sagemaker_session.create_model_package_from_algorithm(name, description,
                                                                   self.algorithm_arn,
                                                                   self.model_data)
        return name
    def fit(self, inputs, wait=True, logs=True, job_name=None):
        """Train a model using the input training dataset.

        The API calls the Amazon SageMaker CreateTrainingJob API to start model training.
        The API uses configuration you provided to create the estimator and the
        specified input training data to send the CreatingTrainingJob request to Amazon SageMaker.

        This is a synchronous operation. After the model training successfully completes,
        you can call the ``deploy()`` method to host the model using the Amazon SageMaker hosting services.

        Args:
            inputs (str or dict or sagemaker.session.s3_input): Information about the training data.
                This can be one of three types:
                (str) - the S3 location where training data is saved.
                (dict[str, str] or dict[str, sagemaker.session.s3_input]) - If using multiple channels for
                    training data, you can specify a dict mapping channel names
                    to strings or :func:`~sagemaker.session.s3_input` objects.
                (sagemaker.session.s3_input) - channel configuration for S3 data sources that can provide
                    additional information about the training dataset. See :func:`sagemaker.session.s3_input`
                    for full details.
            wait (bool): Whether the call shouldl wait until the job completes (default: True).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when wait is True (default: True).
            job_name (str): Training job name. If not specified, the estimator generates a default job name,
                based on the training image name and current timestamp.
        """
        # always determine new job name _here_ because it is used before base is called
        if job_name is not None:
            self._current_job_name = job_name
        else:
            # honor supplied base_job_name or generate it
            base_name = self.base_job_name or base_name_from_image(self.train_image())
            self._current_job_name = name_from_base(base_name)

        if self.code_location is None:
            code_bucket = self.sagemaker_session.default_bucket()
            code_s3_prefix = '{}/source'.format(self._current_job_name)
        else:
            code_bucket, key_prefix = parse_s3_url(self.code_location)
            code_s3_prefix = '{}/{}/source'.format(key_prefix, self._current_job_name)

        self.uploaded_code = tar_and_upload_dir(session=self.sagemaker_session.boto_session,
                                                bucket=code_bucket,
                                                s3_key_prefix=code_s3_prefix,
                                                script=self.entry_point,
                                                directory=self.source_dir)

        # Modify hyperparameters in-place to add the URLs to the uploaded code.
        self._hyperparameters[DIR_PARAM_NAME] = self.uploaded_code.s3_prefix
        self._hyperparameters[SCRIPT_PARAM_NAME] = self.uploaded_code.script_name
        self._hyperparameters[CLOUDWATCH_METRICS_PARAM_NAME] = self.enable_cloudwatch_metrics
        self._hyperparameters[CONTAINER_LOG_LEVEL_PARAM_NAME] = self.container_log_level
        self._hyperparameters[JOB_NAME_PARAM_NAME] = self._current_job_name
        self._hyperparameters[SAGEMAKER_REGION_PARAM_NAME] = self.sagemaker_session.boto_session.region_name
        super(Framework, self).fit(inputs, wait, logs, self._current_job_name)
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
Example #33
0
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
def test_async_kmeans(sagemaker_session):
    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense='True',
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
def test_byo_estimator(sagemaker_session, region):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_name = registry(region) + "/factorization-machines:1"
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(prefix, 'train', key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data})

    endpoint_name = name_from_base('byo')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None
def test_tuning_kmeans_identical_dataset_algorithm_tuner_raw(sagemaker_session,
                                                             kmeans_train_set,
                                                             kmeans_estimator,
                                                             hyperparameter_ranges):
    parent_tuning_job_name = name_from_base("kmeans-identical", max_length=32, short=True)
    child_tuning_job_name = name_from_base("c-kmeans-identical", max_length=32, short=True)
    _tune(kmeans_estimator, kmeans_train_set, job_name=parent_tuning_job_name,
          hyperparameter_ranges=hyperparameter_ranges, max_parallel_jobs=1, max_jobs=1)
    child_tuner = _tune(kmeans_estimator, kmeans_train_set, job_name=child_tuning_job_name,
                        hyperparameter_ranges=hyperparameter_ranges,
                        warm_start_config=WarmStartConfig(warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM,
                                                          parents=[parent_tuning_job_name]), max_parallel_jobs=1,
                        max_jobs=1)

    child_warm_start_config_response = WarmStartConfig.from_job_desc(
        sagemaker_session.sagemaker_client.describe_hyper_parameter_tuning_job(
            HyperParameterTuningJobName=child_tuning_job_name)["WarmStartConfig"])

    assert child_warm_start_config_response.type == child_tuner.warm_start_config.type
    assert child_warm_start_config_response.parents == child_tuner.warm_start_config.parents
Example #37
0
    def _create_sagemaker_model_package(self):
        """Placeholder docstring"""
        if self.algorithm_arn is None:
            raise ValueError("No algorithm_arn was provided to create a SageMaker Model Pacakge")

        name = self.name or utils.name_from_base(self.algorithm_arn.split("/")[-1])
        description = "Model Package created from training with %s" % self.algorithm_arn
        self.sagemaker_session.create_model_package_from_algorithm(
            name, description, self.algorithm_arn, self.model_data
        )
        return name
Example #38
0
    def _upload_monitoring_analysis_config(self) -> str:
        """Generate and upload monitoring schedule analysis config to s3

        Returns:
            str: The S3 uri of the uploaded monitoring schedule analysis config
        """

        output_s3_uri = self._get_s3_base_uri_for_monitoring_analysis_config()

        if isinstance(self.clarify_check_config,
                      ModelExplainabilityCheckConfig):
            # Explainability analysis doesn't need label
            headers = copy.deepcopy(
                self.clarify_check_config.data_config.headers)
            if headers and self.clarify_check_config.data_config.label in headers:
                headers.remove(self.clarify_check_config.data_config.label)
            explainability_analysis_config = ExplainabilityAnalysisConfig(
                explainability_config=self.clarify_check_config.
                explainability_config,
                model_config=self.clarify_check_config.model_config,
                headers=headers,
            )
            analysis_config = explainability_analysis_config._to_dict()
            if "predictor" in analysis_config and "model_name" in analysis_config[
                    "predictor"]:
                analysis_config["predictor"].pop("model_name")
            job_definition_name = name_from_base(
                f"{_EXPLAINABILITY_MONITORING_CFG_BASE_NAME}-config")

        else:
            bias_analysis_config = BiasAnalysisConfig(
                bias_config=self.clarify_check_config.data_bias_config,
                headers=self.clarify_check_config.data_config.headers,
                label=self.clarify_check_config.data_config.label,
            )
            analysis_config = bias_analysis_config._to_dict()
            job_definition_name = name_from_base(
                f"{_BIAS_MONITORING_CFG_BASE_NAME}-config")

        return self._model_monitor._upload_analysis_config(
            analysis_config, output_s3_uri, job_definition_name)
Example #39
0
    def run_explainability(
        self,
        data_config,
        model_config,
        explainability_config,
        model_scores=None,
        wait=True,
        logs=True,
        job_name=None,
        kms_key=None,
        experiment_config=None,
    ):
        """Runs a ProcessingJob computing for each example in the input the feature importance.

        Currently, only SHAP is supported as explainability method.

        Spins up a model endpoint.
        For each input example in the 's3_data_input_path' the SHAP algorithm determines
        feature importance, by creating 'num_samples' copies of the example with a subset
        of features replaced with values from the 'baseline'.
        Model inference is run to see how the prediction changes with the replaced features.
        If the model output returns multiple scores importance is computed for each of them.
        Across examples, feature importance is aggregated using 'agg_method'.

        Args:
            data_config (:class:`~sagemaker.clarify.DataConfig`): Config of the input/output data.
            model_config (:class:`~sagemaker.clarify.ModelConfig`): Config of the model and its
                endpoint to be created.
            explainability_config (:class:`~sagemaker.clarify.ExplainabilityConfig`): Config of the
                specific explainability method. Currently, only SHAP is supported.
            model_scores:  Index or JSONPath location in the model output for the predicted scores
                to be explained. This is not required if the model output is a single score.
            wait (bool): Whether the call should wait until the job completes (default: True).
            logs (bool): Whether to show the logs produced by the job.
                Only meaningful when ``wait`` is True (default: True).
            job_name (str): Processing job name. If not specified, a name is composed of
                "Clarify-Explainability" and current timestamp.
            kms_key (str): The ARN of the KMS key that is used to encrypt the
                user code file (default: None).
            experiment_config (dict[str, str]): Experiment management configuration.
                Dictionary contains three optional keys:
                'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
        """
        analysis_config = data_config.get_config()
        predictor_config = model_config.get_predictor_config()
        _set(model_scores, "label", predictor_config)
        analysis_config[
            "methods"] = explainability_config.get_explainability_config()
        analysis_config["predictor"] = predictor_config
        if job_name is None:
            job_name = utils.name_from_base("Clarify-Explainability")
        self._run(data_config, analysis_config, wait, logs, job_name, kms_key,
                  experiment_config)
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(prefix, 'train', key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
Example #41
0
    def _prepare_for_training(self, job_name=None, include_cls_metadata=True):
        if job_name is not None:
            self._current_job_name = job_name
        else:
            base_name = self.base_tuning_job_name or base_name_from_image(self.estimator.train_image())
            self._current_job_name = name_from_base(base_name, max_length=self.TUNING_JOB_NAME_MAX_LENGTH, short=True)

        self.static_hyperparameters = {to_str(k): to_str(v) for (k, v) in self.estimator.hyperparameters().items()}
        for hyperparameter_name in self._hyperparameter_ranges.keys():
            self.static_hyperparameters.pop(hyperparameter_name, None)

        # For attach() to know what estimator to use for non-1P algorithms
        # (1P algorithms don't accept extra hyperparameters)
        if include_cls_metadata and not isinstance(self.estimator, AmazonAlgorithmEstimatorBase):
            self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_CLASS_NAME] = json.dumps(
                self.estimator.__class__.__name__)
            self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_MODULE] = json.dumps(self.estimator.__module__)
    def _prepare_for_training(self, job_name=None):
        """Set any values in the estimator that need to be set before training.

        Args:
            * job_name (str): Name of the training job to be created. If not specified, one is generated,
                using the base name given to the constructor if applicable.
        """
        if job_name is not None:
            self._current_job_name = job_name
        else:
            # honor supplied base_job_name or generate it
            base_name = self.base_job_name or base_name_from_image(self.train_image())
            self._current_job_name = name_from_base(base_name)

        # if output_path was specified we use it otherwise initialize here.
        # For Local Mode with local_code=True we don't need an explicit output_path
        if self.output_path is None:
            local_code = get_config_value('local.local_code', self.sagemaker_session.config)
            if self.sagemaker_session.local_mode and local_code:
                self.output_path = ''
            else:
                self.output_path = 's3://{}/'.format(self.sagemaker_session.default_bucket())
Example #43
0
def test_async_knn_classifier(sagemaker_session):
    training_job_name = ""
    endpoint_name = name_from_base('knn')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        knn = KNN(role='SageMakerRole',
                  train_instance_count=1, train_instance_type='ml.c4.xlarge',
                  k=10, predictor_type='classifier', sample_size=500,
                  index_type='faiss.IVFFlat', index_metric='L2',
                  sagemaker_session=sagemaker_session, base_job_name='test-knn-cl')

        # training labels must be 'float32'
        knn.fit(knn.record_set(train_set[0][:200], train_set[1][:200].astype('float32')), wait=False)
        training_job_name = knn.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = KNN.attach(training_job_name=training_job_name,
                               sagemaker_session=sagemaker_session)
        model = KNNModel(estimator.model_data, role='SageMakerRole',
                         sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["score"] is not None
def test_randomcutforest(sagemaker_session):
    with timeout(minutes=15):
        # Generate a thousand 14-dimensional datapoints.
        feature_num = 14
        train_input = np.random.rand(1000, feature_num)

        rcf = RandomCutForest(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge',
                              num_trees=50, num_samples_per_tree=20, sagemaker_session=sagemaker_session,
                              base_job_name='test-randomcutforest')

        rcf.fit(rcf.record_set(train_input))

    endpoint_name = name_from_base('randomcutforest')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
        model = RandomCutForestModel(rcf.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["score"] is not None
            assert len(record.label["score"].float32_tensor.values) == 1
def test_name_from_base(sagemaker_timestamp):
    name_from_base(NAME, short=False)
    assert sagemaker_timestamp.called_once
def test_name_from_base_short(sagemaker_short_timestamp):
    name_from_base(NAME, short=True)
    assert sagemaker_short_timestamp.called_once
def test_byo_estimator(sagemaker_session, region):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_name = registry(region) + "/factorization-machines:1"

    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data})

    endpoint_name = name_from_base('byo')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None
def test_linear_learner(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        train_set[1][:100] = 1
        train_set[1][100:200] = 0
        train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

        ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner',
                           predictor_type='binary_classifier', sagemaker_session=sagemaker_session)
        ll.binary_classifier_model_selection_criteria = 'accuracy'
        ll.target_recall = 0.5
        ll.target_precision = 0.5
        ll.positive_example_weight_mult = 0.1
        ll.epochs = 1
        ll.use_bias = True
        ll.num_models = 1
        ll.num_calibration_samples = 1
        ll.init_method = 'uniform'
        ll.init_scale = 0.5
        ll.init_sigma = 0.2
        ll.init_bias = 5
        ll.optimizer = 'adam'
        ll.loss = 'logistic'
        ll.wd = 0.5
        ll.l1 = 0.5
        ll.momentum = 0.5
        ll.learning_rate = 0.1
        ll.beta_1 = 0.1
        ll.beta_2 = 0.1
        ll.use_lr_scheduler = True
        ll.lr_scheduler_step = 2
        ll.lr_scheduler_factor = 0.5
        ll.lr_scheduler_minimum_lr = 0.1
        ll.normalize_data = False
        ll.normalize_label = False
        ll.unbias_data = True
        ll.unbias_label = False
        ll.num_point_for_scaler = 10000
        ll.margin = 1.0
        ll.quantile = 0.5
        ll.loss_insensitivity = 0.1
        ll.huber_delta = 0.1
        ll.early_stopping_tolerance = 0.0001
        ll.early_stopping_patience = 3
        ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]))

    endpoint_name = name_from_base('linear-learner')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):

        predictor = ll.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        result = predictor.predict(train_set[0][0:100])
        assert len(result) == 100
        for record in result:
            assert record.label["predicted_label"] is not None
            assert record.label["score"] is not None