Exemple #1
0
def test_deploy_model(
    sklearn_training_job,
    sagemaker_session,
    cpu_instance_type,
    sklearn_latest_version,
    sklearn_latest_py_version,
):
    endpoint_name = "test-sklearn-deploy-model-{}".format(
        sagemaker_timestamp())
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        desc = sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=sklearn_training_job)
        model_data = desc["ModelArtifacts"]["S3ModelArtifacts"]
        script_path = os.path.join(DATA_DIR, "sklearn_mnist", "mnist.py")
        model = SKLearnModel(
            model_data,
            "SageMakerRole",
            entry_point=script_path,
            framework_version=sklearn_latest_version,
            sagemaker_session=sagemaker_session,
        )
        predictor = model.deploy(1,
                                 cpu_instance_type,
                                 endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
def test_model(sagemaker_session):
    model = SKLearnModel("s3://some/data.tar.gz",
                         role=ROLE,
                         entry_point=SCRIPT_PATH,
                         sagemaker_session=sagemaker_session)
    predictor = model.deploy(1, CPU)
    assert isinstance(predictor, SKLearnPredictor)
Exemple #3
0
def main():
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}

    dummy_model_file = Path("dummy.model")
    dummy_model_file.touch()

    with tarfile.open("model.tar.gz", "w:gz") as tar:
        tar.add(dummy_model_file.as_posix())

    # For local training a dummy role will be sufficient
    role = DUMMY_IAM_ROLE

    model = SKLearnModel(role=role,
                         model_data='file://./model.tar.gz',
                         framework_version='0.23-1',
                         py_version='py3',
                         source_dir='code',
                         entry_point='inference.py')

    print('Deploying endpoint in local mode')
    print(
        'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.'
    )
    predictor = model.deploy(
        initial_instance_count=1,
        instance_type='local',
    )

    do_inference_on_local_endpoint(predictor)

    print('About to delete the endpoint to stop paying (if in cloud mode).')
    predictor.delete_endpoint(predictor.endpoint_name)
Exemple #4
0
def test_deploy_model(sklearn_training_job, sagemaker_session):
    endpoint_name = 'test-sklearn-deploy-model-{}'.format(sagemaker_timestamp())
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=sklearn_training_job)
        model_data = desc['ModelArtifacts']['S3ModelArtifacts']
        script_path = os.path.join(DATA_DIR, 'sklearn_mnist', 'mnist.py')
        model = SKLearnModel(model_data, 'SageMakerRole', entry_point=script_path, sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
def test_create_model(sagemaker_session):
    source_dir = 's3://mybucket/source'

    sklearn_model = SKLearnModel(model_data=source_dir,
                                 role=ROLE,
                                 sagemaker_session=sagemaker_session,
                                 entry_point=SCRIPT_PATH)
    default_image_uri = _get_full_cpu_image_uri('0.20.0')
    model_values = sklearn_model.prepare_container_def(CPU)
    assert model_values['Image'] == default_image_uri
def test_create_model(sagemaker_session, sklearn_version):
    source_dir = "s3://mybucket/source"

    sklearn_model = SKLearnModel(
        model_data=source_dir,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        entry_point=SCRIPT_PATH,
        framework_version=sklearn_version,
    )
    image_uri = _get_full_cpu_image_uri(sklearn_version)
    model_values = sklearn_model.prepare_container_def(CPU)
    assert model_values["Image"] == image_uri
def test_create_model_with_network_isolation(upload, sagemaker_session):
    source_dir = "s3://mybucket/source"
    repacked_model_data = "s3://mybucket/prefix/model.tar.gz"

    sklearn_model = SKLearnModel(
        model_data=source_dir,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        entry_point=SCRIPT_PATH,
        enable_network_isolation=True,
    )
    sklearn_model.uploaded_code = UploadedCode(s3_prefix=repacked_model_data, script_name="script")
    sklearn_model.repacked_model_data = repacked_model_data
    model_values = sklearn_model.prepare_container_def(CPU)
    assert model_values["Environment"]["SAGEMAKER_SUBMIT_DIRECTORY"] == "/opt/ml/model/code"
    assert model_values["ModelDataUrl"] == repacked_model_data
def test_model_custom_serialization(sagemaker_session, sklearn_version):
    model = SKLearnModel(
        "s3://some/data.tar.gz",
        role=ROLE,
        entry_point=SCRIPT_PATH,
        framework_version=sklearn_version,
        sagemaker_session=sagemaker_session,
    )
    custom_serializer = Mock()
    custom_deserializer = Mock()
    predictor = model.deploy(
        1,
        CPU,
        serializer=custom_serializer,
        deserializer=custom_deserializer,
    )
    assert isinstance(predictor, SKLearnPredictor)
    assert predictor.serializer is custom_serializer
    assert predictor.deserializer is custom_deserializer
def test_model_py2_raises(sagemaker_session, sklearn_version):
    source_dir = "s3://mybucket/source"

    with pytest.raises(AttributeError):
        SKLearnModel(
            model_data=source_dir,
            role=ROLE,
            entry_point=SCRIPT_PATH,
            sagemaker_session=sagemaker_session,
            framework_version=sklearn_version,
            py_version="py2",
        )
Exemple #10
0
def test_model_py2_warning(warning, sagemaker_session):
    source_dir = "s3://mybucket/source"

    model = SKLearnModel(
        model_data=source_dir,
        role=ROLE,
        entry_point=SCRIPT_PATH,
        sagemaker_session=sagemaker_session,
        py_version="py2",
    )
    assert model.py_version == "py2"
    warning.assert_called_with(model.__framework_name__, defaults.LATEST_PY2_VERSION)
Exemple #11
0
def main():

    # Prepare data for model inference - we use the Boston housing dataset
    print('Preparing data for model inference')
    data = fetch_california_housing()
    X_train, X_test, y_train, y_test = train_test_split(data.data,
                                                        data.target,
                                                        test_size=0.25,
                                                        random_state=42)

    # we don't train a model, so we will need only the testing data
    testX = pd.DataFrame(X_test, columns=data.feature_names)

    # Download a pre-trained model file
    print('Downloading a pre-trained model file')
    s3.download_file(
        'aws-ml-blog',
        'artifacts/scikit_learn_bring_your_own_model/model.joblib',
        'model.joblib')

    # Creating a model.tar.gz file
    tar = tarfile.open('model.tar.gz', 'w:gz')
    tar.add('model.joblib')
    tar.close()

    model = SKLearnModel(role=DUMMY_IAM_ROLE,
                         model_data='file://./model.tar.gz',
                         framework_version='0.23-1',
                         py_version='py3',
                         source_dir='code',
                         entry_point='inference.py')

    print('Deploying endpoint in local mode')
    predictor = model.deploy(initial_instance_count=1, instance_type='local')

    predictions = predictor.predict(testX[data.feature_names].head(5))
    print(f"Predictions: {predictions}")

    print('About to delete the endpoint to stop paying (if in cloud mode).')
    predictor.delete_endpoint(predictor.endpoint_name)
    def create_model(
        self, estimator, role, sagemaker_session, transform_mode, **kwargs
    ):
        """Create a deployable data transformer model
        Args:
            estimator: an estimator attached from trainer
            sagemaker_session:
        :return: an SKLearnModel
        """

        environment = dict(self.DEFAULT_TRANSFORMER_ENV)
        environment["AUTOML_TRANSFORM_MODE"] = transform_mode or "feature-transform"

        return SKLearnModel(
            model_data=estimator.model_data,
            role=role,
            entry_point=f"{self.source_module_path}/{self.SERVE_ENTRY_POINT}",
            env=environment,
            image=self.transformer_image_uri,
            sagemaker_session=sagemaker_session,
            **kwargs,
        )
def test_sklearn_xgboost_sip_model_registration(sagemaker_session, role,
                                                pipeline_name, region_name):
    prefix = "sip"
    bucket_name = sagemaker_session.default_bucket()
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")

    sklearn_processor = SKLearnProcessor(
        role=role,
        instance_type=instance_type,
        instance_count=instance_count,
        framework_version="0.20.0",
        sagemaker_session=sagemaker_session,
    )

    # The path to the raw data.
    raw_data_path = "s3://{0}/{1}/data/raw/".format(bucket_name, prefix)
    raw_data_path_param = ParameterString(name="raw_data_path",
                                          default_value=raw_data_path)

    # The output path to the training data.
    train_data_path = "s3://{0}/{1}/data/preprocessed/train/".format(
        bucket_name, prefix)
    train_data_path_param = ParameterString(name="train_data_path",
                                            default_value=train_data_path)

    # The output path to the validation data.
    val_data_path = "s3://{0}/{1}/data/preprocessed/val/".format(
        bucket_name, prefix)
    val_data_path_param = ParameterString(name="val_data_path",
                                          default_value=val_data_path)

    # The training output path for the model.
    output_path = "s3://{0}/{1}/output/".format(bucket_name, prefix)
    output_path_param = ParameterString(name="output_path",
                                        default_value=output_path)

    # The output path to the featurizer model.
    model_path = "s3://{0}/{1}/output/sklearn/".format(bucket_name, prefix)
    model_path_param = ParameterString(name="model_path",
                                       default_value=model_path)

    inputs = [
        ProcessingInput(
            input_name="raw_data",
            source=raw_data_path_param,
            destination="/opt/ml/processing/input",
        )
    ]

    outputs = [
        ProcessingOutput(
            output_name="train_data",
            source="/opt/ml/processing/train",
            destination=train_data_path_param,
        ),
        ProcessingOutput(
            output_name="val_data",
            source="/opt/ml/processing/val",
            destination=val_data_path_param,
        ),
        ProcessingOutput(
            output_name="model",
            source="/opt/ml/processing/model",
            destination=model_path_param,
        ),
    ]

    base_dir = os.path.join(DATA_DIR, "sip")
    code_path = os.path.join(base_dir, "preprocessor.py")

    processing_step = ProcessingStep(
        name="Processing",
        code=code_path,
        processor=sklearn_processor,
        inputs=inputs,
        outputs=outputs,
        job_arguments=["--train-test-split-ratio", "0.2"],
    )

    entry_point = "training.py"
    source_dir = base_dir
    code_location = "s3://{0}/{1}/code".format(bucket_name, prefix)

    estimator = XGBoost(
        entry_point=entry_point,
        source_dir=source_dir,
        output_path=output_path_param,
        code_location=code_location,
        instance_type=instance_type,
        instance_count=instance_count,
        framework_version="0.90-2",
        sagemaker_session=sagemaker_session,
        py_version="py3",
        role=role,
    )

    training_step = TrainingStep(
        name="Training",
        estimator=estimator,
        inputs={
            "train":
            TrainingInput(
                s3_data=processing_step.properties.ProcessingOutputConfig.
                Outputs["train_data"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=processing_step.properties.ProcessingOutputConfig.
                Outputs["val_data"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    code_location = "s3://{0}/{1}/code".format(bucket_name, prefix)
    source_dir = os.path.join(base_dir, "sklearn_source_dir")

    sklearn_model = SKLearnModel(
        name="sklearn-model",
        model_data=processing_step.properties.ProcessingOutputConfig.
        Outputs["model"].S3Output.S3Uri,
        entry_point="inference.py",
        source_dir=source_dir,
        code_location=code_location,
        role=role,
        sagemaker_session=sagemaker_session,
        framework_version="0.20.0",
        py_version="py3",
    )

    code_location = "s3://{0}/{1}/code".format(bucket_name, prefix)
    source_dir = os.path.join(base_dir, "xgboost_source_dir")

    xgboost_model = XGBoostModel(
        name="xgboost-model",
        model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
        entry_point="inference.py",
        source_dir=source_dir,
        code_location=code_location,
        framework_version="0.90-2",
        py_version="py3",
        role=role,
        sagemaker_session=sagemaker_session,
    )

    pipeline_model = PipelineModel([xgboost_model, sklearn_model],
                                   role,
                                   sagemaker_session=sagemaker_session)

    step_register = RegisterModel(
        name="AbaloneRegisterModel",
        model=pipeline_model,
        content_types=["application/json"],
        response_types=["application/json"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name="windturbine",
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            raw_data_path_param,
            train_data_path_param,
            val_data_path_param,
            model_path_param,
            instance_type,
            instance_count,
            output_path_param,
        ],
        steps=[processing_step, training_step, step_register],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.upsert(role_arn=role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        execution = pipeline.start()
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
# configuration) during stack deletion if the `endpoint_name` is kept as
# is. You will need to manually delete the endpoint (and endpoint
# configuration) after stack deletion if you change this.

# %%
explainer_name = "{}-explainer".format(config.SOLUTION_PREFIX)

# %% [markdown]
# We define the model to deploy which includes the explainer logic.

# %%
model = SKLearnModel(
    name=explainer_name,
    model_data=model_data,
    role=config.IAM_ROLE,
    entry_point='entry_point.py',
    source_dir=str(Path(current_folder, '../containers/model/src').resolve()),
    dependencies=[str(Path(current_folder, '../package/package').resolve())],
    image=config.ECR_IMAGE,
    code_location='s3://' +
    str(Path(config.S3_BUCKET, config.OUTPUTS_S3_PREFIX)))

# %% [markdown]
# Calling `deploy` will start a container to host the model.
# You can expect this step to take approximately 5 minutes.

# %%
model.deploy(endpoint_name=explainer_name,
             instance_type='ml.c5.xlarge',
             initial_instance_count=1,
             tags=[{
                 'Key': config.TAG_KEY,
Exemple #15
0
def get_pipeline(
        region,
        sagemaker_project_arn=None,
        role=None,
        default_bucket='',
        pipeline_name='end-to-end-ml-sagemaker-pipeline',
        model_package_group_name='end-to-end-ml-sm-model-package-group',
        base_job_prefix='endtoendmlsm') -> Pipeline:
    """
    Gets the SM Pipeline.

    :param role: The execution role.
    :param bucket_name: The bucket where pipeline artifacts are stored.
    :param prefix: The prefix where pipeline artifacts are stored.
    :return: A Pipeline instance.
    """

    bucket_name = default_bucket
    prefix = 'endtoendmlsm'
    sagemaker_session = get_session(region, bucket_name)

    # ---------------------
    # Processing parameters
    # ---------------------
    # The path to the raw data.
    raw_data_path = 's3://gianpo-public/endtoendml/data/raw/predmain_raw_data_header.csv'.format(
        bucket_name, prefix)
    raw_data_path_param = ParameterString(name="raw_data_path",
                                          default_value=raw_data_path)
    # The output path to the training data.
    train_data_path = 's3://{0}/{1}/data/preprocessed/train/'.format(
        bucket_name, prefix)
    train_data_path_param = ParameterString(name="train_data_path",
                                            default_value=train_data_path)
    # The output path to the validation data.
    val_data_path = 's3://{0}/{1}/data/preprocessed/val/'.format(
        bucket_name, prefix)
    val_data_path_param = ParameterString(name="val_data_path",
                                          default_value=val_data_path)
    # The output path to the featurizer model.
    model_path = 's3://{0}/{1}/output/sklearn/'.format(bucket_name, prefix)
    model_path_param = ParameterString(name="model_path",
                                       default_value=model_path)
    # The instance type for the processing job.
    processing_instance_type_param = ParameterString(
        name="processing_instance_type", default_value='ml.m5.large')
    # The instance count for the processing job.
    processing_instance_count_param = ParameterInteger(
        name="processing_instance_count", default_value=1)
    # The train/test split ration parameter.
    train_test_split_ratio_param = ParameterString(
        name="train_test_split_ratio", default_value='0.2')
    # -------------------
    # Training parameters
    # -------------------
    # XGB hyperparameters.
    max_depth_param = ParameterString(name="max_depth", default_value='3')
    eta_param = ParameterString(name="eta", default_value='0.1')
    gamma_param = ParameterString(name="gamma", default_value='0')
    min_child_weight_param = ParameterString(name="min_child_weight",
                                             default_value='1')
    objective_param = ParameterString(name="objective",
                                      default_value='binary:logistic')
    num_round_param = ParameterString(name="num_round", default_value='10')
    eval_metric_param = ParameterString(name="eval_metric",
                                        default_value='auc')
    # The instance type for the training job.
    training_instance_type_param = ParameterString(
        name="training_instance_type", default_value='ml.m5.xlarge')
    # The instance count for the training job.
    training_instance_count_param = ParameterInteger(
        name="training_instance_count", default_value=1)
    # The training output path for the model.
    output_path = 's3://{0}/{1}/output/'.format(bucket_name, prefix)
    output_path_param = ParameterString(name="output_path",
                                        default_value=output_path)
    # --------------------------
    # Register model parameters
    # --------------------------
    # The default instance type for deployment.
    deploy_instance_type_param = ParameterString(name="deploy_instance_type",
                                                 default_value='ml.m5.2xlarge')
    # The approval status for models added to the registry.
    model_approval_status_param = ParameterString(
        name="model_approval_status", default_value='PendingManualApproval')
    # --------------------------
    # Processing Step
    # --------------------------
    sklearn_processor = SKLearnProcessor(
        role=role,
        instance_type=processing_instance_type_param,
        instance_count=processing_instance_count_param,
        framework_version='0.20.0')
    inputs = [
        ProcessingInput(input_name='raw_data',
                        source=raw_data_path_param,
                        destination='/opt/ml/processing/input')
    ]
    outputs = [
        ProcessingOutput(output_name='train_data',
                         source='/opt/ml/processing/train',
                         destination=train_data_path_param),
        ProcessingOutput(output_name='val_data',
                         source='/opt/ml/processing/val',
                         destination=val_data_path_param),
        ProcessingOutput(output_name='model',
                         source='/opt/ml/processing/model',
                         destination=model_path_param)
    ]
    code_path = os.path.join(BASE_DIR, 'dataprep/preprocess.py')
    processing_step = ProcessingStep(name='Processing',
                                     code=code_path,
                                     processor=sklearn_processor,
                                     inputs=inputs,
                                     outputs=outputs,
                                     job_arguments=[
                                         '--train-test-split-ratio',
                                         train_test_split_ratio_param
                                     ])
    # --------------------------
    # Training Step
    # --------------------------
    hyperparameters = {
        "max_depth": max_depth_param,
        "eta": eta_param,
        "gamma": gamma_param,
        "min_child_weight": min_child_weight_param,
        "silent": 0,
        "objective": objective_param,
        "num_round": num_round_param,
        "eval_metric": eval_metric_param
    }
    entry_point = 'train.py'
    source_dir = os.path.join(BASE_DIR, 'train/')
    code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix)
    estimator = XGBoost(entry_point=entry_point,
                        source_dir=source_dir,
                        output_path=output_path_param,
                        code_location=code_location,
                        hyperparameters=hyperparameters,
                        instance_type=training_instance_type_param,
                        instance_count=training_instance_count_param,
                        framework_version="0.90-2",
                        py_version="py3",
                        role=role)
    training_step = TrainingStep(
        name='Training',
        estimator=estimator,
        inputs={
            'train':
            TrainingInput(
                s3_data=processing_step.properties.ProcessingOutputConfig.
                Outputs['train_data'].S3Output.S3Uri,
                content_type='text/csv'),
            'validation':
            TrainingInput(
                s3_data=processing_step.properties.ProcessingOutputConfig.
                Outputs['val_data'].S3Output.S3Uri,
                content_type='text/csv')
        })
    # --------------------------
    # Register Model Step
    # --------------------------
    code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix)
    sklearn_model = SKLearnModel(
        name='end-to-end-ml-sm-skl-model-{0}'.format(str(int(time.time()))),
        model_data=processing_step.properties.ProcessingOutputConfig.
        Outputs['model'].S3Output.S3Uri,
        entry_point='inference.py',
        source_dir=os.path.join(BASE_DIR, 'deploy/sklearn/'),
        code_location=code_location,
        role=role,
        sagemaker_session=sagemaker_session,
        framework_version='0.20.0',
        py_version='py3')
    code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix)
    xgboost_model = XGBoostModel(
        name='end-to-end-ml-sm-xgb-model-{0}'.format(str(int(time.time()))),
        model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
        entry_point='inference.py',
        source_dir=os.path.join(BASE_DIR, 'deploy/xgboost/'),
        code_location=code_location,
        framework_version='0.90-2',
        py_version='py3',
        role=role,
        sagemaker_session=sagemaker_session)
    pipeline_model_name = 'end-to-end-ml-sm-xgb-skl-pipeline-{0}'.format(
        str(int(time.time())))
    pipeline_model = PipelineModel(name=pipeline_model_name,
                                   role=role,
                                   models=[sklearn_model, xgboost_model],
                                   sagemaker_session=sagemaker_session)

    register_model_step = RegisterModel(
        name='RegisterModel',
        content_types=['text/csv'],
        response_types=['application/json', 'text/csv'],
        inference_instances=[deploy_instance_type_param, 'ml.m5.large'],
        transform_instances=['ml.c5.4xlarge'],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status_param,
        model=pipeline_model)
    # --------------------------
    # Pipeline
    # --------------------------

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            raw_data_path_param, train_data_path_param, val_data_path_param,
            model_path_param, processing_instance_type_param,
            processing_instance_count_param, train_test_split_ratio_param,
            max_depth_param, eta_param, gamma_param, min_child_weight_param,
            objective_param, num_round_param, eval_metric_param,
            training_instance_type_param, training_instance_count_param,
            output_path_param, deploy_instance_type_param,
            model_approval_status_param
        ],
        steps=[processing_step, training_step, register_model_step],
        sagemaker_session=sagemaker_session,
    )
    response = pipeline.upsert(role_arn=role)
    print(response["PipelineArn"])
    return pipeline