def pipeline_name():
    return utils.unique_name_from_base("my-pipeline-model-regis")
Esempio n. 2
0
def test_kmeans(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("kmeans")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            eval_metrics=json.dumps(kmeans.eval_metrics),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None

    predictor.delete_model()
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
        assert "Could not find model" in str(exception.value)
def test_disabling_data_capture_on_endpoint_shows_correct_data_capture_status(
        sagemaker_session, tensorflow_inference_latest_version):
    endpoint_name = unique_name_from_base("sagemaker-tensorflow-serving")
    model_data = sagemaker_session.upload_data(
        path=os.path.join(tests.integ.DATA_DIR,
                          "tensorflow-serving-test-model.tar.gz"),
        key_prefix="tensorflow-serving/models",
    )
    with tests.integ.timeout.timeout_and_delete_endpoint_by_name(
            endpoint_name, sagemaker_session):
        model = TensorFlowModel(
            model_data=model_data,
            role=ROLE,
            framework_version=tensorflow_inference_latest_version,
            sagemaker_session=sagemaker_session,
        )
        destination_s3_uri = os.path.join("s3://",
                                          sagemaker_session.default_bucket(),
                                          endpoint_name, "custom")
        predictor = model.deploy(
            initial_instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            endpoint_name=endpoint_name,
            data_capture_config=DataCaptureConfig(
                enable_capture=True,
                sampling_percentage=CUSTOM_SAMPLING_PERCENTAGE,
                destination_s3_uri=destination_s3_uri,
                capture_options=CUSTOM_CAPTURE_OPTIONS,
                csv_content_types=CUSTOM_CSV_CONTENT_TYPES,
                json_content_types=CUSTOM_JSON_CONTENT_TYPES,
                sagemaker_session=sagemaker_session,
            ),
        )

        endpoint_desc = sagemaker_session.sagemaker_client.describe_endpoint(
            EndpointName=predictor.endpoint_name)

        endpoint_config_desc = sagemaker_session.sagemaker_client.describe_endpoint_config(
            EndpointConfigName=endpoint_desc["EndpointConfigName"])

        assert endpoint_config_desc["DataCaptureConfig"]["EnableCapture"]
        assert (endpoint_config_desc["DataCaptureConfig"]
                ["InitialSamplingPercentage"] == CUSTOM_SAMPLING_PERCENTAGE)
        assert endpoint_config_desc["DataCaptureConfig"]["CaptureOptions"] == [
            {
                "CaptureMode": "Input"
            }
        ]
        assert (endpoint_config_desc["DataCaptureConfig"]
                ["CaptureContentTypeHeader"]["CsvContentTypes"] ==
                CUSTOM_CSV_CONTENT_TYPES)
        assert (endpoint_config_desc["DataCaptureConfig"]
                ["CaptureContentTypeHeader"]["JsonContentTypes"] ==
                CUSTOM_JSON_CONTENT_TYPES)

        predictor.disable_data_capture()

        # Wait for endpoint to finish updating
        # Endpoint update takes ~7min. 25 retries * 60s sleeps = 25min timeout
        for _ in retries(
                max_retry_count=25,
                exception_message_prefix=
                "Waiting for 'InService' endpoint status",
                seconds_to_sleep=60,
        ):
            new_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
                EndpointName=predictor.endpoint_name)
            if new_endpoint["EndpointStatus"] == "InService":
                break

        endpoint_desc = sagemaker_session.sagemaker_client.describe_endpoint(
            EndpointName=predictor.endpoint_name)

        endpoint_config_desc = sagemaker_session.sagemaker_client.describe_endpoint_config(
            EndpointConfigName=endpoint_desc["EndpointConfigName"])

        assert not endpoint_config_desc["DataCaptureConfig"]["EnableCapture"]
Esempio n. 4
0
def test_attach_tuning_pytorch(
    sagemaker_session,
    cpu_instance_type,
    pytorch_inference_latest_version,
    pytorch_inference_latest_py_version,
):
    mnist_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    mnist_script = os.path.join(mnist_dir, "mnist.py")

    estimator = PyTorch(
        entry_point=mnist_script,
        role="SageMakerRole",
        instance_count=1,
        framework_version=pytorch_inference_latest_version,
        py_version=pytorch_inference_latest_py_version,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
    )

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        objective_metric_name = "evaluation-accuracy"
        metric_definitions = [{
            "Name": "evaluation-accuracy",
            "Regex": r"Overall test accuracy: (\d+)"
        }]
        hyperparameter_ranges = {"batch-size": IntegerParameter(50, 100)}

        tuner = HyperparameterTuner(
            estimator,
            objective_metric_name,
            hyperparameter_ranges,
            metric_definitions,
            max_jobs=2,
            max_parallel_jobs=2,
            early_stopping_type="Auto",
        )

        training_data = estimator.sagemaker_session.upload_data(
            path=os.path.join(mnist_dir, "training"),
            key_prefix="integ-test-data/pytorch_mnist/training",
        )

        tuning_job_name = unique_name_from_base("pytorch", max_length=32)
        print("Started hyperparameter tuning job with name: {}".format(
            tuning_job_name))
        tuner.fit({"training": training_data}, job_name=tuning_job_name)

    endpoint_name = tuning_job_name
    model_name = "model-name-1"
    attached_tuner = HyperparameterTuner.attach(
        tuning_job_name, sagemaker_session=sagemaker_session)
    assert attached_tuner.early_stopping_type == "Auto"

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = attached_tuner.deploy(1,
                                          cpu_instance_type,
                                          endpoint_name=endpoint_name,
                                          model_name=model_name)
    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = attached_tuner.deploy(1, cpu_instance_type)
        data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32)
        predictor.predict(data)

        batch_size = 100
        data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
        _assert_model_name_match(sagemaker_session.sagemaker_client,
                                 endpoint_name, model_name)
Esempio n. 5
0
def test_processing_step_with_clarify_processor(pipeline_session):
    def headers():
        return [
            "Label",
            "F1",
            "F2",
            "F3",
            "F4",
        ]

    def data_bias_config():
        return BiasConfig(
            label_values_or_threshold=[1],
            facet_name="F1",
            facet_values_or_threshold=[0.5],
            group_name="F2",
        )

    def model_config(model_name):
        return ModelConfig(
            model_name=model_name,
            instance_type="ml.c5.xlarge",
            instance_count=1,
            accept_type="application/jsonlines",
            endpoint_name_prefix="myprefix",
        )

    def shap_config():
        return SHAPConfig(
            baseline=[
                [
                    0.94672389,
                    0.47108862,
                    0.63350081,
                    0.00604642,
                ]
            ],
            num_samples=2,
            agg_method="mean_sq",
            seed=123,
        )

    def verfiy(step_args):
        step = ProcessingStep(
            name="MyProcessingStep",
            step_args=step_args,
        )
        pipeline = Pipeline(
            name="MyPipeline",
            steps=[step],
            sagemaker_session=pipeline_session,
        )
        assert json.loads(pipeline.definition())["Steps"][0] == {
            "Name": "MyProcessingStep",
            "Type": "Processing",
            "Arguments": step_args,
        }

    test_run = utils.unique_name_from_base("test_run")
    output_path = "s3://{}/{}/{}".format(
        pipeline_session.default_bucket(), "linear_learner_analysis_result", test_run
    )
    data_config = DataConfig(
        s3_data_input_path=f"s3://{pipeline_session.default_bucket()}/{input}/train.csv",
        s3_output_path=output_path,
        label="Label",
        headers=headers(),
        dataset_type="text/csv",
    )

    clarify_processor = SageMakerClarifyProcessor(
        instance_type=INSTANCE_TYPE,
        instance_count=1,
        sagemaker_session=pipeline_session,
        role=sagemaker.get_execution_role(),
    )

    run_bias_args = clarify_processor.run_bias(
        data_config=data_config,
        bias_config=data_bias_config(),
        model_config=model_config("1st-model-rpyndy0uyo"),
    )
    verfiy(run_bias_args)

    run_pre_training_bias_args = clarify_processor.run_pre_training_bias(
        data_config=data_config,
        data_bias_config=data_bias_config(),
    )
    verfiy(run_pre_training_bias_args)

    run_post_training_bias_args = clarify_processor.run_post_training_bias(
        data_config=data_config,
        data_bias_config=data_bias_config(),
        model_config=model_config("1st-model-rpyndy0uyo"),
        model_predicted_label_config=ModelPredictedLabelConfig(probability_threshold=0.9),
    )
    verfiy(run_post_training_bias_args)

    run_explainability_args = clarify_processor.run_explainability(
        data_config=data_config,
        model_config=model_config("1st-model-rpyndy0uyo"),
        explainability_config=shap_config(),
    )
    verfiy(run_explainability_args)
Esempio n. 6
0
def test_mnist_with_checkpoint_config(
    sagemaker_session,
    instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format(
        sagemaker_session.default_bucket(), sagemaker_timestamp())
    checkpoint_local_path = "/test/checkpoint/path"
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role="SageMakerRole",
        instance_count=1,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        metric_definitions=[{
            "Name": "train:global_steps",
            "Regex": r"global_step\/sec:\s(.*)"
        }],
        checkpoint_s3_uri=checkpoint_s3_uri,
        checkpoint_local_path=checkpoint_local_path,
        environment=ENV_INPUT,
        max_wait=24 * 60 * 60,
        max_retry_attempts=2,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"),
        key_prefix="scriptmode/mnist")

    training_job_name = unique_name_from_base("test-tf-sm-mnist")
    with tests.integ.timeout.timeout(
            minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs, job_name=training_job_name)
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
    # remove dataframe assertion to unblock PR build
    # TODO: add independent integration test for `training_job_analytics`

    expected_training_checkpoint_config = {
        "S3Uri": checkpoint_s3_uri,
        "LocalPath": checkpoint_local_path,
    }
    actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=training_job_name)["CheckpointConfig"]
    actual_training_environment_variable_config = (
        sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=training_job_name)["Environment"])

    expected_retry_strategy = {
        "MaximumRetryAttempts": 2,
    }
    actual_retry_strategy = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=training_job_name)["RetryStrategy"]
    assert actual_training_checkpoint_config == expected_training_checkpoint_config
    assert actual_training_environment_variable_config == ENV_INPUT
    assert actual_retry_strategy == expected_retry_strategy
Esempio n. 7
0
def test_tuning_lda(sagemaker_session, cpu_instance_type):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "lda")
        data_filename = "nips-train_1.pbr"

        with open(os.path.join(data_path, data_filename), "rb") as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(
            all_records[0].features["values"].float32_tensor.shape[0])

        lda = LDA(
            role="SageMakerRole",
            instance_type=cpu_instance_type,
            num_topics=10,
            sagemaker_session=sagemaker_session,
        )

        record_set = prepare_record_set_from_local_files(
            data_path, lda.data_location, len(all_records), feature_num,
            sagemaker_session)
        test_record_set = prepare_record_set_from_local_files(
            data_path, lda.data_location, len(all_records), feature_num,
            sagemaker_session)
        test_record_set.channel = "test"

        # specify which hp you want to optimize over
        hyperparameter_ranges = {
            "alpha0": ContinuousParameter(1, 10),
            "num_topics": IntegerParameter(1, 2),
        }
        objective_metric_name = "test:pwll"

        tuner = HyperparameterTuner(
            estimator=lda,
            objective_metric_name=objective_metric_name,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_type="Maximize",
            max_jobs=2,
            max_parallel_jobs=2,
            early_stopping_type="Auto",
        )

        tuning_job_name = unique_name_from_base("test-lda", max_length=32)
        print("Started hyperparameter tuning job with name:" + tuning_job_name)
        tuner.fit([record_set, test_record_set],
                  mini_batch_size=1,
                  job_name=tuning_job_name)

    attached_tuner = HyperparameterTuner.attach(
        tuning_job_name, sagemaker_session=sagemaker_session)
    assert attached_tuner.early_stopping_type == "Auto"
    assert attached_tuner.estimator.alpha0 == 1.0
    assert attached_tuner.estimator.num_topics == 1

    best_training_job = attached_tuner.best_training_job()

    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1, cpu_instance_type)
        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["topic_mixture"] is not None
def multi_variant_endpoint(sagemaker_session):
    """
    Sets up the multi variant endpoint before the integration tests run.
    Cleans up the multi variant endpoint after the integration tests run.
    """
    multi_variant_endpoint.endpoint_name = unique_name_from_base(
        "integ-test-multi-variant-endpoint")
    with tests.integ.timeout.timeout_and_delete_endpoint_by_name(
            endpoint_name=multi_variant_endpoint.endpoint_name,
            sagemaker_session=sagemaker_session,
            hours=2,
    ):

        # Creating a model
        bucket = sagemaker_session.default_bucket()
        prefix = "sagemaker/DEMO-VariantTargeting"
        model_url = S3Uploader.upload(
            local_path=XG_BOOST_MODEL_LOCAL_PATH,
            desired_s3_uri="s3://" + bucket + "/" + prefix,
            session=sagemaker_session,
        )

        image_uri = get_image_uri(sagemaker_session.boto_session.region_name,
                                  "xgboost", "0.90-1")

        multi_variant_endpoint_model = sagemaker_session.create_model(
            name=MODEL_NAME,
            role=ROLE,
            container_defs={
                "Image": image_uri,
                "ModelDataUrl": model_url
            },
        )

        # Creating a multi variant endpoint
        variant1 = production_variant(
            model_name=MODEL_NAME,
            instance_type=DEFAULT_INSTANCE_TYPE,
            initial_instance_count=DEFAULT_INSTANCE_COUNT,
            variant_name=TEST_VARIANT_1,
            initial_weight=TEST_VARIANT_1_WEIGHT,
        )
        variant2 = production_variant(
            model_name=MODEL_NAME,
            instance_type=DEFAULT_INSTANCE_TYPE,
            initial_instance_count=DEFAULT_INSTANCE_COUNT,
            variant_name=TEST_VARIANT_2,
            initial_weight=TEST_VARIANT_2_WEIGHT,
        )
        sagemaker_session.endpoint_from_production_variants(
            name=multi_variant_endpoint.endpoint_name,
            production_variants=[variant1, variant2])

        # Yield to run the integration tests
        yield multi_variant_endpoint

        # Cleanup resources
        sagemaker_session.delete_model(multi_variant_endpoint_model)
        sagemaker_session.sagemaker_client.delete_endpoint_config(
            EndpointConfigName=multi_variant_endpoint.endpoint_name)

    # Validate resource cleanup
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(
            ModelName=multi_variant_endpoint_model.name)
        assert "Could not find model" in str(exception.value)
        sagemaker_session.sagemaker_client.describe_endpoint_config(
            name=multi_variant_endpoint.endpoint_name)
        assert "Could not find endpoint" in str(exception.value)
def test_async_linear_learner(sagemaker_session):
    job_name = unique_name_from_base('linear-learner')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        train_set[1][:100] = 1
        train_set[1][100:200] = 0
        train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

        ll = LinearLearner('SageMakerRole',
                           1,
                           'ml.c4.2xlarge',
                           predictor_type='binary_classifier',
                           sagemaker_session=sagemaker_session)
        ll.binary_classifier_model_selection_criteria = 'accuracy'
        ll.target_recall = 0.5
        ll.target_precision = 0.5
        ll.positive_example_weight_mult = 0.1
        ll.epochs = 1
        ll.use_bias = True
        ll.num_models = 1
        ll.num_calibration_samples = 1
        ll.init_method = 'uniform'
        ll.init_scale = 0.5
        ll.init_sigma = 0.2
        ll.init_bias = 5
        ll.optimizer = 'adam'
        ll.loss = 'logistic'
        ll.wd = 0.5
        ll.l1 = 0.5
        ll.momentum = 0.5
        ll.learning_rate = 0.1
        ll.beta_1 = 0.1
        ll.beta_2 = 0.1
        ll.use_lr_scheduler = True
        ll.lr_scheduler_step = 2
        ll.lr_scheduler_factor = 0.5
        ll.lr_scheduler_minimum_lr = 0.1
        ll.normalize_data = False
        ll.normalize_label = False
        ll.unbias_data = True
        ll.unbias_label = False
        ll.num_point_for_scaler = 10000
        ll.margin = 1.0
        ll.quantile = 0.5
        ll.loss_insensitivity = 0.1
        ll.huber_delta = 0.1
        ll.early_stopping_tolerance = 0.0001
        ll.early_stopping_patience = 3
        ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]),
               wait=False,
               job_name=job_name)

        print("Waiting to re-attach to the training job: %s" % job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = LinearLearner.attach(training_job_name=job_name,
                                         sagemaker_session=sagemaker_session)
        model = LinearLearnerModel(estimator.model_data,
                                   role='SageMakerRole',
                                   sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)

        result = predictor.predict(train_set[0][0:100])
        assert len(result) == 100
        for record in result:
            assert record.label["predicted_label"] is not None
            assert record.label["score"] is not None
def test_async_linear_learner(sagemaker_session, cpu_instance_type,
                              training_set):
    job_name = unique_name_from_base("linear-learner")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        training_set[1][:100] = 1
        training_set[1][100:200] = 0
        training_set = training_set[0], training_set[1].astype(
            np.dtype("float32"))

        ll = LinearLearner(
            "SageMakerRole",
            1,
            cpu_instance_type,
            predictor_type="binary_classifier",
            sagemaker_session=sagemaker_session,
        )
        ll.binary_classifier_model_selection_criteria = "accuracy"
        ll.target_recall = 0.5
        ll.target_precision = 0.5
        ll.positive_example_weight_mult = 0.1
        ll.epochs = 1
        ll.use_bias = True
        ll.num_models = 1
        ll.num_calibration_samples = 1
        ll.init_method = "uniform"
        ll.init_scale = 0.5
        ll.init_sigma = 0.2
        ll.init_bias = 5
        ll.optimizer = "adam"
        ll.loss = "logistic"
        ll.wd = 0.5
        ll.l1 = 0.5
        ll.momentum = 0.5
        ll.learning_rate = 0.1
        ll.beta_1 = 0.1
        ll.beta_2 = 0.1
        ll.use_lr_scheduler = True
        ll.lr_scheduler_step = 2
        ll.lr_scheduler_factor = 0.5
        ll.lr_scheduler_minimum_lr = 0.1
        ll.normalize_data = False
        ll.normalize_label = False
        ll.unbias_data = True
        ll.unbias_label = False
        ll.num_point_for_scaler = 10000
        ll.margin = 1.0
        ll.quantile = 0.5
        ll.loss_insensitivity = 0.1
        ll.huber_delta = 0.1
        ll.early_stopping_tolerance = 0.0001
        ll.early_stopping_patience = 3
        ll.fit(
            ll.record_set(training_set[0][:200], training_set[1][:200]),
            wait=False,
            job_name=job_name,
        )

        print("Waiting to re-attach to the training job: %s" % job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = LinearLearner.attach(training_job_name=job_name,
                                         sagemaker_session=sagemaker_session)
        model = LinearLearnerModel(estimator.model_data,
                                   role="SageMakerRole",
                                   sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)

        result = predictor.predict(training_set[0][0:100])
        assert len(result) == 100
        for record in result:
            assert record.label["predicted_label"] is not None
            assert record.label["score"] is not None
Esempio n. 11
0
def test_training(sagemaker_session, ecr_image, instance_type,
                  framework_version):

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "tf-container-integ-test-{}".format(int(time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description="Integration test experiment from sagemaker-tf-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "tf-container-integ-test-{}".format(int(time.time()))

    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    training_job_name = utils.unique_name_from_base(
        "test-tf-experiments-mnist")

    # create a training job and wait for it to complete
    with timeout(minutes=DEFAULT_TIMEOUT):
        resource_path = os.path.join(os.path.dirname(__file__), "..", "..",
                                     "resources")
        script = os.path.join(resource_path, "mnist", "mnist.py")
        estimator = TensorFlow(
            entry_point=script,
            role="SageMakerRole",
            train_instance_type=instance_type,
            train_instance_count=1,
            sagemaker_session=sagemaker_session,
            image_name=ecr_image,
            framework_version=framework_version,
            script_mode=True,
        )
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "mnist", "data"),
            key_prefix="scriptmode/mnist")
        estimator.fit(inputs, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
def test_choice_state_machine_creation(sfn_client, sfn_role_arn):
    choice_state_name = "ChoiceState"
    first_match_name = "FirstMatchState"
    second_match_name = "SecondMatchState"
    default_state_name = "DefaultState"
    variable = "$.choice"
    first_choice_value = 1
    second_choice_value = 2
    default_error = "DefaultStateError"
    default_cause = "No Matches"
    first_choice_state_result = "First Choice State"
    second_choice_state_result = "Second Choice State"
    state_machine_input = {"choice": first_choice_value}

    asl_state_machine_definition = {
        "StartAt": choice_state_name,
        "States": {
            choice_state_name: {
                "Type":
                "Choice",
                "Choices": [{
                    "Variable": variable,
                    "NumericEquals": first_choice_value,
                    "Next": first_match_name
                }, {
                    "Variable": variable,
                    "NumericEquals": second_choice_value,
                    "Next": second_match_name
                }],
                "Default":
                default_state_name
            },
            default_state_name: {
                "Error": default_error,
                "Cause": default_cause,
                "Type": "Fail"
            },
            first_match_name: {
                "Type": "Pass",
                "Result": first_choice_state_result,
                "End": True
            },
            second_match_name: {
                "Type": "Pass",
                "Result": second_choice_state_result,
                "End": True
            }
        }
    }

    definition = steps.Choice(choice_state_name)

    definition.default_choice(
        steps.Fail(default_state_name,
                   error=default_error,
                   cause=default_cause))
    definition.add_choice(
        steps.ChoiceRule.NumericEquals(variable=variable,
                                       value=first_choice_value),
        steps.Pass(first_match_name, result=first_choice_state_result))
    definition.add_choice(
        steps.ChoiceRule.NumericEquals(variable=variable,
                                       value=second_choice_value),
        steps.Pass(second_match_name, result=second_choice_state_result))

    workflow = Workflow(unique_name_from_base('Test_Choice_Workflow'),
                        definition=definition,
                        role=sfn_role_arn)

    workflow_test_suite(sfn_client, workflow, asl_state_machine_definition,
                        first_choice_state_result, state_machine_input)
def test_wait_state_machine_creation(sfn_client, sfn_role_arn):
    first_state_name = "FirstState"
    first_wait_state_name = "WaitInSeconds"
    second_wait_state_name = "WaitTimestamp"
    third_wait_state_name = "WaitTimestampPath"
    fourth_wait_state_name = "WaitInSecondsPath"
    final_state_name = "FinalState"
    timestamp = "2019-09-04T01:59:00Z"
    timestamp_path = "$.expirydate"
    seconds = 2
    seconds_path = "$.expiryseconds"
    wait_state_result = "Wait Result"
    parameters = {'expirydate': timestamp, 'expiryseconds': seconds}

    asl_state_machine_definition = {
        "StartAt": first_state_name,
        "States": {
            first_state_name: {
                "Type": "Pass",
                "Next": first_wait_state_name,
                "Parameters": parameters
            },
            first_wait_state_name: {
                "Seconds": seconds,
                "Type": "Wait",
                "Next": second_wait_state_name
            },
            second_wait_state_name: {
                "Timestamp": timestamp,
                "Type": "Wait",
                "Next": third_wait_state_name
            },
            third_wait_state_name: {
                "TimestampPath": timestamp_path,
                "Type": "Wait",
                "Next": fourth_wait_state_name
            },
            fourth_wait_state_name: {
                "SecondsPath": seconds_path,
                "Type": "Wait",
                "Next": final_state_name
            },
            final_state_name: {
                "Type": "Pass",
                "Result": wait_state_result,
                "End": True
            }
        }
    }

    definition = steps.Chain([
        steps.Pass(first_state_name, parameters=parameters),
        steps.Wait(first_wait_state_name, seconds=seconds),
        steps.Wait(second_wait_state_name, timestamp=timestamp),
        steps.Wait(third_wait_state_name, timestamp_path=timestamp_path),
        steps.Wait(fourth_wait_state_name, seconds_path=seconds_path),
        steps.Pass(final_state_name, result=wait_state_result)
    ])

    workflow = Workflow(unique_name_from_base('Test_Wait_Workflow'),
                        definition=definition,
                        role=sfn_role_arn)

    workflow_test_suite(sfn_client, workflow, asl_state_machine_definition,
                        wait_state_result)
import pytest
import scipy.stats as st

from sagemaker import image_uris
from sagemaker.deserializers import CSVDeserializer
from sagemaker.s3 import S3Uploader
from sagemaker.session import production_variant
from sagemaker.sparkml import SparkMLModel
from sagemaker.utils import unique_name_from_base
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
import tests.integ

ROLE = "SageMakerRole"
MODEL_NAME = unique_name_from_base("test-xgboost-model")
DEFAULT_REGION = "us-west-2"
DEFAULT_INSTANCE_TYPE = "ml.m5.xlarge"
DEFAULT_INSTANCE_COUNT = 1
XG_BOOST_MODEL_LOCAL_PATH = os.path.join(tests.integ.DATA_DIR, "xgboost_model",
                                         "xgb_model.tar.gz")

TEST_VARIANT_1 = "Variant1"
TEST_VARIANT_1_WEIGHT = 0.3

TEST_VARIANT_2 = "Variant2"
TEST_VARIANT_2_WEIGHT = 0.7

VARIANT_TRAFFIC_SAMPLING_COUNT = 100
DESIRED_CONFIDENCE_FOR_VARIANT_TRAFFIC_DISTRIBUTION = 0.999
def test_run_bias_monitor_baseline(
    sagemaker_session,
    data_config,
    model_config,
    bias_config,
    model_predicted_label_config,
    endpoint_name,
    ground_truth_input,
    upload_actual_data,
):
    monitor = ModelBiasMonitor(
        role=ROLE,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        volume_size_in_gb=VOLUME_SIZE_IN_GB,
        max_runtime_in_seconds=MAX_RUNTIME_IN_SECONDS,
        sagemaker_session=sagemaker_session,
        tags=TEST_TAGS,
    )

    baselining_job_name = utils.unique_name_from_base("bias-baselining-job")
    print("Creating baselining job: {}".format(baselining_job_name))
    monitor.suggest_baseline(
        data_config=data_config,
        bias_config=bias_config,
        model_config=model_config,
        model_predicted_label_config=model_predicted_label_config,
        job_name=baselining_job_name,
    )
    assert (monitor.latest_baselining_job_config.
            probability_threshold_attribute == BIAS_PROBABILITY_THRESHOLD)
    monitoring_schedule_name = utils.unique_name_from_base(
        "bias-suggest-baseline")
    s3_uri_monitoring_output = os.path.join(
        "s3://",
        sagemaker_session.default_bucket(),
        endpoint_name,
        monitoring_schedule_name,
        "monitor_output",
    )
    # Let's test if the schedule can pick up analysis_config from baselining job
    monitor.create_monitoring_schedule(
        output_s3_uri=s3_uri_monitoring_output,
        monitor_schedule_name=monitoring_schedule_name,
        endpoint_input=EndpointInput(
            endpoint_name=endpoint_name,
            destination=ENDPOINT_INPUT_LOCAL_PATH,
            start_time_offset=START_TIME_OFFSET,
            end_time_offset=END_TIME_OFFSET,
        ),
        ground_truth_input=ground_truth_input,
        schedule_cron_expression=CRON,
    )
    _verify_execution_status(monitor)

    _verify_bias_job_description(
        sagemaker_session=sagemaker_session,
        monitor=monitor,
        endpoint_name=endpoint_name,
        ground_truth_input=ground_truth_input,
    )

    monitor.delete_monitoring_schedule()
def test_run_model_quality_monitor_baseline(
    sagemaker_session,
    endpoint_name,
    data_path,
    ground_truth_input,
    upload_actual_data,
):
    monitor = ModelQualityMonitor(
        role=ROLE,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        volume_size_in_gb=VOLUME_SIZE_IN_GB,
        max_runtime_in_seconds=MAX_RUNTIME_IN_SECONDS,
        sagemaker_session=sagemaker_session,
        env=TEST_ENV,
        tags=TEST_TAGS,
    )

    baselining_job_name = utils.unique_name_from_base("model-quality-baselining-job")
    print("Creating baselining job: {}".format(baselining_job_name))
    monitor.suggest_baseline(
        baseline_dataset=data_path,
        dataset_format=DatasetFormat.csv(),
        problem_type=PROBLEM_TYPE,
        job_name=baselining_job_name,
        ground_truth_attribute=HEADER_OF_LABEL,
        inference_attribute=HEADER_OF_PREDICTED_LABEL,
    )

    monitoring_schedule_name = utils.unique_name_from_base("model-quality-suggest-baseline")
    s3_uri_monitoring_output = os.path.join(
        "s3://",
        sagemaker_session.default_bucket(),
        endpoint_name,
        monitoring_schedule_name,
        "monitor_output",
    )
    monitor.create_monitoring_schedule(
        endpoint_input=EndpointInput(
            endpoint_name=endpoint_name,
            destination=ENDPOINT_INPUT_LOCAL_PATH,
            start_time_offset=START_TIME_OFFSET,
            end_time_offset=END_TIME_OFFSET,
            inference_attribute=INFERENCE_ATTRIBUTE,
        ),
        ground_truth_input=ground_truth_input,
        problem_type=PROBLEM_TYPE,
        output_s3_uri=s3_uri_monitoring_output,
        monitor_schedule_name=monitoring_schedule_name,
        schedule_cron_expression=CRON,
    )
    _verify_execution_status(monitor)

    _verify_model_quality_job_description(
        sagemaker_session=sagemaker_session,
        monitor=monitor,
        endpoint_name=endpoint_name,
        ground_truth_input=ground_truth_input,
    )

    monitor.delete_monitoring_schedule()
def test_linear_learner(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("linear-learner")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        train_set[1][:100] = 1
        train_set[1][100:200] = 0
        train_set = train_set[0], train_set[1].astype(np.dtype("float32"))

        ll = LinearLearner(
            "SageMakerRole",
            1,
            cpu_instance_type,
            predictor_type="binary_classifier",
            sagemaker_session=sagemaker_session,
        )
        ll.binary_classifier_model_selection_criteria = "accuracy"
        ll.target_recall = 0.5
        ll.target_precision = 0.5
        ll.positive_example_weight_mult = 0.1
        ll.epochs = 1
        ll.use_bias = True
        ll.num_models = 1
        ll.num_calibration_samples = 1
        ll.init_method = "uniform"
        ll.init_scale = 0.5
        ll.init_sigma = 0.2
        ll.init_bias = 5
        ll.optimizer = "adam"
        ll.loss = "logistic"
        ll.wd = 0.5
        ll.l1 = 0.5
        ll.momentum = 0.5
        ll.learning_rate = 0.1
        ll.beta_1 = 0.1
        ll.beta_2 = 0.1
        ll.use_lr_scheduler = True
        ll.lr_scheduler_step = 2
        ll.lr_scheduler_factor = 0.5
        ll.lr_scheduler_minimum_lr = 0.1
        ll.normalize_data = False
        ll.normalize_label = False
        ll.unbias_data = True
        ll.unbias_label = False
        ll.num_point_for_scaler = 10000
        ll.margin = 1.0
        ll.quantile = 0.5
        ll.loss_insensitivity = 0.1
        ll.huber_delta = 0.1
        ll.early_stopping_tolerance = 0.0001
        ll.early_stopping_patience = 3
        ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]),
               job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        predictor = ll.deploy(1, cpu_instance_type, endpoint_name=job_name)

        result = predictor.predict(train_set[0][0:100])
        assert len(result) == 100
        for record in result:
            assert record.label["predicted_label"] is not None
            assert record.label["score"] is not None
Esempio n. 18
0
def test_training(sagemaker_session, ecr_image, instance_type, instance_count):

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "mxnet-container-integ-test-{}".format(int(time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description=
        "Integration test experiment from sagemaker-mxnet-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "mxnet-container-integ-test-{}".format(int(time.time()))
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    hyperparameters = {
        "random_seed": True,
        "num_steps": 50,
        "smdebug_path": "/opt/ml/output/tensors",
        "epochs": 1,
    }

    mx = MXNet(
        entry_point=SCRIPT_PATH,
        role="SageMakerRole",
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        image_name=ecr_image,
        hyperparameters=hyperparameters,
    )

    training_job_name = utils.unique_name_from_base("test-mxnet-image")

    # create a training job and wait for it to complete
    with timeout(minutes=15):
        prefix = "mxnet_mnist_gluon_basic_hook_demo/{}".format(
            utils.sagemaker_timestamp())
        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, "train"),
            key_prefix=prefix + "/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, "test"), key_prefix=prefix + "/test")

        mx.fit({
            "train": train_input,
            "test": test_input
        },
               job_name=training_job_name,
               wait=False)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_component_summary = None
    attempts = 0
    while True:
        trial_components = list(
            TrialComponent.list(source_arn=training_job_arn,
                                sagemaker_boto_client=sm_client))

        if len(trial_components) > 0:
            trial_component_summary = trial_components[0]
            break

        if attempts < 10:
            attempts += 1
            sleep(500)

    assert trial_component_summary is not None

    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
def test_smmodelparallel_mnist_multigpu_singlenode(ecr_image, instance_type,
                                                   sagemaker_regions,
                                                   test_script, num_processes):
    """
    Tests pt gpt2 command via script mode
    """
    framework, framework_version = get_framework_and_version_from_tag(
        ecr_image)
    if framework == "pytorch" and Version(framework_version) in SpecifierSet(
            "==1.9.*"):
        pytest.skip("Skipping the test for PT1.9")
    instance_type = "ml.p3.16xlarge"
    hyperparameters = {
        'training_dir': '/opt/ml/input/data/train',
        'max_steps': 100,
        'seed': 12345,
        'fp16': 1,
        'lr': 2.e-4,
        'lr_decay_iters': 125000,
        'min_lr': 0.00001,
        'lr-decay-style': 'linear',
        'warmup': 0.01,
        'logging_freq': 1,
        'max_context_width': 1024,
        'hidden_width': 768,
        'num_layers': 12,
        'num_heads': 12,
        'n_gpus': 8,
        'train_batch_size': 32,
        'microbatches': 1,
        'tensor_parallel_degree': 4,
        'pipeline_parallel_degree': 2,
        'activation_checkpointing': 1,
        'activation_strategy': "group_2",
        'manual_partition': 1
    }
    train = sagemaker.session.s3_input(
        "s3://gpt2-data/train_synthetic_small/",
        distribution="FullyReplicated",
        content_type="application/tfrecord",
        s3_data_type="S3Prefix",
    )
    inputs = {"train": train, "test": train}
    validate_or_skip_smmodelparallel(ecr_image)
    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator_parameter = {
            'entry_point': test_script,
            'role': 'SageMakerRole',
            'source_dir': gpt2_path,
            'instance_count': 1,
            'instance_type': instance_type,
            'hyperparameters': hyperparameters,
            'distribution': {
                "smdistributed": {
                    "modelparallel": {
                        "enabled": True,
                        "parameters": {
                            "partitions": 2,
                            "tensor_parallel_degree": 4,
                            "microbatches": 1,
                            "optimize": "speed",
                            "pipeline": "interleaved",
                            "ddp": True,
                        },
                    }
                },
                "mpi": {
                    "enabled":
                    True,
                    "processes_per_host":
                    num_processes,
                    "custom_mpi_options":
                    "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ",
                },
            },
        }
        job_name = utils.unique_name_from_base('test-pt-smdmp-gpt2-singlenode')
        invoke_pytorch_estimator(ecr_image,
                                 sagemaker_regions,
                                 estimator_parameter,
                                 inputs=inputs,
                                 job_name=job_name)
Esempio n. 20
0
def _test_training_function(ecr_image, sagemaker_session, instance_type,
                            framework_version, py_version):
    if py_version is None or '2' in py_version:
        pytest.skip('Skipping python2 {}'.format(py_version))
        return

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_id = random.randint(1, 6000)

    experiment_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description="Integration test experiment from sagemaker-tf-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    training_job_name = utils.unique_name_from_base(
        "test-tf-experiments-mnist")

    # create a training job and wait for it to complete
    with timeout(minutes=15):
        resource_path = os.path.join(os.path.dirname(__file__), "..", "..",
                                     "resources")
        script = os.path.join(resource_path, "mnist", "mnist.py")
        estimator = TensorFlow(
            entry_point=script,
            role="SageMakerRole",
            instance_type=instance_type,
            instance_count=1,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
            script_mode=True,
        )
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "mnist", "data"),
            key_prefix="scriptmode/mnist")
        estimator.fit(inputs, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
Esempio n. 21
0
def test_tuning_chainer(sagemaker_session, chainer_latest_version,
                        chainer_latest_py_version, cpu_instance_type):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
        data_path = os.path.join(DATA_DIR, "chainer_mnist")

        estimator = Chainer(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=chainer_latest_version,
            py_version=chainer_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            hyperparameters={"epochs": 1},
        )

        train_input = estimator.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/chainer_mnist/train")
        test_input = estimator.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/chainer_mnist/test")

        hyperparameter_ranges = {"alpha": ContinuousParameter(0.001, 0.005)}

        objective_metric_name = "Validation-accuracy"
        metric_definitions = [{
            "Name":
            "Validation-accuracy",
            "Regex":
            r"\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)",
        }]

        tuner = HyperparameterTuner(
            estimator,
            objective_metric_name,
            hyperparameter_ranges,
            metric_definitions,
            max_jobs=2,
            max_parallel_jobs=2,
        )

        tuning_job_name = unique_name_from_base("chainer", max_length=32)
        print("Started hyperparameter tuning job with name: {}".format(
            tuning_job_name))
        tuner.fit({
            "train": train_input,
            "test": test_input
        },
                  job_name=tuning_job_name)

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1, cpu_instance_type)

        batch_size = 100
        data = np.zeros((batch_size, 784), dtype="float32")
        output = predictor.predict(data)
        assert len(output) == batch_size

        data = np.zeros((batch_size, 1, 28, 28), dtype="float32")
        output = predictor.predict(data)
        assert len(output) == batch_size

        data = np.zeros((batch_size, 28, 28), dtype="float32")
        output = predictor.predict(data)
        assert len(output) == batch_size
Esempio n. 22
0
def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.
    """
    image_name = get_image_uri(sagemaker_session.boto_session.region_name,
                               "factorization-machines")
    training_data_path = os.path.join(DATA_DIR, "dummy_tensor")

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = "test_byo_estimator"
        key = "recordio-pb-data"
        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, "train",
                                                          key))

        estimator = Estimator(
            image_name=image_name,
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type="binary_classifier")

        hyperparameter_ranges = {"mini_batch_size": IntegerParameter(100, 200)}

        tuner = HyperparameterTuner(
            estimator=estimator,
            objective_metric_name="test:binary_classification_accuracy",
            hyperparameter_ranges=hyperparameter_ranges,
            max_jobs=2,
            max_parallel_jobs=2,
        )

        tuner.fit(
            {
                "train": s3_train_data,
                "test": s3_train_data
            },
            include_cls_metadata=False,
            job_name=unique_name_from_base("byo", 32),
        )

        print("Started hyperparameter tuning job with name:" +
              tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1,
                                 cpu_instance_type,
                                 endpoint_name=best_training_job)
        predictor.serializer = _fm_serializer
        predictor.content_type = "application/json"
        predictor.deserializer = json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result["predictions"]) == 10
        for prediction in result["predictions"]:
            assert prediction["score"] is not None
Esempio n. 23
0
def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.
    """
    image_uri = image_uris.retrieve("factorization-machines",
                                    sagemaker_session.boto_region_name)
    training_data_path = os.path.join(DATA_DIR, "dummy_tensor")

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        prefix = "test_byo_estimator"
        key = "recordio-pb-data"
        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, "train",
                                                          key))

        estimator = Estimator(
            image_uri=image_uri,
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type="binary_classifier")

        hyperparameter_ranges = {"mini_batch_size": IntegerParameter(100, 200)}

        tuner = HyperparameterTuner(
            estimator=estimator,
            objective_metric_name="test:binary_classification_accuracy",
            hyperparameter_ranges=hyperparameter_ranges,
            max_jobs=2,
            max_parallel_jobs=2,
        )

        tuning_job_name = unique_name_from_base("byo", 32)
        print("Started hyperparameter tuning job with name {}:".format(
            tuning_job_name))
        tuner.fit(
            {
                "train": s3_train_data,
                "test": s3_train_data
            },
            include_cls_metadata=False,
            job_name=tuning_job_name,
        )

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(
            1,
            cpu_instance_type,
            endpoint_name=best_training_job,
            serializer=_FactorizationMachineSerializer(),
            deserializer=JSONDeserializer(),
        )

        result = predictor.predict(datasets.one_p_mnist()[0][:10])

        assert len(result["predictions"]) == 10
        for prediction in result["predictions"]:
            assert prediction["score"] is not None
Esempio n. 24
0
def test_mxnet_with_custom_profiler_config_then_update_rule_and_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        profiler_config = ProfilerConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system",
            system_monitor_interval_millis=1000,
        )
        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description.get(
            "ProfilerConfig") == profiler_config._to_request_dict()
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_rule_configuration["RuleConfigurationName"])
        assert profiler_rule_configuration[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        mx.update_profiler(
            rules=[ProfilerRule.sagemaker(rule_configs.CPUBottleneck())],
            system_monitor_interval_millis=500,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description["ProfilerConfig"][
            "S3OutputPath"] == profiler_config.s3_output_path
        assert job_description["ProfilerConfig"][
            "ProfilingIntervalInMilliseconds"] == 500

        profiler_report_rule_config = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_report_rule_config["RuleConfigurationName"])
        assert profiler_report_rule_config[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_report_rule_config["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }
Esempio n. 25
0
def test_transform_tf_kms_network_isolation(sagemaker_session,
                                            cpu_instance_type, tmpdir,
                                            tf_full_version, py_version):
    data_path = os.path.join(DATA_DIR, "tensorflow_mnist")

    tf = TensorFlow(
        entry_point=os.path.join(data_path, "mnist.py"),
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        framework_version=tf_full_version,
        script_mode=True,
        py_version=py_version,
        sagemaker_session=sagemaker_session,
    )

    s3_prefix = "integ-test-data/tf-scriptmode/mnist"
    training_input = sagemaker_session.upload_data(
        path=os.path.join(data_path, "data"),
        key_prefix="{}/training".format(s3_prefix))

    job_name = unique_name_from_base("test-tf-transform")
    tf.fit(inputs=training_input, job_name=job_name)

    transform_input = sagemaker_session.upload_data(
        path=os.path.join(data_path, "transform"),
        key_prefix="{}/transform".format(s3_prefix))

    with bucket_with_encryption(sagemaker_session,
                                "SageMakerRole") as (bucket_with_kms, kms_key):
        output_path = "{}/{}/output".format(bucket_with_kms, job_name)

        transformer = tf.transformer(
            instance_count=1,
            instance_type=cpu_instance_type,
            output_path=output_path,
            output_kms_key=kms_key,
            volume_kms_key=kms_key,
            enable_network_isolation=True,
        )

        with timeout_and_delete_model_with_transformer(
                transformer,
                sagemaker_session,
                minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
            transformer.transform(transform_input,
                                  job_name=job_name,
                                  content_type="text/csv",
                                  wait=True)

            model_desc = sagemaker_session.sagemaker_client.describe_model(
                ModelName=transformer.model_name)
            assert model_desc["EnableNetworkIsolation"]

        job_desc = sagemaker_session.describe_transform_job(job_name=job_name)
        assert job_desc["TransformOutput"]["S3OutputPath"] == output_path
        assert job_desc["TransformOutput"]["KmsKeyId"] == kms_key
        assert job_desc["TransformResources"]["VolumeKmsKeyId"] == kms_key

        s3.S3Downloader.download(
            s3_uri=output_path,
            local_path=os.path.join(tmpdir, "tf-batch-output"),
            session=sagemaker_session,
        )

        with open(os.path.join(tmpdir, "tf-batch-output",
                               "data.csv.out")) as f:
            result = json.load(f)
            assert len(result["predictions"][0]["probabilities"]) == 10
            assert result["predictions"][0]["classes"] == 1
Esempio n. 26
0
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            ProfilerRule.sagemaker(rule_configs.ProfilerReport(),
                                   name="CustomProfilerReportRule"),
        ]
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors",
        )
        profiler_config = ProfilerConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system",
            system_monitor_interval_millis=1000,
            framework_profile_params=FrameworkProfile(),
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description[
            "ProfilerConfig"] == profiler_config._to_request_dict()
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert profiler_rule_configuration[
            "RuleConfigurationName"] == "CustomProfilerReportRule"
        assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[
            0].image_uri
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport",
        }

        for index, rule in enumerate(mx.debugger_rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        mx.update_profiler(disable_framework_metrics=True)
        job_description = mx.latest_training_job.describe()
        assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
Esempio n. 27
0
def test_async_kmeans(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]),
                   wait=False,
                   job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Esempio n. 28
0
def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        profiler_config = ProfilerConfig(
            framework_profile_params=FrameworkProfile(start_step=1,
                                                      num_steps=5))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert (job_description["ProfilerConfig"]["ProfilingParameters"] ==
                profiler_config._to_request_dict()["ProfilingParameters"])
        assert job_description.get("ProfilingStatus") == "Enabled"

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        updated_framework_profile = FrameworkProfile(
            detailed_profiling_config=DetailedProfilingConfig(
                profile_default_steps=True))
        mx.update_profiler(framework_profile_params=updated_framework_profile)

        job_description = mx.latest_training_job.describe()
        assert (job_description["ProfilerConfig"]["ProfilingParameters"] ==
                updated_framework_profile.profiling_parameters)

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert re.match(r"ProfilerReport-\d*",
                        profiler_rule_configuration["RuleConfigurationName"])
        assert profiler_rule_configuration[
            "RuleEvaluatorImage"] == get_rule_container_image_uri(
                mx.sagemaker_session.boto_region_name)
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport"
        }
def pipeline_name():
    return utils.unique_name_from_base("my-pipeline-clarify")
def test_model_registration_with_drift_check_baselines(
    sagemaker_session,
    role,
    pipeline_name,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")

    # upload model data to s3
    model_local_path = os.path.join(DATA_DIR, "mxnet_mnist/model.tar.gz")
    model_base_uri = "s3://{}/{}/input/model/{}".format(
        sagemaker_session.default_bucket(),
        "register_model_test_with_drift_baseline",
        utils.unique_name_from_base("model"),
    )
    model_uri = S3Uploader.upload(model_local_path,
                                  model_base_uri,
                                  sagemaker_session=sagemaker_session)
    model_uri_param = ParameterString(name="model_uri",
                                      default_value=model_uri)

    # upload metrics to s3
    metrics_data = (
        '{"regression_metrics": {"mse": {"value": 4.925353410353891, '
        '"standard_deviation": 2.219186917819692}}}')
    metrics_base_uri = "s3://{}/{}/input/metrics/{}".format(
        sagemaker_session.default_bucket(),
        "register_model_test_with_drift_baseline",
        utils.unique_name_from_base("metrics"),
    )
    metrics_uri = S3Uploader.upload_string_as_file_body(
        body=metrics_data,
        desired_s3_uri=metrics_base_uri,
        sagemaker_session=sagemaker_session,
    )
    metrics_uri_param = ParameterString(name="metrics_uri",
                                        default_value=metrics_uri)

    model_metrics = ModelMetrics(
        bias=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        explainability=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_pre_training=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_post_training=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
    )
    drift_check_baselines = DriftCheckBaselines(
        model_statistics=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        model_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        model_data_statistics=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        model_data_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_config_file=FileSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_pre_training_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        bias_post_training_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        explainability_constraints=MetricsSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
        explainability_config_file=FileSource(
            s3_uri=metrics_uri_param,
            content_type="application/json",
        ),
    )
    customer_metadata_properties = {"key1": "value1"}
    estimator = XGBoost(
        entry_point="training.py",
        source_dir=os.path.join(DATA_DIR, "sip"),
        instance_type=instance_type,
        instance_count=instance_count,
        framework_version="0.90-2",
        sagemaker_session=sagemaker_session,
        py_version="py3",
        role=role,
    )
    step_register = RegisterModel(
        name="MyRegisterModelStep",
        estimator=estimator,
        model_data=model_uri_param,
        content_types=["application/json"],
        response_types=["application/json"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name="testModelPackageGroup",
        model_metrics=model_metrics,
        drift_check_baselines=drift_check_baselines,
        customer_metadata_properties=customer_metadata_properties,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            model_uri_param,
            metrics_uri_param,
            instance_type,
            instance_count,
        ],
        steps=[step_register],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        for _ in retries(
                max_retry_count=5,
                exception_message_prefix=
                "Waiting for a successful execution of pipeline",
                seconds_to_sleep=10,
        ):
            execution = pipeline.start(parameters={
                "model_uri": model_uri,
                "metrics_uri": metrics_uri
            })
            response = execution.describe()

            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=30, max_attempts=60)
            except WaiterError:
                pass
            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            failure_reason = execution_steps[0].get("FailureReason", "")
            if failure_reason != "":
                logging.error(
                    f"Pipeline execution failed with error: {failure_reason}."
                    " Retrying..")
                continue
            assert execution_steps[0]["StepStatus"] == "Succeeded"
            assert execution_steps[0]["StepName"] == "MyRegisterModelStep"

            response = sagemaker_session.sagemaker_client.describe_model_package(
                ModelPackageName=execution_steps[0]["Metadata"]
                ["RegisterModel"]["Arn"])

            assert (response["ModelMetrics"]["Explainability"]["Report"]
                    ["ContentType"] == "application/json")
            assert (response["DriftCheckBaselines"]["Bias"][
                "PreTrainingConstraints"]["ContentType"] == "application/json")
            assert (response["DriftCheckBaselines"]["Explainability"]
                    ["Constraints"]["ContentType"] == "application/json")
            assert (response["DriftCheckBaselines"]["ModelQuality"]
                    ["Statistics"]["ContentType"] == "application/json")
            assert (response["DriftCheckBaselines"]["ModelDataQuality"]
                    ["Statistics"]["ContentType"] == "application/json")
            assert response[
                "CustomerMetadataProperties"] == customer_metadata_properties
            break
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass