def pipeline_name(): return utils.unique_name_from_base("my-pipeline-model-regis")
def test_kmeans(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("kmeans") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.eval_metrics = ["ssd", "msd"] assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), eval_metrics=json.dumps(kmeans.eval_metrics), force_dense="True", ) kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = KMeansModel(kmeans.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None predictor.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) assert "Could not find model" in str(exception.value)
def test_disabling_data_capture_on_endpoint_shows_correct_data_capture_status( sagemaker_session, tensorflow_inference_latest_version): endpoint_name = unique_name_from_base("sagemaker-tensorflow-serving") model_data = sagemaker_session.upload_data( path=os.path.join(tests.integ.DATA_DIR, "tensorflow-serving-test-model.tar.gz"), key_prefix="tensorflow-serving/models", ) with tests.integ.timeout.timeout_and_delete_endpoint_by_name( endpoint_name, sagemaker_session): model = TensorFlowModel( model_data=model_data, role=ROLE, framework_version=tensorflow_inference_latest_version, sagemaker_session=sagemaker_session, ) destination_s3_uri = os.path.join("s3://", sagemaker_session.default_bucket(), endpoint_name, "custom") predictor = model.deploy( initial_instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, endpoint_name=endpoint_name, data_capture_config=DataCaptureConfig( enable_capture=True, sampling_percentage=CUSTOM_SAMPLING_PERCENTAGE, destination_s3_uri=destination_s3_uri, capture_options=CUSTOM_CAPTURE_OPTIONS, csv_content_types=CUSTOM_CSV_CONTENT_TYPES, json_content_types=CUSTOM_JSON_CONTENT_TYPES, sagemaker_session=sagemaker_session, ), ) endpoint_desc = sagemaker_session.sagemaker_client.describe_endpoint( EndpointName=predictor.endpoint_name) endpoint_config_desc = sagemaker_session.sagemaker_client.describe_endpoint_config( EndpointConfigName=endpoint_desc["EndpointConfigName"]) assert endpoint_config_desc["DataCaptureConfig"]["EnableCapture"] assert (endpoint_config_desc["DataCaptureConfig"] ["InitialSamplingPercentage"] == CUSTOM_SAMPLING_PERCENTAGE) assert endpoint_config_desc["DataCaptureConfig"]["CaptureOptions"] == [ { "CaptureMode": "Input" } ] assert (endpoint_config_desc["DataCaptureConfig"] ["CaptureContentTypeHeader"]["CsvContentTypes"] == CUSTOM_CSV_CONTENT_TYPES) assert (endpoint_config_desc["DataCaptureConfig"] ["CaptureContentTypeHeader"]["JsonContentTypes"] == CUSTOM_JSON_CONTENT_TYPES) predictor.disable_data_capture() # Wait for endpoint to finish updating # Endpoint update takes ~7min. 25 retries * 60s sleeps = 25min timeout for _ in retries( max_retry_count=25, exception_message_prefix= "Waiting for 'InService' endpoint status", seconds_to_sleep=60, ): new_endpoint = sagemaker_session.sagemaker_client.describe_endpoint( EndpointName=predictor.endpoint_name) if new_endpoint["EndpointStatus"] == "InService": break endpoint_desc = sagemaker_session.sagemaker_client.describe_endpoint( EndpointName=predictor.endpoint_name) endpoint_config_desc = sagemaker_session.sagemaker_client.describe_endpoint_config( EndpointConfigName=endpoint_desc["EndpointConfigName"]) assert not endpoint_config_desc["DataCaptureConfig"]["EnableCapture"]
def test_attach_tuning_pytorch( sagemaker_session, cpu_instance_type, pytorch_inference_latest_version, pytorch_inference_latest_py_version, ): mnist_dir = os.path.join(DATA_DIR, "pytorch_mnist") mnist_script = os.path.join(mnist_dir, "mnist.py") estimator = PyTorch( entry_point=mnist_script, role="SageMakerRole", instance_count=1, framework_version=pytorch_inference_latest_version, py_version=pytorch_inference_latest_py_version, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): objective_metric_name = "evaluation-accuracy" metric_definitions = [{ "Name": "evaluation-accuracy", "Regex": r"Overall test accuracy: (\d+)" }] hyperparameter_ranges = {"batch-size": IntegerParameter(50, 100)} tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, early_stopping_type="Auto", ) training_data = estimator.sagemaker_session.upload_data( path=os.path.join(mnist_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) tuning_job_name = unique_name_from_base("pytorch", max_length=32) print("Started hyperparameter tuning job with name: {}".format( tuning_job_name)) tuner.fit({"training": training_data}, job_name=tuning_job_name) endpoint_name = tuning_job_name model_name = "model-name-1" attached_tuner = HyperparameterTuner.attach( tuning_job_name, sagemaker_session=sagemaker_session) assert attached_tuner.early_stopping_type == "Auto" with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = attached_tuner.deploy(1, cpu_instance_type, endpoint_name=endpoint_name, model_name=model_name) best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = attached_tuner.deploy(1, cpu_instance_type) data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32) predictor.predict(data) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10) _assert_model_name_match(sagemaker_session.sagemaker_client, endpoint_name, model_name)
def test_processing_step_with_clarify_processor(pipeline_session): def headers(): return [ "Label", "F1", "F2", "F3", "F4", ] def data_bias_config(): return BiasConfig( label_values_or_threshold=[1], facet_name="F1", facet_values_or_threshold=[0.5], group_name="F2", ) def model_config(model_name): return ModelConfig( model_name=model_name, instance_type="ml.c5.xlarge", instance_count=1, accept_type="application/jsonlines", endpoint_name_prefix="myprefix", ) def shap_config(): return SHAPConfig( baseline=[ [ 0.94672389, 0.47108862, 0.63350081, 0.00604642, ] ], num_samples=2, agg_method="mean_sq", seed=123, ) def verfiy(step_args): step = ProcessingStep( name="MyProcessingStep", step_args=step_args, ) pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": step_args, } test_run = utils.unique_name_from_base("test_run") output_path = "s3://{}/{}/{}".format( pipeline_session.default_bucket(), "linear_learner_analysis_result", test_run ) data_config = DataConfig( s3_data_input_path=f"s3://{pipeline_session.default_bucket()}/{input}/train.csv", s3_output_path=output_path, label="Label", headers=headers(), dataset_type="text/csv", ) clarify_processor = SageMakerClarifyProcessor( instance_type=INSTANCE_TYPE, instance_count=1, sagemaker_session=pipeline_session, role=sagemaker.get_execution_role(), ) run_bias_args = clarify_processor.run_bias( data_config=data_config, bias_config=data_bias_config(), model_config=model_config("1st-model-rpyndy0uyo"), ) verfiy(run_bias_args) run_pre_training_bias_args = clarify_processor.run_pre_training_bias( data_config=data_config, data_bias_config=data_bias_config(), ) verfiy(run_pre_training_bias_args) run_post_training_bias_args = clarify_processor.run_post_training_bias( data_config=data_config, data_bias_config=data_bias_config(), model_config=model_config("1st-model-rpyndy0uyo"), model_predicted_label_config=ModelPredictedLabelConfig(probability_threshold=0.9), ) verfiy(run_post_training_bias_args) run_explainability_args = clarify_processor.run_explainability( data_config=data_config, model_config=model_config("1st-model-rpyndy0uyo"), explainability_config=shap_config(), ) verfiy(run_explainability_args)
def test_mnist_with_checkpoint_config( sagemaker_session, instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format( sagemaker_session.default_bucket(), sagemaker_timestamp()) checkpoint_local_path = "/test/checkpoint/path" estimator = TensorFlow( entry_point=SCRIPT, role="SageMakerRole", instance_count=1, instance_type=instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, metric_definitions=[{ "Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)" }], checkpoint_s3_uri=checkpoint_s3_uri, checkpoint_local_path=checkpoint_local_path, environment=ENV_INPUT, max_wait=24 * 60 * 60, max_retry_attempts=2, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist") training_job_name = unique_name_from_base("test-tf-sm-mnist") with tests.integ.timeout.timeout( minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=training_job_name) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], ) # remove dataframe assertion to unblock PR build # TODO: add independent integration test for `training_job_analytics` expected_training_checkpoint_config = { "S3Uri": checkpoint_s3_uri, "LocalPath": checkpoint_local_path, } actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_name)["CheckpointConfig"] actual_training_environment_variable_config = ( sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_name)["Environment"]) expected_retry_strategy = { "MaximumRetryAttempts": 2, } actual_retry_strategy = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_name)["RetryStrategy"] assert actual_training_checkpoint_config == expected_training_checkpoint_config assert actual_training_environment_variable_config == ENV_INPUT assert actual_retry_strategy == expected_retry_strategy
def test_tuning_lda(sagemaker_session, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "lda") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int( all_records[0].features["values"].float32_tensor.shape[0]) lda = LDA( role="SageMakerRole", instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = "test" # specify which hp you want to optimize over hyperparameter_ranges = { "alpha0": ContinuousParameter(1, 10), "num_topics": IntegerParameter(1, 2), } objective_metric_name = "test:pwll" tuner = HyperparameterTuner( estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type="Maximize", max_jobs=2, max_parallel_jobs=2, early_stopping_type="Auto", ) tuning_job_name = unique_name_from_base("test-lda", max_length=32) print("Started hyperparameter tuning job with name:" + tuning_job_name) tuner.fit([record_set, test_record_set], mini_batch_size=1, job_name=tuning_job_name) attached_tuner = HyperparameterTuner.attach( tuning_job_name, sagemaker_session=sagemaker_session) assert attached_tuner.early_stopping_type == "Auto" assert attached_tuner.estimator.alpha0 == 1.0 assert attached_tuner.estimator.num_topics == 1 best_training_job = attached_tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, cpu_instance_type) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_mixture"] is not None
def multi_variant_endpoint(sagemaker_session): """ Sets up the multi variant endpoint before the integration tests run. Cleans up the multi variant endpoint after the integration tests run. """ multi_variant_endpoint.endpoint_name = unique_name_from_base( "integ-test-multi-variant-endpoint") with tests.integ.timeout.timeout_and_delete_endpoint_by_name( endpoint_name=multi_variant_endpoint.endpoint_name, sagemaker_session=sagemaker_session, hours=2, ): # Creating a model bucket = sagemaker_session.default_bucket() prefix = "sagemaker/DEMO-VariantTargeting" model_url = S3Uploader.upload( local_path=XG_BOOST_MODEL_LOCAL_PATH, desired_s3_uri="s3://" + bucket + "/" + prefix, session=sagemaker_session, ) image_uri = get_image_uri(sagemaker_session.boto_session.region_name, "xgboost", "0.90-1") multi_variant_endpoint_model = sagemaker_session.create_model( name=MODEL_NAME, role=ROLE, container_defs={ "Image": image_uri, "ModelDataUrl": model_url }, ) # Creating a multi variant endpoint variant1 = production_variant( model_name=MODEL_NAME, instance_type=DEFAULT_INSTANCE_TYPE, initial_instance_count=DEFAULT_INSTANCE_COUNT, variant_name=TEST_VARIANT_1, initial_weight=TEST_VARIANT_1_WEIGHT, ) variant2 = production_variant( model_name=MODEL_NAME, instance_type=DEFAULT_INSTANCE_TYPE, initial_instance_count=DEFAULT_INSTANCE_COUNT, variant_name=TEST_VARIANT_2, initial_weight=TEST_VARIANT_2_WEIGHT, ) sagemaker_session.endpoint_from_production_variants( name=multi_variant_endpoint.endpoint_name, production_variants=[variant1, variant2]) # Yield to run the integration tests yield multi_variant_endpoint # Cleanup resources sagemaker_session.delete_model(multi_variant_endpoint_model) sagemaker_session.sagemaker_client.delete_endpoint_config( EndpointConfigName=multi_variant_endpoint.endpoint_name) # Validate resource cleanup with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model( ModelName=multi_variant_endpoint_model.name) assert "Could not find model" in str(exception.value) sagemaker_session.sagemaker_client.describe_endpoint_config( name=multi_variant_endpoint.endpoint_name) assert "Could not find endpoint" in str(exception.value)
def test_async_linear_learner(sagemaker_session): job_name = unique_name_from_base('linear-learner') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else { 'encoding': 'latin1' } # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) train_set[1][:100] = 1 train_set[1][100:200] = 0 train_set = train_set[0], train_set[1].astype(np.dtype('float32')) ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', predictor_type='binary_classifier', sagemaker_session=sagemaker_session) ll.binary_classifier_model_selection_criteria = 'accuracy' ll.target_recall = 0.5 ll.target_precision = 0.5 ll.positive_example_weight_mult = 0.1 ll.epochs = 1 ll.use_bias = True ll.num_models = 1 ll.num_calibration_samples = 1 ll.init_method = 'uniform' ll.init_scale = 0.5 ll.init_sigma = 0.2 ll.init_bias = 5 ll.optimizer = 'adam' ll.loss = 'logistic' ll.wd = 0.5 ll.l1 = 0.5 ll.momentum = 0.5 ll.learning_rate = 0.1 ll.beta_1 = 0.1 ll.beta_2 = 0.1 ll.use_lr_scheduler = True ll.lr_scheduler_step = 2 ll.lr_scheduler_factor = 0.5 ll.lr_scheduler_minimum_lr = 0.1 ll.normalize_data = False ll.normalize_label = False ll.unbias_data = True ll.unbias_label = False ll.num_point_for_scaler = 10000 ll.margin = 1.0 ll.quantile = 0.5 ll.loss_insensitivity = 0.1 ll.huber_delta = 0.1 ll.early_stopping_tolerance = 0.0001 ll.early_stopping_patience = 3 ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), wait=False, job_name=job_name) print("Waiting to re-attach to the training job: %s" % job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): estimator = LinearLearner.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = LinearLearnerModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name) result = predictor.predict(train_set[0][0:100]) assert len(result) == 100 for record in result: assert record.label["predicted_label"] is not None assert record.label["score"] is not None
def test_async_linear_learner(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("linear-learner") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): training_set[1][:100] = 1 training_set[1][100:200] = 0 training_set = training_set[0], training_set[1].astype( np.dtype("float32")) ll = LinearLearner( "SageMakerRole", 1, cpu_instance_type, predictor_type="binary_classifier", sagemaker_session=sagemaker_session, ) ll.binary_classifier_model_selection_criteria = "accuracy" ll.target_recall = 0.5 ll.target_precision = 0.5 ll.positive_example_weight_mult = 0.1 ll.epochs = 1 ll.use_bias = True ll.num_models = 1 ll.num_calibration_samples = 1 ll.init_method = "uniform" ll.init_scale = 0.5 ll.init_sigma = 0.2 ll.init_bias = 5 ll.optimizer = "adam" ll.loss = "logistic" ll.wd = 0.5 ll.l1 = 0.5 ll.momentum = 0.5 ll.learning_rate = 0.1 ll.beta_1 = 0.1 ll.beta_2 = 0.1 ll.use_lr_scheduler = True ll.lr_scheduler_step = 2 ll.lr_scheduler_factor = 0.5 ll.lr_scheduler_minimum_lr = 0.1 ll.normalize_data = False ll.normalize_label = False ll.unbias_data = True ll.unbias_label = False ll.num_point_for_scaler = 10000 ll.margin = 1.0 ll.quantile = 0.5 ll.loss_insensitivity = 0.1 ll.huber_delta = 0.1 ll.early_stopping_tolerance = 0.0001 ll.early_stopping_patience = 3 ll.fit( ll.record_set(training_set[0][:200], training_set[1][:200]), wait=False, job_name=job_name, ) print("Waiting to re-attach to the training job: %s" % job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): estimator = LinearLearner.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = LinearLearnerModel(estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(training_set[0][0:100]) assert len(result) == 100 for record in result: assert record.label["predicted_label"] is not None assert record.label["score"] is not None
def test_training(sagemaker_session, ecr_image, instance_type, framework_version): sm_client = sagemaker_session.sagemaker_client experiment_name = "tf-container-integ-test-{}".format(int(time.time())) experiment = Experiment.create( experiment_name=experiment_name, description="Integration test experiment from sagemaker-tf-container", sagemaker_boto_client=sm_client, ) trial_name = "tf-container-integ-test-{}".format(int(time.time())) trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) training_job_name = utils.unique_name_from_base( "test-tf-experiments-mnist") # create a training job and wait for it to complete with timeout(minutes=DEFAULT_TIMEOUT): resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") script = os.path.join(resource_path, "mnist", "mnist.py") estimator = TensorFlow( entry_point=script, role="SageMakerRole", train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist") estimator.fit(inputs, job_name=training_job_name) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) trial_component_summary = trial_components[0] trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()
def test_choice_state_machine_creation(sfn_client, sfn_role_arn): choice_state_name = "ChoiceState" first_match_name = "FirstMatchState" second_match_name = "SecondMatchState" default_state_name = "DefaultState" variable = "$.choice" first_choice_value = 1 second_choice_value = 2 default_error = "DefaultStateError" default_cause = "No Matches" first_choice_state_result = "First Choice State" second_choice_state_result = "Second Choice State" state_machine_input = {"choice": first_choice_value} asl_state_machine_definition = { "StartAt": choice_state_name, "States": { choice_state_name: { "Type": "Choice", "Choices": [{ "Variable": variable, "NumericEquals": first_choice_value, "Next": first_match_name }, { "Variable": variable, "NumericEquals": second_choice_value, "Next": second_match_name }], "Default": default_state_name }, default_state_name: { "Error": default_error, "Cause": default_cause, "Type": "Fail" }, first_match_name: { "Type": "Pass", "Result": first_choice_state_result, "End": True }, second_match_name: { "Type": "Pass", "Result": second_choice_state_result, "End": True } } } definition = steps.Choice(choice_state_name) definition.default_choice( steps.Fail(default_state_name, error=default_error, cause=default_cause)) definition.add_choice( steps.ChoiceRule.NumericEquals(variable=variable, value=first_choice_value), steps.Pass(first_match_name, result=first_choice_state_result)) definition.add_choice( steps.ChoiceRule.NumericEquals(variable=variable, value=second_choice_value), steps.Pass(second_match_name, result=second_choice_state_result)) workflow = Workflow(unique_name_from_base('Test_Choice_Workflow'), definition=definition, role=sfn_role_arn) workflow_test_suite(sfn_client, workflow, asl_state_machine_definition, first_choice_state_result, state_machine_input)
def test_wait_state_machine_creation(sfn_client, sfn_role_arn): first_state_name = "FirstState" first_wait_state_name = "WaitInSeconds" second_wait_state_name = "WaitTimestamp" third_wait_state_name = "WaitTimestampPath" fourth_wait_state_name = "WaitInSecondsPath" final_state_name = "FinalState" timestamp = "2019-09-04T01:59:00Z" timestamp_path = "$.expirydate" seconds = 2 seconds_path = "$.expiryseconds" wait_state_result = "Wait Result" parameters = {'expirydate': timestamp, 'expiryseconds': seconds} asl_state_machine_definition = { "StartAt": first_state_name, "States": { first_state_name: { "Type": "Pass", "Next": first_wait_state_name, "Parameters": parameters }, first_wait_state_name: { "Seconds": seconds, "Type": "Wait", "Next": second_wait_state_name }, second_wait_state_name: { "Timestamp": timestamp, "Type": "Wait", "Next": third_wait_state_name }, third_wait_state_name: { "TimestampPath": timestamp_path, "Type": "Wait", "Next": fourth_wait_state_name }, fourth_wait_state_name: { "SecondsPath": seconds_path, "Type": "Wait", "Next": final_state_name }, final_state_name: { "Type": "Pass", "Result": wait_state_result, "End": True } } } definition = steps.Chain([ steps.Pass(first_state_name, parameters=parameters), steps.Wait(first_wait_state_name, seconds=seconds), steps.Wait(second_wait_state_name, timestamp=timestamp), steps.Wait(third_wait_state_name, timestamp_path=timestamp_path), steps.Wait(fourth_wait_state_name, seconds_path=seconds_path), steps.Pass(final_state_name, result=wait_state_result) ]) workflow = Workflow(unique_name_from_base('Test_Wait_Workflow'), definition=definition, role=sfn_role_arn) workflow_test_suite(sfn_client, workflow, asl_state_machine_definition, wait_state_result)
import pytest import scipy.stats as st from sagemaker import image_uris from sagemaker.deserializers import CSVDeserializer from sagemaker.s3 import S3Uploader from sagemaker.session import production_variant from sagemaker.sparkml import SparkMLModel from sagemaker.utils import unique_name_from_base from sagemaker.predictor import Predictor from sagemaker.serializers import CSVSerializer import tests.integ ROLE = "SageMakerRole" MODEL_NAME = unique_name_from_base("test-xgboost-model") DEFAULT_REGION = "us-west-2" DEFAULT_INSTANCE_TYPE = "ml.m5.xlarge" DEFAULT_INSTANCE_COUNT = 1 XG_BOOST_MODEL_LOCAL_PATH = os.path.join(tests.integ.DATA_DIR, "xgboost_model", "xgb_model.tar.gz") TEST_VARIANT_1 = "Variant1" TEST_VARIANT_1_WEIGHT = 0.3 TEST_VARIANT_2 = "Variant2" TEST_VARIANT_2_WEIGHT = 0.7 VARIANT_TRAFFIC_SAMPLING_COUNT = 100 DESIRED_CONFIDENCE_FOR_VARIANT_TRAFFIC_DISTRIBUTION = 0.999
def test_run_bias_monitor_baseline( sagemaker_session, data_config, model_config, bias_config, model_predicted_label_config, endpoint_name, ground_truth_input, upload_actual_data, ): monitor = ModelBiasMonitor( role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, volume_size_in_gb=VOLUME_SIZE_IN_GB, max_runtime_in_seconds=MAX_RUNTIME_IN_SECONDS, sagemaker_session=sagemaker_session, tags=TEST_TAGS, ) baselining_job_name = utils.unique_name_from_base("bias-baselining-job") print("Creating baselining job: {}".format(baselining_job_name)) monitor.suggest_baseline( data_config=data_config, bias_config=bias_config, model_config=model_config, model_predicted_label_config=model_predicted_label_config, job_name=baselining_job_name, ) assert (monitor.latest_baselining_job_config. probability_threshold_attribute == BIAS_PROBABILITY_THRESHOLD) monitoring_schedule_name = utils.unique_name_from_base( "bias-suggest-baseline") s3_uri_monitoring_output = os.path.join( "s3://", sagemaker_session.default_bucket(), endpoint_name, monitoring_schedule_name, "monitor_output", ) # Let's test if the schedule can pick up analysis_config from baselining job monitor.create_monitoring_schedule( output_s3_uri=s3_uri_monitoring_output, monitor_schedule_name=monitoring_schedule_name, endpoint_input=EndpointInput( endpoint_name=endpoint_name, destination=ENDPOINT_INPUT_LOCAL_PATH, start_time_offset=START_TIME_OFFSET, end_time_offset=END_TIME_OFFSET, ), ground_truth_input=ground_truth_input, schedule_cron_expression=CRON, ) _verify_execution_status(monitor) _verify_bias_job_description( sagemaker_session=sagemaker_session, monitor=monitor, endpoint_name=endpoint_name, ground_truth_input=ground_truth_input, ) monitor.delete_monitoring_schedule()
def test_run_model_quality_monitor_baseline( sagemaker_session, endpoint_name, data_path, ground_truth_input, upload_actual_data, ): monitor = ModelQualityMonitor( role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, volume_size_in_gb=VOLUME_SIZE_IN_GB, max_runtime_in_seconds=MAX_RUNTIME_IN_SECONDS, sagemaker_session=sagemaker_session, env=TEST_ENV, tags=TEST_TAGS, ) baselining_job_name = utils.unique_name_from_base("model-quality-baselining-job") print("Creating baselining job: {}".format(baselining_job_name)) monitor.suggest_baseline( baseline_dataset=data_path, dataset_format=DatasetFormat.csv(), problem_type=PROBLEM_TYPE, job_name=baselining_job_name, ground_truth_attribute=HEADER_OF_LABEL, inference_attribute=HEADER_OF_PREDICTED_LABEL, ) monitoring_schedule_name = utils.unique_name_from_base("model-quality-suggest-baseline") s3_uri_monitoring_output = os.path.join( "s3://", sagemaker_session.default_bucket(), endpoint_name, monitoring_schedule_name, "monitor_output", ) monitor.create_monitoring_schedule( endpoint_input=EndpointInput( endpoint_name=endpoint_name, destination=ENDPOINT_INPUT_LOCAL_PATH, start_time_offset=START_TIME_OFFSET, end_time_offset=END_TIME_OFFSET, inference_attribute=INFERENCE_ATTRIBUTE, ), ground_truth_input=ground_truth_input, problem_type=PROBLEM_TYPE, output_s3_uri=s3_uri_monitoring_output, monitor_schedule_name=monitoring_schedule_name, schedule_cron_expression=CRON, ) _verify_execution_status(monitor) _verify_model_quality_job_description( sagemaker_session=sagemaker_session, monitor=monitor, endpoint_name=endpoint_name, ground_truth_input=ground_truth_input, ) monitor.delete_monitoring_schedule()
def test_linear_learner(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("linear-learner") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) train_set[1][:100] = 1 train_set[1][100:200] = 0 train_set = train_set[0], train_set[1].astype(np.dtype("float32")) ll = LinearLearner( "SageMakerRole", 1, cpu_instance_type, predictor_type="binary_classifier", sagemaker_session=sagemaker_session, ) ll.binary_classifier_model_selection_criteria = "accuracy" ll.target_recall = 0.5 ll.target_precision = 0.5 ll.positive_example_weight_mult = 0.1 ll.epochs = 1 ll.use_bias = True ll.num_models = 1 ll.num_calibration_samples = 1 ll.init_method = "uniform" ll.init_scale = 0.5 ll.init_sigma = 0.2 ll.init_bias = 5 ll.optimizer = "adam" ll.loss = "logistic" ll.wd = 0.5 ll.l1 = 0.5 ll.momentum = 0.5 ll.learning_rate = 0.1 ll.beta_1 = 0.1 ll.beta_2 = 0.1 ll.use_lr_scheduler = True ll.lr_scheduler_step = 2 ll.lr_scheduler_factor = 0.5 ll.lr_scheduler_minimum_lr = 0.1 ll.normalize_data = False ll.normalize_label = False ll.unbias_data = True ll.unbias_label = False ll.num_point_for_scaler = 10000 ll.margin = 1.0 ll.quantile = 0.5 ll.loss_insensitivity = 0.1 ll.huber_delta = 0.1 ll.early_stopping_tolerance = 0.0001 ll.early_stopping_patience = 3 ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): predictor = ll.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][0:100]) assert len(result) == 100 for record in result: assert record.label["predicted_label"] is not None assert record.label["score"] is not None
def test_training(sagemaker_session, ecr_image, instance_type, instance_count): from smexperiments.experiment import Experiment from smexperiments.trial import Trial from smexperiments.trial_component import TrialComponent sm_client = sagemaker_session.sagemaker_client experiment_name = "mxnet-container-integ-test-{}".format(int(time.time())) experiment = Experiment.create( experiment_name=experiment_name, description= "Integration test experiment from sagemaker-mxnet-container", sagemaker_boto_client=sm_client, ) trial_name = "mxnet-container-integ-test-{}".format(int(time.time())) trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) hyperparameters = { "random_seed": True, "num_steps": 50, "smdebug_path": "/opt/ml/output/tensors", "epochs": 1, } mx = MXNet( entry_point=SCRIPT_PATH, role="SageMakerRole", train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters=hyperparameters, ) training_job_name = utils.unique_name_from_base("test-mxnet-image") # create a training job and wait for it to complete with timeout(minutes=15): prefix = "mxnet_mnist_gluon_basic_hook_demo/{}".format( utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, "train"), key_prefix=prefix + "/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, "test"), key_prefix=prefix + "/test") mx.fit({ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_component_summary = None attempts = 0 while True: trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) if len(trial_components) > 0: trial_component_summary = trial_components[0] break if attempts < 10: attempts += 1 sleep(500) assert trial_component_summary is not None trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()
def test_smmodelparallel_mnist_multigpu_singlenode(ecr_image, instance_type, sagemaker_regions, test_script, num_processes): """ Tests pt gpt2 command via script mode """ framework, framework_version = get_framework_and_version_from_tag( ecr_image) if framework == "pytorch" and Version(framework_version) in SpecifierSet( "==1.9.*"): pytest.skip("Skipping the test for PT1.9") instance_type = "ml.p3.16xlarge" hyperparameters = { 'training_dir': '/opt/ml/input/data/train', 'max_steps': 100, 'seed': 12345, 'fp16': 1, 'lr': 2.e-4, 'lr_decay_iters': 125000, 'min_lr': 0.00001, 'lr-decay-style': 'linear', 'warmup': 0.01, 'logging_freq': 1, 'max_context_width': 1024, 'hidden_width': 768, 'num_layers': 12, 'num_heads': 12, 'n_gpus': 8, 'train_batch_size': 32, 'microbatches': 1, 'tensor_parallel_degree': 4, 'pipeline_parallel_degree': 2, 'activation_checkpointing': 1, 'activation_strategy': "group_2", 'manual_partition': 1 } train = sagemaker.session.s3_input( "s3://gpt2-data/train_synthetic_small/", distribution="FullyReplicated", content_type="application/tfrecord", s3_data_type="S3Prefix", ) inputs = {"train": train, "test": train} validate_or_skip_smmodelparallel(ecr_image) with timeout(minutes=DEFAULT_TIMEOUT): estimator_parameter = { 'entry_point': test_script, 'role': 'SageMakerRole', 'source_dir': gpt2_path, 'instance_count': 1, 'instance_type': instance_type, 'hyperparameters': hyperparameters, 'distribution': { "smdistributed": { "modelparallel": { "enabled": True, "parameters": { "partitions": 2, "tensor_parallel_degree": 4, "microbatches": 1, "optimize": "speed", "pipeline": "interleaved", "ddp": True, }, } }, "mpi": { "enabled": True, "processes_per_host": num_processes, "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ", }, }, } job_name = utils.unique_name_from_base('test-pt-smdmp-gpt2-singlenode') invoke_pytorch_estimator(ecr_image, sagemaker_regions, estimator_parameter, inputs=inputs, job_name=job_name)
def _test_training_function(ecr_image, sagemaker_session, instance_type, framework_version, py_version): if py_version is None or '2' in py_version: pytest.skip('Skipping python2 {}'.format(py_version)) return from smexperiments.experiment import Experiment from smexperiments.trial import Trial from smexperiments.trial_component import TrialComponent sm_client = sagemaker_session.sagemaker_client random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") unique_id = random.randint(1, 6000) experiment_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}" experiment = Experiment.create( experiment_name=experiment_name, description="Integration test experiment from sagemaker-tf-container", sagemaker_boto_client=sm_client, ) trial_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}" trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) training_job_name = utils.unique_name_from_base( "test-tf-experiments-mnist") # create a training job and wait for it to complete with timeout(minutes=15): resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") script = os.path.join(resource_path, "mnist", "mnist.py") estimator = TensorFlow( entry_point=script, role="SageMakerRole", instance_type=instance_type, instance_count=1, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, script_mode=True, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist") estimator.fit(inputs, job_name=training_job_name) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) trial_component_summary = trial_components[0] trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()
def test_tuning_chainer(sagemaker_session, chainer_latest_version, chainer_latest_py_version, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") estimator = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_latest_version, py_version=chainer_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train") test_input = estimator.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test") hyperparameter_ranges = {"alpha": ContinuousParameter(0.001, 0.005)} objective_metric_name = "Validation-accuracy" metric_definitions = [{ "Name": "Validation-accuracy", "Regex": r"\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)", }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("chainer", max_length=32) print("Started hyperparameter tuning job with name: {}".format( tuning_job_name)) tuner.fit({ "train": train_input, "test": test_input }, job_name=tuning_job_name) best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, cpu_instance_type) batch_size = 100 data = np.zeros((batch_size, 784), dtype="float32") output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 1, 28, 28), dtype="float32") output = predictor.predict(data) assert len(output) == batch_size data = np.zeros((batch_size, 28, 28), dtype="float32") output = predictor.predict(data) assert len(output) == batch_size
def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type): """Use Factorization Machines algorithm as an example here. First we need to prepare data for training. We take standard data set, convert it to the format that the algorithm can process and upload it to S3. Then we create the Estimator and set hyperparamets as required by the algorithm. Next, we can call fit() with path to the S3. Later the trained model is deployed and prediction is called against the endpoint. Default predictor is updated with json serializer and deserializer. """ image_name = get_image_uri(sagemaker_session.boto_session.region_name, "factorization-machines") training_data_path = os.path.join(DATA_DIR, "dummy_tensor") with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) prefix = "test_byo_estimator" key = "recordio-pb-data" s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join( prefix, "train", key)) estimator = Estimator( image_name=image_name, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="binary_classifier") hyperparameter_ranges = {"mini_batch_size": IntegerParameter(100, 200)} tuner = HyperparameterTuner( estimator=estimator, objective_metric_name="test:binary_classification_accuracy", hyperparameter_ranges=hyperparameter_ranges, max_jobs=2, max_parallel_jobs=2, ) tuner.fit( { "train": s3_train_data, "test": s3_train_data }, include_cls_metadata=False, job_name=unique_name_from_base("byo", 32), ) print("Started hyperparameter tuning job with name:" + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, cpu_instance_type, endpoint_name=best_training_job) predictor.serializer = _fm_serializer predictor.content_type = "application/json" predictor.deserializer = json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: assert prediction["score"] is not None
def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type): """Use Factorization Machines algorithm as an example here. First we need to prepare data for training. We take standard data set, convert it to the format that the algorithm can process and upload it to S3. Then we create the Estimator and set hyperparamets as required by the algorithm. Next, we can call fit() with path to the S3. Later the trained model is deployed and prediction is called against the endpoint. Default predictor is updated with json serializer and deserializer. """ image_uri = image_uris.retrieve("factorization-machines", sagemaker_session.boto_region_name) training_data_path = os.path.join(DATA_DIR, "dummy_tensor") with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): prefix = "test_byo_estimator" key = "recordio-pb-data" s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join( prefix, "train", key)) estimator = Estimator( image_uri=image_uri, role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="binary_classifier") hyperparameter_ranges = {"mini_batch_size": IntegerParameter(100, 200)} tuner = HyperparameterTuner( estimator=estimator, objective_metric_name="test:binary_classification_accuracy", hyperparameter_ranges=hyperparameter_ranges, max_jobs=2, max_parallel_jobs=2, ) tuning_job_name = unique_name_from_base("byo", 32) print("Started hyperparameter tuning job with name {}:".format( tuning_job_name)) tuner.fit( { "train": s3_train_data, "test": s3_train_data }, include_cls_metadata=False, job_name=tuning_job_name, ) best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy( 1, cpu_instance_type, endpoint_name=best_training_job, serializer=_FactorizationMachineSerializer(), deserializer=JSONDeserializer(), ) result = predictor.predict(datasets.one_p_mnist()[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: assert prediction["score"] is not None
def test_mxnet_with_custom_profiler_config_then_update_rule_and_config( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): profiler_config = ProfilerConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system", system_monitor_interval_millis=1000, ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description.get( "ProfilerConfig") == profiler_config._to_request_dict() assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"]) assert profiler_rule_configuration[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" } _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.update_profiler( rules=[ProfilerRule.sagemaker(rule_configs.CPUBottleneck())], system_monitor_interval_millis=500, ) job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"][ "S3OutputPath"] == profiler_config.s3_output_path assert job_description["ProfilerConfig"][ "ProfilingIntervalInMilliseconds"] == 500 profiler_report_rule_config = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_report_rule_config["RuleConfigurationName"]) assert profiler_report_rule_config[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_report_rule_config["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" }
def test_transform_tf_kms_network_isolation(sagemaker_session, cpu_instance_type, tmpdir, tf_full_version, py_version): data_path = os.path.join(DATA_DIR, "tensorflow_mnist") tf = TensorFlow( entry_point=os.path.join(data_path, "mnist.py"), role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, framework_version=tf_full_version, script_mode=True, py_version=py_version, sagemaker_session=sagemaker_session, ) s3_prefix = "integ-test-data/tf-scriptmode/mnist" training_input = sagemaker_session.upload_data( path=os.path.join(data_path, "data"), key_prefix="{}/training".format(s3_prefix)) job_name = unique_name_from_base("test-tf-transform") tf.fit(inputs=training_input, job_name=job_name) transform_input = sagemaker_session.upload_data( path=os.path.join(data_path, "transform"), key_prefix="{}/transform".format(s3_prefix)) with bucket_with_encryption(sagemaker_session, "SageMakerRole") as (bucket_with_kms, kms_key): output_path = "{}/{}/output".format(bucket_with_kms, job_name) transformer = tf.transformer( instance_count=1, instance_type=cpu_instance_type, output_path=output_path, output_kms_key=kms_key, volume_kms_key=kms_key, enable_network_isolation=True, ) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.transform(transform_input, job_name=job_name, content_type="text/csv", wait=True) model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) assert model_desc["EnableNetworkIsolation"] job_desc = sagemaker_session.describe_transform_job(job_name=job_name) assert job_desc["TransformOutput"]["S3OutputPath"] == output_path assert job_desc["TransformOutput"]["KmsKeyId"] == kms_key assert job_desc["TransformResources"]["VolumeKmsKeyId"] == kms_key s3.S3Downloader.download( s3_uri=output_path, local_path=os.path.join(tmpdir, "tf-batch-output"), session=sagemaker_session, ) with open(os.path.join(tmpdir, "tf-batch-output", "data.csv.out")) as f: result = json.load(f) assert len(result["predictions"][0]["probabilities"]) == 10 assert result["predictions"][0]["classes"] == 1
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), ProfilerRule.sagemaker(rule_configs.ProfilerReport(), name="CustomProfilerReportRule"), ] debugger_hook_config = DebuggerHookConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors", ) profiler_config = ProfilerConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system", system_monitor_interval_millis=1000, framework_profile_params=FrameworkProfile(), ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description[ "ProfilerConfig"] == profiler_config._to_request_dict() assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert profiler_rule_configuration[ "RuleConfigurationName"] == "CustomProfilerReportRule" assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[ 0].image_uri assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport", } for index, rule in enumerate(mx.debugger_rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.update_profiler(disable_framework_metrics=True) job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
def test_async_kmeans(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("kmeans") with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), force_dense="True", ) kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False, job_name=job_name) print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = KMeansModel(estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_mxnet_with_enable_framework_metrics_then_update_framework_metrics( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): profiler_config = ProfilerConfig( framework_profile_params=FrameworkProfile(start_step=1, num_steps=5)) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert (job_description["ProfilerConfig"]["ProfilingParameters"] == profiler_config._to_request_dict()["ProfilingParameters"]) assert job_description.get("ProfilingStatus") == "Enabled" _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) updated_framework_profile = FrameworkProfile( detailed_profiling_config=DetailedProfilingConfig( profile_default_steps=True)) mx.update_profiler(framework_profile_params=updated_framework_profile) job_description = mx.latest_training_job.describe() assert (job_description["ProfilerConfig"]["ProfilingParameters"] == updated_framework_profile.profiling_parameters) profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert re.match(r"ProfilerReport-\d*", profiler_rule_configuration["RuleConfigurationName"]) assert profiler_rule_configuration[ "RuleEvaluatorImage"] == get_rule_container_image_uri( mx.sagemaker_session.boto_region_name) assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport" }
def pipeline_name(): return utils.unique_name_from_base("my-pipeline-clarify")
def test_model_registration_with_drift_check_baselines( sagemaker_session, role, pipeline_name, ): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") # upload model data to s3 model_local_path = os.path.join(DATA_DIR, "mxnet_mnist/model.tar.gz") model_base_uri = "s3://{}/{}/input/model/{}".format( sagemaker_session.default_bucket(), "register_model_test_with_drift_baseline", utils.unique_name_from_base("model"), ) model_uri = S3Uploader.upload(model_local_path, model_base_uri, sagemaker_session=sagemaker_session) model_uri_param = ParameterString(name="model_uri", default_value=model_uri) # upload metrics to s3 metrics_data = ( '{"regression_metrics": {"mse": {"value": 4.925353410353891, ' '"standard_deviation": 2.219186917819692}}}') metrics_base_uri = "s3://{}/{}/input/metrics/{}".format( sagemaker_session.default_bucket(), "register_model_test_with_drift_baseline", utils.unique_name_from_base("metrics"), ) metrics_uri = S3Uploader.upload_string_as_file_body( body=metrics_data, desired_s3_uri=metrics_base_uri, sagemaker_session=sagemaker_session, ) metrics_uri_param = ParameterString(name="metrics_uri", default_value=metrics_uri) model_metrics = ModelMetrics( bias=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_pre_training=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_post_training=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), ) drift_check_baselines = DriftCheckBaselines( model_statistics=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_data_statistics=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_data_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_config_file=FileSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_pre_training_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_post_training_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability_config_file=FileSource( s3_uri=metrics_uri_param, content_type="application/json", ), ) customer_metadata_properties = {"key1": "value1"} estimator = XGBoost( entry_point="training.py", source_dir=os.path.join(DATA_DIR, "sip"), instance_type=instance_type, instance_count=instance_count, framework_version="0.90-2", sagemaker_session=sagemaker_session, py_version="py3", role=role, ) step_register = RegisterModel( name="MyRegisterModelStep", estimator=estimator, model_data=model_uri_param, content_types=["application/json"], response_types=["application/json"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name="testModelPackageGroup", model_metrics=model_metrics, drift_check_baselines=drift_check_baselines, customer_metadata_properties=customer_metadata_properties, ) pipeline = Pipeline( name=pipeline_name, parameters=[ model_uri_param, metrics_uri_param, instance_type, instance_count, ], steps=[step_register], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] for _ in retries( max_retry_count=5, exception_message_prefix= "Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): execution = pipeline.start(parameters={ "model_uri": model_uri, "metrics_uri": metrics_uri }) response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 failure_reason = execution_steps[0].get("FailureReason", "") if failure_reason != "": logging.error( f"Pipeline execution failed with error: {failure_reason}." " Retrying..") continue assert execution_steps[0]["StepStatus"] == "Succeeded" assert execution_steps[0]["StepName"] == "MyRegisterModelStep" response = sagemaker_session.sagemaker_client.describe_model_package( ModelPackageName=execution_steps[0]["Metadata"] ["RegisterModel"]["Arn"]) assert (response["ModelMetrics"]["Explainability"]["Report"] ["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["Bias"][ "PreTrainingConstraints"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["Explainability"] ["Constraints"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["ModelQuality"] ["Statistics"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["ModelDataQuality"] ["Statistics"]["ContentType"] == "application/json") assert response[ "CustomerMetadataProperties"] == customer_metadata_properties break finally: try: pipeline.delete() except Exception: pass