def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_files( sagemaker_session, s3_files_kms_key ): my_uuid = str(uuid.uuid4()) file_1_body = "First File Body {}.".format(my_uuid) file_1_name = "first_file_{}.txt".format(my_uuid) file_2_body = "Second File Body {}.".format(my_uuid) file_2_name = "second_file_{}.txt".format(my_uuid) base_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid ) file_1_s3_uri = os.path.join(base_s3_uri, file_1_name) file_2_s3_uri = os.path.join(base_s3_uri, file_2_name) S3Uploader.upload_string_as_file_body( body=file_1_body, desired_s3_uri=file_1_s3_uri, kms_key=s3_files_kms_key, sagemaker_session=sagemaker_session, ) S3Uploader.upload_string_as_file_body( body=file_2_body, desired_s3_uri=file_2_s3_uri, kms_key=s3_files_kms_key, sagemaker_session=sagemaker_session, ) s3_uris = S3Downloader.list(s3_uri=base_s3_uri, sagemaker_session=sagemaker_session) assert file_1_name in s3_uris[0] assert file_2_name in s3_uris[1] assert file_1_body == S3Downloader.read_file( s3_uri=s3_uris[0], sagemaker_session=sagemaker_session ) assert file_2_body == S3Downloader.read_file( s3_uri=s3_uris[1], sagemaker_session=sagemaker_session ) S3Downloader.download( s3_uri=base_s3_uri, local_path=TMP_BASE_PATH, sagemaker_session=sagemaker_session ) with open(os.path.join(TMP_BASE_PATH, file_1_name), "r") as f: assert file_1_body == f.read() with open(os.path.join(TMP_BASE_PATH, file_2_name), "r") as f: assert file_2_body == f.read()
def from_s3_uri(cls, constraints_file_s3_uri, kms_key=None, sagemaker_session=None): """Generates a Constraints object from an s3 uri. Args: constraints_file_s3_uri (str): The uri of the constraints JSON file. kms_key (str): The kms key to be used to decrypt the file in S3. sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker interactions (default: None). If not specified, one is created using the default AWS configuration chain. Returns: sagemaker.model_monitor.Constraints: The instance of Constraints generated from the s3 uri. """ try: body_dict = json.loads( S3Downloader.read_file(s3_uri=constraints_file_s3_uri, session=sagemaker_session)) except ClientError as error: print("\nCould not retrieve constraints file at location '{}'. " "To manually retrieve Constraints object from a given uri, " "use 'my_model_monitor.constraints(my_s3_uri)' or " "'Constraints.from_s3_uri(my_s3_uri)'".format( constraints_file_s3_uri)) raise error return cls(body_dict=body_dict, constraints_file_s3_uri=constraints_file_s3_uri, kms_key=kms_key)
def test_one_step_data_bias_pipeline_constraint_violation( sagemaker_session, role, pipeline_name, check_job_config, data_bias_check_config, supplied_baseline_constraints_uri_param, ): data_bias_supplied_baseline_constraints = Constraints.from_file_path( constraints_file_path=os.path.join( DATA_DIR, "pipeline/clarify_check_step/data_bias/bad_cases/analysis.json"), sagemaker_session=sagemaker_session, ).file_s3_uri data_bias_check_step = ClarifyCheckStep( name="DataBiasCheckStep", clarify_check_config=data_bias_check_config, check_job_config=check_job_config, skip_check=False, register_new_baseline=False, supplied_baseline_constraints=supplied_baseline_constraints_uri_param, ) pipeline = Pipeline( name=pipeline_name, steps=[data_bias_check_step], parameters=[supplied_baseline_constraints_uri_param], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] monitoring_analysis_cfg_json = S3Downloader.read_file( data_bias_check_config.monitoring_analysis_config_uri, sagemaker_session, ) monitoring_analysis_cfg = json.loads(monitoring_analysis_cfg_json) assert monitoring_analysis_cfg is not None and len( monitoring_analysis_cfg) > 0 for _ in retries( max_retry_count=5, exception_message_prefix= "Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): execution = pipeline.start(parameters={ "SuppliedBaselineConstraintsUri": data_bias_supplied_baseline_constraints }, ) response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 failure_reason = execution_steps[0].get("FailureReason", "") if _CHECK_FAIL_ERROR_MSG not in failure_reason: logging.error( f"Pipeline execution failed with error: {failure_reason}. Retrying.." ) continue assert execution_steps[0]["StepName"] == "DataBiasCheckStep" assert execution_steps[0]["StepStatus"] == "Failed" break finally: try: pipeline.delete() except Exception: pass
def main(deploy_data: dict, train_data: dict, capture_prefix: str): inference_id_prefix = 'sts_' # the same used in testendpoint.py # Load config from environment and set required defaults # AWS especific AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1') AWS_PROFILE = os.getenv('AWS_PROFILE', 'default') AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None) AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None) b3_session, sm_client, sm_runtime, sm_session = get_sm_session( region=AWS_DEFAULT_REGION, profile_name=AWS_PROFILE, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) # read test data test_data = load_dataset(train_data['train']['test'], 'test.csv', sagemaker_session=sm_session) print(f"Loadding {train_data['train']['test']}") Y_val = test_data.iloc[:, 0].to_numpy() print(f"Test dataset shape: {Y_val.shape}") # list capture files, this is just as an example. Not used right # now but could be. capture_files = sorted( S3Downloader.list("{}/{}".format( deploy_data['monitor']['s3_capture_upload_path'], deploy_data['endpoint']['name']), sagemaker_session=sm_session)) # just the files with the prefix filtered = list( filter(lambda file_name: capture_prefix in file_name, capture_files)) print(f"Detected {len(filtered)} capture files") capture_records = [] for c_file in filtered: print(f"Processing: {c_file}") # read the capture data directly from S3 content = S3Downloader.read_file(c_file, sagemaker_session=sm_session) records = [json.loads(l) for l in content.split("\n")[:-1]] capture_records.extend(records) print(f"No. of records {len(capture_records)} captured") captured_predictions = {} for obj in capture_records: # Extract inference ID inference_id = obj["eventMetadata"]["inferenceId"] # current version of script start in 1 when id=0 # remove the prefix and get the id req_id = int(inference_id[len(inference_id_prefix):]) # Extract result given by the model Y_pred_value = encoders.decode( obj["captureData"]["endpointOutput"]["data"], # i have fixed this value here becouse # obj["captureData"]["endpointOutput"]["observedContentType"] # some times include the encoding like: text/csv; utf-8 # and encoders.decode() will give error. content_types.CSV) captured_predictions[req_id] = Y_pred_value # np.array # save and upload the ground truth labels print("Generating labels") fake_records = [] for i, label in captured_predictions.items(): val = ground_truth_with_id(i, label, Y_val, inference_id_prefix) fake_records.append(json.dumps(val)) data_to_upload = "\n".join(fake_records) target_s3_uri = "{}/{}/{}.jsonl".format( deploy_data['monitor']['ground truth uri'], capture_prefix, uuid.uuid4().hex) print(f"Uploading ground truth to {target_s3_uri} ...", end="") S3Uploader.upload_string_as_file_body(data_to_upload, target_s3_uri, sagemaker_session=sm_session) print("Done !")
def example_run_training_pipeline(workflow_arn, region): """ execute the Workflow, which consists of four steps: 1. Define job names for pre-processing, training, and evaluation 2. Upload source code for pre-processing, training, and evaluation 3. Define URLs for the input, output, and intermediary data 4. Execute the workflow with populated parameters, and monitor the progress 5. Inspect the evaluation result when the execution is completed """ training_pipeline = get_existing_training_pipeline(workflow_arn) # Step 1 - Generate unique names for Pre-Processing Job, Training Job, and unique_id = uuid.uuid1().hex # pipeline_job_name = f"pipeline-job-{unique_id}" training_job_name = f"scikit-learn-training-{unique_id}" preprocessing_job_name = f"scikit-learn-sm-preprocessing-{unique_id}" evaluation_job_name = f"scikit-learn-sm-evaluation-{unique_id}" # Step 2 - Upload source code (pre-processing, evaluation, and train) to sagemaker PREPROCESSING_SCRIPT_LOCATION = "../../src/mlmax/preprocessing.py" EVALUATION_SCRIPT_LOCATION = "../../src/mlmax/evaluation.py" TRAINING_SCRIPT_LOCATION = "../../src/mlmax/train.py" sagemaker_session = sagemaker.Session() input_preprocessing_code = sagemaker_session.upload_data( PREPROCESSING_SCRIPT_LOCATION, bucket=sagemaker_session.default_bucket(), key_prefix=f"{preprocessing_job_name}/source", ) input_evaluation_code = sagemaker_session.upload_data( EVALUATION_SCRIPT_LOCATION, bucket=sagemaker_session.default_bucket(), key_prefix=f"{evaluation_job_name}/source", ) s3_bucket_base_uri = f"s3://{sagemaker_session.default_bucket()}" sm_submit_dir_url = ( f"{s3_bucket_base_uri}/{training_job_name}/source/sourcedir.tar.gz") tar = tarfile.open("/tmp/sourcedir.tar.gz", "w:gz") # TODO need to add directory if source_dir is specified. tar.add(TRAINING_SCRIPT_LOCATION, arcname="train.py") tar.close() sagemaker_session.upload_data( "/tmp/sourcedir.tar.gz", bucket=sagemaker_session.default_bucket(), key_prefix=f"{training_job_name}/source", ) # Step 3 - Define data URLs, preprocessed data URLs can be made # specifically to this training job input_data = ( f"s3://sagemaker-sample-data-{region}/processing/census/census-income.csv" ) output_data = f"{s3_bucket_base_uri}/{preprocessing_job_name}/output" preprocessed_training_data = f"{output_data}/train_data" preprocessed_test_data = f"{output_data}/test_data" preprocessed_model_url = f"{s3_bucket_base_uri}/{preprocessing_job_name}/output" # Step 4 - Execute workflow print(f"Training Job Name is {training_job_name}") execution = training_pipeline.execute( inputs={ "InputDataURL": input_data, # Each pre processing job (SageMaker processing job) requires a unique name, "PreprocessingJobName": preprocessing_job_name, "PreprocessingCodeURL": input_preprocessing_code, # Each Sagemaker Training job requires a unique name, "TrainingJobName": training_job_name, "SMSubmitDirURL": sm_submit_dir_url, "SMRegion": region, # Each SageMaker processing job requires a unique name, "EvaluationProcessingJobName": evaluation_job_name, "EvaluationCodeURL": input_evaluation_code, "EvaluationResultURL": ( f"{s3_bucket_base_uri}/{training_job_name}/evaluation"), "PreprocessedTrainDataURL": preprocessed_training_data, "PreprocessedTestDataURL": preprocessed_test_data, "PreprocessedModelURL": preprocessed_model_url, "SMOutputDataURL": f"{s3_bucket_base_uri}/", "SMDebugOutputURL": f"{s3_bucket_base_uri}/", }) execution.get_output(wait=True) execution.render_progress() # Step 5 - Inspect the output of the Workflow execution workflow_execution_output_json = execution.get_output(wait=True) import json from sagemaker.s3 import S3Downloader evaluation_output_config = workflow_execution_output_json[ "ProcessingOutputConfig"] for output in evaluation_output_config["Outputs"]: if output["OutputName"] == "evaluation": evaluation_s3_uri = "{}/{}".format(output["S3Output"]["S3Uri"], "evaluation.json") break evaluation_output = S3Downloader.read_file(evaluation_s3_uri) evaluation_output_dict = json.loads(evaluation_output) print(json.dumps(evaluation_output_dict, sort_keys=True, indent=4))