def get_node_to_idx_mapping(training_job_name: str = None, dataprocessing_job_name: str = None, model_artifacts_location: str = './model-artifacts', vertex_label: str = None): assert training_job_name is not None or dataprocessing_job_name is not None, \ "You must provide either a modeltraining job id or a dataprocessing job id to obtain node to index mappings" job_name = training_job_name if training_job_name is not None else dataprocessing_job_name job_type = "modeltraining" if training_job_name == job_name else "dataprocessing" filename = "mapping.info" if training_job_name == job_name else "info.pkl" mapping_key = "node2id" if training_job_name == job_name else "node_id_map" # get mappings model_artifacts_location = os.path.join(model_artifacts_location, job_name) if not os.path.exists(os.path.join(model_artifacts_location, filename)): job_s3_output = get_neptune_ml_job_output_location(job_name, job_type) print(job_s3_output) if not job_s3_output: return S3Downloader.download(os.path.join(job_s3_output, filename), model_artifacts_location) with open(os.path.join(model_artifacts_location, filename), "rb") as f: mapping = pickle.load(f)[mapping_key] if vertex_label is not None: if vertex_label in mapping: mapping = mapping[vertex_label] else: print("Mapping for vertex label: {} not found.".format(vertex_label)) print("valid vertex labels which have vertices mapped to embeddings: {} ".format(list(mapping.keys()))) print("Returning mapping for all valid vertex labels") return mapping
def copy_sample_flow_to_local(workspace, local_dir) : config = BPConfig.get_config(workspace, local_dir) fname = f"{local_dir}/{FLOW_NAME}" flow_uri = f"s3://{workspace}/{config.ws_prefix()}/meta/{FLOW_NAME}" S3Downloader.download(flow_uri, local_dir) # Change the flow definition so that it references the dataset copied over by the user def _update_sample_flow_def(fname, s3_uri) : with open(fname, 'r+') as f: flow_def = json.loads(f.read()) nodes = flow_def["nodes"] for n in nodes : if n["type"] == "SOURCE" : data_def = n["parameters"]["dataset_definition"] dstype = data_def["datasetSourceType"] if dstype == "S3" : data_def["s3ExecutionContext"]["s3Uri"] = s3_uri f.seek(0) f.write(json.dumps(flow_def)) f.truncate() _update_sample_flow_def(fname, config.sample_data_uri()) return fname
def download_folder(Filename, Bucket, Key, session): if isinstance(session, boto3.Session): session = sagemaker.Session(boto_session=session) ensure_path(Filename) S3Downloader.download(s3_uri=f"s3://{Bucket}/{Key}", local_path=Filename, sagemaker_session=session)
def download_model(model_data, local_path=".", unzip=False, sagemaker_session=None, model_dir="model"): """Downloads model file from sagemaker training to local directory and unzips its to directory if wanted.""" S3Downloader.download( s3_uri=model_data, local_path=os.path.join(local_path, model_dir), sagemaker_session=sagemaker_session ) if unzip: with tarfile.open(os.path.join(local_path, model_dir, "model.tar.gz"), "r:gz") as model_zip: model_zip.extractall(path=os.path.join(local_path, model_dir)) os.remove(os.path.join(local_path, model_dir, "model.tar.gz"))
def _download_bp_config(cls, config_uri=None): if not config_uri: config_uri = cls.default_config_uri(cls.workspace) S3Downloader.download(config_uri, cls.local_dir) fname = f"{cls.local_dir}/blueprint-config.json" return fname
def load_dataset( s3_uri: str, filename: str, sagemaker_session=None ) -> pd.DataFrame: """Load a data set from a S3 uri""" S3Downloader.download( s3_uri, tempfile.gettempdir(), sagemaker_session=sagemaker_session) dataset_filename = os.path.join( tempfile.gettempdir(), filename) return pd.read_csv(dataset_filename, header=None)
def test_download(sagemaker_session): s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME, SOURCE_NAME) S3Downloader.download(s3_uri=s3_uri, local_path="/path/for/download/", session=sagemaker_session) sagemaker_session.download_data.assert_called_with( path="/path/for/download/", bucket=BUCKET_NAME, key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME), extra_args=None, )
def test_download_with_kms_key(sagemaker_session): s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME, SOURCE_NAME) S3Downloader.download(s3_uri=s3_uri, local_path="/path/for/download/", kms_key=KMS_KEY, session=sagemaker_session) sagemaker_session.download_data.assert_called_with( path="/path/for/download/", bucket=BUCKET_NAME, key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME), extra_args={"SSECustomerKey": KMS_KEY}, )
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_files( sagemaker_session, s3_files_kms_key ): my_uuid = str(uuid.uuid4()) file_1_body = "First File Body {}.".format(my_uuid) file_1_name = "first_file_{}.txt".format(my_uuid) file_2_body = "Second File Body {}.".format(my_uuid) file_2_name = "second_file_{}.txt".format(my_uuid) base_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid ) file_1_s3_uri = os.path.join(base_s3_uri, file_1_name) file_2_s3_uri = os.path.join(base_s3_uri, file_2_name) S3Uploader.upload_string_as_file_body( body=file_1_body, desired_s3_uri=file_1_s3_uri, kms_key=s3_files_kms_key, sagemaker_session=sagemaker_session, ) S3Uploader.upload_string_as_file_body( body=file_2_body, desired_s3_uri=file_2_s3_uri, kms_key=s3_files_kms_key, sagemaker_session=sagemaker_session, ) s3_uris = S3Downloader.list(s3_uri=base_s3_uri, sagemaker_session=sagemaker_session) assert file_1_name in s3_uris[0] assert file_2_name in s3_uris[1] assert file_1_body == S3Downloader.read_file( s3_uri=s3_uris[0], sagemaker_session=sagemaker_session ) assert file_2_body == S3Downloader.read_file( s3_uri=s3_uris[1], sagemaker_session=sagemaker_session ) S3Downloader.download( s3_uri=base_s3_uri, local_path=TMP_BASE_PATH, sagemaker_session=sagemaker_session ) with open(os.path.join(TMP_BASE_PATH, file_1_name), "r") as f: assert file_1_body == f.read() with open(os.path.join(TMP_BASE_PATH, file_2_name), "r") as f: assert file_2_body == f.read()
def _download_clarify_xai_summary(self): try: summary_uri = f"s3://{self.bucket}/{self.xai_prefix}/analysis.json" S3Downloader.download(summary_uri, os.getcwd()) with open('analysis.json', 'r') as f: summary = json.loads(f.read()) return summary except Exception as e: print(f"{e}: Failed to download {xai_summary}")
def get_embeddings(training_job_name: str, download_location: str = './model-artifacts'): training_job_s3_output = get_modeltraining_job_output_location(training_job_name) if not training_job_s3_output: return download_location = os.path.join(download_location, training_job_name) os.makedirs(download_location, exist_ok=True) # download embeddings and mapping info S3Downloader.download(os.path.join(training_job_s3_output, "embeddings/"), os.path.join(download_location, "embeddings/")) entity_emb = np.load(os.path.join(download_location, "embeddings", "entity.npy")) return entity_emb
def from_s3_uri(cls, constraints_file_s3_uri, kms_key=None, sagemaker_session=None): """Generates a Constraints object from an s3 uri. Args: constraints_file_s3_uri (str): The uri of the constraints JSON file. kms_key (str): The kms key to be used to decrypt the file in S3. sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker interactions (default: None). If not specified, one is created using the default AWS configuration chain. Returns: sagemaker.model_monitor.Constraints: The instance of Constraints generated from the s3 uri. """ try: body_dict = json.loads( S3Downloader.read_file(s3_uri=constraints_file_s3_uri, session=sagemaker_session)) except ClientError as error: print("\nCould not retrieve constraints file at location '{}'. " "To manually retrieve Constraints object from a given uri, " "use 'my_model_monitor.constraints(my_s3_uri)' or " "'Constraints.from_s3_uri(my_s3_uri)'".format( constraints_file_s3_uri)) raise error return cls(body_dict=body_dict, constraints_file_s3_uri=constraints_file_s3_uri, kms_key=kms_key)
def get_performance_metrics(training_job_name: str, download_location: str = './model-artifacts'): training_job_s3_output = get_modeltraining_job_output_location(training_job_name) if not training_job_s3_output: return download_location = os.path.join(download_location, training_job_name) os.makedirs(download_location, exist_ok=True) # download embeddings and mapping info S3Downloader.download(os.path.join(training_job_s3_output, "eval_metrics_info.json"), download_location) with open(os.path.join(download_location, "eval_metrics_info.json")) as f: metrics = json.load(f) return metrics
def get_predictions(training_job_name: str, download_location: str = './model-artifacts', class_preds: bool = False): training_job_s3_output = get_modeltraining_job_output_location(training_job_name) if not training_job_s3_output: return download_location = os.path.join(download_location, training_job_name) os.makedirs(download_location, exist_ok=True) # download embeddings and mapping info S3Downloader.download(os.path.join(training_job_s3_output, "predictions/"), os.path.join(download_location, "predictions/")) preds = np.load(os.path.join(download_location, "predictions", "result.npz"))['infer_scores'] if class_preds: return preds.argmax(axis=1) return preds
def test_sagemaker_pyspark_sse_s3(tag, role, image_uri, sagemaker_session, region, sagemaker_client): """Test that Spark container can read and write S3 data encrypted with SSE-S3 (default AES256 encryption)""" spark = PySparkProcessor( base_job_name="sm-spark-py", framework_version=tag, image_uri=image_uri, role=role, instance_count=2, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() input_data_key = f"spark/input/sales/{timestamp}/data.jsonl" input_data_uri = f"s3://{bucket}/{input_data_key}" output_data_uri = f"s3://{bucket}/spark/output/sales/{timestamp}" s3_client = sagemaker_session.boto_session.client("s3", region_name=region) with open("test/resources/data/files/data.jsonl") as data: body = data.read() s3_client.put_object(Body=body, Bucket=bucket, Key=input_data_key, ServerSideEncryption="AES256") spark.run( submit_app= "test/resources/code/python/hello_py_spark/hello_py_spark_app.py", submit_py_files=[ "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py" ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration={ "Classification": "core-site", "Properties": { "fs.s3a.server-side-encryption-algorithm": "AES256" }, }, ) processing_job = spark.latest_job waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped") waiter.wait( ProcessingJobName=processing_job.job_name, # poll every 15 seconds. timeout after 15 minutes. WaiterConfig={ "Delay": 15, "MaxAttempts": 60 }, ) output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def get_last_object_by_name(s3_location): import os from sagemaker.s3 import S3Downloader as s3down object_list = s3down.list(s3_location) object_list.sort() obj = object_list.pop() return obj
def get_object_path_by_filename(s3_location, filename): import os from sagemaker.s3 import S3Downloader as s3down object_list = s3down.list(s3_location) for url in object_list: if os.path.basename(url) == filename: return url return None
def test_sagemaker_scala_jar_multinode(tag, role, image_uri, configuration, sagemaker_session, sagemaker_client): """Test SparkJarProcessor using Scala application jar with external runtime dependency jars staged by SDK""" spark = SparkJarProcessor( base_job_name="sm-spark-scala", framework_version=tag, image_uri=image_uri, role=role, instance_count=2, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = spark.sagemaker_session.default_bucket() with open("test/resources/data/files/data.jsonl") as data: body = data.read() input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket) S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) output_data_uri = "s3://{}/spark/output/sales/{}".format( bucket, datetime.now().isoformat()) scala_project_dir = "test/resources/code/scala/hello-scala-spark" spark.run( submit_app="{}/target/scala-2.11/hello-scala-spark_2.11-1.0.jar". format(scala_project_dir), submit_class="com.amazonaws.sagemaker.spark.test.HelloScalaSparkApp", submit_jars=[ "{}/lib_managed/jars/org.json4s/json4s-native_2.11/json4s-native_2.11-3.6.9.jar" .format(scala_project_dir) ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, ) processing_job = spark.latest_job waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped") waiter.wait( ProcessingJobName=processing_job.job_name, # poll every 15 seconds. timeout after 15 minutes. WaiterConfig={ "Delay": 15, "MaxAttempts": 60 }, ) output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def test_sagemaker_spark_processor_default_tag(spark_version, role, sagemaker_session, sagemaker_client): """Test that spark processor works with default tag""" spark = PySparkProcessor( base_job_name="sm-spark-py", framework_version=spark_version, role=role, instance_count=1, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = spark.sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp) spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp) spark_event_logs_s3_uri = "s3://{}/{}".format(bucket, spark_event_logs_key_prefix) with open("test/resources/data/files/data.jsonl") as data: body = data.read() input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket) S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) spark.run( submit_app= "test/resources/code/python/hello_py_spark/hello_py_spark_app.py", submit_py_files=[ "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py" ], arguments=["--input", input_data_uri, "--output", output_data_uri], spark_event_logs_s3_uri=spark_event_logs_s3_uri, wait=True, ) processing_job = spark.latest_job waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped") waiter.wait( ProcessingJobName=processing_job.job_name, # poll every 15 seconds. timeout after 15 minutes. WaiterConfig={ "Delay": 15, "MaxAttempts": 60 }, ) output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def _inject_repack_script(self): """Injects the _repack_model.py script where it belongs. If the source_dir is an S3 path: 1) downloads the source_dir tar.gz 2) copies the _repack_model.py script where it belongs 3) uploads the mutated source_dir If the source_dir is a local path: 1) copies the _repack_model.py script into the source dir """ fname = os.path.join(os.path.dirname(__file__), REPACK_SCRIPT) if self._source_dir.lower().startswith("s3://"): with tempfile.TemporaryDirectory() as tmp: local_path = os.path.join(tmp, "local.tar.gz") S3Downloader.download( s3_uri=self._source_dir, local_path=local_path, sagemaker_session=self._estimator.sagemaker_session, ) src_dir = os.path.join(tmp, "src") with tarfile.open(name=local_path, mode="r:gz") as tf: tf.extractall(path=src_dir) shutil.copy2(fname, os.path.join(src_dir, REPACK_SCRIPT)) with tarfile.open(name=local_path, mode="w:gz") as tf: tf.add(src_dir, arcname=".") S3Uploader.upload( local_path=local_path, desired_s3_uri=self._source_dir, sagemaker_session=self._estimator.sagemaker_session, ) else: shutil.copy2(fname, os.path.join(self._source_dir, REPACK_SCRIPT))
def test_one_step_data_bias_pipeline_constraint_violation( sagemaker_session, role, pipeline_name, check_job_config, data_bias_check_config, supplied_baseline_constraints_uri_param, ): data_bias_supplied_baseline_constraints = Constraints.from_file_path( constraints_file_path=os.path.join( DATA_DIR, "pipeline/clarify_check_step/data_bias/bad_cases/analysis.json"), sagemaker_session=sagemaker_session, ).file_s3_uri data_bias_check_step = ClarifyCheckStep( name="DataBiasCheckStep", clarify_check_config=data_bias_check_config, check_job_config=check_job_config, skip_check=False, register_new_baseline=False, supplied_baseline_constraints=supplied_baseline_constraints_uri_param, ) pipeline = Pipeline( name=pipeline_name, steps=[data_bias_check_step], parameters=[supplied_baseline_constraints_uri_param], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] monitoring_analysis_cfg_json = S3Downloader.read_file( data_bias_check_config.monitoring_analysis_config_uri, sagemaker_session, ) monitoring_analysis_cfg = json.loads(monitoring_analysis_cfg_json) assert monitoring_analysis_cfg is not None and len( monitoring_analysis_cfg) > 0 for _ in retries( max_retry_count=5, exception_message_prefix= "Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): execution = pipeline.start(parameters={ "SuppliedBaselineConstraintsUri": data_bias_supplied_baseline_constraints }, ) response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 failure_reason = execution_steps[0].get("FailureReason", "") if _CHECK_FAIL_ERROR_MSG not in failure_reason: logging.error( f"Pipeline execution failed with error: {failure_reason}. Retrying.." ) continue assert execution_steps[0]["StepName"] == "DataBiasCheckStep" assert execution_steps[0]["StepStatus"] == "Failed" break finally: try: pipeline.delete() except Exception: pass
def test_sagemaker_pyspark_multinode(tag, role, image_uri, configuration, sagemaker_session, region, sagemaker_client): """Test that basic multinode case works on 32KB of data""" spark = PySparkProcessor( base_job_name="sm-spark-py", framework_version=tag, image_uri=image_uri, role=role, instance_count=2, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = spark.sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp) spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp) spark_event_logs_s3_uri = "s3://{}/{}".format(bucket, spark_event_logs_key_prefix) with open("test/resources/data/files/data.jsonl") as data: body = data.read() input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket) S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) spark.run( submit_app= "test/resources/code/python/hello_py_spark/hello_py_spark_app.py", submit_py_files=[ "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py" ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, spark_event_logs_s3_uri=spark_event_logs_s3_uri, wait=False, ) processing_job = spark.latest_job s3_client = boto3.client("s3", region_name=region) file_size = 0 latest_file_size = None updated_times_count = 0 time_out = time.time() + 900 while not processing_job_not_fail_or_complete(sagemaker_client, processing_job.job_name): response = s3_client.list_objects(Bucket=bucket, Prefix=spark_event_logs_key_prefix) if "Contents" in response: # somehow when call list_objects the first file size is always 0, this for loop # is to skip that. for event_log_file in response["Contents"]: if event_log_file["Size"] != 0: print("\n##### Latest file size is " + str(event_log_file["Size"])) latest_file_size = event_log_file["Size"] # update the file size if it increased if latest_file_size and latest_file_size > file_size: print("\n##### S3 file updated.") updated_times_count += 1 file_size = latest_file_size if time.time() > time_out: raise RuntimeError("Timeout") time.sleep(20) # verify that spark event logs are periodically written to s3 print("\n##### file_size {} updated_times_count {}".format( file_size, updated_times_count)) assert file_size != 0 # Commenting this assert because it's flaky. # assert updated_times_count > 1 output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def example_run_training_pipeline(workflow_arn, region): """ execute the Workflow, which consists of four steps: 1. Define job names for pre-processing, training, and evaluation 2. Upload source code for pre-processing, training, and evaluation 3. Define URLs for the input, output, and intermediary data 4. Execute the workflow with populated parameters, and monitor the progress 5. Inspect the evaluation result when the execution is completed """ training_pipeline = get_existing_training_pipeline(workflow_arn) # Step 1 - Generate unique names for Pre-Processing Job, Training Job, and unique_id = uuid.uuid1().hex # pipeline_job_name = f"pipeline-job-{unique_id}" training_job_name = f"scikit-learn-training-{unique_id}" preprocessing_job_name = f"scikit-learn-sm-preprocessing-{unique_id}" evaluation_job_name = f"scikit-learn-sm-evaluation-{unique_id}" # Step 2 - Upload source code (pre-processing, evaluation, and train) to sagemaker PREPROCESSING_SCRIPT_LOCATION = "../../src/mlmax/preprocessing.py" EVALUATION_SCRIPT_LOCATION = "../../src/mlmax/evaluation.py" TRAINING_SCRIPT_LOCATION = "../../src/mlmax/train.py" sagemaker_session = sagemaker.Session() input_preprocessing_code = sagemaker_session.upload_data( PREPROCESSING_SCRIPT_LOCATION, bucket=sagemaker_session.default_bucket(), key_prefix=f"{preprocessing_job_name}/source", ) input_evaluation_code = sagemaker_session.upload_data( EVALUATION_SCRIPT_LOCATION, bucket=sagemaker_session.default_bucket(), key_prefix=f"{evaluation_job_name}/source", ) s3_bucket_base_uri = f"s3://{sagemaker_session.default_bucket()}" sm_submit_dir_url = ( f"{s3_bucket_base_uri}/{training_job_name}/source/sourcedir.tar.gz") tar = tarfile.open("/tmp/sourcedir.tar.gz", "w:gz") # TODO need to add directory if source_dir is specified. tar.add(TRAINING_SCRIPT_LOCATION, arcname="train.py") tar.close() sagemaker_session.upload_data( "/tmp/sourcedir.tar.gz", bucket=sagemaker_session.default_bucket(), key_prefix=f"{training_job_name}/source", ) # Step 3 - Define data URLs, preprocessed data URLs can be made # specifically to this training job input_data = ( f"s3://sagemaker-sample-data-{region}/processing/census/census-income.csv" ) output_data = f"{s3_bucket_base_uri}/{preprocessing_job_name}/output" preprocessed_training_data = f"{output_data}/train_data" preprocessed_test_data = f"{output_data}/test_data" preprocessed_model_url = f"{s3_bucket_base_uri}/{preprocessing_job_name}/output" # Step 4 - Execute workflow print(f"Training Job Name is {training_job_name}") execution = training_pipeline.execute( inputs={ "InputDataURL": input_data, # Each pre processing job (SageMaker processing job) requires a unique name, "PreprocessingJobName": preprocessing_job_name, "PreprocessingCodeURL": input_preprocessing_code, # Each Sagemaker Training job requires a unique name, "TrainingJobName": training_job_name, "SMSubmitDirURL": sm_submit_dir_url, "SMRegion": region, # Each SageMaker processing job requires a unique name, "EvaluationProcessingJobName": evaluation_job_name, "EvaluationCodeURL": input_evaluation_code, "EvaluationResultURL": ( f"{s3_bucket_base_uri}/{training_job_name}/evaluation"), "PreprocessedTrainDataURL": preprocessed_training_data, "PreprocessedTestDataURL": preprocessed_test_data, "PreprocessedModelURL": preprocessed_model_url, "SMOutputDataURL": f"{s3_bucket_base_uri}/", "SMDebugOutputURL": f"{s3_bucket_base_uri}/", }) execution.get_output(wait=True) execution.render_progress() # Step 5 - Inspect the output of the Workflow execution workflow_execution_output_json = execution.get_output(wait=True) import json from sagemaker.s3 import S3Downloader evaluation_output_config = workflow_execution_output_json[ "ProcessingOutputConfig"] for output in evaluation_output_config["Outputs"]: if output["OutputName"] == "evaluation": evaluation_s3_uri = "{}/{}".format(output["S3Output"]["S3Uri"], "evaluation.json") break evaluation_output = S3Downloader.read_file(evaluation_s3_uri) evaluation_output_dict = json.loads(evaluation_output) print(json.dumps(evaluation_output_dict, sort_keys=True, indent=4))
def main(deploy_data: dict, train_data: dict, capture_prefix: str): inference_id_prefix = 'sts_' # the same used in testendpoint.py # Load config from environment and set required defaults # AWS especific AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1') AWS_PROFILE = os.getenv('AWS_PROFILE', 'default') AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None) AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None) b3_session, sm_client, sm_runtime, sm_session = get_sm_session( region=AWS_DEFAULT_REGION, profile_name=AWS_PROFILE, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) # read test data test_data = load_dataset(train_data['train']['test'], 'test.csv', sagemaker_session=sm_session) print(f"Loadding {train_data['train']['test']}") Y_val = test_data.iloc[:, 0].to_numpy() print(f"Test dataset shape: {Y_val.shape}") # list capture files, this is just as an example. Not used right # now but could be. capture_files = sorted( S3Downloader.list("{}/{}".format( deploy_data['monitor']['s3_capture_upload_path'], deploy_data['endpoint']['name']), sagemaker_session=sm_session)) # just the files with the prefix filtered = list( filter(lambda file_name: capture_prefix in file_name, capture_files)) print(f"Detected {len(filtered)} capture files") capture_records = [] for c_file in filtered: print(f"Processing: {c_file}") # read the capture data directly from S3 content = S3Downloader.read_file(c_file, sagemaker_session=sm_session) records = [json.loads(l) for l in content.split("\n")[:-1]] capture_records.extend(records) print(f"No. of records {len(capture_records)} captured") captured_predictions = {} for obj in capture_records: # Extract inference ID inference_id = obj["eventMetadata"]["inferenceId"] # current version of script start in 1 when id=0 # remove the prefix and get the id req_id = int(inference_id[len(inference_id_prefix):]) # Extract result given by the model Y_pred_value = encoders.decode( obj["captureData"]["endpointOutput"]["data"], # i have fixed this value here becouse # obj["captureData"]["endpointOutput"]["observedContentType"] # some times include the encoding like: text/csv; utf-8 # and encoders.decode() will give error. content_types.CSV) captured_predictions[req_id] = Y_pred_value # np.array # save and upload the ground truth labels print("Generating labels") fake_records = [] for i, label in captured_predictions.items(): val = ground_truth_with_id(i, label, Y_val, inference_id_prefix) fake_records.append(json.dumps(val)) data_to_upload = "\n".join(fake_records) target_s3_uri = "{}/{}/{}.jsonl".format( deploy_data['monitor']['ground truth uri'], capture_prefix, uuid.uuid4().hex) print(f"Uploading ground truth to {target_s3_uri} ...", end="") S3Uploader.upload_string_as_file_body(data_to_upload, target_s3_uri, sagemaker_session=sm_session) print("Done !")
def test_sagemaker_pyspark_multinode(spark_py_processor, sagemaker_session, configuration): """Test that basic multinode case works on 32KB of data""" bucket = spark_py_processor.sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() output_data_uri = f"s3://{bucket}/spark/output/sales/{timestamp}" spark_event_logs_key_prefix = f"spark/spark-events/{timestamp}" spark_event_logs_s3_uri = f"s3://{bucket}/{spark_event_logs_key_prefix}" with open(os.path.join(SPARK_PATH, "files", "data.jsonl")) as data: body = data.read() input_data_uri = f"s3://{bucket}/spark/input/data.jsonl" S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) spark_py_processor.run( submit_app=os.path.join(SPARK_PATH, "code", "python", "hello_py_spark", "hello_py_spark_app.py"), submit_py_files=[ os.path.join(SPARK_PATH, "code", "python", "hello_py_spark", "hello_py_spark_udfs.py") ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, spark_event_logs_s3_uri=spark_event_logs_s3_uri, wait=False, ) processing_job = spark_py_processor.latest_job s3_client = boto3.client( "s3", region_name=spark_py_processor.sagemaker_session.boto_region_name) file_size = 0 latest_file_size = None updated_times_count = 0 time_out = time.time() + 900 while not processing_job_not_fail_or_complete( sagemaker_session.sagemaker_client, processing_job.job_name): response = s3_client.list_objects(Bucket=bucket, Prefix=spark_event_logs_key_prefix) if "Contents" in response: # somehow when call list_objects the first file size is always 0, this for loop # is to skip that. for event_log_file in response["Contents"]: if event_log_file["Size"] != 0: latest_file_size = event_log_file["Size"] # update the file size if it increased if latest_file_size and latest_file_size > file_size: updated_times_count += 1 file_size = latest_file_size if time.time() > time_out: raise RuntimeError("Timeout") time.sleep(20) # verify that spark event logs are periodically written to s3 assert file_size != 0 output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0