def test_sagemaker_pyspark_sse_s3(tag, role, image_uri, sagemaker_session, region, sagemaker_client): """Test that Spark container can read and write S3 data encrypted with SSE-S3 (default AES256 encryption)""" spark = PySparkProcessor( base_job_name="sm-spark-py", framework_version=tag, image_uri=image_uri, role=role, instance_count=2, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() input_data_key = f"spark/input/sales/{timestamp}/data.jsonl" input_data_uri = f"s3://{bucket}/{input_data_key}" output_data_uri = f"s3://{bucket}/spark/output/sales/{timestamp}" s3_client = sagemaker_session.boto_session.client("s3", region_name=region) with open("test/resources/data/files/data.jsonl") as data: body = data.read() s3_client.put_object(Body=body, Bucket=bucket, Key=input_data_key, ServerSideEncryption="AES256") spark.run( submit_app= "test/resources/code/python/hello_py_spark/hello_py_spark_app.py", submit_py_files=[ "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py" ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration={ "Classification": "core-site", "Properties": { "fs.s3a.server-side-encryption-algorithm": "AES256" }, }, ) processing_job = spark.latest_job waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped") waiter.wait( ProcessingJobName=processing_job.job_name, # poll every 15 seconds. timeout after 15 minutes. WaiterConfig={ "Delay": 15, "MaxAttempts": 60 }, ) output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def get_last_object_by_name(s3_location): import os from sagemaker.s3 import S3Downloader as s3down object_list = s3down.list(s3_location) object_list.sort() obj = object_list.pop() return obj
def get_object_path_by_filename(s3_location, filename): import os from sagemaker.s3 import S3Downloader as s3down object_list = s3down.list(s3_location) for url in object_list: if os.path.basename(url) == filename: return url return None
def test_sagemaker_scala_jar_multinode(tag, role, image_uri, configuration, sagemaker_session, sagemaker_client): """Test SparkJarProcessor using Scala application jar with external runtime dependency jars staged by SDK""" spark = SparkJarProcessor( base_job_name="sm-spark-scala", framework_version=tag, image_uri=image_uri, role=role, instance_count=2, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = spark.sagemaker_session.default_bucket() with open("test/resources/data/files/data.jsonl") as data: body = data.read() input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket) S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) output_data_uri = "s3://{}/spark/output/sales/{}".format( bucket, datetime.now().isoformat()) scala_project_dir = "test/resources/code/scala/hello-scala-spark" spark.run( submit_app="{}/target/scala-2.11/hello-scala-spark_2.11-1.0.jar". format(scala_project_dir), submit_class="com.amazonaws.sagemaker.spark.test.HelloScalaSparkApp", submit_jars=[ "{}/lib_managed/jars/org.json4s/json4s-native_2.11/json4s-native_2.11-3.6.9.jar" .format(scala_project_dir) ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, ) processing_job = spark.latest_job waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped") waiter.wait( ProcessingJobName=processing_job.job_name, # poll every 15 seconds. timeout after 15 minutes. WaiterConfig={ "Delay": 15, "MaxAttempts": 60 }, ) output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def test_sagemaker_spark_processor_default_tag(spark_version, role, sagemaker_session, sagemaker_client): """Test that spark processor works with default tag""" spark = PySparkProcessor( base_job_name="sm-spark-py", framework_version=spark_version, role=role, instance_count=1, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = spark.sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp) spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp) spark_event_logs_s3_uri = "s3://{}/{}".format(bucket, spark_event_logs_key_prefix) with open("test/resources/data/files/data.jsonl") as data: body = data.read() input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket) S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) spark.run( submit_app= "test/resources/code/python/hello_py_spark/hello_py_spark_app.py", submit_py_files=[ "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py" ], arguments=["--input", input_data_uri, "--output", output_data_uri], spark_event_logs_s3_uri=spark_event_logs_s3_uri, wait=True, ) processing_job = spark.latest_job waiter = sagemaker_client.get_waiter("processing_job_completed_or_stopped") waiter.wait( ProcessingJobName=processing_job.job_name, # poll every 15 seconds. timeout after 15 minutes. WaiterConfig={ "Delay": 15, "MaxAttempts": 60 }, ) output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_files( sagemaker_session, s3_files_kms_key ): my_uuid = str(uuid.uuid4()) file_1_body = "First File Body {}.".format(my_uuid) file_1_name = "first_file_{}.txt".format(my_uuid) file_2_body = "Second File Body {}.".format(my_uuid) file_2_name = "second_file_{}.txt".format(my_uuid) base_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid ) file_1_s3_uri = os.path.join(base_s3_uri, file_1_name) file_2_s3_uri = os.path.join(base_s3_uri, file_2_name) S3Uploader.upload_string_as_file_body( body=file_1_body, desired_s3_uri=file_1_s3_uri, kms_key=s3_files_kms_key, sagemaker_session=sagemaker_session, ) S3Uploader.upload_string_as_file_body( body=file_2_body, desired_s3_uri=file_2_s3_uri, kms_key=s3_files_kms_key, sagemaker_session=sagemaker_session, ) s3_uris = S3Downloader.list(s3_uri=base_s3_uri, sagemaker_session=sagemaker_session) assert file_1_name in s3_uris[0] assert file_2_name in s3_uris[1] assert file_1_body == S3Downloader.read_file( s3_uri=s3_uris[0], sagemaker_session=sagemaker_session ) assert file_2_body == S3Downloader.read_file( s3_uri=s3_uris[1], sagemaker_session=sagemaker_session ) S3Downloader.download( s3_uri=base_s3_uri, local_path=TMP_BASE_PATH, sagemaker_session=sagemaker_session ) with open(os.path.join(TMP_BASE_PATH, file_1_name), "r") as f: assert file_1_body == f.read() with open(os.path.join(TMP_BASE_PATH, file_2_name), "r") as f: assert file_2_body == f.read()
def test_sagemaker_pyspark_multinode(tag, role, image_uri, configuration, sagemaker_session, region, sagemaker_client): """Test that basic multinode case works on 32KB of data""" spark = PySparkProcessor( base_job_name="sm-spark-py", framework_version=tag, image_uri=image_uri, role=role, instance_count=2, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = spark.sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp) spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp) spark_event_logs_s3_uri = "s3://{}/{}".format(bucket, spark_event_logs_key_prefix) with open("test/resources/data/files/data.jsonl") as data: body = data.read() input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket) S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) spark.run( submit_app= "test/resources/code/python/hello_py_spark/hello_py_spark_app.py", submit_py_files=[ "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py" ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, spark_event_logs_s3_uri=spark_event_logs_s3_uri, wait=False, ) processing_job = spark.latest_job s3_client = boto3.client("s3", region_name=region) file_size = 0 latest_file_size = None updated_times_count = 0 time_out = time.time() + 900 while not processing_job_not_fail_or_complete(sagemaker_client, processing_job.job_name): response = s3_client.list_objects(Bucket=bucket, Prefix=spark_event_logs_key_prefix) if "Contents" in response: # somehow when call list_objects the first file size is always 0, this for loop # is to skip that. for event_log_file in response["Contents"]: if event_log_file["Size"] != 0: print("\n##### Latest file size is " + str(event_log_file["Size"])) latest_file_size = event_log_file["Size"] # update the file size if it increased if latest_file_size and latest_file_size > file_size: print("\n##### S3 file updated.") updated_times_count += 1 file_size = latest_file_size if time.time() > time_out: raise RuntimeError("Timeout") time.sleep(20) # verify that spark event logs are periodically written to s3 print("\n##### file_size {} updated_times_count {}".format( file_size, updated_times_count)) assert file_size != 0 # Commenting this assert because it's flaky. # assert updated_times_count > 1 output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def test_sagemaker_pyspark_multinode(spark_py_processor, sagemaker_session, configuration): """Test that basic multinode case works on 32KB of data""" bucket = spark_py_processor.sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() output_data_uri = f"s3://{bucket}/spark/output/sales/{timestamp}" spark_event_logs_key_prefix = f"spark/spark-events/{timestamp}" spark_event_logs_s3_uri = f"s3://{bucket}/{spark_event_logs_key_prefix}" with open(os.path.join(SPARK_PATH, "files", "data.jsonl")) as data: body = data.read() input_data_uri = f"s3://{bucket}/spark/input/data.jsonl" S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) spark_py_processor.run( submit_app=os.path.join(SPARK_PATH, "code", "python", "hello_py_spark", "hello_py_spark_app.py"), submit_py_files=[ os.path.join(SPARK_PATH, "code", "python", "hello_py_spark", "hello_py_spark_udfs.py") ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, spark_event_logs_s3_uri=spark_event_logs_s3_uri, wait=False, ) processing_job = spark_py_processor.latest_job s3_client = boto3.client( "s3", region_name=spark_py_processor.sagemaker_session.boto_region_name) file_size = 0 latest_file_size = None updated_times_count = 0 time_out = time.time() + 900 while not processing_job_not_fail_or_complete( sagemaker_session.sagemaker_client, processing_job.job_name): response = s3_client.list_objects(Bucket=bucket, Prefix=spark_event_logs_key_prefix) if "Contents" in response: # somehow when call list_objects the first file size is always 0, this for loop # is to skip that. for event_log_file in response["Contents"]: if event_log_file["Size"] != 0: latest_file_size = event_log_file["Size"] # update the file size if it increased if latest_file_size and latest_file_size > file_size: updated_times_count += 1 file_size = latest_file_size if time.time() > time_out: raise RuntimeError("Timeout") time.sleep(20) # verify that spark event logs are periodically written to s3 assert file_size != 0 output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def main(deploy_data: dict, train_data: dict, capture_prefix: str): inference_id_prefix = 'sts_' # the same used in testendpoint.py # Load config from environment and set required defaults # AWS especific AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1') AWS_PROFILE = os.getenv('AWS_PROFILE', 'default') AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None) AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None) b3_session, sm_client, sm_runtime, sm_session = get_sm_session( region=AWS_DEFAULT_REGION, profile_name=AWS_PROFILE, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) # read test data test_data = load_dataset(train_data['train']['test'], 'test.csv', sagemaker_session=sm_session) print(f"Loadding {train_data['train']['test']}") Y_val = test_data.iloc[:, 0].to_numpy() print(f"Test dataset shape: {Y_val.shape}") # list capture files, this is just as an example. Not used right # now but could be. capture_files = sorted( S3Downloader.list("{}/{}".format( deploy_data['monitor']['s3_capture_upload_path'], deploy_data['endpoint']['name']), sagemaker_session=sm_session)) # just the files with the prefix filtered = list( filter(lambda file_name: capture_prefix in file_name, capture_files)) print(f"Detected {len(filtered)} capture files") capture_records = [] for c_file in filtered: print(f"Processing: {c_file}") # read the capture data directly from S3 content = S3Downloader.read_file(c_file, sagemaker_session=sm_session) records = [json.loads(l) for l in content.split("\n")[:-1]] capture_records.extend(records) print(f"No. of records {len(capture_records)} captured") captured_predictions = {} for obj in capture_records: # Extract inference ID inference_id = obj["eventMetadata"]["inferenceId"] # current version of script start in 1 when id=0 # remove the prefix and get the id req_id = int(inference_id[len(inference_id_prefix):]) # Extract result given by the model Y_pred_value = encoders.decode( obj["captureData"]["endpointOutput"]["data"], # i have fixed this value here becouse # obj["captureData"]["endpointOutput"]["observedContentType"] # some times include the encoding like: text/csv; utf-8 # and encoders.decode() will give error. content_types.CSV) captured_predictions[req_id] = Y_pred_value # np.array # save and upload the ground truth labels print("Generating labels") fake_records = [] for i, label in captured_predictions.items(): val = ground_truth_with_id(i, label, Y_val, inference_id_prefix) fake_records.append(json.dumps(val)) data_to_upload = "\n".join(fake_records) target_s3_uri = "{}/{}/{}.jsonl".format( deploy_data['monitor']['ground truth uri'], capture_prefix, uuid.uuid4().hex) print(f"Uploading ground truth to {target_s3_uri} ...", end="") S3Uploader.upload_string_as_file_body(data_to_upload, target_s3_uri, sagemaker_session=sm_session) print("Done !")