def create_workteamjob( kfp_client, experiment_id, region, sagemaker_client, test_file_dir, download_dir ): test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), ) ) # Get the account, region specific user_pool and client_id for the SageMaker Workforce. ( test_params["Arguments"]["user_pool"], test_params["Arguments"]["client_id"], test_params["Arguments"]["user_groups"], ) = sagemaker_utils.get_cognito_member_definitions(sagemaker_client) # Generate random prefix for workteam_name to avoid errors if resources with same name exists test_params["Arguments"]["team_name"] = workteam_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["team_name"] ) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) return workteam_name, workflow_json
def test_terminate_trainingjob(kfp_client, experiment_id, region, sagemaker_client): test_file_dir = "resources/config/simple-mnist-training" download_dir = utils.mkdir( os.path.join(test_file_dir + "/generated_test_terminate")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) input_job_name = test_params["Arguments"]["job_name"] = ( utils.generate_random_string(4) + "-terminate-job") run_id, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], 60, "running", ) print( f"Terminating run: {run_id} where Training job_name: {input_job_name}") kfp_client_utils.terminate_run(kfp_client, run_id) response = sagemaker_utils.describe_training_job(sagemaker_client, input_job_name) assert response["TrainingJobStatus"] in ["Stopping", "Stopped"] utils.remove_dir(download_dir)
def test_transform_job( kfp_client, experiment_id, s3_client, sagemaker_client, s3_data_bucket, test_file_dir, ): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for model, job name to avoid errors if resources with same name exists test_params["Arguments"]["model_name"] = test_params["Arguments"][ "job_name"] = input_job_name = (utils.generate_random_string(5) + "-" + test_params["Arguments"]["model_name"]) print(f"running test with model/job name: {input_job_name}") # Generate unique location for output since output filename is generated according to the content_type test_params["Arguments"]["output_location"] = os.path.join( test_params["Arguments"]["output_location"], input_job_name) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-batch-transformation": ["output_location"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Job was successful on SageMaker response = sagemaker_utils.describe_transform_job(sagemaker_client, input_job_name) assert response["TransformJobStatus"] == "Completed" assert response["TransformJobName"] == input_job_name # Verify output location from pipeline matches job output and that the transformed file exists output_location = utils.read_from_file_in_tar( output_files["sagemaker-batch-transformation"]["output_location"]) print(f"output location: {output_location}") assert output_location == response["TransformOutput"]["S3OutputPath"] # Get relative path of file in S3 bucket # URI is following format s3://<bucket_name>/relative/path/to/file # split below is to extract the part after bucket name file_key = os.path.join("/".join(output_location.split("/")[3:]), test_params["ExpectedOutputFile"]) assert s3_utils.check_object_exists(s3_client, s3_data_bucket, file_key) utils.remove_dir(download_dir)
def test_create_endpoint(kfp_client, experiment_id, boto3_session, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for model, endpoint config and endpoint name # to avoid errors if resources with same name exists test_params["Arguments"]["model_name"] = test_params["Arguments"][ "endpoint_config_name"] = test_params["Arguments"][ "endpoint_name"] = input_endpoint_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["model_name"]) print(f"running test with model/endpoint name: {input_endpoint_name}") _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) try: outputs = {"sagemaker-deploy-model": ["endpoint_name"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) output_endpoint_name = utils.read_from_file_in_tar( output_files["sagemaker-deploy-model"]["endpoint_name"], "endpoint_name.txt") print(f"endpoint name: {output_endpoint_name}") # Verify output from pipeline is endpoint name assert output_endpoint_name == input_endpoint_name # Verify endpoint is running assert (sagemaker_utils.describe_endpoint( sagemaker_client, input_endpoint_name)["EndpointStatus"] == "InService") # Validate the model for use by running a prediction result = run_predict_mnist(boto3_session, input_endpoint_name, download_dir) print(f"prediction result: {result}") assert json.dumps(result, sort_keys=True) == json.dumps( test_params["ExpectedPrediction"], sort_keys=True) utils.remove_dir(download_dir) finally: # delete endpoint sagemaker_utils.delete_endpoint(sagemaker_client, input_endpoint_name)
def test_trainingjob(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = { "sagemaker-training-job": ["job_name", "model_artifact_url", "training_image"] } output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Training job was successful on SageMaker training_job_name = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["job_name"]) print(f"training job name: {training_job_name}") train_response = sagemaker_utils.describe_training_job( sagemaker_client, training_job_name) assert train_response["TrainingJobStatus"] == "Completed" # Verify model artifacts output was generated from this run model_artifact_url = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["model_artifact_url"]) print(f"model_artifact_url: {model_artifact_url}") assert model_artifact_url == train_response["ModelArtifacts"][ "S3ModelArtifacts"] assert training_job_name in model_artifact_url # Verify training image output is an ECR image training_image = utils.read_from_file_in_tar( output_files["sagemaker-training-job"]["training_image"]) print(f"Training image used: {training_image}") if "ExpectedTrainingImage" in test_params.keys(): assert test_params["ExpectedTrainingImage"] == training_image else: assert f"dkr.ecr.{region}.amazonaws.com" in training_image assert not argo_utils.error_in_cw_logs( workflow_json["metadata"]["name"] ), "Found the CloudWatch error message in the log output. Check SageMaker to see if the job has failed." utils.remove_dir(download_dir)
def test_trainingjob(kfp_client, experiment_id, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) test_params["Arguments"]["hyperparameters"] = json.dumps( test_params["Arguments"]["hyperparameters"]) test_params["Arguments"]["channels"] = json.dumps( test_params["Arguments"]["channels"]) run_id, status, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-training-job": ["job_name", "model_artifact_url"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Training job was successful on SageMaker training_job_name = utils.extract_information( output_files["sagemaker-training-job"]["job_name"], "job_name.txt") print(f"training job name: {training_job_name}") train_response = sagemaker_utils.describe_training_job( sagemaker_client, training_job_name.decode()) assert train_response["TrainingJobStatus"] == "Completed" # Verify model artifacts output was generated from this run model_artifact_url = utils.extract_information( output_files["sagemaker-training-job"]["model_artifact_url"], "model_artifact_url.txt", ) print(f"model_artifact_url: {model_artifact_url}") assert (model_artifact_url.decode() == train_response["ModelArtifacts"] ["S3ModelArtifacts"]) assert (train_response["ModelArtifacts"]["S3ModelArtifacts"] in model_artifact_url.decode())
def create_workteamjob( kfp_client, test_params, experiment_id, region, sagemaker_client, download_dir ): # Get the account, region specific user_pool and client_id for the SageMaker Workforce. ( test_params["Arguments"]["user_pool"], test_params["Arguments"]["client_id"], test_params["Arguments"]["user_groups"], ) = sagemaker_utils.get_cognito_member_definitions(sagemaker_client) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) return workflow_json
def test_createmodel(kfp_client, experiment_id, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for model name to avoid errors if model with same name exists test_params["Arguments"]["model_name"] = input_model_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["model_name"]) print(f"running test with model_name: {input_model_name}") _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-create-model": ["model_name"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) output_model_name = utils.read_from_file_in_tar( output_files["sagemaker-create-model"]["model_name"]) print(f"model_name: {output_model_name}") assert output_model_name == input_model_name assert (sagemaker_utils.describe_model(sagemaker_client, input_model_name) is not None) utils.remove_dir(download_dir)
def test_processingjob(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for job name to avoid errors if model with same name exists test_params["Arguments"]["job_name"] = input_job_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["job_name"]) print(f"running test with job_name: {input_job_name}") for index, output in enumerate(test_params["Arguments"]["output_config"]): if "S3Output" in output: test_params["Arguments"]["output_config"][index]["S3Output"][ "S3Uri"] = os.path.join(output["S3Output"]["S3Uri"], input_job_name) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-processing-job": ["job_name", "output_artifacts"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify Processing job was successful on SageMaker processing_job_name = utils.read_from_file_in_tar( output_files["sagemaker-processing-job"]["job_name"]) print(f"processing job name: {processing_job_name}") process_response = sagemaker_utils.describe_processing_job( sagemaker_client, processing_job_name) assert process_response["ProcessingJobStatus"] == "Completed" assert process_response["ProcessingJobArn"].split("/")[1] == input_job_name # Verify processing job produced the correct outputs processing_outputs = json.loads( utils.read_from_file_in_tar( output_files["sagemaker-processing-job"]["output_artifacts"], )) print( f"processing job outputs: {json.dumps(processing_outputs, indent = 2)}" ) assert processing_outputs is not None for output in process_response["ProcessingOutputConfig"]["Outputs"]: assert processing_outputs[ output["OutputName"]] == output["S3Output"]["S3Uri"] assert not argo_utils.error_in_cw_logs( workflow_json["metadata"]["name"] ), "Found the CloudWatch error message in the log output. Check SageMaker to see if the job has failed." utils.remove_dir(download_dir)
def test_hyperparameter_tuning(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) test_params["Arguments"]["channels"] = json.dumps( test_params["Arguments"]["channels"]) test_params["Arguments"]["static_parameters"] = json.dumps( test_params["Arguments"]["static_parameters"]) test_params["Arguments"]["integer_parameters"] = json.dumps( test_params["Arguments"]["integer_parameters"]) test_params["Arguments"]["categorical_parameters"] = json.dumps( test_params["Arguments"]["categorical_parameters"]) _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = { "sagemaker-hyperparameter-tuning": [ "best_hyperparameters", "best_job_name", "hpo_job_name", "model_artifact_url", "training_image", ] } output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) # Verify HPO job was successful on SageMaker hpo_job_name = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["hpo_job_name"], "hpo_job_name.txt", ) print(f"HPO job name: {hpo_job_name}") hpo_response = sagemaker_utils.describe_hpo_job(sagemaker_client, hpo_job_name) assert hpo_response["HyperParameterTuningJobStatus"] == "Completed" # Verify training image output is an ECR image training_image = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["training_image"], "training_image.txt", ) print(f"Training image used: {training_image}") if "ExpectedTrainingImage" in test_params.keys(): assert test_params["ExpectedTrainingImage"] == training_image else: assert f"dkr.ecr.{region}.amazonaws.com" in training_image # Verify Training job was part of HPO job, returned as best and was successful best_training_job_name = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["best_job_name"], "best_job_name.txt", ) print(f"best training job name: {best_training_job_name}") train_response = sagemaker_utils.describe_training_job( sagemaker_client, best_training_job_name) assert train_response["TuningJobArn"] == hpo_response[ "HyperParameterTuningJobArn"] assert (train_response["TrainingJobName"] == hpo_response["BestTrainingJob"]["TrainingJobName"]) assert train_response["TrainingJobStatus"] == "Completed" # Verify model artifacts output was generated from this run model_artifact_url = utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"]["model_artifact_url"], "model_artifact_url.txt", ) print(f"model_artifact_url: {model_artifact_url}") assert model_artifact_url == train_response["ModelArtifacts"][ "S3ModelArtifacts"] assert best_training_job_name in model_artifact_url # Verify hyper_parameters output is not empty hyper_parameters = json.loads( utils.read_from_file_in_tar( output_files["sagemaker-hyperparameter-tuning"] ["best_hyperparameters"], "best_hyperparameters.txt", )) print( f"HPO best hyperparameters: {json.dumps(hyper_parameters, indent = 2)}" ) assert hyper_parameters is not None utils.remove_dir(download_dir)
def test_groundtruth_labeling_job(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Verify the GroundTruthJob was created in SageMaker and is InProgress. # TODO: Add a bot to complete the labeling job and check for completion instead. try: workteam_name, workteam_arn = create_initial_workteam( kfp_client, experiment_id, region, sagemaker_client, "resources/config/create-workteam", download_dir, ) test_params["Arguments"]["workteam_arn"] = workteam_arn # Generate the ground_truth_train_job_name based on the workteam which will be used for labeling. test_params["Arguments"][ "ground_truth_train_job_name"] = ground_truth_train_job_name = ( test_params["Arguments"]["ground_truth_train_job_name"] + "-by-" + workteam_name) run_id, _, _ = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], test_params["StatusToCheck"], ) response = sagemaker_utils.describe_labeling_job( sagemaker_client, ground_truth_train_job_name) assert response["LabelingJobStatus"] == "InProgress" # Verify that the workteam has the specified labeling job labeling_jobs = sagemaker_utils.list_labeling_jobs_for_workteam( sagemaker_client, workteam_arn) assert len(labeling_jobs["LabelingJobSummaryList"]) == 1 assert (labeling_jobs["LabelingJobSummaryList"][0]["LabelingJobName"] == ground_truth_train_job_name) # Test terminate functionality print( f"Terminating run: {run_id} where GT job_name: {ground_truth_train_job_name}" ) kfp_client_utils.terminate_run(kfp_client, run_id) response = sagemaker_utils.describe_labeling_job( sagemaker_client, ground_truth_train_job_name) assert response["LabelingJobStatus"] in ["Stopping", "Stopped"] finally: # Check if terminate failed, and stop the labeling job labeling_jobs = sagemaker_utils.list_labeling_jobs_for_workteam( sagemaker_client, workteam_arn) if len(labeling_jobs["LabelingJobSummaryList"]) > 0: sagemaker_utils.stop_labeling_job(sagemaker_client, ground_truth_train_job_name) # Cleanup the workteam workteams = sagemaker_utils.list_workteams( sagemaker_client)["Workteams"] workteam_names = list(map((lambda x: x["WorkteamName"]), workteams)) if workteam_name in workteam_names: sagemaker_utils.delete_workteam(sagemaker_client, workteam_name) # Delete generated files utils.remove_dir(download_dir)
def test_groundtruth_labeling_job(kfp_client, experiment_id, region, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # First create a workteam using a separate pipeline and get the name, arn of the workteam created. workteam_name, _ = create_workteamjob( kfp_client, experiment_id, region, sagemaker_client, "resources/config/create-workteam", download_dir, ) test_params["Arguments"][ "workteam_arn"] = workteam_arn = sagemaker_utils.get_workteam_arn( sagemaker_client, workteam_name) # Generate the ground_truth_train_job_name based on the workteam which will be used for labeling. test_params["Arguments"][ "ground_truth_train_job_name"] = ground_truth_train_job_name = ( test_params["Arguments"]["ground_truth_train_job_name"] + "-by-" + workteam_name) _ = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], test_params["StatusToCheck"], ) # Verify the GroundTruthJob was created in SageMaker and is InProgress. # TODO: Add a bot to complete the labeling job and check for completion instead. try: response = sagemaker_utils.describe_labeling_job( sagemaker_client, ground_truth_train_job_name) assert response["LabelingJobStatus"] == "InProgress" # Verify that the workteam has the specified labeling job labeling_jobs = sagemaker_utils.list_labeling_jobs_for_workteam( sagemaker_client, workteam_arn) assert len(labeling_jobs["LabelingJobSummaryList"]) == 1 assert (labeling_jobs["LabelingJobSummaryList"][0]["LabelingJobName"] == ground_truth_train_job_name) finally: # Cleanup the SageMaker Resources sagemaker_utils.stop_labeling_job(sagemaker_client, ground_truth_train_job_name) sagemaker_utils.delete_workteam(sagemaker_client, workteam_name) # Delete generated files utils.remove_dir(download_dir)
def test_create_endpoint(kfp_client, experiment_id, boto3_session, sagemaker_client, test_file_dir): download_dir = utils.mkdir(os.path.join(test_file_dir + "/generated")) test_params = utils.load_params( utils.replace_placeholders( os.path.join(test_file_dir, "config.yaml"), os.path.join(download_dir, "config.yaml"), )) # Generate random prefix for model, endpoint config and endpoint name # to avoid errors if resources with same name exists test_params["Arguments"]["model_name"] = test_params["Arguments"][ "endpoint_config_name"] = test_params["Arguments"][ "endpoint_name"] = input_endpoint_name = ( utils.generate_random_string(5) + "-" + test_params["Arguments"]["model_name"]) try: print(f"running test with model/endpoint name: {input_endpoint_name}") _, _, workflow_json = kfp_client_utils.compile_run_monitor_pipeline( kfp_client, experiment_id, test_params["PipelineDefinition"], test_params["Arguments"], download_dir, test_params["TestName"], test_params["Timeout"], ) outputs = {"sagemaker-deploy-model": ["endpoint_name"]} output_files = minio_utils.artifact_download_iterator( workflow_json, outputs, download_dir) output_endpoint_name = utils.read_from_file_in_tar( output_files["sagemaker-deploy-model"]["endpoint_name"]) print(f"endpoint name: {output_endpoint_name}") # Verify output from pipeline is endpoint name assert output_endpoint_name == input_endpoint_name # Verify endpoint is running assert (sagemaker_utils.describe_endpoint( sagemaker_client, input_endpoint_name)["EndpointStatus"] == "InService") # Verify that the update was successful by checking that InstanceType changed if "ExpectedInstanceType" in test_params.keys(): new_endpoint_config_name = sagemaker_utils.describe_endpoint( sagemaker_client, input_endpoint_name)["EndpointConfigName"] response = sagemaker_utils.describe_endpoint_config( sagemaker_client, new_endpoint_config_name) prod_variant = response["ProductionVariants"][0] print(f"Production Variant item: {prod_variant}") instance_type = prod_variant["InstanceType"] print(f"Production Variant item InstanceType: {instance_type}") assert instance_type == test_params["ExpectedInstanceType"] # Validate the model for use by running a prediction result = run_predict_mnist(boto3_session, input_endpoint_name, download_dir) print(f"prediction result: {result}") assert json.dumps(result, sort_keys=True) == json.dumps( test_params["ExpectedPrediction"], sort_keys=True) utils.remove_dir(download_dir) finally: endpoints = sagemaker_utils.list_endpoints( sagemaker_client, name_contains=input_endpoint_name)["Endpoints"] endpoint_names = list(map((lambda x: x["EndpointName"]), endpoints)) # Check endpoint was successfully created if input_endpoint_name in endpoint_names: sagemaker_utils.delete_endpoint(sagemaker_client, input_endpoint_name)