コード例 #1
0
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_files(
    sagemaker_session, s3_files_kms_key
):
    my_uuid = str(uuid.uuid4())

    file_1_body = "First File Body {}.".format(my_uuid)
    file_1_name = "first_file_{}.txt".format(my_uuid)
    file_2_body = "Second File Body {}.".format(my_uuid)
    file_2_name = "second_file_{}.txt".format(my_uuid)

    base_s3_uri = os.path.join(
        "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid
    )
    file_1_s3_uri = os.path.join(base_s3_uri, file_1_name)
    file_2_s3_uri = os.path.join(base_s3_uri, file_2_name)

    S3Uploader.upload_string_as_file_body(
        body=file_1_body,
        desired_s3_uri=file_1_s3_uri,
        kms_key=s3_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    S3Uploader.upload_string_as_file_body(
        body=file_2_body,
        desired_s3_uri=file_2_s3_uri,
        kms_key=s3_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    s3_uris = S3Downloader.list(s3_uri=base_s3_uri, sagemaker_session=sagemaker_session)

    assert file_1_name in s3_uris[0]
    assert file_2_name in s3_uris[1]

    assert file_1_body == S3Downloader.read_file(
        s3_uri=s3_uris[0], sagemaker_session=sagemaker_session
    )
    assert file_2_body == S3Downloader.read_file(
        s3_uri=s3_uris[1], sagemaker_session=sagemaker_session
    )

    S3Downloader.download(
        s3_uri=base_s3_uri, local_path=TMP_BASE_PATH, sagemaker_session=sagemaker_session
    )

    with open(os.path.join(TMP_BASE_PATH, file_1_name), "r") as f:
        assert file_1_body == f.read()

    with open(os.path.join(TMP_BASE_PATH, file_2_name), "r") as f:
        assert file_2_body == f.read()
コード例 #2
0
    def from_s3_uri(cls,
                    constraints_file_s3_uri,
                    kms_key=None,
                    sagemaker_session=None):
        """Generates a Constraints object from an s3 uri.

        Args:
            constraints_file_s3_uri (str): The uri of the constraints JSON file.
            kms_key (str): The kms key to be used to decrypt the file in S3.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session
                object, used for SageMaker interactions (default: None). If not
                specified, one is created using the default AWS configuration
                chain.

        Returns:
            sagemaker.model_monitor.Constraints: The instance of Constraints generated from
                the s3 uri.

        """
        try:
            body_dict = json.loads(
                S3Downloader.read_file(s3_uri=constraints_file_s3_uri,
                                       session=sagemaker_session))
        except ClientError as error:
            print("\nCould not retrieve constraints file at location '{}'. "
                  "To manually retrieve Constraints object from a given uri, "
                  "use 'my_model_monitor.constraints(my_s3_uri)' or "
                  "'Constraints.from_s3_uri(my_s3_uri)'".format(
                      constraints_file_s3_uri))
            raise error

        return cls(body_dict=body_dict,
                   constraints_file_s3_uri=constraints_file_s3_uri,
                   kms_key=kms_key)
コード例 #3
0
def test_one_step_data_bias_pipeline_constraint_violation(
    sagemaker_session,
    role,
    pipeline_name,
    check_job_config,
    data_bias_check_config,
    supplied_baseline_constraints_uri_param,
):
    data_bias_supplied_baseline_constraints = Constraints.from_file_path(
        constraints_file_path=os.path.join(
            DATA_DIR,
            "pipeline/clarify_check_step/data_bias/bad_cases/analysis.json"),
        sagemaker_session=sagemaker_session,
    ).file_s3_uri
    data_bias_check_step = ClarifyCheckStep(
        name="DataBiasCheckStep",
        clarify_check_config=data_bias_check_config,
        check_job_config=check_job_config,
        skip_check=False,
        register_new_baseline=False,
        supplied_baseline_constraints=supplied_baseline_constraints_uri_param,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        steps=[data_bias_check_step],
        parameters=[supplied_baseline_constraints_uri_param],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        monitoring_analysis_cfg_json = S3Downloader.read_file(
            data_bias_check_config.monitoring_analysis_config_uri,
            sagemaker_session,
        )
        monitoring_analysis_cfg = json.loads(monitoring_analysis_cfg_json)

        assert monitoring_analysis_cfg is not None and len(
            monitoring_analysis_cfg) > 0

        for _ in retries(
                max_retry_count=5,
                exception_message_prefix=
                "Waiting for a successful execution of pipeline",
                seconds_to_sleep=10,
        ):
            execution = pipeline.start(parameters={
                "SuppliedBaselineConstraintsUri":
                data_bias_supplied_baseline_constraints
            }, )
            response = execution.describe()

            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=30, max_attempts=60)
            except WaiterError:
                pass
            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            failure_reason = execution_steps[0].get("FailureReason", "")
            if _CHECK_FAIL_ERROR_MSG not in failure_reason:
                logging.error(
                    f"Pipeline execution failed with error: {failure_reason}. Retrying.."
                )
                continue
            assert execution_steps[0]["StepName"] == "DataBiasCheckStep"
            assert execution_steps[0]["StepStatus"] == "Failed"
            break
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
コード例 #4
0
def main(deploy_data: dict, train_data: dict, capture_prefix: str):
    inference_id_prefix = 'sts_'  # the same used in testendpoint.py

    # Load config from environment and set required defaults
    # AWS especific
    AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1')
    AWS_PROFILE = os.getenv('AWS_PROFILE', 'default')
    AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None)
    AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None)
    b3_session, sm_client, sm_runtime, sm_session = get_sm_session(
        region=AWS_DEFAULT_REGION,
        profile_name=AWS_PROFILE,
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    # read test data
    test_data = load_dataset(train_data['train']['test'],
                             'test.csv',
                             sagemaker_session=sm_session)
    print(f"Loadding {train_data['train']['test']}")
    Y_val = test_data.iloc[:, 0].to_numpy()
    print(f"Test dataset shape: {Y_val.shape}")

    # list capture files, this is just as an example. Not used right
    # now but could be.
    capture_files = sorted(
        S3Downloader.list("{}/{}".format(
            deploy_data['monitor']['s3_capture_upload_path'],
            deploy_data['endpoint']['name']),
                          sagemaker_session=sm_session))
    # just the files with the prefix
    filtered = list(
        filter(lambda file_name: capture_prefix in file_name, capture_files))
    print(f"Detected {len(filtered)} capture files")

    capture_records = []
    for c_file in filtered:
        print(f"Processing: {c_file}")
        # read the capture data directly from S3
        content = S3Downloader.read_file(c_file, sagemaker_session=sm_session)
        records = [json.loads(l) for l in content.split("\n")[:-1]]

        capture_records.extend(records)

    print(f"No. of records {len(capture_records)} captured")
    captured_predictions = {}

    for obj in capture_records:
        # Extract inference ID
        inference_id = obj["eventMetadata"]["inferenceId"]
        # current version of script start in 1 when id=0
        # remove the prefix and get the id
        req_id = int(inference_id[len(inference_id_prefix):])

        # Extract result given by the model
        Y_pred_value = encoders.decode(
            obj["captureData"]["endpointOutput"]["data"],
            # i have fixed this value here becouse
            # obj["captureData"]["endpointOutput"]["observedContentType"]
            # some times include the encoding like: text/csv; utf-8
            # and encoders.decode() will give error.
            content_types.CSV)
        captured_predictions[req_id] = Y_pred_value  # np.array

    # save and upload the ground truth labels
    print("Generating labels")
    fake_records = []
    for i, label in captured_predictions.items():
        val = ground_truth_with_id(i, label, Y_val, inference_id_prefix)
        fake_records.append(json.dumps(val))

    data_to_upload = "\n".join(fake_records)
    target_s3_uri = "{}/{}/{}.jsonl".format(
        deploy_data['monitor']['ground truth uri'], capture_prefix,
        uuid.uuid4().hex)
    print(f"Uploading ground truth to {target_s3_uri} ...", end="")
    S3Uploader.upload_string_as_file_body(data_to_upload,
                                          target_s3_uri,
                                          sagemaker_session=sm_session)
    print("Done !")
コード例 #5
0
def example_run_training_pipeline(workflow_arn, region):
    """
    execute the Workflow, which consists of four steps:

    1. Define job names for pre-processing, training, and evaluation
    2. Upload source code for pre-processing, training, and evaluation
    3. Define URLs for the input, output, and intermediary data
    4. Execute the workflow with populated parameters, and monitor the progress
    5. Inspect the evaluation result when the execution is completed
    """

    training_pipeline = get_existing_training_pipeline(workflow_arn)

    # Step 1 - Generate unique names for Pre-Processing Job, Training Job, and
    unique_id = uuid.uuid1().hex
    # pipeline_job_name = f"pipeline-job-{unique_id}"
    training_job_name = f"scikit-learn-training-{unique_id}"
    preprocessing_job_name = f"scikit-learn-sm-preprocessing-{unique_id}"
    evaluation_job_name = f"scikit-learn-sm-evaluation-{unique_id}"

    # Step 2 - Upload source code (pre-processing, evaluation, and train) to sagemaker
    PREPROCESSING_SCRIPT_LOCATION = "../../src/mlmax/preprocessing.py"
    EVALUATION_SCRIPT_LOCATION = "../../src/mlmax/evaluation.py"
    TRAINING_SCRIPT_LOCATION = "../../src/mlmax/train.py"

    sagemaker_session = sagemaker.Session()
    input_preprocessing_code = sagemaker_session.upload_data(
        PREPROCESSING_SCRIPT_LOCATION,
        bucket=sagemaker_session.default_bucket(),
        key_prefix=f"{preprocessing_job_name}/source",
    )
    input_evaluation_code = sagemaker_session.upload_data(
        EVALUATION_SCRIPT_LOCATION,
        bucket=sagemaker_session.default_bucket(),
        key_prefix=f"{evaluation_job_name}/source",
    )
    s3_bucket_base_uri = f"s3://{sagemaker_session.default_bucket()}"
    sm_submit_dir_url = (
        f"{s3_bucket_base_uri}/{training_job_name}/source/sourcedir.tar.gz")
    tar = tarfile.open("/tmp/sourcedir.tar.gz", "w:gz")
    # TODO need to add directory if source_dir is specified.
    tar.add(TRAINING_SCRIPT_LOCATION, arcname="train.py")
    tar.close()
    sagemaker_session.upload_data(
        "/tmp/sourcedir.tar.gz",
        bucket=sagemaker_session.default_bucket(),
        key_prefix=f"{training_job_name}/source",
    )

    # Step 3 - Define data URLs, preprocessed data URLs can be made
    # specifically to this training job
    input_data = (
        f"s3://sagemaker-sample-data-{region}/processing/census/census-income.csv"
    )
    output_data = f"{s3_bucket_base_uri}/{preprocessing_job_name}/output"
    preprocessed_training_data = f"{output_data}/train_data"
    preprocessed_test_data = f"{output_data}/test_data"
    preprocessed_model_url = f"{s3_bucket_base_uri}/{preprocessing_job_name}/output"
    # Step 4 - Execute workflow
    print(f"Training Job Name is {training_job_name}")
    execution = training_pipeline.execute(
        inputs={
            "InputDataURL":
            input_data,
            # Each pre processing job (SageMaker processing job) requires a unique name,
            "PreprocessingJobName":
            preprocessing_job_name,
            "PreprocessingCodeURL":
            input_preprocessing_code,
            # Each Sagemaker Training job requires a unique name,
            "TrainingJobName":
            training_job_name,
            "SMSubmitDirURL":
            sm_submit_dir_url,
            "SMRegion":
            region,
            # Each SageMaker processing job requires a unique name,
            "EvaluationProcessingJobName":
            evaluation_job_name,
            "EvaluationCodeURL":
            input_evaluation_code,
            "EvaluationResultURL": (
                f"{s3_bucket_base_uri}/{training_job_name}/evaluation"),
            "PreprocessedTrainDataURL":
            preprocessed_training_data,
            "PreprocessedTestDataURL":
            preprocessed_test_data,
            "PreprocessedModelURL":
            preprocessed_model_url,
            "SMOutputDataURL":
            f"{s3_bucket_base_uri}/",
            "SMDebugOutputURL":
            f"{s3_bucket_base_uri}/",
        })
    execution.get_output(wait=True)
    execution.render_progress()

    # Step 5 - Inspect the output of the Workflow execution
    workflow_execution_output_json = execution.get_output(wait=True)
    import json

    from sagemaker.s3 import S3Downloader

    evaluation_output_config = workflow_execution_output_json[
        "ProcessingOutputConfig"]
    for output in evaluation_output_config["Outputs"]:
        if output["OutputName"] == "evaluation":
            evaluation_s3_uri = "{}/{}".format(output["S3Output"]["S3Uri"],
                                               "evaluation.json")
            break

    evaluation_output = S3Downloader.read_file(evaluation_s3_uri)
    evaluation_output_dict = json.loads(evaluation_output)
    print(json.dumps(evaluation_output_dict, sort_keys=True, indent=4))