def monitor_frequency(interval=frequency, hour_interval=None, starting_hour=None):

    # this allows users to define the frequency of data drift monitoring

    if interval == "daily":
        monitoring_frequency = CronExpressionGenerator.daily()
    if interval == "hourly":
        monitoring_frequency = CronExpressionGenerator.hourly()
    if interval == "others":
        monitoring_frequency = CronExpressionGenerator.daily_every_x_hours(
            hour_interval, starting_hour
        )
    return monitoring_frequency
from tests.integ.retry import retries

XGBOOST_DATA_PATH = os.path.join(DATA_DIR, "xgboost_model")
ENDPOINT_INPUT_LOCAL_PATH = "/opt/ml/processing/input/endpoint"
PROBABILITY_THRESHOLD = 0.5005
PROBLEM_TYPE = "Regression"
INFERENCE_ATTRIBUTE = "0"
HEADER_OF_LABEL = "Label"
HEADER_OF_PREDICTED_LABEL = "Prediction"
HEADERS_OF_FEATURES = ["F1", "F2", "F3", "F4", "F5", "F6", "F7"]
ALL_HEADERS = [
    *HEADERS_OF_FEATURES, HEADER_OF_LABEL, HEADER_OF_PREDICTED_LABEL
]

CRON = "cron(0 * * * ? *)"
UPDATED_CRON = CronExpressionGenerator.daily()
MAX_RUNTIME_IN_SECONDS = 30 * 60
UPDATED_MAX_RUNTIME_IN_SECONDS = 25 * 60
ROLE = "SageMakerRole"
INSTANCE_COUNT = 1
INSTANCE_TYPE = "ml.c5.xlarge"
VOLUME_SIZE_IN_GB = 100
START_TIME_OFFSET = "-PT1H"
END_TIME_OFFSET = "-PT0H"
TEST_TAGS = [{"Key": "integration", "Value": "test"}]
# TODO: Remove this workaround once the API service fix is deployed to Prod
TEST_ENV = {"problem_type": PROBLEM_TYPE}


@pytest.yield_fixture(scope="module")
def endpoint_name(sagemaker_session):
Beispiel #3
0
def main(resources, train_data):

    # configurarion
    AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1')
    AWS_PROFILE = os.getenv('AWS_PROFILE', 'default')
    AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None)
    AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None)
    b3_session, sm_client, sm_runtime, sm_session = get_sm_session(
        region=AWS_DEFAULT_REGION,
        profile_name=AWS_PROFILE,
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
    BASE_JOB_PREFIX = os.getenv('BASE_JOB_PREFIX', 'sts')
    ROLE_ARN = os.getenv('AWS_ROLE', sagemaker.get_execution_role())
    outputs = resources

    bucket = sm_session.default_bucket()
    prefix = "{}/{}".format(BASE_JOB_PREFIX, resources['endpoint']['name'])
    if 'monitor' not in resources:
        raise ValueError("Monitoring not enabled")

    if 's3_capture_upload_path' not in resources['monitor']:
        raise ValueError("Monitoring not enabled")

    baseline_prefix = prefix + "/baselining"
    baseline_data_prefix = baseline_prefix + "/data"
    baseline_results_prefix = baseline_prefix + "/results"
    baseline_data_uri = "s3://{}/{}".format(bucket, baseline_data_prefix)
    baseline_results_uri = "s3://{}/{}".format(bucket, baseline_results_prefix)
    outputs['monitor'].update({
        'baseline': {
            'data_uri': baseline_data_uri,
            'results_uri': baseline_results_uri
        }
    })
    _l.info("Baseline data uri: {}".format(baseline_data_uri))
    _l.info("Baseline results uri: {}".format(baseline_results_uri))

    ground_truth_upload_path = f"s3://{bucket}/{prefix}/ground_truth_data"
    _l.info(f"Ground truth uri: {ground_truth_upload_path}")
    outputs['monitor'].update({'ground truth uri': ground_truth_upload_path})

    # Create a baselining job with training dataset
    _l.info("Executing a baselining job with training dataset")
    _l.info(f"baseline_data_uri: {train_data['baseline']['validate']}")
    my_monitor = ModelQualityMonitor(
        role=ROLE_ARN,
        sagemaker_session=sm_session,
        max_runtime_in_seconds=1800  # 30 minutes
    )
    my_monitor.suggest_baseline(
        baseline_dataset=train_data['baseline']['validate'] + "/baseline.csv",
        dataset_format=DatasetFormat.csv(header=True),
        problem_type="Regression",
        inference_attribute="prediction",
        ground_truth_attribute="label",
        output_s3_uri=baseline_results_uri,
        wait=True)
    baseline_job = my_monitor.latest_baselining_job
    _l.info("suggested baseline contrains")
    _l.info(
        pprint.pformat(baseline_job.suggested_constraints().
                       body_dict["regression_constraints"]))
    _l.info("suggested baseline statistics")
    _l.info(
        pprint.pformat(baseline_job.baseline_statistics().
                       body_dict["regression_metrics"]))

    monitor_schedule_name = (
        f"{BASE_JOB_PREFIX}-mq-sch-{datetime.datetime.utcnow():%Y-%m-%d-%H%M}")
    _l.info(f"Monitoring schedule name: {monitor_schedule_name}")
    outputs['monitor'].update({'schedule_name': monitor_schedule_name})
    endpointInput = EndpointInput(
        resources['endpoint']['name'],
        "/opt/ml/processing/input_data",
        inference_attribute='0'  # REVIEW:
    )

    my_monitor.create_monitoring_schedule(
        monitor_schedule_name=monitor_schedule_name,
        endpoint_input=endpointInput,
        output_s3_uri=baseline_results_uri,
        problem_type="Regression",
        ground_truth_input=ground_truth_upload_path,
        constraints=baseline_job.suggested_constraints(),
        # run the scheduler hourly
        schedule_cron_expression=CronExpressionGenerator.hourly(),
        enable_cloudwatch_metrics=True,
    )
    mq_schedule_details = my_monitor.describe_schedule()
    while mq_schedule_details['MonitoringScheduleStatus'] == 'Pending':
        _l.info(f'Waiting for {monitor_schedule_name}')
        time.sleep(3)
        mq_schedule_details = my_monitor.describe_schedule()
    _l.debug(
        f"Model Quality Monitor - schedule details: {pprint.pformat(mq_schedule_details)}"
    )
    _l.info(
        f"Model Quality Monitor - schedule status: {mq_schedule_details['MonitoringScheduleStatus']}"
    )

    # save outputs to a file
    with open('deploymodel_out.json', 'w') as f:
        json.dump(outputs, f, default=json_default)
Beispiel #4
0
print('Model data baseline suggested at {}'.format(baseline_results_uri))

import datetime as datetime
from time import gmtime, strftime

mon_schedule_name = '{}-{}'.format(mon_schedule_name_base, datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S"))

s3_report_path = f's3://{bucket}/{prefix}/monitor/report'

# Setup daily Cron job schedule 
print(f"Attempting to create monitoring schedule as {mon_schedule_name} \n")

try:
    my_default_monitor.create_monitoring_schedule(
        monitor_schedule_name=mon_schedule_name,
        endpoint_input=endpoint_name,
        output_s3_uri=s3_report_path,
        statistics=my_default_monitor.baseline_statistics(),
        constraints=my_default_monitor.suggested_constraints(),
        schedule_cron_expression=CronExpressionGenerator.daily(),
        enable_cloudwatch_metrics=True,
    )
    desc_schedule_result = my_default_monitor.describe_schedule()
    print('Created monitoring schedule. Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))
    
except:
    my_default_monitor.update_monitoring_schedule(
        endpoint_input=endpoint_name,
        schedule_cron_expression=CronExpressionGenerator.daily()
    )
    print("Monitoring schedule already exists for endpoint. Updating schedule.")
def test_cron_expression_generator_daily_every_x_hours_returns_expected_value_when_called_with_customizations():
    assert (
        CronExpressionGenerator.daily_every_x_hours(hour_interval=7, starting_hour=8)
        == "cron(0 8/7 ? * * *)"
    )
def test_cron_expression_generator_daily_every_x_hours_returns_expected_value_when_called_without_customizations():
    assert CronExpressionGenerator.daily_every_x_hours(hour_interval=6) == "cron(0 0/6 ? * * *)"
def test_cron_expression_generator_daily_returns_expected_value_when_called_with_parameters():
    assert CronExpressionGenerator.daily(hour=5) == "cron(0 5 ? * * *)"
def test_cron_expression_generator_hourly_returns_expected_value():
    assert CronExpressionGenerator.hourly() == "cron(0 * ? * * *)"
Beispiel #9
0
        max_runtime_in_seconds=3600,
)
def create_baseline():
    print(f'Baseline data uri: {baseline_data_uri}')
    print(f'Baseline results uri: {baseline_results_uri}')

    my_default_monitor.suggest_baseline(
        baseline_dataset=baseline_data_uri,
        dataset_format=DatasetFormat.csv(header=False),
        output_s3_uri=baseline_results_uri,
        wait=True
    )

mon_schedule_name = 'xgb-boston-pred-model-monitor-schedule-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
my_default_monitor.create_monitoring_schedule(
    monitor_schedule_name=mon_schedule_name,
    endpoint_input=endpoint_name,
    output_s3_uri=baseline_results_uri.replace('baseline_results', 'monitor_reports'),
    statistics=baseline_results_uri + '/statistics.json',
    constraints=baseline_results_uri + '/constraints.json',
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)