def lambda_handler(event, context):
    schema_json = get_schema_json()
    bucket = os.environ['bucket']
    # Get the execution ID
    sess = sagemaker.Session()
    obj = s3.Object(bucket, 'execution.txt')
    exec_id = obj.get()['Body'].read().decode("utf-8")
    role = os.environ['role']
    print(exec_id)
    # Build variables
    training_job = f'{exec_id}-job'
    mleap_model_prefix = f'sagemaker/spark-preprocess-demo/{exec_id}/mleap-model'
    # Create models for Pipeline
    xgb_model = sagemaker.estimator.Estimator.attach(
        training_job).create_model()
    sparkml_data = 's3://{}/{}/{}'.format(os.environ['bucket'],
                                          mleap_model_prefix, 'model.tar.gz')
    sparkml_model = SparkMLModel(model_data=sparkml_data,
                                 env={'SAGEMAKER_SPARKML_SCHEMA': schema_json})

    # Create Pipeline Model
    model_name = 'inference-pipeline-' + exec_id
    sm_model = PipelineModel(name=model_name,
                             role=role,
                             models=[sparkml_model, xgb_model])
    sm_model.transformer(1, 'ml.m4.xlarge')
    event['model_name'] = model_name
    event['timestamp_prefix'] = exec_id
    return event
Beispiel #2
0
def test_inference_pipeline_batch_transform(sagemaker_session):
    sparkml_model_data = sagemaker_session.upload_data(
        path=os.path.join(SPARKML_DATA_PATH, 'mleap_model.tar.gz'),
        key_prefix='integ-test-data/sparkml/model')
    xgb_model_data = sagemaker_session.upload_data(
        path=os.path.join(XGBOOST_DATA_PATH, 'xgb_model.tar.gz'),
        key_prefix='integ-test-data/xgboost/model')
    batch_job_name = 'test-inference-pipeline-batch-{}'.format(
        sagemaker_timestamp())
    sparkml_model = SparkMLModel(model_data=sparkml_model_data,
                                 env={'SAGEMAKER_SPARKML_SCHEMA': SCHEMA},
                                 sagemaker_session=sagemaker_session)
    xgb_image = get_image_uri(sagemaker_session.boto_region_name, 'xgboost')
    xgb_model = Model(model_data=xgb_model_data,
                      image=xgb_image,
                      sagemaker_session=sagemaker_session)
    model = PipelineModel(models=[sparkml_model, xgb_model],
                          role='SageMakerRole',
                          sagemaker_session=sagemaker_session,
                          name=batch_job_name)
    transformer = model.transformer(1, 'ml.m4.xlarge')
    transform_input_key_prefix = 'integ-test-data/sparkml_xgboost/transform'
    transform_input = transformer.sagemaker_session.upload_data(
        path=VALID_DATA_PATH, key_prefix=transform_input_key_prefix)

    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.transform(transform_input,
                              content_type=CONTENT_TYPE_CSV,
                              job_name=batch_job_name)
        transformer.wait()
Beispiel #3
0
def test_transformer(tfo, time, sagemaker_session):
    framework_model = DummyFrameworkModel(sagemaker_session)
    sparkml_model = SparkMLModel(model_data=MODEL_DATA_2,
                                 role=ROLE,
                                 sagemaker_session=sagemaker_session)
    model_name = "ModelName"
    model = PipelineModel(
        models=[framework_model, sparkml_model],
        role=ROLE,
        sagemaker_session=sagemaker_session,
        name=model_name,
    )

    instance_count = 55
    strategy = "MultiRecord"
    assemble_with = "Line"
    output_path = "s3://output/path"
    output_kms_key = "output:kms:key"
    accept = "application/jsonlines"
    env = {"my_key": "my_value"}
    max_concurrent_transforms = 20
    max_payload = 5
    tags = [{"my_tag": "my_value"}]
    volume_kms_key = "volume:kms:key"
    transformer = model.transformer(
        instance_type=INSTANCE_TYPE,
        instance_count=instance_count,
        strategy=strategy,
        assemble_with=assemble_with,
        output_path=output_path,
        output_kms_key=output_kms_key,
        accept=accept,
        env=env,
        max_concurrent_transforms=max_concurrent_transforms,
        max_payload=max_payload,
        tags=tags,
        volume_kms_key=volume_kms_key,
    )
    assert transformer.instance_type == INSTANCE_TYPE
    assert transformer.instance_count == instance_count
    assert transformer.strategy == strategy
    assert transformer.assemble_with == assemble_with
    assert transformer.output_path == output_path
    assert transformer.output_kms_key == output_kms_key
    assert transformer.accept == accept
    assert transformer.env == env
    assert transformer.max_concurrent_transforms == max_concurrent_transforms
    assert transformer.max_payload == max_payload
    assert transformer.tags == tags
    assert transformer.volume_kms_key == volume_kms_key
    assert transformer.model_name == model_name
def test_inference_pipeline_batch_transform(sagemaker_session,
                                            cpu_instance_type):
    sparkml_model_data = sagemaker_session.upload_data(
        path=os.path.join(SPARKML_DATA_PATH, "mleap_model.tar.gz"),
        key_prefix="integ-test-data/sparkml/model",
    )
    xgb_model_data = sagemaker_session.upload_data(
        path=os.path.join(XGBOOST_DATA_PATH, "xgb_model.tar.gz"),
        key_prefix="integ-test-data/xgboost/model",
    )
    batch_job_name = "test-inference-pipeline-batch-{}".format(
        sagemaker_timestamp())
    sparkml_model = SparkMLModel(
        model_data=sparkml_model_data,
        env={"SAGEMAKER_SPARKML_SCHEMA": SCHEMA},
        sagemaker_session=sagemaker_session,
    )
    xgb_image = image_uris.retrieve("xgboost",
                                    sagemaker_session.boto_region_name,
                                    version="1",
                                    image_scope="inference")
    xgb_model = Model(model_data=xgb_model_data,
                      image_uri=xgb_image,
                      sagemaker_session=sagemaker_session)
    model = PipelineModel(
        models=[sparkml_model, xgb_model],
        role="SageMakerRole",
        sagemaker_session=sagemaker_session,
        name=batch_job_name,
    )
    transformer = model.transformer(1, cpu_instance_type)
    transform_input_key_prefix = "integ-test-data/sparkml_xgboost/transform"
    transform_input = transformer.sagemaker_session.upload_data(
        path=VALID_DATA_PATH, key_prefix=transform_input_key_prefix)

    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.transform(transform_input,
                              content_type="text/csv",
                              job_name=batch_job_name)
        transformer.wait()