def test_transform_config_from_amazon_alg_estimator(sagemaker_session):
    knn_estimator = knn.KNN(
        role="{{ role }}",
        train_instance_count="{{ instance_count }}",
        train_instance_type="ml.m4.xlarge",
        k=16,
        sample_size=128,
        predictor_type="regressor",
        sagemaker_session=sagemaker_session,
    )

    record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, "S3Prefix")
    transform_data = "{{ transform_data }}"

    # simulate training
    airflow.training_config(knn_estimator, record, mini_batch_size=256)

    config = airflow.transform_config_from_estimator(
        estimator=knn_estimator,
        task_id="task_id",
        task_type="training",
        instance_count="{{ instance_count }}",
        instance_type="ml.p2.xlarge",
        data=transform_data,
    )
    expected_config = {
        "Model": {
            "ModelName": "knn-%s" % TIME_STAMP,
            "PrimaryContainer": {
                "Image":
                "174872318107.dkr.ecr.us-west-2.amazonaws.com/knn:1",
                "Environment": {},
                "ModelDataUrl":
                "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Training']['TrainingJobName'] }}"
                "/output/model.tar.gz",
            },
            "ExecutionRoleArn": "{{ role }}",
        },
        "Transform": {
            "TransformJobName": "knn-%s" % TIME_STAMP,
            "ModelName": "knn-%s" % TIME_STAMP,
            "TransformInput": {
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": "{{ transform_data }}"
                    }
                }
            },
            "TransformOutput": {
                "S3OutputPath": "s3://output/knn-%s" % TIME_STAMP
            },
            "TransformResources": {
                "InstanceCount": "{{ instance_count }}",
                "InstanceType": "ml.p2.xlarge",
            },
        },
    }

    assert config == expected_config
def test_transform_config_from_amazon_alg_estimator(sagemaker_session):
    knn_estimator = knn.KNN(role="{{ role }}",
                            train_instance_count="{{ instance_count }}",
                            train_instance_type='ml.m4.xlarge',
                            k=16,
                            sample_size=128,
                            predictor_type='regressor',
                            sagemaker_session=sagemaker_session)

    record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, 'S3Prefix')
    transform_data = "{{ transform_data }}"

    # simulate training
    airflow.training_config(knn_estimator, record, mini_batch_size=256)

    config = airflow.transform_config_from_estimator(
        estimator=knn_estimator,
        instance_count="{{ instance_count }}",
        instance_type="ml.p2.xlarge",
        data=transform_data)
    expected_config = {
        'Model': {
            'ModelName':
            "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}",
            'PrimaryContainer': {
                'Image':
                '174872318107.dkr.ecr.us-west-2.amazonaws.com/knn:1',
                'Environment': {},
                'ModelDataUrl':
                "s3://output/knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}"
                "/output/model.tar.gz"
            },
            'ExecutionRoleArn': '{{ role }}'
        },
        'Transform': {
            'TransformJobName':
            "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}",
            'ModelName':
            "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}",
            'TransformInput': {
                'DataSource': {
                    'S3DataSource': {
                        'S3DataType': 'S3Prefix',
                        'S3Uri': '{{ transform_data }}'
                    }
                }
            },
            'TransformOutput': {
                'S3OutputPath':
                "s3://output/knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}"
            },
            'TransformResources': {
                'InstanceCount': '{{ instance_count }}',
                'InstanceType': 'ml.p2.xlarge'
            }
        }
    }

    assert config == expected_config
Example #3
0
def test_pytorch_airflow_config_uploads_data_source_to_s3_when_inputs_not_provided(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        estimator = PyTorch(
            entry_point=PYTORCH_MNIST_SCRIPT,
            role=ROLE,
            framework_version="1.1.0",
            train_instance_count=2,
            train_instance_type=cpu_instance_type,
            hyperparameters={
                "epochs": 6,
                "backend": "gloo"
            },
        )

        train_config = sm_airflow.training_config(estimator=estimator)

        uploaded_s3_data = train_config["HyperParameters"][
            "sagemaker_submit_directory"].strip('"')

        transform_config = sm_airflow.transform_config_from_estimator(
            estimator=estimator,
            task_id="transform_config",
            task_type="training",
            instance_count=SINGLE_INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            data=uploaded_s3_data,
            content_type="text/csv",
        )

        default_args = {
            "owner": "airflow",
            "start_date": airflow.utils.dates.days_ago(2),
            "provide_context": True,
        }

        dag = DAG("tensorflow_example",
                  default_args=default_args,
                  schedule_interval="@once")

        train_op = SageMakerTrainingOperator(task_id="tf_training",
                                             config=train_config,
                                             wait_for_completion=True,
                                             dag=dag)

        transform_op = SageMakerTransformOperator(task_id="transform_operator",
                                                  config=transform_config,
                                                  wait_for_completion=True,
                                                  dag=dag)

        transform_op.set_upstream(train_op)

        _assert_that_s3_url_contains_data(sagemaker_session, uploaded_s3_data)
def _build_airflow_workflow(estimator,
                            instance_type,
                            inputs=None,
                            mini_batch_size=None):
    training_config = sm_airflow.training_config(
        estimator=estimator, inputs=inputs, mini_batch_size=mini_batch_size)

    model = estimator.create_model()
    assert model is not None

    model_config = sm_airflow.model_config(instance_type, model)
    assert model_config is not None

    transform_config = sm_airflow.transform_config_from_estimator(
        estimator=estimator,
        task_id="transform_config",
        task_type="training",
        instance_count=SINGLE_INSTANCE_COUNT,
        instance_type=estimator.train_instance_type,
        data=inputs,
        content_type="text/csv",
        input_filter="$",
        output_filter="$",
    )

    default_args = {
        "owner": "airflow",
        "start_date": airflow.utils.dates.days_ago(2),
        "provide_context": True,
    }

    dag = DAG("tensorflow_example",
              default_args=default_args,
              schedule_interval="@once")

    train_op = SageMakerTrainingOperator(task_id="tf_training",
                                         config=training_config,
                                         wait_for_completion=True,
                                         dag=dag)

    transform_op = SageMakerTransformOperator(task_id="transform_operator",
                                              config=transform_config,
                                              wait_for_completion=True,
                                              dag=dag)

    transform_op.set_upstream(train_op)

    return training_config
# train_config specifies SageMaker training configuration
train_config = training_config(estimator=fm_estimator,
                               inputs=config["train_model"]["inputs"])

# create tuner
fm_tuner = HyperparameterTuner(estimator=fm_estimator,
                               **config["tune_model"]["tuner_config"])

# create tuning config
tuner_config = tuning_config(tuner=fm_tuner,
                             inputs=config["tune_model"]["inputs"])

# create transform config
transform_config = transform_config_from_estimator(
    estimator=fm_estimator,
    task_id="model_tuning" if hpo_enabled else "model_training",
    task_type="tuning" if hpo_enabled else "training",
    **config["batch_transform"]["transform_config"])

# =============================================================================
# define airflow DAG and tasks
# =============================================================================

# define airflow DAG

args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)}

dag = DAG(
    dag_id='sagemaker-ml-pipeline',
    default_args=args,
    schedule_interval=None,
def test_transform_config_from_framework_estimator(sagemaker_session):
    mxnet_estimator = mxnet.MXNet(
        entry_point="{{ entry_point }}",
        source_dir="{{ source_dir }}",
        py_version="py3",
        framework_version="1.3.0",
        role="{{ role }}",
        train_instance_count=1,
        train_instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
        base_job_name="{{ base_job_name }}",
        hyperparameters={"batch_size": 100},
    )

    train_data = "{{ train_data }}"
    transform_data = "{{ transform_data }}"

    # simulate training
    airflow.training_config(mxnet_estimator, train_data)

    config = airflow.transform_config_from_estimator(
        estimator=mxnet_estimator,
        task_id="task_id",
        task_type="training",
        instance_count="{{ instance_count }}",
        instance_type="ml.p2.xlarge",
        data=transform_data,
    )
    expected_config = {
        "Model": {
            "ModelName": "sagemaker-mxnet-%s" % TIME_STAMP,
            "PrimaryContainer": {
                "Image":
                "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:1.3.0-gpu-py3",
                "Environment": {
                    "SAGEMAKER_PROGRAM":
                    "{{ entry_point }}",
                    "SAGEMAKER_SUBMIT_DIRECTORY":
                    "s3://output/{{ ti.xcom_pull(task_ids='task_id')"
                    "['Training']['TrainingJobName'] }}"
                    "/source/sourcedir.tar.gz",
                    "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS":
                    "false",
                    "SAGEMAKER_CONTAINER_LOG_LEVEL":
                    "20",
                    "SAGEMAKER_REGION":
                    "us-west-2",
                },
                "ModelDataUrl":
                "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Training']['TrainingJobName'] }}"
                "/output/model.tar.gz",
            },
            "ExecutionRoleArn": "{{ role }}",
        },
        "Transform": {
            "TransformJobName": "{{ base_job_name }}-%s" % TIME_STAMP,
            "ModelName": "sagemaker-mxnet-%s" % TIME_STAMP,
            "TransformInput": {
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": "{{ transform_data }}"
                    }
                }
            },
            "TransformOutput": {
                "S3OutputPath":
                "s3://output/{{ base_job_name }}-%s" % TIME_STAMP
            },
            "TransformResources": {
                "InstanceCount": "{{ instance_count }}",
                "InstanceType": "ml.p2.xlarge",
            },
            "Environment": {},
        },
    }

    assert config == expected_config
Example #7
0
def test_transform_config_from_framework_estimator(sagemaker_session):
    mxnet_estimator = mxnet.MXNet(entry_point="{{ entry_point }}",
                                  source_dir="{{ source_dir }}",
                                  py_version='py3',
                                  framework_version='1.3.0',
                                  role="{{ role }}",
                                  train_instance_count=1,
                                  train_instance_type='ml.m4.xlarge',
                                  sagemaker_session=sagemaker_session,
                                  base_job_name="{{ base_job_name }}",
                                  hyperparameters={'batch_size': 100})

    train_data = "{{ train_data }}"
    transform_data = "{{ transform_data }}"

    # simulate training
    airflow.training_config(mxnet_estimator, train_data)

    config = airflow.transform_config_from_estimator(
        estimator=mxnet_estimator,
        task_id='task_id',
        task_type='training',
        instance_count="{{ instance_count }}",
        instance_type="ml.p2.xlarge",
        data=transform_data)
    expected_config = {
        'Model': {
            'ModelName': "sagemaker-mxnet-%s" % TIME_STAMP,
            'PrimaryContainer': {
                'Image':
                '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:1.3.0-gpu-py3',
                'Environment': {
                    'SAGEMAKER_PROGRAM':
                    '{{ entry_point }}',
                    'SAGEMAKER_SUBMIT_DIRECTORY':
                    "s3://output/{{ ti.xcom_pull(task_ids='task_id')"
                    "['Training']['TrainingJobName'] }}"
                    "/source/sourcedir.tar.gz",
                    'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS':
                    'false',
                    'SAGEMAKER_CONTAINER_LOG_LEVEL':
                    '20',
                    'SAGEMAKER_REGION':
                    'us-west-2'
                },
                'ModelDataUrl':
                "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Training']['TrainingJobName'] }}"
                "/output/model.tar.gz"
            },
            'ExecutionRoleArn': '{{ role }}'
        },
        'Transform': {
            'TransformJobName': "{{ base_job_name }}-%s" % TIME_STAMP,
            'ModelName': "sagemaker-mxnet-%s" % TIME_STAMP,
            'TransformInput': {
                'DataSource': {
                    'S3DataSource': {
                        'S3DataType': 'S3Prefix',
                        'S3Uri': '{{ transform_data }}'
                    }
                }
            },
            'TransformOutput': {
                'S3OutputPath':
                "s3://output/{{ base_job_name }}-%s" % TIME_STAMP
            },
            'TransformResources': {
                'InstanceCount': '{{ instance_count }}',
                'InstanceType': 'ml.p2.xlarge'
            },
            'Environment': {}
        }
    }

    assert config == expected_config