def test_model_config_from_amazon_alg_estimator(sagemaker_session): knn_estimator = knn.KNN(role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type='ml.m4.xlarge', k=16, sample_size=128, predictor_type='regressor', sagemaker_session=sagemaker_session) record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, 'S3Prefix') # simulate training airflow.training_config(knn_estimator, record, mini_batch_size=256) config = airflow.model_config_from_estimator(instance_type='ml.c4.xlarge', estimator=knn_estimator, task_id='task_id', task_type='tuning') expected_config = { 'ModelName': "knn-%s" % TIME_STAMP, 'PrimaryContainer': { 'Image': '174872318107.dkr.ecr.us-west-2.amazonaws.com/knn:1', 'Environment': {}, 'ModelDataUrl': "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Tuning']['BestTrainingJob']" "['TrainingJobName'] }}/output/model.tar.gz" }, 'ExecutionRoleArn': '{{ role }}' } assert config == expected_config
def test_transform_config_from_amazon_alg_estimator(sagemaker_session): knn_estimator = knn.KNN( role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.m4.xlarge", k=16, sample_size=128, predictor_type="regressor", sagemaker_session=sagemaker_session, ) record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, "S3Prefix") transform_data = "{{ transform_data }}" # simulate training airflow.training_config(knn_estimator, record, mini_batch_size=256) config = airflow.transform_config_from_estimator( estimator=knn_estimator, task_id="task_id", task_type="training", instance_count="{{ instance_count }}", instance_type="ml.p2.xlarge", data=transform_data, ) expected_config = { "Model": { "ModelName": "knn-%s" % TIME_STAMP, "PrimaryContainer": { "Image": "174872318107.dkr.ecr.us-west-2.amazonaws.com/knn:1", "Environment": {}, "ModelDataUrl": "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Training']['TrainingJobName'] }}" "/output/model.tar.gz", }, "ExecutionRoleArn": "{{ role }}", }, "Transform": { "TransformJobName": "knn-%s" % TIME_STAMP, "ModelName": "knn-%s" % TIME_STAMP, "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "{{ transform_data }}" } } }, "TransformOutput": { "S3OutputPath": "s3://output/knn-%s" % TIME_STAMP }, "TransformResources": { "InstanceCount": "{{ instance_count }}", "InstanceType": "ml.p2.xlarge", }, }, } assert config == expected_config
def test_model_config_from_amazon_alg_estimator(sagemaker_session): job_name = get_job_name('knn') knn_estimator = knn.KNN(role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type='ml.m4.xlarge', k=16, sample_size=128, predictor_type='regressor', sagemaker_session=sagemaker_session) record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, 'S3Prefix') # simulate training airflow.training_config(knn_estimator, record, mini_batch_size=256) config = airflow.model_config_from_estimator(instance_type='ml.c4.xlarge', estimator=knn_estimator) expected_config = { 'ModelName': job_name, 'PrimaryContainer': { 'Image': '174872318107.dkr.ecr.us-west-2.amazonaws.com/knn:1', 'Environment': {}, 'ModelDataUrl': "s3://output/{}/output/model.tar.gz".format(job_name) }, 'ExecutionRoleArn': '{{ role }}' } assert config == expected_config
def test_model_config_from_amazon_alg_estimator(sagemaker_session): knn_estimator = knn.KNN( role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.m4.xlarge", k=16, sample_size=128, predictor_type="regressor", sagemaker_session=sagemaker_session, ) record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, "S3Prefix") # simulate training airflow.training_config(knn_estimator, record, mini_batch_size=256) config = airflow.model_config_from_estimator(instance_type="ml.c4.xlarge", estimator=knn_estimator, task_id="task_id", task_type="tuning") expected_config = { "ModelName": "knn-%s" % TIME_STAMP, "PrimaryContainer": { "Image": "174872318107.dkr.ecr.us-west-2.amazonaws.com/knn:1", "Environment": {}, "ModelDataUrl": "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Tuning']['BestTrainingJob']" "['TrainingJobName'] }}/output/model.tar.gz", }, "ExecutionRoleArn": "{{ role }}", } assert config == expected_config
def test_transform_config_from_amazon_alg_estimator(sagemaker_session): knn_estimator = knn.KNN(role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type='ml.m4.xlarge', k=16, sample_size=128, predictor_type='regressor', sagemaker_session=sagemaker_session) record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, 'S3Prefix') transform_data = "{{ transform_data }}" # simulate training airflow.training_config(knn_estimator, record, mini_batch_size=256) config = airflow.transform_config_from_estimator( estimator=knn_estimator, instance_count="{{ instance_count }}", instance_type="ml.p2.xlarge", data=transform_data) expected_config = { 'Model': { 'ModelName': "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'PrimaryContainer': { 'Image': '174872318107.dkr.ecr.us-west-2.amazonaws.com/knn:1', 'Environment': {}, 'ModelDataUrl': "s3://output/knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" "/output/model.tar.gz" }, 'ExecutionRoleArn': '{{ role }}' }, 'Transform': { 'TransformJobName': "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'ModelName': "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'TransformInput': { 'DataSource': { 'S3DataSource': { 'S3DataType': 'S3Prefix', 'S3Uri': '{{ transform_data }}' } } }, 'TransformOutput': { 'S3OutputPath': "s3://output/knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" }, 'TransformResources': { 'InstanceCount': '{{ instance_count }}', 'InstanceType': 'ml.p2.xlarge' } } } assert config == expected_config
def test_deploy_config_from_framework_estimator(sagemaker_session): mxnet_estimator = mxnet.MXNet( entry_point="{{ entry_point }}", source_dir="{{ source_dir }}", py_version='py3', framework_version='1.3.0', role="{{ role }}", train_instance_count=1, train_instance_type='ml.m4.xlarge', sagemaker_session=sagemaker_session, base_job_name="{{ base_job_name }}", hyperparameters={'batch_size': 100}) train_data = "{{ train_data }}" # simulate training airflow.training_config(mxnet_estimator, train_data) config = airflow.deploy_config_from_estimator(estimator=mxnet_estimator, task_id='task_id', task_type='training', initial_instance_count="{{ instance_count}}", instance_type="ml.c4.large", endpoint_name="mxnet-endpoint") expected_config = { 'Model': { 'ModelName': "sagemaker-mxnet-%s" % TIME_STAMP, 'PrimaryContainer': { 'Image': '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:1.3.0-cpu-py3', 'Environment': { 'SAGEMAKER_PROGRAM': '{{ entry_point }}', 'SAGEMAKER_SUBMIT_DIRECTORY': "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Training']" "['TrainingJobName'] }}/source/sourcedir.tar.gz", 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_REGION': 'us-west-2'}, 'ModelDataUrl': "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Training']['TrainingJobName'] }}" "/output/model.tar.gz" }, 'ExecutionRoleArn': '{{ role }}' }, 'EndpointConfig': { 'EndpointConfigName': "sagemaker-mxnet-%s" % TIME_STAMP, 'ProductionVariants': [{ 'InstanceType': 'ml.c4.large', 'InitialInstanceCount': '{{ instance_count}}', 'ModelName': "sagemaker-mxnet-%s" % TIME_STAMP, 'VariantName': 'AllTraffic', 'InitialVariantWeight': 1 }] }, 'Endpoint': { 'EndpointName': 'mxnet-endpoint', 'EndpointConfigName': "sagemaker-mxnet-%s" % TIME_STAMP } } assert config == expected_config
def test_deploy_config_from_amazon_alg_estimator(sagemaker_session): knn_estimator = knn.KNN(role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type='ml.m4.xlarge', k=16, sample_size=128, predictor_type='regressor', sagemaker_session=sagemaker_session) record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, 'S3Prefix') # simulate training airflow.training_config(knn_estimator, record, mini_batch_size=256) config = airflow.deploy_config_from_estimator( estimator=knn_estimator, initial_instance_count="{{ instance_count }}", instance_type="ml.p2.xlarge") expected_config = { 'Model': { 'ModelName': "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'PrimaryContainer': { 'Image': '174872318107.dkr.ecr.us-west-2.amazonaws.com/knn:1', 'Environment': {}, 'ModelDataUrl': "s3://output/knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" "/output/model.tar.gz" }, 'ExecutionRoleArn': '{{ role }}' }, 'EndpointConfig': { 'EndpointConfigName': "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'ProductionVariants': [{ 'InstanceType': 'ml.p2.xlarge', 'InitialInstanceCount': '{{ instance_count }}', 'ModelName': "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'VariantName': 'AllTraffic', 'InitialVariantWeight': 1 }] }, 'Endpoint': { 'EndpointName': "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'EndpointConfigName': "knn-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" } } assert config == expected_config
def test_model_config_from_framework_estimator(sagemaker_session): mxnet_estimator = mxnet.MXNet( entry_point="{{ entry_point }}", source_dir="{{ source_dir }}", py_version="py3", framework_version="1.3.0", role="{{ role }}", train_instance_count=1, train_instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, base_job_name="{{ base_job_name }}", hyperparameters={"batch_size": 100}, ) data = "{{ training_data }}" # simulate training airflow.training_config(mxnet_estimator, data) config = airflow.model_config_from_estimator( instance_type="ml.c4.xlarge", estimator=mxnet_estimator, task_id="task_id", task_type="training", ) expected_config = { "ModelName": "sagemaker-mxnet-%s" % TIME_STAMP, "PrimaryContainer": { "Image": "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:1.3.0-cpu-py3", "Environment": { "SAGEMAKER_PROGRAM": "{{ entry_point }}", "SAGEMAKER_SUBMIT_DIRECTORY": "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Training']" "['TrainingJobName'] }}/source/sourcedir.tar.gz", "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_REGION": "us-west-2", }, "ModelDataUrl": "s3://output/{{ ti.xcom_pull(task_ids='task_id')['Training']['TrainingJobName'] }}" "/output/model.tar.gz", }, "ExecutionRoleArn": "{{ role }}", } assert config == expected_config
def __init__(self, state_id, estimator, job_name, data=None, hyperparameters=None, mini_batch_size=None, wait_for_completion=True, **kwargs): """ Args: state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine. estimator (sagemaker.estimator.EstimatorBase): The estimator for the training step. Can be a `BYO estimator, Framework estimator <https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms.html>`_ or `Amazon built-in algorithm estimator <https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html>`_. job_name (str or Placeholder): Specify a training job name, this is required for the training job to run. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution. data: Information about the training data. Please refer to the ``fit()`` method of the associated estimator, as this can take any of the following forms: * (str) - The S3 location where training data is saved. * (dict[str, str] or dict[str, sagemaker.session.s3_input]) - If using multiple channels for training data, you can specify a dict mapping channel names to strings or :func:`~sagemaker.session.s3_input` objects. * (sagemaker.session.s3_input) - Channel configuration for S3 data sources that can provide additional information about the training dataset. See :func:`sagemaker.session.s3_input` for full details. * (sagemaker.amazon.amazon_estimator.RecordSet) - A collection of Amazon :class:`Record` objects serialized and stored in S3. For use with an estimator for an Amazon algorithm. * (list[sagemaker.amazon.amazon_estimator.RecordSet]) - A list of :class:`sagemaker.amazon.amazon_estimator.RecordSet` objects, where each instance is a different channel of training data. hyperparameters (dict, optional): Specify the hyper parameters for the training. (Default: None) mini_batch_size (int): Specify this argument only when estimator is a built-in estimator of an Amazon algorithm. For other estimators, batch size should be specified in the estimator. wait_for_completion (bool, optional): Boolean value set to `True` if the Task state should wait for the training job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the training job and proceed to the next step. (default: True) """ self.estimator = estimator self.job_name = job_name if wait_for_completion: kwargs[Field.Resource.value] = 'arn:aws:states:::sagemaker:createTrainingJob.sync' else: kwargs[Field.Resource.value] = 'arn:aws:states:::sagemaker:createTrainingJob' if isinstance(job_name, str): parameters = training_config(estimator=estimator, inputs=data, job_name=job_name, mini_batch_size=mini_batch_size) else: parameters = training_config(estimator=estimator, inputs=data, mini_batch_size=mini_batch_size) if isinstance(job_name, (ExecutionInput, StepInput)): parameters['TrainingJobName'] = job_name if hyperparameters is not None: parameters['HyperParameters'] = hyperparameters if 'S3Operations' in parameters: del parameters['S3Operations'] kwargs[Field.Parameters.value] = parameters super(TrainingStep, self).__init__(state_id, **kwargs)
def test_model_config_from_framework_estimator(sagemaker_session): mxnet_estimator = mxnet.MXNet(entry_point="{{ entry_point }}", source_dir="{{ source_dir }}", py_version='py3', framework_version='1.3.0', role="{{ role }}", train_instance_count=1, train_instance_type='ml.m4.xlarge', sagemaker_session=sagemaker_session, base_job_name="{{ base_job_name }}", hyperparameters={'batch_size': 100}) data = "{{ training_data }}" # simulate training airflow.training_config(mxnet_estimator, data) config = airflow.model_config_from_estimator(instance_type='ml.c4.xlarge', estimator=mxnet_estimator) expected_config = { 'ModelName': "{{ base_job_name }}-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'PrimaryContainer': { 'Image': '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet:1.3.0-cpu-py3', 'Environment': { 'SAGEMAKER_PROGRAM': '{{ entry_point }}', 'SAGEMAKER_SUBMIT_DIRECTORY': "s3://output/{{ base_job_name }}-" "{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" "/source/sourcedir.tar.gz", 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_REGION': 'us-west-2' }, 'ModelDataUrl': "s3://output/{{ base_job_name }}-" "{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}/output/model.tar.gz" }, 'ExecutionRoleArn': '{{ role }}' } assert config == expected_config
def test_amazon_alg_training_config_required_args(sagemaker_session): ntm_estimator = ntm.NTM( role="{{ role }}", num_topics=10, train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", sagemaker_session=sagemaker_session, ) ntm_estimator.epochs = 32 record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, "S3Prefix") config = airflow.training_config(ntm_estimator, record, mini_batch_size=256) expected_config = { "AlgorithmSpecification": { "TrainingImage": "174872318107.dkr.ecr.us-west-2.amazonaws.com/ntm:1", "TrainingInputMode": "File", }, "OutputDataConfig": { "S3OutputPath": "s3://output/" }, "TrainingJobName": "ntm-%s" % TIME_STAMP, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "ResourceConfig": { "InstanceCount": "{{ instance_count }}", "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 30, }, "RoleArn": "{{ role }}", "InputDataConfig": [{ "DataSource": { "S3DataSource": { "S3DataDistributionType": "ShardedByS3Key", "S3DataType": "S3Prefix", "S3Uri": "{{ record }}", } }, "ChannelName": "train", }], "HyperParameters": { "num_topics": "10", "epochs": "32", "mini_batch_size": "256", "feature_dim": "100", }, } assert config == expected_config
def test_amazon_alg_training_config_required_args(sagemaker_session): job_name = get_job_name('ntm') ntm_estimator = ntm.NTM(role="{{ role }}", num_topics=10, train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", sagemaker_session=sagemaker_session) ntm_estimator.epochs = 32 record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, 'S3Prefix') config = airflow.training_config(ntm_estimator, record, mini_batch_size=256) expected_config = { 'AlgorithmSpecification': { 'TrainingImage': '174872318107.dkr.ecr.us-west-2.amazonaws.com/ntm:1', 'TrainingInputMode': 'File' }, 'OutputDataConfig': { 'S3OutputPath': 's3://output/' }, 'TrainingJobName': job_name, 'StoppingCondition': { 'MaxRuntimeInSeconds': 86400 }, 'ResourceConfig': { 'InstanceCount': '{{ instance_count }}', 'InstanceType': 'ml.c4.2xlarge', 'VolumeSizeInGB': 30 }, 'RoleArn': '{{ role }}', 'InputDataConfig': [{ 'DataSource': { 'S3DataSource': { 'S3DataDistributionType': 'ShardedByS3Key', 'S3DataType': 'S3Prefix', 'S3Uri': '{{ record }}' } }, 'ChannelName': 'train' }], 'HyperParameters': { 'num_topics': '10', 'epochs': '32', 'mini_batch_size': '256', 'feature_dim': '100' } } assert config == expected_config
def test_pytorch_airflow_config_uploads_data_source_to_s3_when_inputs_not_provided( sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): estimator = PyTorch( entry_point=PYTORCH_MNIST_SCRIPT, role=ROLE, framework_version="1.1.0", train_instance_count=2, train_instance_type=cpu_instance_type, hyperparameters={ "epochs": 6, "backend": "gloo" }, ) train_config = sm_airflow.training_config(estimator=estimator) uploaded_s3_data = train_config["HyperParameters"][ "sagemaker_submit_directory"].strip('"') transform_config = sm_airflow.transform_config_from_estimator( estimator=estimator, task_id="transform_config", task_type="training", instance_count=SINGLE_INSTANCE_COUNT, instance_type=cpu_instance_type, data=uploaded_s3_data, content_type="text/csv", ) default_args = { "owner": "airflow", "start_date": airflow.utils.dates.days_ago(2), "provide_context": True, } dag = DAG("tensorflow_example", default_args=default_args, schedule_interval="@once") train_op = SageMakerTrainingOperator(task_id="tf_training", config=train_config, wait_for_completion=True, dag=dag) transform_op = SageMakerTransformOperator(task_id="transform_operator", config=transform_config, wait_for_completion=True, dag=dag) transform_op.set_upstream(train_op) _assert_that_s3_url_contains_data(sagemaker_session, uploaded_s3_data)
def test_byo_training_config_required_args(sagemaker_session): byo = estimator.Estimator( image_name="byo", role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", sagemaker_session=sagemaker_session, ) byo.set_hyperparameters(epochs=32, feature_dim=1024, mini_batch_size=256) data = {"train": "{{ training_data }}"} config = airflow.training_config(byo, data) expected_config = { "AlgorithmSpecification": { "TrainingImage": "byo", "TrainingInputMode": "File" }, "OutputDataConfig": { "S3OutputPath": "s3://output/" }, "TrainingJobName": "byo-%s" % TIME_STAMP, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "ResourceConfig": { "InstanceCount": "{{ instance_count }}", "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 30, }, "RoleArn": "{{ role }}", "InputDataConfig": [{ "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": "{{ training_data }}", } }, "ChannelName": "train", }], "HyperParameters": { "epochs": "32", "feature_dim": "1024", "mini_batch_size": "256" }, } assert config == expected_config
def get_training_params( model_name, job_id, role, image_uri, training_uri, validation_uri, output_uri, hyperparameters, kms_key_id, ): # Create the estimator xgb = sagemaker.estimator.Estimator( image_uri, role, instance_count=1, instance_type="ml.m4.xlarge", output_path=output_uri, ) # Set the hyperparameters overriding with any defaults params = { "alpha":"0.2", "max_depth":"10", "eta":"0.12", "gamma":"2.0", "min_child_weight":"8.5", "subsample":"0.6", "objective":"binary:logistic", "num_round":"20", } xgb.set_hyperparameters(**{**params, **hyperparameters}) # Specify the data source s3_input_train = sagemaker.inputs.TrainingInput( s3_data=training_uri, content_type="csv" ) s3_input_val = sagemaker.inputs.TrainingInput( s3_data=validation_uri, content_type="csv" ) data = {"train": s3_input_train, "validation": s3_input_val} # Get the training request request = training_config(xgb, inputs=data, job_name=job_id) return { "Parameters": { "ModelName": model_name, "TrainJobId": job_id, "TrainJobRequest": json.dumps(request), "KmsKeyId": kms_key_id, } }
def test_byo_training_config_required_args(sagemaker_session): job_name = get_job_name('byo') byo = estimator.Estimator(image_name="byo", role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", sagemaker_session=sagemaker_session) byo.set_hyperparameters(epochs=32, feature_dim=1024, mini_batch_size=256) data = {'train': "{{ training_data }}"} config = airflow.training_config(byo, data) expected_config = { 'AlgorithmSpecification': { 'TrainingImage': 'byo', 'TrainingInputMode': 'File' }, 'OutputDataConfig': { 'S3OutputPath': 's3://output/' }, 'TrainingJobName': job_name, 'StoppingCondition': { 'MaxRuntimeInSeconds': 86400 }, 'ResourceConfig': { 'InstanceCount': '{{ instance_count }}', 'InstanceType': 'ml.c4.2xlarge', 'VolumeSizeInGB': 30 }, 'RoleArn': '{{ role }}', 'InputDataConfig': [{ 'DataSource': { 'S3DataSource': { 'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': '{{ training_data }}' } }, 'ChannelName': 'train' }], 'HyperParameters': { 'epochs': '32', 'feature_dim': '1024', 'mini_batch_size': '256' } } assert config == expected_config
def _build_airflow_workflow(estimator, instance_type, inputs=None, mini_batch_size=None): training_config = sm_airflow.training_config( estimator=estimator, inputs=inputs, mini_batch_size=mini_batch_size) model = estimator.create_model() assert model is not None model_config = sm_airflow.model_config(instance_type, model) assert model_config is not None transform_config = sm_airflow.transform_config_from_estimator( estimator=estimator, task_id="transform_config", task_type="training", instance_count=SINGLE_INSTANCE_COUNT, instance_type=estimator.train_instance_type, data=inputs, content_type="text/csv", input_filter="$", output_filter="$", ) default_args = { "owner": "airflow", "start_date": airflow.utils.dates.days_ago(2), "provide_context": True, } dag = DAG("tensorflow_example", default_args=default_args, schedule_interval="@once") train_op = SageMakerTrainingOperator(task_id="tf_training", config=training_config, wait_for_completion=True, dag=dag) transform_op = SageMakerTransformOperator(task_id="transform_operator", config=transform_config, wait_for_completion=True, dag=dag) transform_op.set_upstream(train_op) return training_config
def get_training_params(model_name, job_id, role, image_uri, training_uri, validation_uri, output_uri, hyperparameters): # Create the estimator xgb = sagemaker.estimator.Estimator(image_uri, role, train_instance_count=1, train_instance_type='ml.m4.xlarge', output_path=output_uri) # Set the hyperparameters overriding with any defaults params = { 'max_depth': '9', 'eta': '0.2', 'gamma': '4', 'min_child_weight': '300', 'subsample': '0.8', 'objective': 'reg:linear', 'early_stopping_rounds': '10', 'num_round': '100' } xgb.set_hyperparameters(**{**params, **hyperparameters}) # Specify the data source s3_input_train = sagemaker.s3_input(s3_data=training_uri, content_type='csv') s3_input_val = sagemaker.s3_input(s3_data=validation_uri, content_type='csv') data = {'train': s3_input_train, 'validation': s3_input_val} # Get the training request request = training_config(xgb, inputs=data, job_name=job_id) return { "Parameters": { "ModelName": model_name, "TrainJobId": job_id, "TrainJobRequest": json.dumps(request), } }
def get_training_request( model_name, model_id, stage, role, image_uri, training_uri, training_bucket, hyperparameters, ): model_uri = "s3://{0}/{1}".format(training_bucket, model_name) # include location of tarfile and name of training script hyperparameters["sagemaker_program"] = "train.py" hyperparameters["sagemaker_submit_directory"] = model_uri+"/code/train.tar.gz" params = json_encode_hyperparameters(hyperparameters) # Create the estimator estimator = sagemaker.estimator.Estimator( image_uri, role, train_instance_count=1, train_instance_type="ml.c5.xlarge", base_job_name = model_name, output_path = model_uri+"/model", hyperparameters=params ) # Specify the data source s3_input_train = sagemaker.inputs.TrainingInput( s3_data=training_uri, content_type="csv" ) data = {"train": s3_input_train} # Get the training request request = training_config(estimator, inputs=data, job_name=get_training_job_name(model_name, model_id)) return json.dumps(request)
def test_framework_training_config_all_args(sagemaker_session): tf = tensorflow.TensorFlow( entry_point="{{ entry_point }}", source_dir="{{ source_dir }}", enable_cloudwatch_metrics=False, container_log_level="{{ log_level }}", code_location="{{ bucket_name }}", training_steps=1000, evaluation_steps=100, checkpoint_path="{{ checkpoint_path }}", py_version='py2', framework_version='1.10.0', requirements_file="", role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", train_volume_size="{{ train_volume_size }}", train_volume_kms_key="{{ train_volume_kms_key }}", train_max_run="{{ train_max_run }}", input_mode='Pipe', output_path="{{ output_path }}", output_kms_key="{{ output_volume_kms_key }}", base_job_name="{{ base_job_name }}", tags=[{"{{ key }}": "{{ value }}"}], subnets=["{{ subnet }}"], security_group_ids=["{{ security_group_ids }}"], sagemaker_session=sagemaker_session) data = "{{ training_data }}" config = airflow.training_config(tf, data) expected_config = { 'AlgorithmSpecification': { 'TrainingImage': '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:1.10.0-cpu-py2', 'TrainingInputMode': 'Pipe' }, 'OutputDataConfig': { 'S3OutputPath': '{{ output_path }}', 'KmsKeyId': '{{ output_volume_kms_key }}' }, 'TrainingJobName': "{{ base_job_name }}-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'StoppingCondition': { 'MaxRuntimeInSeconds': '{{ train_max_run }}' }, 'ResourceConfig': { 'InstanceCount': '{{ instance_count }}', 'InstanceType': 'ml.c4.2xlarge', 'VolumeSizeInGB': '{{ train_volume_size }}', 'VolumeKmsKeyId': '{{ train_volume_kms_key }}' }, 'RoleArn': '{{ role }}', 'InputDataConfig': [{ 'DataSource': { 'S3DataSource': { 'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': '{{ training_data }}' } }, 'ChannelName': 'training' }], 'VpcConfig': { 'Subnets': ['{{ subnet }}'], 'SecurityGroupIds': ['{{ security_group_ids }}'] }, 'HyperParameters': { 'sagemaker_submit_directory': '"s3://{{ bucket_name }}/{{ base_job_name }}-' '{{ execution_date.strftime(\'%Y-%m-%d-%H-%M-%S\') }}' '/source/sourcedir.tar.gz"', 'sagemaker_program': '"{{ entry_point }}"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"{{ log_level }}"', 'sagemaker_job_name': '"{{ base_job_name }}-{{ execution_date.strftime(\'%Y-%m-%d-%H-%M-%S\') }}"', 'sagemaker_region': '"us-west-2"', 'checkpoint_path': '"{{ checkpoint_path }}"', 'training_steps': '1000', 'evaluation_steps': '100', 'sagemaker_requirements': '""' }, 'Tags': [{'{{ key }}': '{{ value }}'}], 'S3Operations': { 'S3Upload': [{ 'Path': '{{ source_dir }}', 'Bucket': '{{ bucket_name }}', 'Key': "{{ base_job_name }}-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" "/source/sourcedir.tar.gz", 'Tar': True}] } } assert config == expected_config
def test_framework_training_config_required_args(sagemaker_session): tf = tensorflow.TensorFlow( entry_point="{{ entry_point }}", framework_version='1.10.0', training_steps=1000, evaluation_steps=100, role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", sagemaker_session=sagemaker_session) data = "{{ training_data }}" config = airflow.training_config(tf, data) expected_config = { 'AlgorithmSpecification': { 'TrainingImage': '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:1.10.0-cpu-py2', 'TrainingInputMode': 'File' }, 'OutputDataConfig': { 'S3OutputPath': 's3://output/' }, 'TrainingJobName': "sagemaker-tensorflow-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'StoppingCondition': { 'MaxRuntimeInSeconds': 86400 }, 'ResourceConfig': { 'InstanceCount': '{{ instance_count }}', 'InstanceType': 'ml.c4.2xlarge', 'VolumeSizeInGB': 30 }, 'RoleArn': '{{ role }}', 'InputDataConfig': [{ 'DataSource': { 'S3DataSource': { 'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': '{{ training_data }}' } }, 'ChannelName': 'training' }], 'HyperParameters': { 'sagemaker_submit_directory': '"s3://output/sagemaker-tensorflow-' '{{ execution_date.strftime(\'%Y-%m-%d-%H-%M-%S\') }}' '/source/sourcedir.tar.gz"', 'sagemaker_program': '"{{ entry_point }}"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '20', 'sagemaker_job_name': '"sagemaker-tensorflow-{{ execution_date.strftime(\'%Y-%m-%d-%H-%M-%S\') }}"', 'sagemaker_region': '"us-west-2"', 'checkpoint_path': '"s3://output/sagemaker-tensorflow-{{ execution_date.strftime(\'%Y-%m-%d-%H-%M-%S\') }}' '/checkpoints"', 'training_steps': '1000', 'evaluation_steps': '100', 'sagemaker_requirements': '""'}, 'S3Operations': { 'S3Upload': [{ 'Path': '{{ entry_point }}', 'Bucket': 'output', 'Key': "sagemaker-tensorflow-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" "/source/sourcedir.tar.gz", 'Tar': True}] } } assert config == expected_config
xgb_container = get_image_uri(sess.region_name, 'xgboost', repo_version="0.90-1") xgb_estimator = Estimator(image_name=xgb_container, role=role, sagemaker_session=sagemaker.session.Session(sess), **config["train_model"]["estimator_config"]) # train_config specifies SageMaker training configuration train_data = create_s3_input(config['train_model']['inputs']['train']) validation_data = create_s3_input( config['train_model']['inputs']['validation']) data_channels = {'train': train_data, 'validation': validation_data} train_config = training_config(estimator=xgb_estimator, inputs=data_channels) # Batch inference xgb_transformer = Transformer( model_name=config['batch_transform']['model_name'], sagemaker_session=sagemaker.session.Session(sess), **config['batch_transform']['transformer_config']) transform_config = transform_config( transformer=xgb_transformer, **config['batch_transform']['transform_config']) # ============================================================================= # define airflow DAG and tasks # ============================================================================= # define airflow DAG
region = config["job_level"]["region_name"] sess = hook.get_session(region_name=region) role = get_sagemaker_role_arn(config["job_level"]["sagemaker_role"], sess.region_name) # define KG estimator # define DKN estimator train_dkn_estimator = Estimator( image_name= '662566784674.dkr.ecr.ap-northeast-1.amazonaws.com/gw-dkn:20201114025113', role=role, sagemaker_session=sagemaker.session.Session(sess), **config["train_dkn"]["estimator_config"]) train_dkn_config = training_config(estimator=train_dkn_estimator, inputs=config["train_dkn"]["inputs"]) def mock_train(data, **context): S3ModelArtifacts = 's3://leigh-gw/dkn_model/dkn-2020-11-24-05-16-33-890/output/model.tar.gz' return S3ModelArtifacts # trigger CDK to deploy model as ECS service using Airflow Python Operator def task_def(data, **context): print('in deploy ...') model_key = context['ti'].xcom_pull(key='return_value') print(model_key) task_def = config["ecs_task_definition"] task_def['containerDefinitions'][0]['environment'][0]['value'] = model_key
def __init__(self, state_id, estimator, job_name, data=None, hyperparameters=None, mini_batch_size=None, experiment_config=None, wait_for_completion=True, tags=None, output_data_config_path=None, **kwargs): """ Args: state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine. estimator (sagemaker.estimator.EstimatorBase): The estimator for the training step. Can be a `BYO estimator, Framework estimator <https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms.html>`_ or `Amazon built-in algorithm estimator <https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html>`_. job_name (str or Placeholder): Specify a training job name, this is required for the training job to run. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution. data: Information about the training data. Please refer to the ``fit()`` method of the associated estimator, as this can take any of the following forms: * (str or Placeholder) - The S3 location where training data is saved. * (dict[str, str] or dict[str, sagemaker.inputs.TrainingInput]) - If using multiple channels for training data, you can specify a dict mapping channel names to strings or :func:`~sagemaker.inputs.TrainingInput` objects. * (sagemaker.inputs.TrainingInput) - Channel configuration for S3 data sources that can provide additional information about the training dataset. See :func:`sagemaker.inputs.TrainingInput` for full details. * (sagemaker.amazon.amazon_estimator.RecordSet) - A collection of Amazon :class:`Record` objects serialized and stored in S3. For use with an estimator for an Amazon algorithm. * (list[sagemaker.amazon.amazon_estimator.RecordSet]) - A list of :class:`sagemaker.amazon.amazon_estimator.RecordSet` objects, where each instance is a different channel of training data. hyperparameters: Parameters used for training. * (dict, optional) - Hyperparameters supplied will be merged with the Hyperparameters specified in the estimator. If there are duplicate entries, the value provided through this property will be used. (Default: Hyperparameters specified in the estimator.) * (Placeholder, optional) - The TrainingStep will use the hyperparameters specified by the Placeholder's value instead of the hyperparameters specified in the estimator. mini_batch_size (int): Specify this argument only when estimator is a built-in estimator of an Amazon algorithm. For other estimators, batch size should be specified in the estimator. experiment_config (dict, optional): Specify the experiment config for the training. (Default: None) wait_for_completion (bool, optional): Boolean value set to `True` if the Task state should wait for the training job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the training job and proceed to the next step. (default: True) tags (list[dict], optional): `List to tags <https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html>`_ to associate with the resource. output_data_config_path (str or Placeholder, optional): S3 location for saving the training result (model artifacts and output files). If specified, it overrides the `output_path` property of `estimator`. """ self.estimator = estimator self.job_name = job_name if wait_for_completion: """ Example resource arn: arn:aws:states:::sagemaker:createTrainingJob.sync """ kwargs[Field.Resource.value] = get_service_integration_arn( SAGEMAKER_SERVICE_NAME, SageMakerApi.CreateTrainingJob, IntegrationPattern.WaitForCompletion) else: """ Example resource arn: arn:aws:states:::sagemaker:createTrainingJob """ kwargs[Field.Resource.value] = get_service_integration_arn( SAGEMAKER_SERVICE_NAME, SageMakerApi.CreateTrainingJob) # Convert `data` Placeholder to a JSONPath string because sagemaker.workflow.airflow.training_config does not # accept Placeholder in the `input` argument. We will suffix the 'S3Uri' key in `parameters` with ".$" later. is_data_placeholder = isinstance(data, Placeholder) if is_data_placeholder: data = data.to_jsonpath() if isinstance(job_name, str): parameters = training_config(estimator=estimator, inputs=data, job_name=job_name, mini_batch_size=mini_batch_size) else: parameters = training_config(estimator=estimator, inputs=data, mini_batch_size=mini_batch_size) if estimator.debugger_hook_config != None and estimator.debugger_hook_config is not False: parameters[ 'DebugHookConfig'] = estimator.debugger_hook_config._to_request_dict( ) if estimator.rules != None: parameters['DebugRuleConfigurations'] = [ rule.to_debugger_rule_config_dict() for rule in estimator.rules ] if isinstance(job_name, Placeholder): parameters['TrainingJobName'] = job_name if output_data_config_path is not None: parameters['OutputDataConfig'][ 'S3OutputPath'] = output_data_config_path if data is not None and is_data_placeholder: # Replace the 'S3Uri' key with one that supports JSONpath value. # Support for uri str only: The list will only contain 1 element data_uri = parameters['InputDataConfig'][0]['DataSource'][ 'S3DataSource'].pop('S3Uri', None) parameters['InputDataConfig'][0]['DataSource']['S3DataSource'][ 'S3Uri.$'] = data_uri if hyperparameters is not None: if not isinstance(hyperparameters, Placeholder): if estimator.hyperparameters() is not None: hyperparameters = self.__merge_hyperparameters( hyperparameters, estimator.hyperparameters()) parameters['HyperParameters'] = hyperparameters if experiment_config is not None: parameters['ExperimentConfig'] = experiment_config if 'S3Operations' in parameters: del parameters['S3Operations'] if tags: parameters['Tags'] = tags_dict_to_kv_list(tags) kwargs[Field.Parameters.value] = parameters super(TrainingStep, self).__init__(state_id, **kwargs)
def __init__( self, state_id, estimator, job_name, data=None, hyperparameters=None, mini_batch_size=None, experiment_config=None, wait_for_completion=True, tags=None, train_data=None, test_data=None, sm_submit_url=None, sm_region=None, sm_output_data=None, sm_debug_output_data=None, **kwargs, ): """ Args: state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine. estimator (sagemaker.estimator.EstimatorBase): The estimator for the training step. Can be a `BYO estimator, Framework estimator <https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms.html>`_ or `Amazon built-in algorithm estimator <https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html>`_. job_name (str or Placeholder): Specify a training job name, this is required for the training job to run. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution. data: Information about the training data. Please refer to the ``fit()`` method of the associated estimator, as this can take any of the following forms: * (str) - The S3 location where training data is saved. * (dict[str, str] or dict[str, sagemaker.session.s3_input]) - If using multiple channels for training data, you can specify a dict mapping channel names to strings or :func:`~sagemaker.session.s3_input` objects. * (sagemaker.session.s3_input) - Channel configuration for S3 data sources that can provide additional information about the training dataset. See :func:`sagemaker.session.s3_input` for full details. * (sagemaker.amazon.amazon_estimator.RecordSet) - A collection of Amazon :class:`Record` objects serialized and stored in S3. For use with an estimator for an Amazon algorithm. * (list[sagemaker.amazon.amazon_estimator.RecordSet]) - A list of :class:`sagemaker.amazon.amazon_estimator.RecordSet` objects, where each instance is a different channel of training data. hyperparameters (dict, optional): Specify the hyper parameters for the training. (Default: None) mini_batch_size (int): Specify this argument only when estimator is a built-in estimator of an Amazon algorithm. For other estimators, batch size should be specified in the estimator. experiment_config (dict, optional): Specify the experiment config for the training. (Default: None) wait_for_completion (bool, optional): Boolean value set to `True` if the Task state should wait for the training job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the training job and proceed to the next step. (default: True) tags (list[dict], optional): `List to tags <https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html>`_ to associate with the resource. """ self.estimator = estimator self.job_name = job_name if wait_for_completion: kwargs[ Field.Resource.value ] = "arn:aws:states:::sagemaker:createTrainingJob.sync" else: kwargs[ Field.Resource.value ] = "arn:aws:states:::sagemaker:createTrainingJob" if isinstance(job_name, str): parameters = training_config( estimator=estimator, inputs=data, job_name=job_name, mini_batch_size=mini_batch_size, ) else: parameters = training_config( estimator=estimator, inputs=data, mini_batch_size=mini_batch_size ) if data is None and train_data is not None and test_data is not None: if isinstance(train_data, (ExecutionInput, StepInput)) and isinstance( test_data, (ExecutionInput, StepInput) ): parameters["InputDataConfig"] = [ { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": train_data, "S3DataDistributionType": "FullyReplicated", } }, "ChannelName": "train", }, { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": test_data, "S3DataDistributionType": "FullyReplicated", } }, "ChannelName": "test", }, ] if sm_output_data is not None: parameters["OutputDataConfig"]["S3OutputPath"] = sm_output_data if estimator.debugger_hook_config is not None: parameters[ "DebugHookConfig" ] = estimator.debugger_hook_config._to_request_dict() if estimator.rules is not None: parameters["DebugRuleConfigurations"] = [ rule.to_debugger_rule_config_dict() for rule in estimator.rules ] if sm_debug_output_data is not None: parameters["DebugHookConfig"]["S3OutputPath"] = sm_debug_output_data if isinstance(job_name, (ExecutionInput, StepInput)): parameters["TrainingJobName"] = job_name if hyperparameters is not None: if "HyperParameters" in parameters: # try to void overwriting reserved hyperparameters: # github.com/aws/sagemaker-training-toolkit/blob/ # master/src/sagemaker_training/params.py parameters["HyperParameters"].update(hyperparameters) else: parameters["HyperParameters"] = hyperparameters if isinstance(job_name, (ExecutionInput, StepInput)): parameters["HyperParameters"]["sagemaker_job_name"] = job_name if sm_submit_url is not None and isinstance( sm_submit_url, (ExecutionInput, StepInput) ): parameters["HyperParameters"]["sagemaker_submit_directory"] = sm_submit_url if sm_region is not None and isinstance(sm_region, (ExecutionInput, StepInput)): parameters["HyperParameters"]["sagemaker_region"] = sm_region if experiment_config is not None: parameters["ExperimentConfig"] = experiment_config if "S3Operations" in parameters: del parameters["S3Operations"] if tags: parameters["Tags"] = tags_dict_to_kv_list(tags) kwargs[Field.Parameters.value] = parameters # print(kwargs) super(MLMaxTrainingStep, self).__init__(state_id, **kwargs)
def test_amazon_alg_training_config_all_args(sagemaker_session): ntm_estimator = ntm.NTM( role="{{ role }}", num_topics=10, train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", train_volume_size="{{ train_volume_size }}", train_volume_kms_key="{{ train_volume_kms_key }}", train_max_run="{{ train_max_run }}", input_mode='Pipe', output_path="{{ output_path }}", output_kms_key="{{ output_volume_kms_key }}", base_job_name="{{ base_job_name }}", tags=[{"{{ key }}": "{{ value }}"}], subnets=["{{ subnet }}"], security_group_ids=["{{ security_group_ids }}"], sagemaker_session=sagemaker_session) ntm_estimator.epochs = 32 ntm_estimator.mini_batch_size = 256 record = amazon_estimator.RecordSet("{{ record }}", 10000, 100, 'S3Prefix') config = airflow.training_config(ntm_estimator, record) expected_config = { 'AlgorithmSpecification': { 'TrainingImage': '174872318107.dkr.ecr.us-west-2.amazonaws.com/ntm:1', 'TrainingInputMode': 'Pipe' }, 'OutputDataConfig': { 'S3OutputPath': '{{ output_path }}', 'KmsKeyId': '{{ output_volume_kms_key }}' }, 'TrainingJobName': "{{ base_job_name }}-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'StoppingCondition': { 'MaxRuntimeInSeconds': '{{ train_max_run }}' }, 'ResourceConfig': { 'InstanceCount': '{{ instance_count }}', 'InstanceType': 'ml.c4.2xlarge', 'VolumeSizeInGB': '{{ train_volume_size }}', 'VolumeKmsKeyId': '{{ train_volume_kms_key }}' }, 'RoleArn': '{{ role }}', 'InputDataConfig': [{ 'DataSource': { 'S3DataSource': { 'S3DataDistributionType': 'ShardedByS3Key', 'S3DataType': 'S3Prefix', 'S3Uri': '{{ record }}' } }, 'ChannelName': 'train' }], 'VpcConfig': { 'Subnets': ['{{ subnet }}'], 'SecurityGroupIds': ['{{ security_group_ids }}'] }, 'HyperParameters': { 'num_topics': '10', 'epochs': '32', 'mini_batch_size': '256', 'feature_dim': '100' }, 'Tags': [{'{{ key }}': '{{ value }}'}] } assert config == expected_config
def test_byo_training_config_all_args(sagemaker_session): byo = estimator.Estimator( image_name="byo", role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", train_volume_size="{{ train_volume_size }}", train_volume_kms_key="{{ train_volume_kms_key }}", train_max_run="{{ train_max_run }}", input_mode='Pipe', output_path="{{ output_path }}", output_kms_key="{{ output_volume_kms_key }}", base_job_name="{{ base_job_name }}", tags=[{"{{ key }}": "{{ value }}"}], subnets=["{{ subnet }}"], security_group_ids=["{{ security_group_ids }}"], model_uri="{{ model_uri }}", model_channel_name="{{ model_chanel }}", sagemaker_session=sagemaker_session) byo.set_hyperparameters(epochs=32, feature_dim=1024, mini_batch_size=256) data = {'train': "{{ training_data }}"} config = airflow.training_config(byo, data) expected_config = { 'AlgorithmSpecification': { 'TrainingImage': 'byo', 'TrainingInputMode': 'Pipe' }, 'OutputDataConfig': { 'S3OutputPath': '{{ output_path }}', 'KmsKeyId': '{{ output_volume_kms_key }}' }, 'TrainingJobName': "{{ base_job_name }}-{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}", 'StoppingCondition': { 'MaxRuntimeInSeconds': '{{ train_max_run }}' }, 'ResourceConfig': { 'InstanceCount': '{{ instance_count }}', 'InstanceType': 'ml.c4.2xlarge', 'VolumeSizeInGB': '{{ train_volume_size }}', 'VolumeKmsKeyId': '{{ train_volume_kms_key }}' }, 'RoleArn': '{{ role }}', 'InputDataConfig': [ { 'DataSource': { 'S3DataSource': { 'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': '{{ training_data }}' } }, 'ChannelName': 'train' }, { 'DataSource': { 'S3DataSource': { 'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': '{{ model_uri }}' } }, 'ContentType': 'application/x-sagemaker-model', 'InputMode': 'File', 'ChannelName': '{{ model_chanel }}' } ], 'VpcConfig': { 'Subnets': ['{{ subnet }}'], 'SecurityGroupIds': ['{{ security_group_ids }}'] }, 'HyperParameters': { 'epochs': '32', 'feature_dim': '1024', 'mini_batch_size': '256'}, 'Tags': [{'{{ key }}': '{{ value }}'}] } assert config == expected_config
'checkpointPath': '/opt/ml/checkpoints' } byoc_est = sagemaker.estimator.Estimator( '662566784674.dkr.ecr.ap-northeast-1.amazonaws.com/gw-dkn:20201114025113', role=sagemaker.get_execution_role(), train_instance_count=1, train_instance_type=train_instance_type, base_job_name='dkn-byoc', hyperparameters=hyperparameters) train_s3 = "s3://leigh-gw/train.csv/" test_s3 = "s3://leigh-gw/test.csv/" inputs = {'train': train_s3, 'eval': test_s3} train_config = training_config(estimator=byoc_est, inputs=inputs) # step - trigger CDK to deploy model as ECS service using Airflow Python Operator def dkn_model_deploy(data, **context): print("mock for dkn deployment") default_args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2), 'provide_context': True } dag = DAG('tensorflow_example', default_args=default_args,
hook = AwsHook(aws_conn_id='airflow-sagemaker') region = config["job_level"]["region_name"] sess = hook.get_session(region_name=region) role = get_sagemaker_role_arn(config["train_model"]["sagemaker_role"], sess.region_name) container = get_image_uri(sess.region_name, 'factorization-machines') hpo_enabled = is_hpo_enabled() # create estimator fm_estimator = Estimator(image_name=container, role=role, sagemaker_session=sagemaker.session.Session(sess), **config["train_model"]["estimator_config"]) # train_config specifies SageMaker training configuration train_config = training_config(estimator=fm_estimator, inputs=config["train_model"]["inputs"]) # create tuner fm_tuner = HyperparameterTuner(estimator=fm_estimator, **config["tune_model"]["tuner_config"]) # create tuning config tuner_config = tuning_config(tuner=fm_tuner, inputs=config["tune_model"]["inputs"]) # create transform config transform_config = transform_config_from_estimator( estimator=fm_estimator, task_id="model_tuning" if hpo_enabled else "model_training", task_type="tuning" if hpo_enabled else "training", **config["batch_transform"]["transform_config"])
# 3 m4.2xl with 8 cores each. We have to leave 1 core for ray scheduler. # Don't forget to change this on the basis of instance type. "rl.training.config.num_workers": (8 * 2) - 1 #"rl.training.config.horizon": 5000, #"rl.training.config.num_sgd_iter": 10, } ) # estimator.fit(wait=local_mode) # job_name = estimator.latest_training_job.job_name # print("Training job: %s" % job_name) # train_config specifies SageMaker training configuration train_config = training_config( estimator=estimator, inputs="s3://sagemaker-us-east-1-450145409201/sagemaker/DEMO-pytorch-mnist") # MOCK # inputs=config["train_model"]["inputs"]) # # create tuner # fm_tuner = HyperparameterTuner( # estimator=fm_estimator, # **config["tune_model"]["tuner_config"] # ) # # create tuning config # tuner_config = tuning_config( # tuner=fm_tuner, # inputs=config["tune_model"]["inputs"]) # # create transform config