Exemple #1
0
    def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, wait_for_completion=True, **kwargs):
        """
        Args:
            state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine.
            transformer (sagemaker.transformer.Transformer): The SageMaker transformer to use in the TransformStep.
            job_name (str or Placeholder): Specify a transform job name. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution.
            model_name (str or Placeholder): Specify a model name for the transform job to use. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution.
            data (str): Input data location in S3.
            data_type (str): What the S3 location defines (default: 'S3Prefix').
                Valid values:

                * 'S3Prefix' - the S3 URI defines a key name prefix. All objects with this prefix will
                    be used as inputs for the transform job.
                * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object
                    to use as an input for the transform job.
            content_type (str): MIME type of the input data (default: None).
            compression_type (str): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None.
            split_type (str): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
            wait_for_completion(bool, optional): Boolean value set to `True` if the Task state should wait for the transform job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the transform job and proceed to the next step. (default: True)
        """
        if wait_for_completion:
            kwargs[Field.Resource.value] = 'arn:aws:states:::sagemaker:createTransformJob.sync'
        else:
            kwargs[Field.Resource.value] = 'arn:aws:states:::sagemaker:createTransformJob'

        if isinstance(job_name, str):
            parameters = transform_config(
                transformer=transformer,
                data=data,
                data_type=data_type,
                content_type=content_type,
                compression_type=compression_type,
                split_type=split_type,
                job_name=job_name
            )
        else:
            parameters = transform_config(
                transformer=transformer,
                data=data,
                data_type=data_type,
                content_type=content_type,
                compression_type=compression_type,
                split_type=split_type
            )

        if isinstance(job_name, (ExecutionInput, StepInput)):
            parameters['TransformJobName'] = job_name

        parameters['ModelName'] = model_name

        kwargs[Field.Parameters.value] = parameters
        super(TransformStep, self).__init__(state_id, **kwargs)
def test_transform_config(sagemaker_session):
    tf_transformer = transformer.Transformer(
        model_name="tensorflow-model",
        instance_count="{{ instance_count }}",
        instance_type="ml.p2.xlarge",
        strategy="SingleRecord",
        assemble_with='Line',
        output_path="{{ output_path }}",
        output_kms_key="{{ kms_key }}",
        accept="{{ accept }}",
        max_concurrent_transforms="{{ max_parallel_job }}",
        max_payload="{{ max_payload }}",
        tags=[{"{{ key }}": "{{ value }}"}],
        env={"{{ key }}": "{{ value }}"},
        base_transform_job_name="tensorflow-transform",
        sagemaker_session=sagemaker_session,
        volume_kms_key="{{ kms_key }}")

    data = "{{ transform_data }}"

    config = airflow.transform_config(tf_transformer, data, data_type='S3Prefix', content_type="{{ content_type }}",
                                      compression_type="{{ compression_type }}", split_type="{{ split_type }}")
    expected_config = {
        'TransformJobName': "tensorflow-transform-%s" % TIME_STAMP,
        'ModelName': 'tensorflow-model',
        'TransformInput': {
            'DataSource': {
                'S3DataSource': {
                    'S3DataType': 'S3Prefix',
                    'S3Uri': '{{ transform_data }}'
                }
            },
            'ContentType': '{{ content_type }}',
            'CompressionType': '{{ compression_type }}',
            'SplitType': '{{ split_type }}'},
        'TransformOutput': {
            'S3OutputPath': '{{ output_path }}',
            'KmsKeyId': '{{ kms_key }}',
            'AssembleWith': 'Line',
            'Accept': '{{ accept }}'
        },
        'TransformResources': {
            'InstanceCount': '{{ instance_count }}',
            'InstanceType': 'ml.p2.xlarge',
            'VolumeKmsKeyId': '{{ kms_key }}'
        },
        'BatchStrategy': 'SingleRecord',
        'MaxConcurrentTransforms': '{{ max_parallel_job }}',
        'MaxPayloadInMB': '{{ max_payload }}',
        'Environment': {'{{ key }}': '{{ value }}'},
        'Tags': [{'{{ key }}': '{{ value }}'}]
    }

    assert config == expected_config
Exemple #3
0
    def __init__(
        self,
        state_id,
        transformer,
        job_name,
        model_name,
        data,
        outputpath,
        data_type="S3Prefix",
        content_type=None,
        compression_type=None,
        split_type=None,
        experiment_config=None,
        wait_for_completion=True,
        tags=None,
        input_filter=None,
        output_filter=None,
        join_source=None,
        **kwargs,
    ):
        """
        Args:
            state_id (str): State name whose length **must be** less than or
            equal to 128 unicode characters. State names **must be** unique
            within the scope of the whole state machine.

            transformer (sagemaker.transformer.Transformer): The SageMaker
            transformer to use in the TransformStep.

            job_name (str or Placeholder): Specify a transform job name. We
            recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput`
            placeholder collection to pass the value dynamically in each
            execution.

            model_name (str or Placeholder): Specify a model name for the
            transform job to use. We recommend to use
            :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder
            collection to pass the value dynamically in each execution.

            data (str): Input data location in S3.

            data_type (str): What the S3 location defines (default: 'S3Prefix').
                Valid values:

                * 'S3Prefix' - the S3 URI defines a key name prefix. All
                objects with this prefix will be used as inputs for the
                transform job.
                * 'ManifestFile' - the S3 URI points to a single manifest file
                listing each S3 object
                    to use as an input for the transform job.

            content_type (str): MIME type of the input data (default: None).

            compression_type (str): Compression type of the input data, if
            compressed (default: None). Valid values: 'Gzip', None.

            split_type (str): The record delimiter for the input object
            (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and
            'TFRecord'.

            experiment_config (dict, optional): Specify the experiment config
            for the transform. (Default: None)

            wait_for_completion(bool, optional): Boolean value set to `True` if
            the Task state should wait for the transform job to complete before
            proceeding to the next step in the workflow. Set to `False` if the
            Task state should submit the transform job and proceed to the next
            step. (default: True)

            tags (list[dict], optional): `List to tags
            <https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html>`_ to
            associate with the resource.

            input_filter (str): A JSONPath to select a portion of the input to
            pass to the algorithm container for inference. If you omit the
            field, it gets the value ‘$’, representing the entire input. For
            CSV data, each row is taken as a JSON array, so only index-based
            JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow
            the RFC format. See Supported JSONPath Operators for a table of
            supported JSONPath operators. For more information, see the
            SageMaker API documentation for CreateTransformJob. Some examples:
                “$[1:]”, “$.features” (default: None).

            output_filter (str): A JSONPath to select a portion of the
            joined/original output to return as the output. For more
            information, see the SageMaker API documentation for
            CreateTransformJob. Some examples: “$[1:]”, “$.prediction”
            (default: None).

            join_source (str): The source of data to be joined to the transform
            output. It can be set to ‘Input’ meaning the entire input record
            will be joined to the inference result. You can use OutputFilter to
            select the useful portion before uploading to S3. (default: None).
            Valid values: Input, None.
        """
        if wait_for_completion:
            kwargs[
                Field.Resource.value
            ] = "arn:aws:states:::sagemaker:createTransformJob.sync"
        else:
            kwargs[
                Field.Resource.value
            ] = "arn:aws:states:::sagemaker:createTransformJob"

        if isinstance(job_name, str):
            parameters = transform_config(
                transformer=transformer,
                data=data,
                data_type=data_type,
                content_type=content_type,
                compression_type=compression_type,
                split_type=split_type,
                job_name=job_name,
                input_filter=input_filter,
                output_filter=output_filter,
                join_source=join_source,
            )
        else:
            parameters = transform_config(
                transformer=transformer,
                data=data,
                data_type=data_type,
                content_type=content_type,
                compression_type=compression_type,
                split_type=split_type,
                input_filter=input_filter,
                output_filter=output_filter,
                join_source=join_source,
            )

        if isinstance(job_name, (ExecutionInput, StepInput)):
            parameters["TransformJobName"] = job_name

        parameters["ModelName"] = model_name
        parameters["TransformOutput"]["S3OutputPath"] = outputpath

        if experiment_config is not None:
            parameters["ExperimentConfig"] = experiment_config

        if tags:
            parameters["Tags"] = tags_dict_to_kv_list(tags)

        # print(parameters)

        kwargs[Field.Parameters.value] = parameters
        super(MLMaxBatchTransformStep, self).__init__(state_id, **kwargs)
Exemple #4
0
# train_config specifies SageMaker training configuration
train_data = create_s3_input(config['train_model']['inputs']['train'])
validation_data = create_s3_input(
    config['train_model']['inputs']['validation'])
data_channels = {'train': train_data, 'validation': validation_data}

train_config = training_config(estimator=xgb_estimator, inputs=data_channels)

# Batch inference
xgb_transformer = Transformer(
    model_name=config['batch_transform']['model_name'],
    sagemaker_session=sagemaker.session.Session(sess),
    **config['batch_transform']['transformer_config'])

transform_config = transform_config(
    transformer=xgb_transformer,
    **config['batch_transform']['transform_config'])

# =============================================================================
# define airflow DAG and tasks
# =============================================================================
# define airflow DAG
args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)}

dag = DAG(
    'sagemaker-ml-pipeline',
    default_args=args,
    schedule_interval=None,
    concurrency=1,
    max_active_runs=1,
    user_defined_filters={'tojson': lambda s: json.JSONEncoder().encode(s)})
    def __init__(self,
                 state_id,
                 transformer,
                 job_name,
                 model_name,
                 data,
                 data_type='S3Prefix',
                 content_type=None,
                 compression_type=None,
                 split_type=None,
                 experiment_config=None,
                 wait_for_completion=True,
                 tags=None,
                 input_filter=None,
                 output_filter=None,
                 join_source=None,
                 **kwargs):
        """
        Args:
            state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine.
            transformer (sagemaker.transformer.Transformer): The SageMaker transformer to use in the TransformStep.
            job_name (str or Placeholder): Specify a transform job name. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution.
            model_name (str or Placeholder): Specify a model name for the transform job to use. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution.
            data (str or Placeholder): Input data location in S3.
            data_type (str or Placeholder): What the S3 location defines (default: 'S3Prefix').
                Valid values:

                * 'S3Prefix' - the S3 URI defines a key name prefix. All objects with this prefix will
                    be used as inputs for the transform job.
                * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object
                    to use as an input for the transform job.
            content_type (str or Placeholder): MIME type of the input data (default: None).
            compression_type (str or Placeholder): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None.
            split_type (str or Placeholder): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
            experiment_config (dict or Placeholder, optional): Specify the experiment config for the transform. (Default: None)
            wait_for_completion(bool, optional): Boolean value set to `True` if the Task state should wait for the transform job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the transform job and proceed to the next step. (default: True)
            tags (list[dict] or Placeholder, optional): `List to tags <https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html>`_ to associate with the resource.
            input_filter (str or Placeholder): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None).
            output_filter (str or Placeholder): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None).
            join_source (str or Placeholder): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None.
            parameters(dict, optional): The value of this field is merged with other arguments to become the request payload for SageMaker `CreateTransformJob<https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTransformJob.html>`_.
                You can use `parameters` to override the value provided by other arguments and specify any field's value dynamically using `Placeholders<https://aws-step-functions-data-science-sdk.readthedocs.io/en/stable/placeholders.html?highlight=placeholder#stepfunctions.inputs.Placeholder>`_.

        """
        if wait_for_completion:
            """
            Example resource arn: arn:aws:states:::sagemaker:createTransformJob.sync
            """

            kwargs[Field.Resource.value] = get_service_integration_arn(
                SAGEMAKER_SERVICE_NAME, SageMakerApi.CreateTransformJob,
                IntegrationPattern.WaitForCompletion)
        else:
            """
            Example resource arn: arn:aws:states:::sagemaker:createTransformJob
            """

            kwargs[Field.Resource.value] = get_service_integration_arn(
                SAGEMAKER_SERVICE_NAME, SageMakerApi.CreateTransformJob)

        if isinstance(job_name, str):
            transform_parameters = transform_config(
                transformer=transformer,
                data=data,
                data_type=data_type,
                content_type=content_type,
                compression_type=compression_type,
                split_type=split_type,
                job_name=job_name,
                input_filter=input_filter,
                output_filter=output_filter,
                join_source=join_source)
        else:
            transform_parameters = transform_config(
                transformer=transformer,
                data=data,
                data_type=data_type,
                content_type=content_type,
                compression_type=compression_type,
                split_type=split_type,
                input_filter=input_filter,
                output_filter=output_filter,
                join_source=join_source)

        if isinstance(job_name, Placeholder):
            transform_parameters['TransformJobName'] = job_name

        transform_parameters['ModelName'] = model_name

        if experiment_config is not None:
            transform_parameters['ExperimentConfig'] = experiment_config

        if tags:
            transform_parameters['Tags'] = tags if isinstance(
                tags, Placeholder) else tags_dict_to_kv_list(tags)

        if Field.Parameters.value in kwargs and isinstance(
                kwargs[Field.Parameters.value], dict):
            # Update transform_parameters with input parameters
            merge_dicts(transform_parameters, kwargs[Field.Parameters.value])

        kwargs[Field.Parameters.value] = transform_parameters
        super(TransformStep, self).__init__(state_id, **kwargs)
def test_transform_config(sagemaker_session):
    tf_transformer = transformer.Transformer(
        model_name="tensorflow-model",
        instance_count="{{ instance_count }}",
        instance_type="ml.p2.xlarge",
        strategy="SingleRecord",
        assemble_with="Line",
        output_path="{{ output_path }}",
        output_kms_key="{{ kms_key }}",
        accept="{{ accept }}",
        max_concurrent_transforms="{{ max_parallel_job }}",
        max_payload="{{ max_payload }}",
        tags=[{
            "{{ key }}": "{{ value }}"
        }],
        env={"{{ key }}": "{{ value }}"},
        base_transform_job_name="tensorflow-transform",
        sagemaker_session=sagemaker_session,
        volume_kms_key="{{ kms_key }}",
    )

    data = "{{ transform_data }}"

    config = airflow.transform_config(
        tf_transformer,
        data,
        data_type="S3Prefix",
        content_type="{{ content_type }}",
        compression_type="{{ compression_type }}",
        split_type="{{ split_type }}",
    )
    expected_config = {
        "TransformJobName": "tensorflow-transform-%s" % TIME_STAMP,
        "ModelName": "tensorflow-model",
        "TransformInput": {
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "{{ transform_data }}"
                }
            },
            "ContentType": "{{ content_type }}",
            "CompressionType": "{{ compression_type }}",
            "SplitType": "{{ split_type }}",
        },
        "TransformOutput": {
            "S3OutputPath": "{{ output_path }}",
            "KmsKeyId": "{{ kms_key }}",
            "AssembleWith": "Line",
            "Accept": "{{ accept }}",
        },
        "TransformResources": {
            "InstanceCount": "{{ instance_count }}",
            "InstanceType": "ml.p2.xlarge",
            "VolumeKmsKeyId": "{{ kms_key }}",
        },
        "BatchStrategy": "SingleRecord",
        "MaxConcurrentTransforms": "{{ max_parallel_job }}",
        "MaxPayloadInMB": "{{ max_payload }}",
        "Environment": {
            "{{ key }}": "{{ value }}"
        },
        "Tags": [{
            "{{ key }}": "{{ value }}"
        }],
    }

    assert config == expected_config