Ejemplo n.º 1
0
    def test_datajob_stack_creates_resources_on_exit_only_when_no_error_occurs(
            self, m_create_resources):
        exception_ = None
        try:
            with DataJobStack(scope=self.app,
                              id="datajob-stack-with-error") as djs:
                raise Exception("some exception")
        except Exception as e:
            exception_ = e
        self.assertEqual(m_create_resources.call_count, 0)
        self.assertIsNotNone(exception_)

        with DataJobStack(scope=self.app,
                          id="datajob-stack-without-error") as djs:
            pass
        self.assertEqual(m_create_resources.call_count, 1)
Ejemplo n.º 2
0
 def test_create_glue_pythonshell_successfully(self):
     djs = DataJobStack(scope=self.app, id="some-stack", stage="stg")
     glue_job = GlueJob(djs, "some-task", "some/path/task.py")
     self.assertEqual(glue_job.job_type, GlueJobType.PYTHONSHELL.value)
     self.assertEqual(glue_job.glue_version, "1.0")
     self.assertEqual(glue_job.job_path, "some/path/task.py")
     self.assertEqual(glue_job.python_version, "3")
 def test_datajob_stack_with_stage_passed_to_datajob_stack(self):
     stage_value = "some-value"
     with DataJobStack(scope=self.app,
                       id="datajob-stack-no-error",
                       stage=stage_value) as djs:
         pass
     self.assertEqual(djs.stage, stage_value)
 def test_datajob_context_initiates_without_error(self):
     exception_ = None
     try:
         app = core.App()
         djs = DataJobStack(scope=app, id="some-stack-name")
         DataJobContext(djs, unique_stack_name="some-unique-name")
     except Exception as e:
         exception_ = e
     self.assertIsNone(exception_)
 def test_datajob_stack_initiates_without_error(self):
     exception_ = None
     try:
         with DataJobStack(scope=self.app,
                           id="datajob-stack-no-error") as djs:
             pass
     except Exception as e:
         exception_ = e
     self.assertIsNone(exception_)
Ejemplo n.º 6
0
 def test_datajob_context_initiates_without_stage(self):
     exception_ = None
     try:
         app = core.App()
         djs = DataJobStack(scope=app, id="some-stack-name")
         djc = DataJobContext(djs)
     except Exception as e:
         exception_ = e
     self.assertIsNone(exception_)
     # some random characters are appended to the bucketname
     self.assertIsNone(djc.stage)
     self.assertTrue(len(djc.data_bucket_name.split("-")[-1]), 4)
     self.assertTrue(len(djc.deployment_bucket_name.split("-")[-1]), 4)
Ejemplo n.º 7
0
 def test_datajob_context_with_stage(self):
     exception_ = None
     try:
         stack_name = "some-stack"
         stage = "some-stage"
         app = core.App()
         djs = DataJobStack(scope=app, id=stack_name, stage=stage)
         djc = DataJobContext(djs)
         self.assertIsNotNone(djc.stage)
         self.assertEqual(djc.data_bucket_name, djs.unique_stack_name)
         self.assertTrue(djc.deployment_bucket_name,
                         f"{djs.unique_stack_name}-deployment-bucket")
     except Exception as e:
         exception_ = e
     self.assertIsNone(exception_)
Ejemplo n.º 8
0
    def test_sagemaker_transform_step_successfully(self, m_default_bucket):

        m_default_bucket.return_value = "sagemaker-bucket-name"

        with DataJobStack(scope=self.app, id="some-stack", stage="stg") as djs:
            transformer = Transformer(
                model_name="some-model",
                instance_count=1,
                instance_type="ml.t2.medium",
                sagemaker_session=self.sagemaker_session,
            )

            transform_step = TransformStep(
                datajob_stack=djs,
                name="transform-job",
                transformer=transformer,
                data="s3://some-bucket/some-data.csv",
            )

            estimator = SKLearn(
                entry_point=str(
                    pathlib.Path(current_dir, "resources", "train.py")),
                train_instance_type="ml.m5.xlarge",
                role=self.role,
                framework_version="0.20.0",
                py_version="py3",
                sagemaker_session=self.sagemaker_session,
            )

            tuner = HyperparameterTuner(
                estimator=estimator,
                hyperparameter_ranges={
                    "alpha": ContinuousParameter(0.0001, 0.05)
                },
                objective_metric_name="rmse",
            )

            tuner_step = TuningStep(
                datajob_stack=djs,
                name="tuning-step",
                tuner=tuner,
                data="s3://some-bucket/some-data.csv",
            )

            with StepfunctionsWorkflow(djs, "sequential") as sfn_workflow:
                transform_step >> tuner_step
Ejemplo n.º 9
0
 def test_datajob_stack_with_no_stage(self):
     with DataJobStack(scope=self.app, id="datajob-stack-no-stage") as djs:
         pass
     self.assertIsNone(djs.stage)
 def test_datajob_stack_with_stage_passed_via_cli(self):
     stage_value = "some-value"
     scope = core.App(context={"stage": stage_value})
     with DataJobStack(scope=scope, id="datajob-stack-no-error") as djs:
         pass
     self.assertEqual(djs.stage, stage_value)
 def test_datajob_stack_with_no_stage(self):
     with DataJobStack(scope=self.app, id="datajob-stack-no-stage") as djs:
         pass
     self.assertEqual(djs.stage, DEFAULT_STACK_STAGE)
"""
import pathlib
from aws_cdk import core

from datajob.datajob_stack import DataJobStack
from datajob.glue.glue_job import GlueJob
from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow

app = core.App()

current_dir = pathlib.Path(__file__).parent.absolute()

app = core.App()

datajob_stack = DataJobStack(scope=app,
                             id="data-pipeline-pkg",
                             project_root=current_dir)
datajob_stack.init_datajob_context()

task1 = GlueJob(datajob_stack=datajob_stack,
                name="task1",
                job_path="glue_jobs/task1.py")
task2 = GlueJob(datajob_stack=datajob_stack,
                name="task2",
                job_path="glue_jobs/task2.py")

with StepfunctionsWorkflow(datajob_stack=datajob_stack,
                           name="workflow") as sfn:
    task1 >> task2

datajob_stack.create_resources()
Ejemplo n.º 13
0
import sagemaker
from aws_cdk import core
from sagemaker import image_uris

from datajob.datajob_stack import DataJobStack
from datajob.glue.glue_job import GlueJob
from datajob.sagemaker import get_default_sagemaker_role
from datajob.sagemaker.sagemaker_job import EndpointConfigStep
from datajob.sagemaker.sagemaker_job import EndpointStep
from datajob.sagemaker.sagemaker_job import ModelStep
from datajob.sagemaker.sagemaker_job import TrainingStep
from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow

app = core.App()

with DataJobStack(scope=app, id="datajob-ml-pipeline-abalone") as djs:

    sagemaker_default_role = get_default_sagemaker_role(datajob_stack=djs)

    train_path = f"s3://{djs.context.data_bucket_name}/train/abalone.train"
    validation_path = (
        f"s3://{djs.context.data_bucket_name}/validation/abalone.validation")
    test_path = f"s3://{djs.context.data_bucket_name}/test/abalone.test"

    prepare_dataset_step = GlueJob(
        datajob_stack=djs,
        name="prepare-dataset",
        job_path="jobs/prepare_dataset.py",
        job_type="pythonshell",
        max_capacity=1,
        arguments={
Ejemplo n.º 14
0
import pathlib
from aws_cdk import core

from datajob.datajob_stack import DataJobStack
from datajob.glue.glue_job import GlueJob
from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow

current_dir = pathlib.Path(__file__).parent.absolute()

app = core.App()

# the datajob_stack is the instance that will result in a cloudformation stack.
# we inject the datajob_stack object through all the resources that we want to add.
with DataJobStack(scope=app, id="data-pipeline-pkg",
                  project_root=current_dir) as datajob_stack:

    # here we define 2 glue jobs with the path to the source code.
    task1 = GlueJob(datajob_stack=datajob_stack,
                    name="task1",
                    job_path="glue_jobs/task1.py")

    task2 = GlueJob(datajob_stack=datajob_stack,
                    name="task2",
                    job_path="glue_jobs/task2.py")

    # we instantiate a step functions workflow
    # and orchestrate the glue jobs.
    with StepfunctionsWorkflow(datajob_stack=datajob_stack,
                               name="workflow") as step_functions_workflow:
        task1 >> task2
Ejemplo n.º 15
0
from aws_cdk import core

from datajob.datajob_stack import DataJobStack
from datajob.glue.glue_job import GlueJob
from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow

app = core.App()


with DataJobStack(scope=app, id="data-pipeline-parallel") as datajob_stack:

    task1 = GlueJob(
        datajob_stack=datajob_stack, name="task1", job_path="glue_jobs/task.py"
    )
    task2 = GlueJob(
        datajob_stack=datajob_stack, name="task2", job_path="glue_jobs/task.py"
    )
    task3 = GlueJob(
        datajob_stack=datajob_stack, name="task3", job_path="glue_jobs/task.py"
    )
    task4 = GlueJob(
        datajob_stack=datajob_stack, name="task4", job_path="glue_jobs/task.py"
    )
    task5 = GlueJob(
        datajob_stack=datajob_stack, name="task5", job_path="glue_jobs/task.py"
    )

    # Task2 comes after task1. task4 comes after task3.
    # Task 5 depends on both task2 and task4 to be finished.
    # Therefore task1 and task2 can run in parallel,
    # as well as task3 and task4.
Ejemplo n.º 16
0
import pathlib

from aws_cdk import core

from datajob.datajob_stack import DataJobStack
from datajob.glue.glue_job import GlueJob
from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow

current_dir = str(pathlib.Path(__file__).parent.absolute())

app = core.App()

with DataJobStack(scope=app,
                  id="datajob-python-pyspark",
                  project_root=current_dir) as datajob_stack:

    pyspark_job = GlueJob(
        datajob_stack=datajob_stack,
        name="pyspark-job",
        job_path="glue_job/glue_pyspark_example.py",
        job_type="glueetl",
        glue_version="2.0",  # we only support glue 2.0
        python_version="3",
        worker_type="Standard",  # options are Standard / G.1X / G.2X
        number_of_workers=1,
        arguments={
            "--source":
            f"s3://{datajob_stack.context.data_bucket_name}/raw/iris_dataset.csv",
            "--destination":
            f"s3://{datajob_stack.context.data_bucket_name}/target/pyspark_job/iris_dataset.parquet",
        },
"""
same as ./datajob_stack.py but more explicit
"""
from aws_cdk import core

from datajob.datajob_stack import DataJobStack
from datajob.glue.glue_job import GlueJob
from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow

app = core.App()

datajob_stack = DataJobStack(scope=app, id="data-pipeline-simple")
datajob_stack.init_datajob_context()

task1 = GlueJob(datajob_stack=datajob_stack,
                name="task1",
                job_path="glue_jobs/task1.py")
task2 = GlueJob(datajob_stack=datajob_stack,
                name="task2",
                job_path="glue_jobs/task2.py")

with StepfunctionsWorkflow(datajob_stack=datajob_stack,
                           name="workflow") as sfn:
    task1 >> task2

datajob_stack.create_resources()
app.synth()
Ejemplo n.º 18
0
    def test_sagemaker_services_successfully(self, m_default_bucket):

        m_default_bucket.return_value = "sagemaker-bucket-name"

        with DataJobStack(scope=self.app, id="some-stack", stage="stg") as djs:

            processor = SKLearnProcessor(
                framework_version="0.23-1",
                role=self.role,
                instance_type="local",
                instance_count=1,
                sagemaker_session=self.sagemaker_session,
            )

            processing_step = ProcessingStep(
                datajob_stack=djs,
                name="processing-job",
                processor=processor,
            )

            estimator = SKLearn(
                entry_point=str(
                    pathlib.Path(current_dir, "resources", "train.py")),
                train_instance_type="ml.m5.xlarge",
                role=self.role,
                framework_version="0.20.0",
                py_version="py3",
                sagemaker_session=self.sagemaker_session,
            )

            training_step = TrainingStep(
                datajob_stack=djs,
                name="training-job",
                estimator=estimator,
            )

            model_step = ModelStep(
                datajob_stack=djs,
                name="model-step",
                model=training_step.sfn_task.get_expected_model(),
            )

            endpoint_config_step = EndpointConfigStep(
                datajob_stack=djs,
                name="endpoint-config-step",
                model_name=model_step.model_name,
            )

            endpoint_step = EndpointStep(
                datajob_stack=djs,
                name="endpoint-step",
                endpoint_config_name=endpoint_config_step.name,
            )

            with StepfunctionsWorkflow(
                    djs, "sequential") as sfn_workflow_sequential:
                (processing_step >> training_step >> model_step >>
                 endpoint_config_step >> endpoint_step)

            with StepfunctionsWorkflow(djs,
                                       "parallel") as sfn_workflow_parallel:
                processing_step >> processing_step
                training_step >> training_step

        # check if we have the expected value for the execution input
        self.assertDictEqual(
            djs.execution_input.execution_input_schema,
            {
                "some-stack-stg-processing-job": str,
                "some-stack-stg-training-job": str,
                "some-stack-stg-model-step": str,
                "some-stack-stg-endpoint-config-step": str,
                "some-stack-stg-endpoint-step": str,
            },
        )
        # execution input is added to cloudformation output
        self.assertDictEqual(
            djs.outputs,
            {
                "DatajobExecutionInput":
                json.dumps([
                    "some-stack-stg-processing-job",
                    "some-stack-stg-training-job",
                    "some-stack-stg-model-step",
                    "some-stack-stg-endpoint-config-step",
                    "some-stack-stg-endpoint-step",
                ])
            },
        )
Ejemplo n.º 19
0
 def setUp(self) -> None:
     self.app = core.App()
     self.djs = DataJobStack(scope=self.app, id="datajob-stack-no-error")
Ejemplo n.º 20
0
from aws_cdk import core
from sagemaker.processing import ProcessingInput
from sagemaker.processing import ProcessingOutput
from sagemaker.sklearn import SKLearnProcessor
from sagemaker.sklearn.estimator import SKLearn

from datajob.datajob_stack import DataJobStack
from datajob.sagemaker import get_default_sagemaker_role
from datajob.sagemaker.sagemaker_job import ModelStep
from datajob.sagemaker.sagemaker_job import ProcessingStep
from datajob.sagemaker.sagemaker_job import TrainingStep
from datajob.stepfunctions.stepfunctions_workflow import StepfunctionsWorkflow

app = core.App()

with DataJobStack(scope=app, id="datajob-ml-pipeline-scikitlearn") as djs:

    role = get_default_sagemaker_role(datajob_stack=djs)

    sagemaker_session = sagemaker.Session(boto_session=boto3.session.Session(
        region_name=djs.env.region))
    s3_bucket_base_uri = "{}{}".format("s3://",
                                       sagemaker_session.default_bucket())
    output_data = "{}/{}".format(s3_bucket_base_uri,
                                 "data/sklearn_processing/output")

    input_data = f"s3://sagemaker-sample-data-{djs.env.region}/processing/census/census-income.csv"

    input_code = sagemaker_session.upload_data(
        "resources/preprocessing.py",
        bucket=sagemaker_session.default_bucket(),