コード例 #1
0
ファイル: emr.py プロジェクト: naturalett/incubator-liminal
    def apply_task_to_dag(self, **kwargs):
        task = kwargs['task']
        parent = kwargs.get('parent', task.parent)

        self._validate_task_type(task)

        # assuming emr already exists
        add_step = EmrAddStepsOperator(
            task_id=f'{task.task_id}_add_step',
            job_flow_id=self.job_flow_id,
            job_flow_name=self.job_flow_name,
            aws_conn_id=self.aws_conn_id,
            steps=self.__generate_emr_step(
                task.task_id, [str(x) for x in task.get_runnable_command()]),
            cluster_states=self.cluster_states,
            dag=task.dag)

        if task.parent:
            parent.set_downstream(add_step)

        emr_sensor_step = EmrStepSensor(
            task_id=f'{task.task_id}_watch_step',
            job_flow_id="{{ task_instance.xcom_pull('" + add_step.task_id +
            "', key='job_flow_id') }}",
            step_id="{{ task_instance.xcom_pull('" + add_step.task_id +
            "', key='return_value')[0] }}",
            aws_conn_id=self.aws_conn_id,
            dag=task.dag)

        add_step.set_downstream(emr_sensor_step)

        return emr_sensor_step
コード例 #2
0
def add_step_to_emr(cluster_create_task, task_identifier, step_params,
                    cluster_remover, task_create_cluster, aws_connection, dag):
    """
    In case we need to add multiple steps to the cluster
    cluster_create_task: ID of task that creates a cluster
    task_identifier: ID of step
    step_params: parameters to pass to the step
    cluster_remover: task that terminates the cluster
    task_create_cluster: task that creates the cluster
    aws_connection: Connection to AWS for account credentials
    dag: DAG that is created by the user
    """
    step_adder = EmrAddStepsOperator(
        task_id=task_identifier,
        job_flow_id="{{ task_instance.xcom_pull('" + task_create_cluster +
        "', key='return_value') }}",
        aws_conn_id=aws_connection,
        steps=step_params,
        dag=dag)

    step_checker = EmrStepSensor(
        task_id=task_identifier + '_watch_step',
        job_flow_id="{{ task_instance.xcom_pull('" + task_create_cluster +
        "', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull('" + task_identifier +
        "', key='return_value')[0] }}",
        aws_conn_id=aws_connection,
        dag=dag)

    cluster_create_task.set_downstream(step_adder)
    step_adder.set_downstream(step_checker)
    step_checker.set_downstream(cluster_remover)
コード例 #3
0
    def test_execute_adds_steps_to_the_job_flow_and_returns_step_ids(self):
        with patch('boto3.client', self.boto3_client_mock):

            operator = EmrAddStepsOperator(task_id='test_task',
                                           job_flow_id='j-8989898989',
                                           aws_conn_id='aws_default')

            self.assertEqual(operator.execute(None), ['s-2LH3R5GW3A53T'])
コード例 #4
0
    def test_execute_adds_steps_to_the_job_flow_and_returns_step_ids(self):
        with patch('boto3.client', self.boto3_client_mock):

            operator = EmrAddStepsOperator(
                task_id='test_task',
                job_flow_id='j-8989898989',
                aws_conn_id='aws_default'
            )

            self.assertEqual(operator.execute(None), ['s-2LH3R5GW3A53T'])
コード例 #5
0
    def setUp(self):
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrAddStepsOperator(task_id='test_task',
                                            job_flow_id='j-8989898989',
                                            aws_conn_id='aws_default',
                                            steps=self._config,
                                            dag=DAG('test_dag_id',
                                                    default_args=args))
コード例 #6
0
    def emr_operator(self, dag):
        spark_submit_command = [
            "spark-submit",
            "--master",
            "yarn",
            "--name",
            "{{task.task_id}}",
            "script.py",
            "input.csv",
            "output.csv",
        ]

        step_command = dict(
            Name="{{task.task_id}}",
            ActionOnFailure="CONTINUE",
            HadoopJarStep=dict(Jar="command-runner.jar",
                               Args=spark_submit_command),
        )

        operator = EmrAddStepsOperator(task_id="emr_task",
                                       job_flow_id=1,
                                       steps=[step_command],
                                       dag=dag)
        track_dag(dag)
        return operator
コード例 #7
0
ファイル: test_core.py プロジェクト: angadsingh/airflow-ditto
    def _get_test_dag(self):
        with DAG(dag_id='test_dag', default_args=DEFAULT_DAG_ARGS) as dag:
            op1 = SparkSubmitOperator(task_id='op1')
            op2 = EmrAddStepsOperator(task_id='op2', job_flow_id='foo')
            op3 = S3ListOperator(task_id='op3', bucket='foo')
            op4 = EmrCreateJobFlowOperator(task_id='op4')
            op5 = TriggerDagRunOperator(task_id='op5', trigger_dag_id='foo')
            op6 = FileToWasbOperator(task_id='op6',
                                     container_name='foo',
                                     blob_name='foo',
                                     file_path='foo')
            op7 = EmailOperator(task_id='op7',
                                subject='foo',
                                to='foo',
                                html_content='foo')
            op8 = S3CopyObjectOperator(task_id='op8',
                                       dest_bucket_key='foo',
                                       source_bucket_key='foo')
            op9 = BranchPythonOperator(task_id='op9', python_callable=print)
            op10 = PythonOperator(task_id='op10', python_callable=range)

            op1 >> [op2, op3, op4]
            op2 >> [op5, op6]
            op6 >> [op7, op8, op9]
            op3 >> [op7, op8]
            op8 >> [op9, op10]

        return dag
コード例 #8
0
def poke():
    hook = hooks.S3_hook.S3Hook(aws_conn_id='aws_s3')
    job_flow_id = "j-2ASQREUMPJ0Y7"
    aws_conn_id = 'aws_emr'
    st = hook.read_key(key='prod_deployment/conf/athena_all_tables',
                       bucket_name='bounce-data-platform')
    loop = st.split(",")
    print(loop)
    for i in range(0, len(loop)):
        steps = [{
            'Name': 'test step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': ['hive', '-e', loop[i]]
            }
        }]
        step_addr = EmrAddStepsOperator(task_id='add_steps' + str(i),
                                        job_flow_id="j-2ASQREUMPJ0Y7",
                                        aws_conn_id='aws_emr',
                                        steps=steps,
                                        dag=dag)
        step_adder.append(step_addr)
        step_checkr = EmrStepSensor(
            task_id='watch_step' + str(i),
            job_flow_id="j-2ASQREUMPJ0Y7",
            step_id=
            "{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
            aws_conn_id='aws_emr',
            dag=dag)
        step_checker.append(step_checkr)
コード例 #9
0
    def emr_operator(self, dag):
        spark_submit_command = [
            "spark-submit",
            "--master",
            "yarn",
            "--name",
            "{{task.task_id}}",
            "script.py",
            "input.csv",
            "output.csv",
        ]

        step_command = dict(
            Name="{{task.task_id}}",
            ActionOnFailure="CONTINUE",
            HadoopJarStep=dict(Jar="command-runner.jar", Args=spark_submit_command),
        )

        operator = EmrAddStepsOperator(
            task_id="emr_task", job_flow_id=1, steps=[step_command], dag=dag
        )
        env = {
            "AIRFLOW_CTX_DAG_ID": "test_dag",
            "AIRFLOW_CTX_EXECUTION_DATE": "emr_task",
            "AIRFLOW_CTX_TASK_ID": "1970-01-01T0000.000",
            "AIRFLOW_CTX_TRY_NUMBER": "1",
        }

        add_tracking_to_submit_task(env, operator)
        return operator
コード例 #10
0
    def test_init_with_nonexistent_cluster_name(self):
        cluster_name = 'test_cluster'

        with patch('airflow.contrib.hooks.emr_hook.EmrHook.get_cluster_id_by_name') \
                as mock_get_cluster_id_by_name:
            mock_get_cluster_id_by_name.return_value = None

            operator = EmrAddStepsOperator(
                task_id='test_task',
                job_flow_name=cluster_name,
                cluster_states=['RUNNING', 'WAITING'],
                aws_conn_id='aws_default',
                dag=DAG('test_dag_id', default_args=self.args))

            with self.assertRaises(AirflowException) as error:
                operator.execute(self.mock_context)
            self.assertEqual(str(error.exception),
                             f'No cluster found for name: {cluster_name}')
    def test_init_with_cluster_name(self):
        expected_job_flow_id = 'j-1231231234'

        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        with patch('boto3.session.Session', self.boto3_session_mock):
            operator = EmrAddStepsOperator(
                task_id='test_task',
                job_flow_name='test_cluster',
                cluster_states=['RUNNING', 'WAITING'],
                aws_conn_id='aws_default',
                dag=DAG('test_dag_id', default_args=self.args)
            )

            operator.execute(self.mock_context)

            ti = self.mock_context['ti']

            ti.xcom_push.assert_any_call(key='job_flow_id', value=expected_job_flow_id)
コード例 #12
0
    def setUp(self):
        self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        self.boto3_session_mock = MagicMock(return_value=emr_session_mock)

        self.mock_context = MagicMock()

        self.operator = EmrAddStepsOperator(task_id='test_task',
                                            job_flow_id='j-8989898989',
                                            aws_conn_id='aws_default',
                                            steps=self._config,
                                            dag=DAG('test_dag_id',
                                                    default_args=self.args))
コード例 #13
0
    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrAddStepsOperator(
            task_id='test_task',
            job_flow_id='j-8989898989',
            aws_conn_id='aws_default',
            steps=self._config,
            dag=DAG('test_dag_id', default_args=args)
        )
コード例 #14
0
def poke(run_this, t2):
    hook = S3Hook(aws_conn_id='aws_s3')
    job_flow_id = "j-2ASQREUMPJ0Y7"
    aws_conn_id = 'aws_emr'
    st = hook.read_key(key='prod_deployment/conf/athena_all_tables',
                       bucket_name='bounce-data-platform')
    loop = st.split(",")
    print(loop)
    # X = 5 if not loop is None else len(loop)
    X = 0 if loop is None else len(loop)
    for i in range(0, X):
        steps = [{
            'Name': 'test step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    'hive', '-e',
                    'msck repair table dataplatform.task_fact_daily_agg_entity'
                ]
            }
        }]
        # t3= BashOperator(
        # task_id='ag' + str(i),
        # bash_command='echo "success"',
        # dag=dag)
        # run_this>>t3>>t2
        step_addr = EmrAddStepsOperator(task_id='add_steps_' + str(i),
                                        job_flow_id="j-2ASQREUMPJ0Y7",
                                        aws_conn_id='aws_emr',
                                        steps=steps,
                                        dag=dag)

        step_checkr = EmrStepSensor(
            task_id='watch_step_' + str(i),
            job_flow_id="j-2ASQREUMPJ0Y7",
            step_id=
            "{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
            aws_conn_id='aws_emr',
            dag=dag)
        run_this >> step_addr >> step_checkr >> t2
コード例 #15
0
def add_import_raw_dataset_step_job(**kwargs):
    steps = [{
        "Name": kwargs.get("step_name"),
        "ActionOnFailure": "CONTINUE",
        "HadoopJarStep": {
            "Jar":
            "command-runner.jar",
            "Args": [
                "spark-submit",
                "--deploy-mode",
                "cluster",
                "--num-executors",
                kwargs.get("num_executors", "1"),
                "--executor-cores",
                kwargs.get("executor_cores", "4"),
                "--driver-memory",
                kwargs.get("driver_memory", "4g"),
                "--executor-memory",
                kwargs.get("executor_memory", "8g"),
                "--class",
                kwargs.get("main_class"),
                spark_app_jar_location,
                kwargs.get("source_path"),
                kwargs.get("source_format"),
                kwargs.get("target_path"),
                kwargs.get("target_format"),
            ],
        },
    }]
    job_flow_id = kwargs["ti"].xcom_pull(task_ids="create_cluster_emr_job",
                                         key="job_flow_id")

    step_ids = EmrAddStepsOperator(
        task_id=kwargs.get("task_id"),
        aws_conn_id="aws_default",
        job_flow_id=job_flow_id,
        steps=steps,
        dag=dag,
    ).execute(kwargs)

    kwargs["ti"].xcom_push(key=kwargs.get("step_id_key"), value=step_ids[0])
コード例 #16
0
ファイル: test_core.py プロジェクト: angadsingh/airflow-ditto
    def transform(self, subdag: nx.DiGraph,
                  parent_fragment: DAGFragment) -> DAGFragment:
        subdag_roots = [n for n, d in subdag.in_degree() if d == 0]
        first_root = subdag_roots[0].task_id

        task_id_prefix = '' if first_root in ['op2', 'op3'] else '2'

        TestSubDagTransformer1.op1 = SparkSubmitOperator(
            task_id=f"t{task_id_prefix}p1", dag=self.dag)
        TestSubDagTransformer1.op2 = EmrAddStepsOperator(
            task_id=f"t{task_id_prefix}p2", job_flow_id='foo', dag=self.dag)
        TestSubDagTransformer1.op3 = S3ListOperator(
            task_id=f"t{task_id_prefix}p3", bucket='foo', dag=self.dag)
        TestSubDagTransformer1.op4 = EmrCreateJobFlowOperator(
            task_id=f"t{task_id_prefix}p4", dag=self.dag)
        TestSubDagTransformer1.op5 = DummyOperator(
            task_id=f"t{task_id_prefix}p5", dag=self.dag)

        TestSubDagTransformer1.op1 >> [
            TestSubDagTransformer1.op2, TestSubDagTransformer1.op3
        ] >> TestSubDagTransformer1.op4

        return DAGFragment(
            [TestSubDagTransformer1.op1, TestSubDagTransformer1.op5])
コード例 #17
0
create_EMR_instance = EmrCreateJobFlowOperator(
    task_id="create_EMR_cluster",
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id="aws_default",
    emr_conn_id="emr_default",
    dag=dag
)

# Add your steps to the EMR cluster
EMR_step_adder = EmrAddStepsOperator(
    task_id="EMR_step_adder",
    job_flow_id="{{ task_instance.xcom_pull(task_ids='create_EMR_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    steps=SPARK_STEPS,
    params={  # these params are used to provide the parameters for the steps JSON above
        "bucket_name": BUCKET_NAME,
        "s3_data": S3_DATA_BUCKET,
        "s3_script_bucket": S3_SCRIPT_BUCKET,
        "s3_output": S3_ANALYTICS_BUCKET,
    },
    dag=dag,
)


# get the number of the final step
final_EMR_step = len(SPARK_STEPS) - 1
# wait for the steps to complete - seem to have to use concatenation here as {}.format()
# seems to fail because of double {} in source string
EMR_step_checker = EmrStepSensor(
    task_id="EMR_step_checker",
    job_flow_id="{{ task_instance.xcom_pull('create_EMR_cluster', key='return_value') }}",
コード例 #18
0
)

skip_emr_database_creation = DummyOperator(
    task_id="skip_emr_database_creation",
    trigger_rule=TriggerRule.NONE_FAILED,
    dag=dag,
)

create_emr_database_cluster = EmrCreateJobFlowOperator(
    task_id='create_emr_database_cluster',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    dag=dag)
create_emr_database_step = EmrAddStepsOperator(
    task_id='create_emr_database_step',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_database_cluster', key='return_value') }}",
    aws_conn_id='aws_default',
    on_failure_callback=cleanup_emr_cluster_if_steps_fail,
    steps=CREATE_DATABASE,
)
create_emr_database_sensor = EmrStepSensor(
    task_id='create_emr_database_sensor',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_database_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_database_step', key='return_value')[0] }}",
    on_failure_callback=cleanup_emr_cluster_if_steps_fail,
    aws_conn_id='aws_default',
)

terminate_emr_cluster = EmrTerminateJobFlowOperator(
    task_id='terminate_emr_cluster',
コード例 #19
0
    task_id="create_emr_cluster",
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id="aws_default",
    emr_conn_id="emr_default",
    dag=dag,
)

# Add your steps to the EMR cluster
step_one = EmrAddStepsOperator(
    task_id="step_one",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    steps=SPARK_STEPS,
    params=
    {  # these params are used to fill the paramterized values in SPARK_STEPS json
        "bucket_name": bucket_name,
        "scripts_path_key": scripts_path_key + "/",
        "data_path_key": data_path_key,
        "processed_tables_key": processed_tables_key
    },
    dag=dag,
)

last_step = len(
    SPARK_STEPS
) - 1  # this value will let the sensor know the last step to watch
# wait for the steps to complete
step_checker = EmrStepSensor(
    task_id="watch_step",
    job_flow_id=
コード例 #20
0
    dagrun_timeout=timedelta(hours=2),
    schedule_interval='0 3 * * *'
)

cluster_creator = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    dag=dag
)

step_adder = EmrAddStepsOperator(
    task_id='add_steps',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=SPARK_TEST_STEPS,
    dag=dag
)

step_checker = EmrStepSensor(
    task_id='watch_step',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag
)

cluster_remover = EmrTerminateJobFlowOperator(
    task_id='remove_cluster',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
コード例 #21
0
with DAG(
    dag_id=DAG_ID,
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(minutes=15),
    start_date=days_ago(1),
    schedule_interval='@once',
    tags=['emr'],
) as dag:

    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        emr_conn_id='aws_default',
        job_flow_overrides=JOB_FLOW_OVERRIDES
    )

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_STEPS,
    )

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id='aws_default',
    )

    cluster_creator >> step_adder >> step_checker
コード例 #22
0
        'key': 'scripts/random_text_classification.py',
    },
)

with open(movie_clean_emr_steps) as json_file:
    emr_steps = json.load(json_file)

# adding our EMR steps to an existing EMR cluster
add_emr_steps = EmrAddStepsOperator(dag=dag,
                                    task_id='add_emr_steps',
                                    job_flow_id=EMR_ID,
                                    aws_conn_id='aws_default',
                                    steps=emr_steps,
                                    params={
                                        'BUCKET_NAME': BUCKET_NAME,
                                        'movie_review_load':
                                        movie_review_load_folder,
                                        'text_classifier_script':
                                        text_classifier_script,
                                        'movie_review_stage':
                                        movie_review_stage
                                    },
                                    depends_on_past=True)

last_step = len(emr_steps) - 1

# sensing if the last step is complete
clean_movie_review_data = EmrStepSensor(
    dag=dag,
    task_id='clean_movie_review_data',
    job_flow_id=EMR_ID,
コード例 #23
0
class TestEmrAddStepsOperator(unittest.TestCase):
    # When
    _config = [{
        'Name': 'test_step',
        'ActionOnFailure': 'CONTINUE',
        'HadoopJarStep': {
            'Jar': 'command-runner.jar',
            'Args': [
                '/usr/lib/spark/bin/run-example',
                '{{ macros.ds_add(ds, -1) }}',
                '{{ ds }}'
            ]
        }
    }]

    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrAddStepsOperator(
            task_id='test_task',
            job_flow_id='j-8989898989',
            aws_conn_id='aws_default',
            steps=self._config,
            dag=DAG('test_dag_id', default_args=args)
        )

    def test_init(self):
        self.assertEqual(self.operator.job_flow_id, 'j-8989898989')
        self.assertEqual(self.operator.aws_conn_id, 'aws_default')

    def test_render_template(self):
        ti = TaskInstance(self.operator, DEFAULT_DATE)
        ti.render_templates()

        expected_args = [{
            'Name': 'test_step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': [
                    '/usr/lib/spark/bin/run-example',
                    (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"),
                    DEFAULT_DATE.strftime("%Y-%m-%d"),
                ]
            }
        }]

        self.assertListEqual(self.operator.steps, expected_args)

    def test_execute_returns_step_id(self):
        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        self.boto3_session_mock = MagicMock(return_value=emr_session_mock)

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertEqual(self.operator.execute(None), ['s-2LH3R5GW3A53T'])
コード例 #24
0
        dag_id='emr_job_movies_dag_parallel',
        default_args=DEFAULT_ARGS,
        concurrency=3,
        dagrun_timeout=timedelta(hours=2),
        schedule_interval=None
) as dag:
    create_cluster = EmrCreateJobFlowOperator(
        task_id='create_emr_cluster',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default'
    )

    add_step_1 = EmrAddStepsOperator(
        task_id='movie_analytics_job_1',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_STEP_1_PROPS
    )

    wait_for_step_1 = EmrStepSensor(
        task_id='wait_for_analytics_completion_1',
        job_flow_id="{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='movie_analytics_job_1', key='return_value')[0] }}",
        aws_conn_id='aws_default'
    )

    add_step_2 = EmrAddStepsOperator(
        task_id='movie_analytics_job_2',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_STEP_2_PROPS
コード例 #25
0
    add_steps_to_cluster_op = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
        aws_conn_id=get_config('emr')['aws_conn_id'],
        steps=[
            {
                'Name': 'calculate_pi',
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                    'Jar': '{{ params.hadoop_jar_path }}',
                    'Args': [
                        '10'
                    ],
                    'MainClass': 'org.apache.spark.examples.SparkPi'
                }
            },
            {
                'Name': 'Copy Publisher Output from EMR HDFS to S3',
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                    'Args': [
                        's3-dist-cp',
                        '--src={{ params.dist_cp_src }}',
                        '--dest={{ params.dist_cp_target }}'
                    ],
                    'Jar': 'command-runner.jar'
                }
            }
        ]
    )
コード例 #26
0
class TestEmrAddStepsOperator(unittest.TestCase):
    # When
    _config = [{
        'Name': 'test_step',
        'ActionOnFailure': 'CONTINUE',
        'HadoopJarStep': {
            'Jar': 'command-runner.jar',
            'Args': [
                '/usr/lib/spark/bin/run-example',
                '{{ macros.ds_add(ds, -1) }}',
                '{{ ds }}'
            ]
        }
    }]

    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrAddStepsOperator(
            task_id='test_task',
            job_flow_id='j-8989898989',
            aws_conn_id='aws_default',
            steps=self._config,
            dag=DAG('test_dag_id', default_args=args)
        )

    def test_init(self):
        self.assertEqual(self.operator.job_flow_id, 'j-8989898989')
        self.assertEqual(self.operator.aws_conn_id, 'aws_default')

    def test_render_template(self):
        ti = TaskInstance(self.operator, DEFAULT_DATE)
        ti.render_templates()

        expected_args = [{
            'Name': 'test_step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': [
                    '/usr/lib/spark/bin/run-example',
                    (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"),
                    DEFAULT_DATE.strftime("%Y-%m-%d"),
                ]
            }
        }]

        self.assertListEqual(self.operator.steps, expected_args)

    def test_execute_returns_step_id(self):
        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        self.boto3_session_mock = MagicMock(return_value=emr_session_mock)

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertEqual(self.operator.execute(None), ['s-2LH3R5GW3A53T'])
コード例 #27
0
class TestEmrAddStepsOperator(unittest.TestCase):
    # When
    _config = [{
        'Name': 'test_step',
        'ActionOnFailure': 'CONTINUE',
        'HadoopJarStep': {
            'Jar':
            'command-runner.jar',
            'Args': [
                '/usr/lib/spark/bin/run-example',
                '{{ macros.ds_add(ds, -1) }}', '{{ ds }}'
            ]
        }
    }]

    def setUp(self):
        self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        self.boto3_session_mock = MagicMock(return_value=emr_session_mock)

        self.mock_context = MagicMock()

        self.operator = EmrAddStepsOperator(task_id='test_task',
                                            job_flow_id='j-8989898989',
                                            aws_conn_id='aws_default',
                                            steps=self._config,
                                            dag=DAG('test_dag_id',
                                                    default_args=self.args))

    def test_init(self):
        self.assertEqual(self.operator.job_flow_id, 'j-8989898989')
        self.assertEqual(self.operator.aws_conn_id, 'aws_default')

    def test_render_template(self):
        ti = TaskInstance(self.operator, DEFAULT_DATE)
        ti.render_templates()

        expected_args = [{
            'Name': 'test_step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    '/usr/lib/spark/bin/run-example',
                    (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"),
                    DEFAULT_DATE.strftime("%Y-%m-%d"),
                ]
            }
        }]

        self.assertListEqual(self.operator.steps, expected_args)

    def test_execute_returns_step_id(self):
        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertEqual(self.operator.execute(self.mock_context),
                             ['s-2LH3R5GW3A53T'])

    def test_init_with_cluster_name(self):
        expected_job_flow_id = 'j-1231231234'

        self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN

        with patch('boto3.session.Session', self.boto3_session_mock):
            with patch('airflow.contrib.hooks.emr_hook.EmrHook.get_cluster_id_by_name') \
                    as mock_get_cluster_id_by_name:
                mock_get_cluster_id_by_name.return_value = expected_job_flow_id

                operator = EmrAddStepsOperator(
                    task_id='test_task',
                    job_flow_name='test_cluster',
                    cluster_states=['RUNNING', 'WAITING'],
                    aws_conn_id='aws_default',
                    dag=DAG('test_dag_id', default_args=self.args))

                operator.execute(self.mock_context)

        ti = self.mock_context['ti']

        ti.xcom_push.assert_called_once_with(key='job_flow_id',
                                             value=expected_job_flow_id)

    def test_init_with_nonexistent_cluster_name(self):
        cluster_name = 'test_cluster'

        with patch('airflow.contrib.hooks.emr_hook.EmrHook.get_cluster_id_by_name') \
                as mock_get_cluster_id_by_name:
            mock_get_cluster_id_by_name.return_value = None

            operator = EmrAddStepsOperator(
                task_id='test_task',
                job_flow_name=cluster_name,
                cluster_states=['RUNNING', 'WAITING'],
                aws_conn_id='aws_default',
                dag=DAG('test_dag_id', default_args=self.args))

            with self.assertRaises(AirflowException) as error:
                operator.execute(self.mock_context)
            self.assertEqual(str(error.exception),
                             f'No cluster found for name: {cluster_name}')
コード例 #28
0
        default_args=DEFAULT_ARGS,
        dagrun_timeout=timedelta(hours=2),
        start_date=days_ago(1),
        schedule_interval=None,
        tags=["emr", "spark", "pyspark"],
) as dag:
    cluster_creator = EmrCreateJobFlowOperator(
        task_id="create_job_flow",
        job_flow_overrides=get_object(
            "job_flow_overrides/job_flow_overrides.json", work_bucket),
    )

    step_adder = EmrAddStepsOperator(
        task_id="add_steps",
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id="aws_default",
        steps=get_object("emr_steps/emr_steps.json", work_bucket),
    )

    step_checker = EmrStepSensor(
        task_id="watch_step",
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id="aws_default",
    )

    cluster_creator >> step_adder >> step_checker
コード例 #29
0
    file_sensor = S3KeySensor(task_id='file_sensor',
                              poke_interval=600,
                              timeout=1000,
                              soft_fail=False,
                              bucket_name='ds-afarrell',
                              bucket_key='manybla.txt')

    create_cluster = EmrCreateJobFlowOperator(
        task_id='create_cluster',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_benchmarks_connection')

    run_some_pyspark = EmrAddStepsOperator(
        task_id='run_some_pyspark',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=EMR_STEP_1)

    output_file_sensor = S3KeySensor(
        task_id='output_file_sensor',
        poke_interval=600,
        timeout=1000,
        soft_fail=False,
        bucket_name='ds-afarrell',
        bucket_key='hello_world_was_written/_SUCCESS')

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id="cluster_remover",
        job_flow_id=
        "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
コード例 #30
0
check_data_exists_task = PythonOperator(task_id='check_data_exists',
                                        python_callable=check_data_exists,
                                        provide_context=False,
                                        dag=dag)

create_job_flow_task = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    job_flow_overrides=default_emr_settings,
    dag=dag)

copy_python_script = EmrAddStepsOperator(
    task_id='copy_script',
    # XComs let tasks exchange messages
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=copy_script_step,
    dag=dag)

watch_prev_step_task1 = EmrStepSensor(
    task_id='watch_prev_step1',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull('copy_script', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag)

run_spark_job = EmrAddStepsOperator(
    task_id='run_spark_job',
コード例 #31
0
    # Create an EMR cluster
    create_emr_cluster = EmrCreateJobFlowOperator(
        task_id="create_emr_cluster",
        job_flow_overrides=get_job_flow_overrides(job_flow_overrides),
        aws_conn_id="aws_default",
        emr_conn_id="emr_default")

    # Add steps to the EMR cluster
    # Step 1 = ETL Pipeline
    # Step 2 = Data Quality Test
    step_adder = EmrAddStepsOperator(
        task_id="add_steps",
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
        aws_conn_id="aws_default",
        steps=SPARK_STEPS,
        params={
            "bucket_name": bucket_name,
            "s3_etl_script": s3_etl_script,
            "s3_dq_script": s3_dq_script
        })

    # wait for the steps to complete
    last_step = len(SPARK_STEPS) - 1
    step_checker = EmrStepSensor(
        task_id="watch_step",
        job_flow_id=
        "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')["
        + str(last_step) + "] }}",
コード例 #32
0
dag = DAG('emr_job_flow_manual_steps_dag',
          default_args=DEFAULT_ARGS,
          dagrun_timeout=timedelta(hours=2),
          schedule_interval='0 3 * * *')

cluster_creator = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    dag=dag)

step_adder = EmrAddStepsOperator(
    task_id='add_steps',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=SPARK_TEST_STEPS,
    dag=dag)

step_checker = EmrStepSensor(
    task_id='watch_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag)

cluster_remover = EmrTerminateJobFlowOperator(
    task_id='remove_cluster',
    job_flow_id=
コード例 #33
0
JOB_FLOW_OVERRIDES = {"Name": "MoviesAnalytics"}

with DAG(dag_id='emr_job_movies_dag',
         default_args=DEFAULT_ARGS,
         dagrun_timeout=timedelta(hours=2),
         schedule_interval=None) as dag:
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_emr_cluster',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default')

    step_adder = EmrAddStepsOperator(
        task_id='movie_analytics_job',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_TEST_STEPS)

    step_checker = EmrStepSensor(
        task_id='wait_for_analytics_completion',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='movie_analytics_job', key='return_value')[0] }}",
        aws_conn_id='aws_default')

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
コード例 #34
0
# Create an EMR cluster
create_emr_cluster = EmrCreateJobFlowOperator(
    task_id="create_emr_cluster",
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id="aws_default",
    emr_conn_id="emr_default",
    dag=dag,
)

# Add your steps to the EMR cluster
load_us_states_step = EmrAddStepsOperator(
    task_id="load_us_states_steps",
    job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    steps=LOAD_US_STATES_STEP,
    params={ # these params are used to fill the paramterized values in SPARK_STEPS json
        "BUCKET_NAME": BUCKET_NAME,
    },
    dag=dag,
)

# wait for the steps to complete
us_states_step_checker = EmrStepSensor(
    task_id="us_states_step_checker",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='load_us_states_steps', key='return_value')[0] }}",
    aws_conn_id="aws_default",
    dag=dag,
)