Beispiel #1
0
def add_step_to_emr(cluster_create_task, task_identifier, step_params,
                    cluster_remover, task_create_cluster, aws_connection, dag):
    """
    In case we need to add multiple steps to the cluster
    cluster_create_task: ID of task that creates a cluster
    task_identifier: ID of step
    step_params: parameters to pass to the step
    cluster_remover: task that terminates the cluster
    task_create_cluster: task that creates the cluster
    aws_connection: Connection to AWS for account credentials
    dag: DAG that is created by the user
    """
    step_adder = EmrAddStepsOperator(
        task_id=task_identifier,
        job_flow_id="{{ task_instance.xcom_pull('" + task_create_cluster +
        "', key='return_value') }}",
        aws_conn_id=aws_connection,
        steps=step_params,
        dag=dag)

    step_checker = EmrStepSensor(
        task_id=task_identifier + '_watch_step',
        job_flow_id="{{ task_instance.xcom_pull('" + task_create_cluster +
        "', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull('" + task_identifier +
        "', key='return_value')[0] }}",
        aws_conn_id=aws_connection,
        dag=dag)

    cluster_create_task.set_downstream(step_adder)
    step_adder.set_downstream(step_checker)
    step_checker.set_downstream(cluster_remover)
class TestEmrStepSensor(unittest.TestCase):
    def setUp(self):
        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(
            task_id='test_task',
            poke_interval=0,
            job_flow_id='j-8989898989',
            step_id='s-VK57YR1Z9Z5N',
            aws_conn_id='aws_default',
        )

        mock_emr_session = MagicMock()
        mock_emr_session.client.return_value = self.emr_client_mock

        # Mock out the emr_client creator
        self.boto3_session_mock = MagicMock(return_value=mock_emr_session)

    def test_step_completed(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_COMPLETED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.sensor.execute(None)

            self.assertEqual(self.emr_client_mock.describe_step.call_count, 2)
            calls = [
                unittest.mock.call(ClusterId='j-8989898989', StepId='s-VK57YR1Z9Z5N'),
                unittest.mock.call(ClusterId='j-8989898989', StepId='s-VK57YR1Z9Z5N')
            ]
            self.emr_client_mock.describe_step.assert_has_calls(calls)

    def test_step_cancelled(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_CANCELLED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)

    def test_step_failed(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_FAILED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)

    def test_step_interrupted(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_INTERRUPTED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)
class TestEmrStepSensor(unittest.TestCase):
    def setUp(self):
        configuration.load_test_config()

        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(
            task_id='test_task',
            poke_interval=1,
            job_flow_id='j-8989898989',
            step_id='s-VK57YR1Z9Z5N',
            aws_conn_id='aws_default',
        )

        mock_emr_session = MagicMock()
        mock_emr_session.client.return_value = self.emr_client_mock

        # Mock out the emr_client creator
        self.boto3_session_mock = MagicMock(return_value=mock_emr_session)

    def test_step_completed(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_COMPLETED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.sensor.execute(None)

            self.assertEqual(self.emr_client_mock.describe_step.call_count, 2)
            self.emr_client_mock.describe_step.assert_called_with(
                ClusterId='j-8989898989',
                StepId='s-VK57YR1Z9Z5N'
            )

    def test_step_cancelled(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_CANCELLED_RETURN
        ]

        self.boto3_client_mock = MagicMock(return_value=self.emr_client_mock)

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)

    def test_step_interrupted(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_INTERRUPTED_RETURN
        ]

        self.boto3_client_mock = MagicMock(return_value=self.emr_client_mock)

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)
    def setUp(self):
        configuration.load_test_config()

        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(
            task_id='test_task',
            poke_interval=1,
            job_flow_id='j-8989898989',
            step_id='s-VK57YR1Z9Z5N',
            aws_conn_id='aws_default',
        )
def main_summary_subdag_factory(parent_dag, task_id, day):
    ds = "{{{{ macros.ds_format(macros.ds_add(ds, {0}), '%Y-%m-%d', '%Y%m%d') }}}}".format(day)
    subdag = DAG("{}.{}".format(parent_dag.dag_id, task_id),
                 schedule_interval=SCHEDULE_INTERVAL,
                 start_date=START_DATE,
                 default_args=default_args)

    parent_job_flow_id = ("{{{{ task_instance.xcom_pull('setup_backfill_cluster', "
                          "key='return_value', dag_id={}) }}}}".format(parent_dag.dag_id))

    # Try to alleviate throttling issues by introducing some slight jitter on each of the days
    timedelta_task = TimeDeltaSensor(
        task_id="day_start_jitter",
        delta=timedelta(seconds=day),
        dag=subdag
    )

    add_step_task = EmrAddStepsOperator(
        task_id='submit_main_summary_day',
        job_flow_id=parent_job_flow_id,
        execution_timeout=timedelta(minutes=10),
        aws_conn_id='aws_default',
        steps=EmrAddStepsOperator.get_step_args(
            job_name="main_summary {}".format(ds),
            owner="*****@*****.**",
            action_on_failure='CONTINUE',
            uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
            env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", {
                "from": ds,
                "to": ds,
                "bucket": "telemetry-backfill"
            }, {
                "DO_ASSEMBLY": "False"
            }),
        ),
        dag=subdag
    )

    step_sensor_task = EmrStepSensor(
        task_id="main_summary_step_sensor",
        timeout=timedelta(hours=10).total_seconds(),
        job_flow_id=parent_job_flow_id,
        step_id="{{ task_instance.xcom_pull('submit_main_summary_day', key='return_value') }}",
        poke_interval=timedelta(minutes=5).total_seconds(),
        dag=subdag
    )

    step_sensor_task.set_upstream(add_step_task)
    add_step_task.set_upstream(timedelta_task)

    return subdag
    def setUp(self):
        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(
            task_id='test_task',
            poke_interval=0,
            job_flow_id='j-8989898989',
            step_id='s-VK57YR1Z9Z5N',
            aws_conn_id='aws_default',
        )

        mock_emr_session = MagicMock()
        mock_emr_session.client.return_value = self.emr_client_mock

        # Mock out the emr_client creator
        self.boto3_session_mock = MagicMock(return_value=mock_emr_session)
    def setUp(self):
        configuration.load_test_config()

        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(task_id='test_task',
                                    poke_interval=1,
                                    job_flow_id='j-8989898989',
                                    step_id='s-VK57YR1Z9Z5N',
                                    aws_conn_id='aws_default',
                                    region_name='ap-southeast-1')

        mock_emr_session = MagicMock()
        mock_emr_session.client.return_value = self.emr_client_mock

        # Mock out the emr_client creator
        self.boto3_session_mock = MagicMock(return_value=mock_emr_session)
Beispiel #8
0
def poke():
    hook = hooks.S3_hook.S3Hook(aws_conn_id='aws_s3')
    job_flow_id = "j-2ASQREUMPJ0Y7"
    aws_conn_id = 'aws_emr'
    st = hook.read_key(key='prod_deployment/conf/athena_all_tables',
                       bucket_name='bounce-data-platform')
    loop = st.split(",")
    print(loop)
    for i in range(0, len(loop)):
        steps = [{
            'Name': 'test step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': ['hive', '-e', loop[i]]
            }
        }]
        step_addr = EmrAddStepsOperator(task_id='add_steps' + str(i),
                                        job_flow_id="j-2ASQREUMPJ0Y7",
                                        aws_conn_id='aws_emr',
                                        steps=steps,
                                        dag=dag)
        step_adder.append(step_addr)
        step_checkr = EmrStepSensor(
            task_id='watch_step' + str(i),
            job_flow_id="j-2ASQREUMPJ0Y7",
            step_id=
            "{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
            aws_conn_id='aws_emr',
            dag=dag)
        step_checker.append(step_checkr)
Beispiel #9
0
    def apply_task_to_dag(self, **kwargs):
        task = kwargs['task']
        parent = kwargs.get('parent', task.parent)

        self._validate_task_type(task)

        # assuming emr already exists
        add_step = EmrAddStepsOperator(
            task_id=f'{task.task_id}_add_step',
            job_flow_id=self.job_flow_id,
            job_flow_name=self.job_flow_name,
            aws_conn_id=self.aws_conn_id,
            steps=self.__generate_emr_step(
                task.task_id, [str(x) for x in task.get_runnable_command()]),
            cluster_states=self.cluster_states,
            dag=task.dag)

        if task.parent:
            parent.set_downstream(add_step)

        emr_sensor_step = EmrStepSensor(
            task_id=f'{task.task_id}_watch_step',
            job_flow_id="{{ task_instance.xcom_pull('" + add_step.task_id +
            "', key='job_flow_id') }}",
            step_id="{{ task_instance.xcom_pull('" + add_step.task_id +
            "', key='return_value')[0] }}",
            aws_conn_id=self.aws_conn_id,
            dag=task.dag)

        add_step.set_downstream(emr_sensor_step)

        return emr_sensor_step
Beispiel #10
0
def create_dag():
    with DAG(dag_id='emr_job_flow_manual_steps_dag',
             default_args=DEFAULT_DAG_ARGS,
             dagrun_timeout=timedelta(hours=2),
             max_active_runs=1,
             schedule_interval=None) as dag:

        create_cluster_op = EmrCreateJobFlowOperator(
            task_id='create_cluster',
            job_flow_overrides={'Name': CLUSTER_NAME},
            aws_conn_id=AWS_CONN_ID,
            emr_conn_id=EMR_CONN_ID)

        add_steps_to_cluster_op = TemplatedEmrAddStepsOperator(
            task_id='add_steps',
            job_flow_id=
            "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
            aws_conn_id=AWS_CONN_ID,
            steps=[{
                'Name': 'calculate_pi',
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                    'Jar': 's3://psm-poc-dmp-temp/spark-examples.jar',
                    'Args': ['10'],
                    'MainClass': 'org.apache.spark.examples.SparkPi'
                }
            }])

        monitor_cluster_op = EmrJobFlowSensor(
            task_id='monitor_cluster',
            retries=0,
            aws_conn_id=AWS_CONN_ID,
            job_flow_id=
            '{{ task_instance.xcom_pull("create_cluster", key="return_value") }}',
            timeout=1800)

        monitor_step_op = EmrStepSensor(
            task_id='watch_step',
            job_flow_id=
            "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
            step_id=
            "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
            aws_conn_id=AWS_CONN_ID)

        terminate_cluster_op = EmrTerminateJobFlowOperator(
            task_id='remove_cluster',
            job_flow_id=
            "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
            aws_conn_id=AWS_CONN_ID)

        handle_failure_op = PythonOperator(
            task_id='handle_failure',
            python_callable=handle_failure_task,
            trigger_rule=trigger_rule.TriggerRule.ONE_FAILED)

        create_cluster_op >> monitor_cluster_op >> handle_failure_op
        create_cluster_op >> add_steps_to_cluster_op >> monitor_step_op >> terminate_cluster_op

    return dag
    def test_execute_calls_with_the_job_flow_id_and_step_id_until_it_reaches_a_terminal_state(self):
        with patch('boto3.client', self.boto3_client_mock):

            operator = EmrStepSensor(
                task_id='test_task',
                poke_interval=1,
                job_flow_id='j-8989898989',
                step_id='s-VK57YR1Z9Z5N',
                aws_conn_id='aws_default',
            )

            operator.execute(None)

            # make sure we called twice
            self.assertEqual(self.mock_emr_client.describe_step.call_count, 2)

            # make sure it was called with the job_flow_id and step_id
            self.mock_emr_client.describe_step.assert_called_with(ClusterId='j-8989898989', StepId='s-VK57YR1Z9Z5N')
Beispiel #12
0
    def test_execute_calls_with_the_job_flow_id_and_step_id_until_it_reaches_a_terminal_state(
            self):
        with patch('boto3.client', self.boto3_client_mock):

            operator = EmrStepSensor(
                task_id='test_task',
                poke_interval=1,
                job_flow_id='j-8989898989',
                step_id='s-VK57YR1Z9Z5N',
                aws_conn_id='aws_default',
            )

            operator.execute(None)

            # make sure we called twice
            self.assertEqual(self.mock_emr_client.describe_step.call_count, 2)

            # make sure it was called with the job_flow_id and step_id
            self.mock_emr_client.describe_step.assert_called_with(
                ClusterId='j-8989898989', StepId='s-VK57YR1Z9Z5N')
class TestEmrStepSensor(unittest.TestCase):
    def setUp(self):
        configuration.load_test_config()

        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(
            task_id='test_task',
            poke_interval=1,
            job_flow_id='j-8989898989',
            step_id='s-VK57YR1Z9Z5N',
            aws_conn_id='aws_default',
        )

    def test_step_completed(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_COMPLETED_RETURN
        ]

        self.boto3_client_mock = MagicMock(return_value=self.emr_client_mock)

        with patch('boto3.client', self.boto3_client_mock):
            self.sensor.execute(None)

            self.assertEqual(self.emr_client_mock.describe_step.call_count, 2)
            self.emr_client_mock.describe_step.assert_called_with(
                ClusterId='j-8989898989', StepId='s-VK57YR1Z9Z5N')

    def test_step_cancelled(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_CANCELLED_RETURN
        ]

        self.boto3_client_mock = MagicMock(return_value=self.emr_client_mock)

        with patch('boto3.client', self.boto3_client_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)
    def setUp(self):
        configuration.load_test_config()

        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(
            task_id='test_task',
            poke_interval=1,
            job_flow_id='j-8989898989',
            step_id='s-VK57YR1Z9Z5N',
            aws_conn_id='aws_default',
        )

        mock_emr_session = MagicMock()
        mock_emr_session.client.return_value = self.emr_client_mock

        # Mock out the emr_client creator
        self.boto3_session_mock = MagicMock(return_value=mock_emr_session)
Beispiel #15
0
def poke(run_this, t2):
    hook = S3Hook(aws_conn_id='aws_s3')
    job_flow_id = "j-2ASQREUMPJ0Y7"
    aws_conn_id = 'aws_emr'
    st = hook.read_key(key='prod_deployment/conf/athena_all_tables',
                       bucket_name='bounce-data-platform')
    loop = st.split(",")
    print(loop)
    # X = 5 if not loop is None else len(loop)
    X = 0 if loop is None else len(loop)
    for i in range(0, X):
        steps = [{
            'Name': 'test step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    'hive', '-e',
                    'msck repair table dataplatform.task_fact_daily_agg_entity'
                ]
            }
        }]
        # t3= BashOperator(
        # task_id='ag' + str(i),
        # bash_command='echo "success"',
        # dag=dag)
        # run_this>>t3>>t2
        step_addr = EmrAddStepsOperator(task_id='add_steps_' + str(i),
                                        job_flow_id="j-2ASQREUMPJ0Y7",
                                        aws_conn_id='aws_emr',
                                        steps=steps,
                                        dag=dag)

        step_checkr = EmrStepSensor(
            task_id='watch_step_' + str(i),
            job_flow_id="j-2ASQREUMPJ0Y7",
            step_id=
            "{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
            aws_conn_id='aws_emr',
            dag=dag)
        run_this >> step_addr >> step_checkr >> t2
                                        'movie_review_load':
                                        movie_review_load_folder,
                                        'text_classifier_script':
                                        text_classifier_script,
                                        'movie_review_stage':
                                        movie_review_stage
                                    },
                                    depends_on_past=True)

last_step = len(emr_steps) - 1

# sensing if the last step is complete
clean_movie_review_data = EmrStepSensor(
    dag=dag,
    task_id='clean_movie_review_data',
    job_flow_id=EMR_ID,
    step_id='{{ task_instance.xcom_pull("add_emr_steps", key="return_value")['
    + str(last_step) + '] }}',
    depends_on_past=True,
)

user_purchase_to_rs_stage = PythonOperator(
    dag=dag,
    task_id='user_purchase_to_rs_stage',
    python_callable=run_redshift_external_query,
    op_kwargs={
        'qry':
        "alter table spectrum.user_purchase_staging add partition(insert_date='{{ ds }}') \
           location 's3://data-eng-bucket/user_purchase/stage/{{ ds }}'",
    },
)
Beispiel #17
0
        "s3_script_bucket": S3_SCRIPT_BUCKET,
        "s3_output": S3_ANALYTICS_BUCKET,
    },
    dag=dag,
)


# get the number of the final step
final_EMR_step = len(SPARK_STEPS) - 1
# wait for the steps to complete - seem to have to use concatenation here as {}.format()
# seems to fail because of double {} in source string
EMR_step_checker = EmrStepSensor(
    task_id="EMR_step_checker",
    job_flow_id="{{ task_instance.xcom_pull('create_EMR_cluster', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull(task_ids='EMR_step_adder', key='return_value')["
    + str(final_EMR_step)
    + "] }}",
    aws_conn_id="aws_default",
    dag=dag,
)

# Shutdown EMR cluster
shutdown_EMR_cluster = EmrTerminateJobFlowOperator(
    task_id="shutdown_EMR_cluster",
    job_flow_id="{{ task_instance.xcom_pull(task_ids='create_EMR_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    dag=dag,
)
# Now create dimension table
create_dimension_table = PostgresOperator(
    task_id="create_dimension_table",
Beispiel #18
0
    task_id='create_emr_database_cluster',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    dag=dag)
create_emr_database_step = EmrAddStepsOperator(
    task_id='create_emr_database_step',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_database_cluster', key='return_value') }}",
    aws_conn_id='aws_default',
    on_failure_callback=cleanup_emr_cluster_if_steps_fail,
    steps=CREATE_DATABASE,
)
create_emr_database_sensor = EmrStepSensor(
    task_id='create_emr_database_sensor',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_database_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_database_step', key='return_value')[0] }}",
    on_failure_callback=cleanup_emr_cluster_if_steps_fail,
    aws_conn_id='aws_default',
)

terminate_emr_cluster = EmrTerminateJobFlowOperator(
    task_id='terminate_emr_cluster',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_database_cluster', key='return_value') }}",
    aws_conn_id='aws_default',
)

emr_database_checks_done = DummyOperator(
    task_id="emr_database_checks_done",
    trigger_rule=TriggerRule.NONE_FAILED,
Beispiel #19
0
        "main_class": import_raw_order_status_main_class,
        "source_path": f"{source_datasets_location}/status.json.gz",
        "source_format": "json",
        "target_path": target_raw_datasets_location,
        "target_format": "parquet",
    },
    provide_context=True,
    dag=dag,
)

sensor_import_raw_consumer_step = EmrStepSensor(
    task_id="sensor_import_raw_consumer_step",
    job_flow_id=(
        """{{ task_instance.xcom_pull(task_ids='create_cluster_emr_job',
                                      key='job_flow_id') }}"""),
    step_id=
    ("""{{ task_instance.xcom_pull(task_ids='create_import_raw_consumer_dataset_step_job',
                                      key='import_raw_consumer_dataset_step') }}"""
     ),
    dag=dag,
)

sensor_import_raw_order_step = EmrStepSensor(
    task_id="sensor_import_raw_order_step",
    job_flow_id=
    ("{{ task_instance.xcom_pull(task_ids='create_cluster_emr_job', key='job_flow_id') }}"
     ),
    step_id=
    ("""{{ task_instance.xcom_pull(task_ids='create_import_raw_order_dataset_step_job',
                                      key='import_raw_order_dataset_step') }}"""
     ),
Beispiel #20
0
)

add_all_task = MyEmrAddStepsOperator(
    task_id='add_allstep',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=ALL_CSV,
    retries=3,
    retry_delay=timedelta(minutes=5),
    dag=dag
)

watch_allstep_task = EmrStepSensor(
    task_id='watch_allstep',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull('add_allstep', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag
)

add_ca_task = MyEmrAddStepsOperator(
    task_id='add_castep',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=CA_CSV,
    retries=3,
    retry_delay=timedelta(minutes=5),
    dag=dag
)

watch_castep_task = EmrStepSensor(
        "scripts_path_key": scripts_path_key + "/",
        "data_path_key": data_path_key,
        "processed_tables_key": processed_tables_key
    },
    dag=dag,
)

last_step = len(
    SPARK_STEPS
) - 1  # this value will let the sensor know the last step to watch
# wait for the steps to complete
step_checker = EmrStepSensor(
    task_id="watch_step",
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='step_one', key='return_value')[" +
    str(last_step) + "] }}",
    aws_conn_id="aws_default",
    dag=dag,
)

#Terminate the EMR cluster
terminate_emr_cluster = EmrTerminateJobFlowOperator(
    task_id="terminate_emr_cluster",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    dag=dag,
)

end_data_pipeline = DummyOperator(task_id="end_data_pipeline", dag=dag)
Beispiel #22
0
                    extract_script,
                    "--source_bucket",
                    econet_engineering_source_bucket,  #"s3://rheemconnectrawdata/history/",
                    "--destination",
                    econet_engineering_destination_bucket,  #"s3://weiyutest/",
                    "--input_date",
                    execution_date
                ]
            }
        }],
    )

    watch_extract_step_task = EmrStepSensor(
        task_id='watch_extract_step',  #'watch_prev_step',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull('extract_step', key='return_value')[0] }}",
        aws_conn_id='aws_default',
    )

    connect_step_task = EmrAddStepsOperator(
        task_id='connect_step',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=[{
            "Name": "Step2 Merge",
            "ActionOnFailure": "CONTINUE",
            "HadoopJarStep": {
                "Jar":
                "command-runner.jar",  # todo https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html
Beispiel #23
0
            print('nocluster:', nocluster)
            kwargs['ti'].xcom_push(key='clusterid', value=nocluster)

    parse_id = PythonOperator(task_id='parse_id',
                              provide_context=True,
                              python_callable=parse_cluster_id,
                              trigger_rule='all_done')

    step_adder = EmrAddStepsOperator(task_id='step_adder',
                                     job_flow_id=CLUSTER_ID,
                                     aws_conn_id='aws_default',
                                     steps=SPARK_TEST_STEPS)

    step_checker = EmrStepSensor(
        task_id='step_checker',
        job_flow_id=CLUSTER_ID,
        step_id=
        "{{ task_instance.xcom_pull('step_adder', key='return_value')[0] }}",
        aws_conn_id='aws_default')

    cluster_terminator = EmrTerminateJobFlowOperator(
        task_id='cluster_terminator',
        job_flow_id=CLUSTER_ID,
        aws_conn_id='aws_default')

    end = DummyOperator(task_id='end')

    parse_request >> cluster_checker
    cluster_checker >> cluster_creator >> parse_id
    cluster_checker >> parse_id
    parse_id >> step_adder >> step_checker >> cluster_terminator >> end
Beispiel #24
0
    job_flow_overrides=default_emr_settings,
    dag=dag)

copy_python_script = EmrAddStepsOperator(
    task_id='copy_script',
    # XComs let tasks exchange messages
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=copy_script_step,
    dag=dag)

watch_prev_step_task1 = EmrStepSensor(
    task_id='watch_prev_step1',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull('copy_script', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag)

run_spark_job = EmrAddStepsOperator(
    task_id='run_spark_job',
    # XComs let tasks exchange messages
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=run_job_step,
    dag=dag)

watch_prev_step_task2 = EmrStepSensor(
    task_id='watch_prev_step2',
                                           dag=dag)

add_transform_step_task = EmrAddStepsOperatorV2(
    task_id='add_transform_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_immigration_job', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=TRANSFORM_IMMIGRATION_SAS_DATA,
    region_name=PARAMS['REGION'],
    dag=dag)

watch_immigration_transform_task = EmrStepSensor(
    task_id='watch_immigration_transform',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_immigration_job', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull('add_transform_step', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    region_name=PARAMS['REGION'],
    dag=dag)

add_data_quality_check_task = EmrAddStepsOperatorV2(
    task_id='data_quality_check',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_immigration_job', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=DATA_QUALITY_SAS_DATA,
    region_name=PARAMS['REGION'],
    dag=dag)

watch_prev_data_check_task = EmrStepSensor(
Beispiel #26
0
    }
}]

add_step_emr = EmrAddStepsOperator(
    task_id='add_step_emr',
    dag=dag,
    job_flow_id=
    '{{ task_instance.xcom_pull("create_emr", key="return_value") }}',
    aws_conn_id='conn_aws_id',
    steps=json_step)

check_step_emr = EmrStepSensor(
    task_id='watch_step_emr',
    dag=dag,
    job_flow_id=
    '{{ task_instance.xcom_pull("create_emr", key="return_value") }}',
    step_id=
    '{{ task_instance.xcom_pull("add_step_emr", key="return_value")[0] }}',
    aws_conn_id='conn_aws_id',
)

terminate_emr = EmrTerminateJobFlowOperator(
    task_id='terminate_emr',
    dag=dag,
    job_flow_id=
    '{{ task_instance.xcom_pull("create_emr", key="return_value") }}',
    aws_conn_id='conn_aws_id')

complete_emr = DummyOperator(task_id='complete_emr', dag=dag)

file_sensor >> success_bucket >> create_emr >> add_step_emr >> check_step_emr >> terminate_emr >> complete_emr
    task_id="load_us_states_steps",
    job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    steps=LOAD_US_STATES_STEP,
    params={ # these params are used to fill the paramterized values in SPARK_STEPS json
        "BUCKET_NAME": BUCKET_NAME,
    },
    dag=dag,
)

# wait for the steps to complete
us_states_step_checker = EmrStepSensor(
    task_id="us_states_step_checker",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='load_us_states_steps', key='return_value')[0] }}",
    aws_conn_id="aws_default",
    dag=dag,
)

LOAD_TEMPERATURE_STEP = [
    {
        "Name": "Load Temperature dimension",
        "ActionOnFailure": "CANCEL_AND_WAIT",
        "HadoopJarStep": {
            "Jar":
            "command-runner.jar",
            "Args": [
                "spark-submit",
                "--deploy-mode",
Beispiel #28
0
    dag=dag)

add_jobflow_steps = EmrAddStepsOperator(
    task_id='Add_jobflow_steps',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='Create_EMR_cluster', key='return_value') }}",
    aws_conn_id='aws_credentials',
    region_name='us-west-2',
    steps=SPARK_ETL_STEPS,
    dag=dag)

trip_processing = EmrStepSensor(
    task_id='trip_processing_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('Create_EMR_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='Add_jobflow_steps', key='return_value')[2] }}",
    aws_conn_id='aws_credentials',
    region_name='us-west-2',
    dag=dag)

station_processing = EmrStepSensor(
    task_id='station_processing_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('Create_EMR_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='Add_jobflow_steps', key='return_value')[3] }}",
    aws_conn_id='aws_credentials',
    region_name='us-west-2',
    dag=dag)
# Add steps to an existing EMR JobFlow
add_pipeline_to_emr_cluster_task = EmrAddStepsOperator(
    task_id='add_pipeline_to_emr_cluster',
    job_flow_id="{{task_instance.xcom_pull('spin_up_emr_cluster', " \
               +"  key='return_value')}}",
    steps=covid19_pipeline,
    dag=dag
)

# Wait step to be completed
watch_pipeline_step_task = EmrStepSensor(
    task_id='watch_pipeline_step',
    job_flow_id="{{task_instance.xcom_pull(" \
                "      'spin_up_emr_cluster'," \
                "      key='return_value')}}",
    step_id="{{task_instance.xcom_pull(" \
            "  'add_pipeline_to_emr_cluster'," \
            "  key='return_value')[0]}}",
    dag=dag)

# Terminate EMR JobFlows
spin_down_emr_cluster_task = EmrTerminateJobFlowOperator(
    task_id='spin_down_emr_cluster',
    job_flow_id="{{task_instance.xcom_pull('spin_up_emr_cluster', " \
               +"  key='return_value')}}",
    trigger_rule="all_done",
    dag=dag
)

stop_airflow_containers_task = PythonOperator(
Beispiel #30
0
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    dag=dag)

step_adder = EmrAddStepsOperator(
    task_id='add_steps',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=SPARK_TEST_STEPS,
    dag=dag)

step_checker = EmrStepSensor(
    task_id='watch_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag)

cluster_remover = EmrTerminateJobFlowOperator(
    task_id='remove_cluster',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag)

cluster_creator.set_downstream(step_adder)
step_adder.set_downstream(step_checker)
step_checker.set_downstream(cluster_remover)
Beispiel #31
0
with DAG(
    dag_id=DAG_ID,
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(minutes=15),
    start_date=days_ago(1),
    schedule_interval='@once',
    tags=['emr'],
) as dag:

    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        emr_conn_id='aws_default',
        job_flow_overrides=JOB_FLOW_OVERRIDES
    )

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_STEPS,
    )

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id='aws_default',
    )

    cluster_creator >> step_adder >> step_checker
                    'Jar': 'command-runner.jar'
                }
            }
        ]
    )

    monitor_cluster_op = EmrJobFlowSensor(
        task_id='monitor_cluster',
        retries=0,
        aws_conn_id=get_config('emr')['aws_conn_id'],
        job_flow_id='{{ task_instance.xcom_pull("create_cluster", key="return_value") }}',
        timeout=1800)

    monitor_step_op_1 = EmrStepSensor(
        task_id='watch_step_pi',
        job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    monitor_step_op_2 = EmrStepSensor(
        task_id='watch_step_distcp',
        job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[1] }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    validate_path_exists = S3KeySensor(
        task_id='validate_pii_exist',
        bucket_name='{{ params.bucket_name }}',
        bucket_key='{{ params.bucket_key }}',
        wildcard_match=True)
         dagrun_timeout=timedelta(hours=2),
         schedule_interval=None) as dag:
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_emr_cluster',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default')

    step_adder = EmrAddStepsOperator(
        task_id='movie_analytics_job',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_TEST_STEPS)

    step_checker = EmrStepSensor(
        task_id='wait_for_analytics_completion',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='movie_analytics_job', key='return_value')[0] }}",
        aws_conn_id='aws_default')

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
        aws_conn_id='aws_default')

    cluster_creator >> step_adder >> step_checker >> cluster_remover
    emr_conn_id='emr_default',
    dag=dag
)

step_adder = EmrAddStepsOperator(
    task_id='add_steps',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=SPARK_TEST_STEPS,
    dag=dag
)

step_checker = EmrStepSensor(
    task_id='watch_step',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag
)

cluster_remover = EmrTerminateJobFlowOperator(
    task_id='remove_cluster',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag
)

cluster_creator.set_downstream(step_adder)
step_adder.set_downstream(step_checker)
step_checker.set_downstream(cluster_remover)