class TestEmrStepSensor(unittest.TestCase):
    def setUp(self):
        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(
            task_id='test_task',
            poke_interval=0,
            job_flow_id='j-8989898989',
            step_id='s-VK57YR1Z9Z5N',
            aws_conn_id='aws_default',
        )

        mock_emr_session = MagicMock()
        mock_emr_session.client.return_value = self.emr_client_mock

        # Mock out the emr_client creator
        self.boto3_session_mock = MagicMock(return_value=mock_emr_session)

    def test_step_completed(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_COMPLETED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.sensor.execute(None)

            self.assertEqual(self.emr_client_mock.describe_step.call_count, 2)
            calls = [
                unittest.mock.call(ClusterId='j-8989898989',
                                   StepId='s-VK57YR1Z9Z5N'),
                unittest.mock.call(ClusterId='j-8989898989',
                                   StepId='s-VK57YR1Z9Z5N')
            ]
            self.emr_client_mock.describe_step.assert_has_calls(calls)

    def test_step_cancelled(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_CANCELLED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)

    def test_step_failed(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN, DESCRIBE_JOB_STEP_FAILED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)

    def test_step_interrupted(self):
        self.emr_client_mock.describe_step.side_effect = [
            DESCRIBE_JOB_STEP_RUNNING_RETURN,
            DESCRIBE_JOB_STEP_INTERRUPTED_RETURN
        ]

        with patch('boto3.session.Session', self.boto3_session_mock):
            self.assertRaises(AirflowException, self.sensor.execute, None)
예제 #2
0
    def setUp(self):
        self.emr_client_mock = MagicMock()
        self.sensor = EmrStepSensor(
            task_id='test_task',
            poke_interval=0,
            job_flow_id='j-8989898989',
            step_id='s-VK57YR1Z9Z5N',
            aws_conn_id='aws_default',
        )

        mock_emr_session = MagicMock()
        mock_emr_session.client.return_value = self.emr_client_mock

        # Mock out the emr_client creator
        self.boto3_session_mock = MagicMock(return_value=mock_emr_session)
def emr_step_task_group(script_name, cluster_id, aws_conn_id, dag):
    step = [{
        'Name': f'Run {script_name}.py',
        'ActionOnFailure': 'CONTINUE',
        'HadoopJarStep': {
            'Jar':
            'command-runner.jar',
            'Args': [
                'spark-submit', '--deploy-mode', 'client', '--py-files',
                f'{file_schema}{bucket_name}scripts/utils.zip',
                f'{file_schema}{bucket_name}scripts/{script_name}.py',
                '--config-path',
                f'{file_schema}{bucket_name}scripts/config.yaml'
            ]
        }
    }]
    add_step = EmrAddStepsOperator(task_id='add_step',
                                   job_flow_id=cluster_id,
                                   aws_conn_id=aws_conn_id,
                                   steps=step,
                                   dag=dag)

    wait_step_completion = EmrStepSensor(
        task_id='wait_step_completion',
        job_flow_id=cluster_id,
        aws_conn_id=aws_conn_id,
        step_id=
        f"{{{{ ti.xcom_pull(task_ids='run_{script_name}.add_step')[0] }}}}",
        dag=dag)
    add_step.set_downstream(wait_step_completion)
    return add_step, wait_step_completion
예제 #4
0
def Workflow_0(config):
    if config.fabric == "emr":
        workflow_id = "9"
        workflow_version = "latest"
        workflow_jar = f"s3://{config.s3Bucket}/prophecy/jars/9/latest/workflow.jar"
        prophecy_libs_jar = f"{config.prophecyLibsJar}"
        executor_memory = "1g"
        executor_cores = "4"
        num_executors = "6"
        driver_memory = "1g"
        driver_cores = "2"
        job_flow_id = config.cluster_id
        spark_steps = [{
                         "Name": "Compute_Step",
                         "ActionOnFailure": "CONTINUE",
                         "HadoopJarStep": {
                           "Jar": "command-runner.jar",
                           "Args": ["spark-submit", "--executor-memory", executor_memory, "--executor-cores",
                                     executor_cores, "--num-executors",
                                     num_executors, "--driver-memory",
                                     driver_memory, "--driver-cores", driver_cores,
                                     "--conf",
                                     "spark.executor.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4",
                                     "--conf",
                                     "spark.driver.extraJavaOptions=-Dcom.amazonaws.services.s3.enableV4",
                                     "--deploy-mode", "cluster", "--class", "Main",
                                     "--jars", workflow_jar, prophecy_libs_jar,
                                     "-C", "A=B", "-C",
                                     "fabricName=" + config.fabric]
                         }
                       }]
        step_adder = EmrAddStepsOperator(
            task_id = "Workflow_0",
            job_flow_id = job_flow_id,
            aws_conn_id = "aws_default_pankaj",
            steps = spark_steps,
            trigger_rule = "all_success"
        )
        step_checker = EmrStepSensor(
            task_id = "Workflow_0WatchSteps",
            job_flow_id = job_flow_id,
            step_id = "{{ task_instance.xcom_pull(task_ids='Workflow_0', key='return_value')[0] }}",
            aws_conn_id = "aws_default_pankaj"
        )
        step_adder >> step_checker

        return step_adder, step_checker
예제 #5
0
    # [START howto_operator_emr_manual_steps_tasks]
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default')

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_STEPS)

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id='aws_default')

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default')

    cluster_creator >> step_adder >> step_checker >> cluster_remover
    # [END howto_operator_emr_manual_steps_tasks]
예제 #6
0
    # [START howto_operator_emr_manual_steps_tasks]
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
    )

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id=cluster_creator.output,
        steps=SPARK_STEPS,
    )

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id=cluster_creator.output,
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
    )

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster', job_flow_id=cluster_creator.output)

    step_adder >> step_checker >> cluster_remover
    # [END howto_operator_emr_manual_steps_tasks]

    # Task dependencies created via `XComArgs`:
    #   cluster_creator >> step_adder
    #   cluster_creator >> step_checker
    #   cluster_creator >> cluster_remover
예제 #7
0
            )
            entity_task_list.append(render_spark_step)

            # add spark step to emr
            add_step = EmrAddStepsOperator(
                task_id='add_step_{}'.format(key),
                job_flow_id=job_flow_id,
                aws_conn_id='aws_default',
                steps="{{ task_instance.xcom_pull(task_ids='" + 'render_spark_step_{}'.format(key) + "', key='return_value') }}",
            )
            entity_task_list.append(add_step)

            # wait for the step to complete
            watch_step = EmrStepSensor(
                task_id='watch_step_{}'.format(key),
                job_flow_id=job_flow_id,
                step_id="{{ task_instance.xcom_pull(task_ids='" + 'add_step_{}'.format(key) + "', key='return_value')[0] }}",
                aws_conn_id='aws_default'
            )
            entity_task_list.append(watch_step)

        # if file has sql quality steps, then create additional tasks in DAG
        if file['sql_step_args']:
            # must drop duplicates in table as spark jdbc does not support upsert
            # https://issues.apache.org/jira/browse/SPARK-19335
            drop_table_duplicates = PostgresOperator(
                task_id='drop_table_duplicates_{table}'.format(table=file['sql_step_args']['db_table']),
                postgres_conn_id='postgres_default',
                sql="""
                        DELETE FROM {table} t1 
                        USING {table} t2 
                        WHERE 
예제 #8
0
        task_id='create_job_flow',
        aws_conn_id='aws_default',
        emr_conn_id='emr_default',
        job_flow_overrides=cluster_conf)

    add_step_task = EmrAddStepsOperator(
        task_id='My_first_job',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='my_aws_conn',
        steps=my_first_job)

    watch_prev_step_task = EmrStepSensor(
        task_id='watch_prev_step',
        job_flow_id=
        "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}",
        step_id=
        "{{task_instance.xcom_pull(task_ids='My_first_job', key='return_value')}}",
        aws_conn_id='aws_default')

    terminate_job_flow_task = EmrTerminateJobFlowOperator(
        task_id='terminate_job_flow',
        job_flow_id=
        "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}",
        aws_conn_id='aws_default',
        trigger_rule="all_done")

#dependencies

check_data_exists_task >> create_job_flow_task
create_job_flow_task >> add_step_task
예제 #9
0
    }]


step_1 = EmrAddStepsOperator(
    task_id='step_1',
    aws_conn_id='aws_default',
    steps=add_scala_steps('jar-location',
                          'class-name', 'CONTINUE',
                          "{{ dag_run.conf['date'] }}"),
    dag=dag
)

sensor_1 = EmrStepSensor(
    task_id='sensor_1',
    job_flow_id="{{ dag_run.conf['cluster_id'] }}",
    step_id="{{ task_instance.xcom_pull('step_1', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag
)

step_2 = EmrAddStepsOperator(
    task_id='step_2',
    aws_conn_id='aws_default',
    steps=add_scala_steps('jar-location',
                          'class-name', 'CONTINUE',
                          "{{ dag_run.conf['date'] }}", "{{ dag_run.conf['date'] }}"),
    dag=dag
)

sensor_2 = EmrStepSensor(
    task_id='sensor_daily_parse_message',
예제 #10
0
        task_id="create_cluster",
        job_flow_overrides=EMRClusterConfig.JOB_FLOW_OVERRIDES,
        aws_conn_id="aws_default",
        emr_conn_id="emr_default",
    )

    add_step_load_raw_data = EmrAddStepsOperator(
        task_id="add_step_load_raw_data",
        job_flow_id=create_cluster.output,
        aws_conn_id="aws_default",
        steps=SparkSteps.LOAD_RAW_DATA,
    )

    wait_for_step_load_raw_data = EmrStepSensor(
        task_id="wait_for_step_load_raw_data",
        job_flow_id=create_cluster.output,
        step_id="{{ task_instance.xcom_pull(task_ids='add_step_load_raw_data', key='return_value')[0] }}",
        aws_conn_id="aws_default",
    )

    add_step_transform = EmrAddStepsOperator(
        task_id="add_step_transform",
        job_flow_id=create_cluster.output,
        aws_conn_id="aws_default",
        steps=SparkSteps.TRANSFORM,
    )

    wait_for_step_transform = EmrStepSensor(
        task_id="wait_for_step_transform",
        job_flow_id=create_cluster.output,
        step_id="{{ task_instance.xcom_pull(task_ids='add_step_transform', key='return_value')[0] }}",
        aws_conn_id="aws_default",
예제 #11
0
}]

stage1_adder = EmrAddStepsOperator(
    task_id="add_stage_1",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    steps=SPARK_STEP_1,
    dag=dag,
)

stage1_checker = EmrStepSensor(
    task_id="watch_stage1",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='add_stage_1', key='return_value')[0] }}",
    aws_conn_id="aws_default",
    dag=dag,
)

sensor_stage2_key = S3KeySensor(
    task_id="s3_sensor_stage2_key",
    bucket_name="{{ dag_run.conf['metaBucketName'] }}",
    bucket_key=
    "{{ dag_run.conf['instanceId'] }}/step_2_adv_unmatched_enc_kc_kp/_SUCCESS",
)

SPARK_STEP_2 = [{
    "Name": "adv-mr-pid-stage2",
    "ActionOnFailure": "TERMINATE_JOB_FLOW",
예제 #12
0
    aws_conn_id='aws_default',
    steps=step1,
    dag=dag)

emr_step_2 = EmrAddStepsOperator(
    task_id='emr_step2',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=step2,
    dag=dag)

emr_step_sensor = EmrStepSensor(
    task_id='watch_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='emr_step2', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag)

stop_emr_cluster = EmrTerminateJobFlowOperator(
    task_id='stop_emr1',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag)

create_emr_cluster >> emr_step_1
emr_step_1 >> emr_step_2
emr_step_2 >> emr_step_sensor
emr_step_sensor >> stop_emr_cluster