def test_execute_calls_with_the_job_flow_id_until_it_reaches_a_terminal_state(self):
        with patch('boto3.session.Session', self.boto3_session_mock):
            operator = EmrJobFlowSensor(
                task_id='test_task',
                poke_interval=2,
                job_flow_id='j-8989898989',
                aws_conn_id='aws_default'
            )

            operator.execute(None)

            # make sure we called twice
            self.assertEqual(self.mock_emr_client.describe_cluster.call_count, 2)

            # make sure it was called with the job_flow_id
            self.mock_emr_client.describe_cluster.assert_called_with(ClusterId='j-8989898989')
Esempio n. 2
0
    def test_execute_calls_with_the_job_flow_id_until_it_reaches_a_terminal_state(self):
        with patch('boto3.session.Session', self.boto3_session_mock):
            operator = EmrJobFlowSensor(
                task_id='test_task',
                poke_interval=2,
                job_flow_id='j-8989898989',
                aws_conn_id='aws_default'
            )

            operator.execute(None)

            # make sure we called twice
            self.assertEqual(self.mock_emr_client.describe_cluster.call_count, 2)

            # make sure it was called with the job_flow_id
            self.mock_emr_client.describe_cluster.assert_called_with(ClusterId='j-8989898989')
Esempio n. 3
0
def create_dag():
    with DAG(dag_id='emr_job_flow_manual_steps_dag',
             default_args=DEFAULT_DAG_ARGS,
             dagrun_timeout=timedelta(hours=2),
             max_active_runs=1,
             schedule_interval=None) as dag:

        create_cluster_op = EmrCreateJobFlowOperator(
            task_id='create_cluster',
            job_flow_overrides={'Name': CLUSTER_NAME},
            aws_conn_id=AWS_CONN_ID,
            emr_conn_id=EMR_CONN_ID)

        add_steps_to_cluster_op = TemplatedEmrAddStepsOperator(
            task_id='add_steps',
            job_flow_id=
            "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
            aws_conn_id=AWS_CONN_ID,
            steps=[{
                'Name': 'calculate_pi',
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                    'Jar': 's3://psm-poc-dmp-temp/spark-examples.jar',
                    'Args': ['10'],
                    'MainClass': 'org.apache.spark.examples.SparkPi'
                }
            }])

        monitor_cluster_op = EmrJobFlowSensor(
            task_id='monitor_cluster',
            retries=0,
            aws_conn_id=AWS_CONN_ID,
            job_flow_id=
            '{{ task_instance.xcom_pull("create_cluster", key="return_value") }}',
            timeout=1800)

        monitor_step_op = EmrStepSensor(
            task_id='watch_step',
            job_flow_id=
            "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
            step_id=
            "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
            aws_conn_id=AWS_CONN_ID)

        terminate_cluster_op = EmrTerminateJobFlowOperator(
            task_id='remove_cluster',
            job_flow_id=
            "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}",
            aws_conn_id=AWS_CONN_ID)

        handle_failure_op = PythonOperator(
            task_id='handle_failure',
            python_callable=handle_failure_task,
            trigger_rule=trigger_rule.TriggerRule.ONE_FAILED)

        create_cluster_op >> monitor_cluster_op >> handle_failure_op
        create_cluster_op >> add_steps_to_cluster_op >> monitor_step_op >> terminate_cluster_op

    return dag
Esempio n. 4
0
def get_job_sensor(timeout, job_flow_name, aws_conn_id):
    creator_task_id = _get_job_flow_creator_task_id(job_flow_name)
    return EmrJobFlowSensor(
        timeout=timeout,
        mode="reschedule",
        task_id=f"check_{job_flow_name}",
        retries=0,
        job_flow_id=_get_task_return_value_template(creator_task_id),
        aws_conn_id=aws_conn_id,
    )
Esempio n. 5
0
    def test_execute_calls_with_the_job_flow_id_until_it_reaches_failed_state_with_exception(self):
        self.mock_emr_client.describe_cluster.side_effect = [
            DESCRIBE_CLUSTER_RUNNING_RETURN,
            DESCRIBE_CLUSTER_TERMINATED_WITH_ERRORS_RETURN
        ]
        with patch('boto3.session.Session', self.boto3_session_mock):
            operator = EmrJobFlowSensor(
                task_id='test_task',
                poke_interval=2,
                job_flow_id='j-8989898989',
                aws_conn_id='aws_default'
            )

            with self.assertRaises(AirflowException):
                operator.execute(None)

                # make sure we called twice
                self.assertEqual(self.mock_emr_client.describe_cluster.call_count, 2)

                # make sure it was called with the job_flow_id
                self.mock_emr_client.describe_cluster.assert_called_once_with(ClusterId='j-8989898989')
    def test_execute_calls_with_the_job_flow_id_until_it_reaches_a_terminal_state(
            self):
        self.mock_emr_client.describe_cluster.side_effect = [
            DESCRIBE_CLUSTER_RUNNING_RETURN, DESCRIBE_CLUSTER_TERMINATED_RETURN
        ]
        with patch('boto3.session.Session', self.boto3_session_mock):
            operator = EmrJobFlowSensor(task_id='test_task',
                                        poke_interval=2,
                                        job_flow_id='j-8989898989',
                                        aws_conn_id='aws_default')

            operator.execute(None)

            # make sure we called twice
            self.assertEqual(self.mock_emr_client.describe_cluster.call_count,
                             2)

            # make sure it was called with the job_flow_id
            calls = [
                unittest.mock.call(ClusterId='j-8989898989'),
                unittest.mock.call(ClusterId='j-8989898989')
            ]
            self.mock_emr_client.describe_cluster.assert_has_calls(calls)
Esempio n. 7
0
blp_dag = DAG('mango_log_processing_adi',
              default_args=DEFAULT_ARGS,
              dagrun_timeout=timedelta(hours=6),
              schedule_interval='0 3 * * *')

blp_logs = EmrCreateJobFlowOperator(task_id='blp_create_job_flow',
                                    job_flow_overrides={'Steps': BLP_STEPS},
                                    aws_conn_id='aws_data_iam',
                                    emr_conn_id='emr_data_iam_mango',
                                    dag=blp_dag)

blp_job_sensor = EmrJobFlowSensor(
    task_id='blp_check_job_flow',
    job_flow_id=
    "{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}",
    aws_conn_id='aws_data_iam',
    dag=blp_dag,
    on_retry_callback=lambda context: blp_dag.clear(
        start_date=context['execution_date'],
        end_date=context['execution_date']),
)

gcp_conn_id = "google_cloud_derived_datasets"
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

gcstj_object_conditions = {'includePrefixes': 'blpadi/{{ ds }}'}

gcstj_transfer_options = {'deleteObjectsUniqueInSink': True}

bq_args = [
    'bq',
    '--location=US',
    'Name': 'calculate_pi',
    'ActionOnFailure': 'CONTINUE',
    'HadoopJarStep': {
        'Jar': 'command-runner.jar',
        'Args': ['/usr/lib/spark/bin/run-example', 'SparkPi', '10']
    }
}]

JOB_FLOW_OVERRIDES = {'Name': 'PiCalc', 'Steps': SPARK_TEST_STEPS}

dag = DAG('emr_job_flow_automatic_steps_dag',
          default_args=DEFAULT_ARGS,
          dagrun_timeout=timedelta(hours=2),
          schedule_interval='0 3 * * *')

job_flow_creator = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    dag=dag)

job_sensor = EmrJobFlowSensor(
    task_id='check_job_flow',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag)

job_flow_creator.set_downstream(job_sensor)
Esempio n. 9
0
    'email_on_retry': True
}

with DAG(
    dag_id='flight_delays_emr',
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=1),
    schedule_interval='@once',
) as dag:

    start_operator = DummyOperator(task_id='begin_execution', dag=dag)
    end_operator = DummyOperator(task_id='stop_execution', dag=dag)

    with open('emr_job_flow.json', 'r') as fp:
        job_flow = json.load(fp)
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=job_flow,
        aws_conn_id='aws_credentials',
        emr_conn_id='emr_default'
    )

    job_sensor = EmrJobFlowSensor(
        task_id='check_job_flow',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_credentials'
    )

    # define the DAG structure, in terms of the created operators
    start_operator >> cluster_creator >> job_sensor >> end_operator
Esempio n. 10
0
from airflow.utils.dates import days_ago

from emr_job_flow_with_sensor import EmrJobFlowWithSensor
from emr_step_with_sensor import EmrStepWithSensor

# the job flow step configuration as described here:
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html#EMR.Client.run_job_flow
step_conf = {}
job_conf = {}

dag = DAG(
    dag_id='spark_job',
    default_args={
        'owner': 'airflow',
        'start_date': days_ago(1)
    }
)

job = EmrJobFlowWithSensor(
    task_id='job_and_retry',
    job_flow=EmrCreateJobFlowOperator(
        task_id='job',
        job_flow_overrides=job_conf
    ),
    sensor=EmrJobFlowSensor(
        task_id='sensor',
        job_flow_id=''
    ),
    dag=dag
)
    schedule_interval='0 3 * * *'
)

blp_logs = EmrCreateJobFlowOperator(
    task_id='blp_create_job_flow',
    job_flow_overrides={'Steps': BLP_STEPS},
    aws_conn_id='aws_data_iam',
    emr_conn_id='emr_data_iam_mango',
    dag=blp_dag
)

blp_job_sensor = EmrJobFlowSensor(
    task_id='blp_check_job_flow',
    job_flow_id="{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}",
    aws_conn_id='aws_data_iam',
    dag=blp_dag,
    on_retry_callback=lambda context: blp_dag.clear(
        start_date=context['execution_date'],
        end_date=context['execution_date']),
)

gcp_conn_id = "google_cloud_derived_datasets"
connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

gcstj_object_conditions = {
    'includePrefixes':  'blpadi/{{ ds }}'
}

gcstj_transfer_options = {
    'deleteObjectsUniqueInSink': True
}
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                    'Args': [
                        's3-dist-cp',
                        '--src={{ params.dist_cp_src }}',
                        '--dest={{ params.dist_cp_target }}'
                    ],
                    'Jar': 'command-runner.jar'
                }
            }
        ]
    )

    monitor_cluster_op = EmrJobFlowSensor(
        task_id='monitor_cluster',
        retries=0,
        aws_conn_id=get_config('emr')['aws_conn_id'],
        job_flow_id='{{ task_instance.xcom_pull("create_cluster", key="return_value") }}',
        timeout=1800)

    monitor_step_op_1 = EmrStepSensor(
        task_id='watch_step_pi',
        job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
    )

    monitor_step_op_2 = EmrStepSensor(
        task_id='watch_step_distcp',
        job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[1] }}",
        aws_conn_id=get_config('emr')['aws_conn_id']
Esempio n. 13
0
            PATH_METEO_SPARK],
            'bucket_name': BUCKET_NAME,
        },
        dag=my_dag)

    # launch ephemere EMR task to process data uploaded
    precompute_data_meteo_emr_task = EmrCreateJobFlowOperator(
        task_id='precompute_data_meteo_emr',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default',
    )
    # wait for EMR end
    emr_job_sensor = EmrJobFlowSensor(
        task_id='check_job_flow',
        job_flow_id="{{ task_instance.xcom_pull(" +  \
            "task_ids='precompute_data_meteo_emr', key='return_value') }}",
        aws_conn_id='aws_default',
    )

    # retrieve data processed
    download_files_from_S3_task = PythonOperator(
        task_id='download_from_S3_meteo',
        python_callable=download_files_from_S3,
        op_kwargs={
            'filenames': [PATH_DF_METEO_FR],
            'bucket_name': BUCKET_NAME,
        },
        dag=my_dag)

    prepare_features_task = PythonOperator(
        task_id='prepare_features',
Esempio n. 14
0
    #     },
    # ],
}
run_emr_create_job_flow_task = EmrCreateJobFlowOperator(
    task_id=emr_create_cluster_task_id,
    aws_conn_id=AWS_CREDENTIALS_EMR_ID,
    emr_conn_id="emr_default",
    region_name="us-west-2", # Remove deprecated
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    do_xcom_push=True, # Remove deprecated
    dag=dag,
)
emr_job_sensor = EmrJobFlowSensor(
    task_id='check_job_flow',
    job_flow_id=f"{{{{ task_instance.xcom_pull(task_ids='{emr_create_cluster_task_id}', key='return_value') }}}}",
    # step_id="{{ task_instance.xcom_pull(task_ids='TASK_TO_WATCH', key='return_value')[0] }}", # Here gos an EmrAddStepsOperator's id
    aws_conn_id=AWS_CREDENTIALS_EMR_ID,
    dag=dag,
)

# SPARK_STEPS = [{
#     'Name': 'test step',
#     'ActionOnFailure': 'CONTINUE',
#     'HadoopJarStep': {
#         'Jar': 'command-runner.jar',
#         'Args': [
#             'spark-submit', '--deploy-mode', 'cluster', '--class', 'com.naturalint.data.spark.api.scala.NiSparkAppMain', 's3://ni-data-infra/jars/feeder-factorization-etl-1.0-SNAPSHOT.jar', '--ni-main-class', 'com.naturalint.data.etl.feeder.FactorizationEtl', '--job-id', '133', '--config-file', 's3://galactic-feeder-staging/factorization_input/133.json', '--raw-conversions-table', 'galactic_feeder_staging.conversions_raw', '--previous-runs-table', 'galactic_feeder_staging.factorization_output_partitions', '--parquet-output-location', 's3://galactic-feeder-staging/factorization_output_partitions', '--csv-output-location', 's3://galactic-feeder-staging/output'
#         ]
#     }
# }]
# add_emr_step_task = EmrAddStepsOperator(
Esempio n. 15
0
    schedule_interval='@daily',
)

start_pinkman = EmrCreateJobFlowOperator(
    task_id='start_pinkman',
    job_flow_overrides=PINKMAN_JOB_FLOW_OVERRIDES,
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    region_name='eu-central-1',
    dag=dag,
)

check_pinkman_result = EmrJobFlowSensor(
    task_id='check_pinkman_result',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='start_pinkman', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag,
)

run_walter_white = AWSBatchOperator(
    task_id='run_walter-white',
    job_name='walter-white',
    job_queue=os.getenv('COMPUTE_ENVIRONMENT_JOB_QUEUE'),
    job_definition=os.getenv('WALTER_WHITE_JOB_DEFINITION'),
    aws_conn_id='aws_default',
    region_name='eu-central-1',
    overrides={
        'environment': [
            {
                'name': 'MLFLOW_TRACKING_URI',