def setUp(self):
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrCreateJobFlowOperator(
            task_id='test_task',
            aws_conn_id='aws_default',
            emr_conn_id='emr_default',
            job_flow_overrides=self._config,
            region_name='ap-southeast-2',
            dag=DAG('test_dag_id', default_args=args))
    def setUp(self):
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrCreateJobFlowOperator(
            task_id='test_task',
            aws_conn_id='aws_default',
            emr_conn_id='emr_default',
            region_name='ap-southeast-2',
            dag=DAG('test_dag_id',
                    default_args=args,
                    template_searchpath=TEMPLATE_SEARCHPATH,
                    template_undefined=StrictUndefined))
Ejemplo n.º 3
0
    'JobFlowRole': 'EMR_EC2_DefaultRole',
    'ServiceRole': 'EMR_DefaultRole',
}

with DAG(
        dag_id='emr_job_flow_manual_steps_dag',
        default_args=DEFAULT_ARGS,
        dagrun_timeout=timedelta(hours=2),
        schedule_interval='0 3 * * *',
        tags=['example'],
) as dag:

    # [START howto_operator_emr_manual_steps_tasks]
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='aws_default',
        emr_conn_id='emr_default')

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default',
        steps=SPARK_STEPS)

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id=
with DAG(
    dag_id=DAG_ID,
    description="Analyze Bakery Sales with Amazon EMR",
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=2),
    start_date=days_ago(1),
    schedule_interval=None,
    tags=["emr demo", "spark", "pyspark"],
) as dag:
    begin = DummyOperator(task_id="begin")

    end = DummyOperator(task_id="end")

    cluster_creator = EmrCreateJobFlowOperator(
        task_id="create_job_flow", job_flow_overrides=JOB_FLOW_OVERRIDES
    )

    step_adder = EmrAddStepsOperator(
        task_id="add_steps",
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id="aws_default",
        steps=SPARK_STEPS,
    )

    step_checker = EmrStepSensor(
        task_id="watch_step",
        job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id="aws_default",
    )
class TestEmrCreateJobFlowOperator(unittest.TestCase):
    # When
    _config = {
        'Name':
        'test_job_flow',
        'ReleaseLabel':
        '5.11.0',
        'Steps': [{
            'Name': 'test_step',
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    '/usr/lib/spark/bin/run-example',
                    '{{ macros.ds_add(ds, -1) }}', '{{ ds }}'
                ]
            }
        }]
    }

    def setUp(self):
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}

        # Mock out the emr_client (moto has incorrect response)
        self.emr_client_mock = MagicMock()
        self.operator = EmrCreateJobFlowOperator(
            task_id='test_task',
            aws_conn_id='aws_default',
            emr_conn_id='emr_default',
            region_name='ap-southeast-2',
            dag=DAG('test_dag_id',
                    default_args=args,
                    template_searchpath=TEMPLATE_SEARCHPATH,
                    template_undefined=StrictUndefined))

    def test_init(self):
        self.assertEqual(self.operator.aws_conn_id, 'aws_default')
        self.assertEqual(self.operator.emr_conn_id, 'emr_default')
        self.assertEqual(self.operator.region_name, 'ap-southeast-2')

    def test_render_template(self):
        self.operator.job_flow_overrides = self._config
        ti = TaskInstance(self.operator, DEFAULT_DATE)
        ti.render_templates()

        expected_args = {
            'Name':
            'test_job_flow',
            'ReleaseLabel':
            '5.11.0',
            'Steps': [{
                'Name': 'test_step',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar':
                    'command-runner.jar',
                    'Args': [
                        '/usr/lib/spark/bin/run-example',
                        (DEFAULT_DATE -
                         timedelta(days=1)).strftime("%Y-%m-%d"),
                        DEFAULT_DATE.strftime("%Y-%m-%d"),
                    ]
                }
            }]
        }

        self.assertDictEqual(self.operator.job_flow_overrides, expected_args)

    def test_render_template_from_file(self):
        self.operator.job_flow_overrides = 'job.j2.json'
        self.operator.params = {'releaseLabel': '5.11.0'}

        ti = TaskInstance(self.operator, DEFAULT_DATE)
        ti.render_templates()

        self.emr_client_mock.run_job_flow.return_value = RUN_JOB_FLOW_SUCCESS_RETURN
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        boto3_session_mock = MagicMock(return_value=emr_session_mock)

        with patch('boto3.session.Session', boto3_session_mock):
            self.operator.execute(None)

        expected_args = {
            'Name':
            'test_job_flow',
            'ReleaseLabel':
            '5.11.0',
            'Steps': [{
                'Name': 'test_step',
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar':
                    'command-runner.jar',
                    'Args': [
                        '/usr/lib/spark/bin/run-example',
                        '2016-12-31',
                        '2017-01-01',
                    ]
                }
            }]
        }

        self.assertDictEqual(self.operator.job_flow_overrides, expected_args)

    def test_execute_returns_job_id(self):
        self.emr_client_mock.run_job_flow.return_value = RUN_JOB_FLOW_SUCCESS_RETURN

        # Mock out the emr_client creator
        emr_session_mock = MagicMock()
        emr_session_mock.client.return_value = self.emr_client_mock
        boto3_session_mock = MagicMock(return_value=emr_session_mock)

        with patch('boto3.session.Session', boto3_session_mock):
            self.assertEqual(self.operator.execute(None), 'j-8989898989')
Ejemplo n.º 6
0
    return [spark_step]


with DAG('transform_raw_patents',
    default_args=default_args,
    schedule_interval='@quarterly',
    catchup=False) as dag:

    # create non entity specific tasks
    start_dag = DummyOperator(
        task_id='start_dag',
    )

    create_emr_cluster = EmrCreateJobFlowOperator(
        task_id='create_emr_cluster',
        aws_conn_id='aws_default',
        emr_conn_id='emr_default'
    )
    job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}"
    
    terminate_emr_cluster = EmrTerminateJobFlowOperator(
        task_id='terminate_emr_cluster',
        job_flow_id=job_flow_id,
        aws_conn_id='aws_default'
    )

    # create_emr_cluster = DummyOperator(
    #     task_id='create_emr_cluster',
    # )
    # job_flow_id = 'j-2NFK2ZRYYVS71'
Ejemplo n.º 7
0
with DAG(
        'etl',
        default_args=default_args,
        description='ETL',
        catchup=False,
        start_date=datetime.datetime(2020, 1, 1),
        schedule_interval=None,
        tags=['udacity', 'etl'],
        default_view="graph",
) as dag:
    # generate dag documentation
    dag.doc_md = __doc__

    create_cluster = EmrCreateJobFlowOperator(
        dag=dag,
        task_id="create_cluster",
        job_flow_overrides=job_flow_overrides,
        aws_conn_id=aws_conn_id)
    wait_cluster_completion = EmrJobFlowSensor(
        task_id='wait_cluster_completion',
        job_flow_id=cluster_id,
        aws_conn_id=aws_conn_id,
        target_states=["RUNNING", "WAITING"],
        dag=dag)
    terminate_cluster = EmrTerminateJobFlowOperator(
        task_id="terminate_cluster",
        trigger_rule="all_done",
        job_flow_id=cluster_id,
        aws_conn_id=aws_conn_id,
        dag=dag)

with DAG('emr_dag',
	default_args=default_args,
	description='ETL for ESG analytics',
	catchup=False,
	schedule_interval=timedelta(days=1)) as dag:

	join_before_emr = DummyOperator(task_id='start_emr')
	end_data_pipeline = DummyOperator(task_id='ETL_DONE')

	## [START EMR Spark ETL]
	# Create an EMR cluster
	create_emr_cluster = EmrCreateJobFlowOperator(
		task_id="create_emr_cluster",
		job_flow_overrides=JOB_FLOW_OVERRIDES,
		aws_conn_id="aws_credentials",
		emr_conn_id="emr_default")

	# Add steps to the EMR cluster
	step_adder = EmrAddStepsOperator(
	    task_id="add_steps",
	    job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
	    aws_conn_id="aws_credentials",
	    steps=SPARK_STEPS,
	    params={ # these params are used to fill the paramterized values in SPARK_STEPS json
	        "BUCKET_NAME": config['bucket_name'],
	        "spark_script": "/scripts/etl_spark_gtrends.py",
	        "s3_processed": "processed",
	    })
Ejemplo n.º 9
0
                                     "local_file": "some_file_path"
                                 })

    check_data_exists_task = PythonOperator(
        task_id='check_data_exists',
        python_callable=utils_airflow.check_data_exists,
        op_kwargs={
            "aws_conn": "my_aws_conn",
            "bucket": "my_bucket",
            "prefix": "file_name"
        },
        provide_context=False)

    create_job_flow_task = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        aws_conn_id='aws_default',
        emr_conn_id='emr_default',
        job_flow_overrides=cluster_conf)

    add_step_task = EmrAddStepsOperator(
        task_id='My_first_job',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='my_aws_conn',
        steps=my_first_job)

    watch_prev_step_task = EmrStepSensor(
        task_id='watch_prev_step',
        job_flow_id=
        "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}",
        step_id=
Ejemplo n.º 10
0
)

sensor_2 = EmrStepSensor(
    task_id='sensor_daily_parse_message',
    job_flow_id="{{ dag_run.conf['cluster_id'] }}",
    step_id="{{ task_instance.xcom_pull('step_2', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag
)

step_1 >> sensor_1 >> step_2 >> sensor_2

create_cluster = EmrCreateJobFlowOperator(
    task_id='create_cluster',
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    email_on_failure=True,
    dag=dag
)


def get_spot_instance_fleet_id(cluster_id):
    emr_client = boto3.client('emr', region_name=Variable.get('aws_default_region'))
    response = emr_client.list_instance_fleets(ClusterId=cluster_id)

    for instance_fleet in response['InstanceFleets']:
        if instance_fleet["InstanceFleetType"] == "TASK":
            return instance_fleet["Id"]


def dagrun_trigger(context, dag_run_obj):
Ejemplo n.º 11
0
with DAG(
    dag_id="test_emr_transient_cluster",
    default_args={
        "owner": "Guido Kosloff",
        "depends_on_past": False,
        "email_on_failure": False,
        "email_on_retry": False,
    },
    start_date=datetime(2021, 1, 1, 0, 0),
    schedule_interval="@daily",
    catchup=False,
) as dag:

    create_cluster = EmrCreateJobFlowOperator(
        task_id="create_cluster",
        job_flow_overrides=EMRClusterConfig.JOB_FLOW_OVERRIDES,
        aws_conn_id="aws_default",
        emr_conn_id="emr_default",
    )

    add_step_load_raw_data = EmrAddStepsOperator(
        task_id="add_step_load_raw_data",
        job_flow_id=create_cluster.output,
        aws_conn_id="aws_default",
        steps=SparkSteps.LOAD_RAW_DATA,
    )

    wait_for_step_load_raw_data = EmrStepSensor(
        task_id="wait_for_step_load_raw_data",
        job_flow_id=create_cluster.output,
        step_id="{{ task_instance.xcom_pull(task_ids='add_step_load_raw_data', key='return_value')[0] }}",
        aws_conn_id="aws_default",
with DAG(
        dag_id=DAG_ID,
        description="Run multiple Spark jobs with Amazon EMR",
        default_args=DEFAULT_ARGS,
        dagrun_timeout=timedelta(hours=2),
        start_date=days_ago(1),
        schedule_interval=None,
        tags=["emr demo", "spark", "pyspark"],
) as dag:
    begin = DummyOperator(task_id="begin")

    end = DummyOperator(task_id="end")

    cluster_creator = EmrCreateJobFlowOperator(
        task_id="create_job_flow",
        job_flow_overrides=get_object(
            "job_flow_overrides/job_flow_overrides.json", work_bucket),
    )

    step_adder = EmrAddStepsOperator(
        task_id="add_steps",
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id="aws_default",
        steps=get_object("emr_steps/emr_steps.json", work_bucket),
    )

    step_checker = EmrStepSensor(
        task_id="watch_step",
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
Ejemplo n.º 13
0
        'TerminationProtected':
        False,
    },
    'Steps': SPARK_STEPS,
    'JobFlowRole': 'EMR_EC2_DefaultRole',
    'ServiceRole': 'EMR_DefaultRole',
}
# [END howto_operator_emr_automatic_steps_config]

with DAG(
        dag_id='emr_job_flow_automatic_steps_dag',
        dagrun_timeout=timedelta(hours=2),
        start_date=datetime(2021, 1, 1),
        schedule_interval='0 3 * * *',
        catchup=False,
        tags=['example'],
) as dag:

    # [START howto_operator_emr_automatic_steps_tasks]
    job_flow_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
    )

    job_sensor = EmrJobFlowSensor(task_id='check_job_flow',
                                  job_flow_id=job_flow_creator.output)
    # [END howto_operator_emr_automatic_steps_tasks]

    # Task dependency created via `XComArgs`:
    #   job_flow_creator >> job_sensor
Ejemplo n.º 14
0
        "Jar": "command-runner.jar",
        "Args": ["bash", "/home/hadoop/emr_setup.sh"]
    }
}]

dag = DAG(
    'sec-pipeline',
    default_args=default_args,
    description='DAG test',
    schedule_interval=timedelta(days=1),
)

create_emr_cluster = EmrCreateJobFlowOperator(
    task_id='create_job_flow',
    aws_conn_id='aws_default',
    emr_conn_id='emr_default',
    region_name='us-east-2',
    job_flow_overrides=JOB_FLOW_OVERRIDES,
    dag=dag)

emr_step_1 = EmrAddStepsOperator(
    task_id='emr_step1',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=step1,
    dag=dag)

emr_step_2 = EmrAddStepsOperator(
    task_id='emr_step2',
    job_flow_id=