def apply_task_to_dag(self, **kwargs): task = kwargs['task'] parent = kwargs.get('parent', task.parent) self._validate_task_type(task) # assuming emr already exists add_step = EmrAddStepsOperator( task_id=f'{task.task_id}_add_step', job_flow_id=self.job_flow_id, job_flow_name=self.job_flow_name, aws_conn_id=self.aws_conn_id, steps=self.__generate_emr_step( task.task_id, [str(x) for x in task.get_runnable_command()]), cluster_states=self.cluster_states, dag=task.dag) if task.parent: parent.set_downstream(add_step) emr_sensor_step = EmrStepSensor( task_id=f'{task.task_id}_watch_step', job_flow_id="{{ task_instance.xcom_pull('" + add_step.task_id + "', key='job_flow_id') }}", step_id="{{ task_instance.xcom_pull('" + add_step.task_id + "', key='return_value')[0] }}", aws_conn_id=self.aws_conn_id, dag=task.dag) add_step.set_downstream(emr_sensor_step) return emr_sensor_step
def add_step_to_emr(cluster_create_task, task_identifier, step_params, cluster_remover, task_create_cluster, aws_connection, dag): """ In case we need to add multiple steps to the cluster cluster_create_task: ID of task that creates a cluster task_identifier: ID of step step_params: parameters to pass to the step cluster_remover: task that terminates the cluster task_create_cluster: task that creates the cluster aws_connection: Connection to AWS for account credentials dag: DAG that is created by the user """ step_adder = EmrAddStepsOperator( task_id=task_identifier, job_flow_id="{{ task_instance.xcom_pull('" + task_create_cluster + "', key='return_value') }}", aws_conn_id=aws_connection, steps=step_params, dag=dag) step_checker = EmrStepSensor( task_id=task_identifier + '_watch_step', job_flow_id="{{ task_instance.xcom_pull('" + task_create_cluster + "', key='return_value') }}", step_id="{{ task_instance.xcom_pull('" + task_identifier + "', key='return_value')[0] }}", aws_conn_id=aws_connection, dag=dag) cluster_create_task.set_downstream(step_adder) step_adder.set_downstream(step_checker) step_checker.set_downstream(cluster_remover)
def test_execute_adds_steps_to_the_job_flow_and_returns_step_ids(self): with patch('boto3.client', self.boto3_client_mock): operator = EmrAddStepsOperator(task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default') self.assertEqual(operator.execute(None), ['s-2LH3R5GW3A53T'])
def test_execute_adds_steps_to_the_job_flow_and_returns_step_ids(self): with patch('boto3.client', self.boto3_client_mock): operator = EmrAddStepsOperator( task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default' ) self.assertEqual(operator.execute(None), ['s-2LH3R5GW3A53T'])
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrAddStepsOperator(task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps=self._config, dag=DAG('test_dag_id', default_args=args))
def emr_operator(self, dag): spark_submit_command = [ "spark-submit", "--master", "yarn", "--name", "{{task.task_id}}", "script.py", "input.csv", "output.csv", ] step_command = dict( Name="{{task.task_id}}", ActionOnFailure="CONTINUE", HadoopJarStep=dict(Jar="command-runner.jar", Args=spark_submit_command), ) operator = EmrAddStepsOperator(task_id="emr_task", job_flow_id=1, steps=[step_command], dag=dag) track_dag(dag) return operator
def _get_test_dag(self): with DAG(dag_id='test_dag', default_args=DEFAULT_DAG_ARGS) as dag: op1 = SparkSubmitOperator(task_id='op1') op2 = EmrAddStepsOperator(task_id='op2', job_flow_id='foo') op3 = S3ListOperator(task_id='op3', bucket='foo') op4 = EmrCreateJobFlowOperator(task_id='op4') op5 = TriggerDagRunOperator(task_id='op5', trigger_dag_id='foo') op6 = FileToWasbOperator(task_id='op6', container_name='foo', blob_name='foo', file_path='foo') op7 = EmailOperator(task_id='op7', subject='foo', to='foo', html_content='foo') op8 = S3CopyObjectOperator(task_id='op8', dest_bucket_key='foo', source_bucket_key='foo') op9 = BranchPythonOperator(task_id='op9', python_callable=print) op10 = PythonOperator(task_id='op10', python_callable=range) op1 >> [op2, op3, op4] op2 >> [op5, op6] op6 >> [op7, op8, op9] op3 >> [op7, op8] op8 >> [op9, op10] return dag
def poke(): hook = hooks.S3_hook.S3Hook(aws_conn_id='aws_s3') job_flow_id = "j-2ASQREUMPJ0Y7" aws_conn_id = 'aws_emr' st = hook.read_key(key='prod_deployment/conf/athena_all_tables', bucket_name='bounce-data-platform') loop = st.split(",") print(loop) for i in range(0, len(loop)): steps = [{ 'Name': 'test step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': ['hive', '-e', loop[i]] } }] step_addr = EmrAddStepsOperator(task_id='add_steps' + str(i), job_flow_id="j-2ASQREUMPJ0Y7", aws_conn_id='aws_emr', steps=steps, dag=dag) step_adder.append(step_addr) step_checkr = EmrStepSensor( task_id='watch_step' + str(i), job_flow_id="j-2ASQREUMPJ0Y7", step_id= "{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_emr', dag=dag) step_checker.append(step_checkr)
def emr_operator(self, dag): spark_submit_command = [ "spark-submit", "--master", "yarn", "--name", "{{task.task_id}}", "script.py", "input.csv", "output.csv", ] step_command = dict( Name="{{task.task_id}}", ActionOnFailure="CONTINUE", HadoopJarStep=dict(Jar="command-runner.jar", Args=spark_submit_command), ) operator = EmrAddStepsOperator( task_id="emr_task", job_flow_id=1, steps=[step_command], dag=dag ) env = { "AIRFLOW_CTX_DAG_ID": "test_dag", "AIRFLOW_CTX_EXECUTION_DATE": "emr_task", "AIRFLOW_CTX_TASK_ID": "1970-01-01T0000.000", "AIRFLOW_CTX_TRY_NUMBER": "1", } add_tracking_to_submit_task(env, operator) return operator
def test_init_with_nonexistent_cluster_name(self): cluster_name = 'test_cluster' with patch('airflow.contrib.hooks.emr_hook.EmrHook.get_cluster_id_by_name') \ as mock_get_cluster_id_by_name: mock_get_cluster_id_by_name.return_value = None operator = EmrAddStepsOperator( task_id='test_task', job_flow_name=cluster_name, cluster_states=['RUNNING', 'WAITING'], aws_conn_id='aws_default', dag=DAG('test_dag_id', default_args=self.args)) with self.assertRaises(AirflowException) as error: operator.execute(self.mock_context) self.assertEqual(str(error.exception), f'No cluster found for name: {cluster_name}')
def test_init_with_cluster_name(self): expected_job_flow_id = 'j-1231231234' self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN with patch('boto3.session.Session', self.boto3_session_mock): operator = EmrAddStepsOperator( task_id='test_task', job_flow_name='test_cluster', cluster_states=['RUNNING', 'WAITING'], aws_conn_id='aws_default', dag=DAG('test_dag_id', default_args=self.args) ) operator.execute(self.mock_context) ti = self.mock_context['ti'] ti.xcom_push.assert_any_call(key='job_flow_id', value=expected_job_flow_id)
def setUp(self): self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() # Mock out the emr_client creator emr_session_mock = MagicMock() emr_session_mock.client.return_value = self.emr_client_mock self.boto3_session_mock = MagicMock(return_value=emr_session_mock) self.mock_context = MagicMock() self.operator = EmrAddStepsOperator(task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps=self._config, dag=DAG('test_dag_id', default_args=self.args))
def setUp(self): configuration.load_test_config() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrAddStepsOperator( task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps=self._config, dag=DAG('test_dag_id', default_args=args) )
def poke(run_this, t2): hook = S3Hook(aws_conn_id='aws_s3') job_flow_id = "j-2ASQREUMPJ0Y7" aws_conn_id = 'aws_emr' st = hook.read_key(key='prod_deployment/conf/athena_all_tables', bucket_name='bounce-data-platform') loop = st.split(",") print(loop) # X = 5 if not loop is None else len(loop) X = 0 if loop is None else len(loop) for i in range(0, X): steps = [{ 'Name': 'test step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ 'hive', '-e', 'msck repair table dataplatform.task_fact_daily_agg_entity' ] } }] # t3= BashOperator( # task_id='ag' + str(i), # bash_command='echo "success"', # dag=dag) # run_this>>t3>>t2 step_addr = EmrAddStepsOperator(task_id='add_steps_' + str(i), job_flow_id="j-2ASQREUMPJ0Y7", aws_conn_id='aws_emr', steps=steps, dag=dag) step_checkr = EmrStepSensor( task_id='watch_step_' + str(i), job_flow_id="j-2ASQREUMPJ0Y7", step_id= "{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_emr', dag=dag) run_this >> step_addr >> step_checkr >> t2
def add_import_raw_dataset_step_job(**kwargs): steps = [{ "Name": kwargs.get("step_name"), "ActionOnFailure": "CONTINUE", "HadoopJarStep": { "Jar": "command-runner.jar", "Args": [ "spark-submit", "--deploy-mode", "cluster", "--num-executors", kwargs.get("num_executors", "1"), "--executor-cores", kwargs.get("executor_cores", "4"), "--driver-memory", kwargs.get("driver_memory", "4g"), "--executor-memory", kwargs.get("executor_memory", "8g"), "--class", kwargs.get("main_class"), spark_app_jar_location, kwargs.get("source_path"), kwargs.get("source_format"), kwargs.get("target_path"), kwargs.get("target_format"), ], }, }] job_flow_id = kwargs["ti"].xcom_pull(task_ids="create_cluster_emr_job", key="job_flow_id") step_ids = EmrAddStepsOperator( task_id=kwargs.get("task_id"), aws_conn_id="aws_default", job_flow_id=job_flow_id, steps=steps, dag=dag, ).execute(kwargs) kwargs["ti"].xcom_push(key=kwargs.get("step_id_key"), value=step_ids[0])
def transform(self, subdag: nx.DiGraph, parent_fragment: DAGFragment) -> DAGFragment: subdag_roots = [n for n, d in subdag.in_degree() if d == 0] first_root = subdag_roots[0].task_id task_id_prefix = '' if first_root in ['op2', 'op3'] else '2' TestSubDagTransformer1.op1 = SparkSubmitOperator( task_id=f"t{task_id_prefix}p1", dag=self.dag) TestSubDagTransformer1.op2 = EmrAddStepsOperator( task_id=f"t{task_id_prefix}p2", job_flow_id='foo', dag=self.dag) TestSubDagTransformer1.op3 = S3ListOperator( task_id=f"t{task_id_prefix}p3", bucket='foo', dag=self.dag) TestSubDagTransformer1.op4 = EmrCreateJobFlowOperator( task_id=f"t{task_id_prefix}p4", dag=self.dag) TestSubDagTransformer1.op5 = DummyOperator( task_id=f"t{task_id_prefix}p5", dag=self.dag) TestSubDagTransformer1.op1 >> [ TestSubDagTransformer1.op2, TestSubDagTransformer1.op3 ] >> TestSubDagTransformer1.op4 return DAGFragment( [TestSubDagTransformer1.op1, TestSubDagTransformer1.op5])
create_EMR_instance = EmrCreateJobFlowOperator( task_id="create_EMR_cluster", job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id="aws_default", emr_conn_id="emr_default", dag=dag ) # Add your steps to the EMR cluster EMR_step_adder = EmrAddStepsOperator( task_id="EMR_step_adder", job_flow_id="{{ task_instance.xcom_pull(task_ids='create_EMR_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEPS, params={ # these params are used to provide the parameters for the steps JSON above "bucket_name": BUCKET_NAME, "s3_data": S3_DATA_BUCKET, "s3_script_bucket": S3_SCRIPT_BUCKET, "s3_output": S3_ANALYTICS_BUCKET, }, dag=dag, ) # get the number of the final step final_EMR_step = len(SPARK_STEPS) - 1 # wait for the steps to complete - seem to have to use concatenation here as {}.format() # seems to fail because of double {} in source string EMR_step_checker = EmrStepSensor( task_id="EMR_step_checker", job_flow_id="{{ task_instance.xcom_pull('create_EMR_cluster', key='return_value') }}",
) skip_emr_database_creation = DummyOperator( task_id="skip_emr_database_creation", trigger_rule=TriggerRule.NONE_FAILED, dag=dag, ) create_emr_database_cluster = EmrCreateJobFlowOperator( task_id='create_emr_database_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, dag=dag) create_emr_database_step = EmrAddStepsOperator( task_id='create_emr_database_step', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_database_cluster', key='return_value') }}", aws_conn_id='aws_default', on_failure_callback=cleanup_emr_cluster_if_steps_fail, steps=CREATE_DATABASE, ) create_emr_database_sensor = EmrStepSensor( task_id='create_emr_database_sensor', job_flow_id= "{{ task_instance.xcom_pull('create_emr_database_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='create_emr_database_step', key='return_value')[0] }}", on_failure_callback=cleanup_emr_cluster_if_steps_fail, aws_conn_id='aws_default', ) terminate_emr_cluster = EmrTerminateJobFlowOperator( task_id='terminate_emr_cluster',
task_id="create_emr_cluster", job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id="aws_default", emr_conn_id="emr_default", dag=dag, ) # Add your steps to the EMR cluster step_one = EmrAddStepsOperator( task_id="step_one", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEPS, params= { # these params are used to fill the paramterized values in SPARK_STEPS json "bucket_name": bucket_name, "scripts_path_key": scripts_path_key + "/", "data_path_key": data_path_key, "processed_tables_key": processed_tables_key }, dag=dag, ) last_step = len( SPARK_STEPS ) - 1 # this value will let the sensor know the last step to watch # wait for the steps to complete step_checker = EmrStepSensor( task_id="watch_step", job_flow_id=
dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *' ) cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', dag=dag ) step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_TEST_STEPS, dag=dag ) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag ) cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
with DAG( dag_id=DAG_ID, default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(minutes=15), start_date=days_ago(1), schedule_interval='@once', tags=['emr'], ) as dag: cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', emr_conn_id='aws_default', job_flow_overrides=JOB_FLOW_OVERRIDES ) step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_STEPS, ) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id='aws_default', ) cluster_creator >> step_adder >> step_checker
'key': 'scripts/random_text_classification.py', }, ) with open(movie_clean_emr_steps) as json_file: emr_steps = json.load(json_file) # adding our EMR steps to an existing EMR cluster add_emr_steps = EmrAddStepsOperator(dag=dag, task_id='add_emr_steps', job_flow_id=EMR_ID, aws_conn_id='aws_default', steps=emr_steps, params={ 'BUCKET_NAME': BUCKET_NAME, 'movie_review_load': movie_review_load_folder, 'text_classifier_script': text_classifier_script, 'movie_review_stage': movie_review_stage }, depends_on_past=True) last_step = len(emr_steps) - 1 # sensing if the last step is complete clean_movie_review_data = EmrStepSensor( dag=dag, task_id='clean_movie_review_data', job_flow_id=EMR_ID,
class TestEmrAddStepsOperator(unittest.TestCase): # When _config = [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', '{{ macros.ds_add(ds, -1) }}', '{{ ds }}' ] } }] def setUp(self): configuration.load_test_config() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrAddStepsOperator( task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps=self._config, dag=DAG('test_dag_id', default_args=args) ) def test_init(self): self.assertEqual(self.operator.job_flow_id, 'j-8989898989') self.assertEqual(self.operator.aws_conn_id, 'aws_default') def test_render_template(self): ti = TaskInstance(self.operator, DEFAULT_DATE) ti.render_templates() expected_args = [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"), DEFAULT_DATE.strftime("%Y-%m-%d"), ] } }] self.assertListEqual(self.operator.steps, expected_args) def test_execute_returns_step_id(self): self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN # Mock out the emr_client creator emr_session_mock = MagicMock() emr_session_mock.client.return_value = self.emr_client_mock self.boto3_session_mock = MagicMock(return_value=emr_session_mock) with patch('boto3.session.Session', self.boto3_session_mock): self.assertEqual(self.operator.execute(None), ['s-2LH3R5GW3A53T'])
dag_id='emr_job_movies_dag_parallel', default_args=DEFAULT_ARGS, concurrency=3, dagrun_timeout=timedelta(hours=2), schedule_interval=None ) as dag: create_cluster = EmrCreateJobFlowOperator( task_id='create_emr_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default' ) add_step_1 = EmrAddStepsOperator( task_id='movie_analytics_job_1', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_STEP_1_PROPS ) wait_for_step_1 = EmrStepSensor( task_id='wait_for_analytics_completion_1', job_flow_id="{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='movie_analytics_job_1', key='return_value')[0] }}", aws_conn_id='aws_default' ) add_step_2 = EmrAddStepsOperator( task_id='movie_analytics_job_2', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_STEP_2_PROPS
add_steps_to_cluster_op = EmrAddStepsOperator( task_id='add_steps', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=get_config('emr')['aws_conn_id'], steps=[ { 'Name': 'calculate_pi', 'ActionOnFailure': 'TERMINATE_CLUSTER', 'HadoopJarStep': { 'Jar': '{{ params.hadoop_jar_path }}', 'Args': [ '10' ], 'MainClass': 'org.apache.spark.examples.SparkPi' } }, { 'Name': 'Copy Publisher Output from EMR HDFS to S3', 'ActionOnFailure': 'TERMINATE_CLUSTER', 'HadoopJarStep': { 'Args': [ 's3-dist-cp', '--src={{ params.dist_cp_src }}', '--dest={{ params.dist_cp_target }}' ], 'Jar': 'command-runner.jar' } } ] )
class TestEmrAddStepsOperator(unittest.TestCase): # When _config = [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', '{{ macros.ds_add(ds, -1) }}', '{{ ds }}' ] } }] def setUp(self): self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() # Mock out the emr_client creator emr_session_mock = MagicMock() emr_session_mock.client.return_value = self.emr_client_mock self.boto3_session_mock = MagicMock(return_value=emr_session_mock) self.mock_context = MagicMock() self.operator = EmrAddStepsOperator(task_id='test_task', job_flow_id='j-8989898989', aws_conn_id='aws_default', steps=self._config, dag=DAG('test_dag_id', default_args=self.args)) def test_init(self): self.assertEqual(self.operator.job_flow_id, 'j-8989898989') self.assertEqual(self.operator.aws_conn_id, 'aws_default') def test_render_template(self): ti = TaskInstance(self.operator, DEFAULT_DATE) ti.render_templates() expected_args = [{ 'Name': 'test_step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ '/usr/lib/spark/bin/run-example', (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"), DEFAULT_DATE.strftime("%Y-%m-%d"), ] } }] self.assertListEqual(self.operator.steps, expected_args) def test_execute_returns_step_id(self): self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN with patch('boto3.session.Session', self.boto3_session_mock): self.assertEqual(self.operator.execute(self.mock_context), ['s-2LH3R5GW3A53T']) def test_init_with_cluster_name(self): expected_job_flow_id = 'j-1231231234' self.emr_client_mock.add_job_flow_steps.return_value = ADD_STEPS_SUCCESS_RETURN with patch('boto3.session.Session', self.boto3_session_mock): with patch('airflow.contrib.hooks.emr_hook.EmrHook.get_cluster_id_by_name') \ as mock_get_cluster_id_by_name: mock_get_cluster_id_by_name.return_value = expected_job_flow_id operator = EmrAddStepsOperator( task_id='test_task', job_flow_name='test_cluster', cluster_states=['RUNNING', 'WAITING'], aws_conn_id='aws_default', dag=DAG('test_dag_id', default_args=self.args)) operator.execute(self.mock_context) ti = self.mock_context['ti'] ti.xcom_push.assert_called_once_with(key='job_flow_id', value=expected_job_flow_id) def test_init_with_nonexistent_cluster_name(self): cluster_name = 'test_cluster' with patch('airflow.contrib.hooks.emr_hook.EmrHook.get_cluster_id_by_name') \ as mock_get_cluster_id_by_name: mock_get_cluster_id_by_name.return_value = None operator = EmrAddStepsOperator( task_id='test_task', job_flow_name=cluster_name, cluster_states=['RUNNING', 'WAITING'], aws_conn_id='aws_default', dag=DAG('test_dag_id', default_args=self.args)) with self.assertRaises(AirflowException) as error: operator.execute(self.mock_context) self.assertEqual(str(error.exception), f'No cluster found for name: {cluster_name}')
default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), start_date=days_ago(1), schedule_interval=None, tags=["emr", "spark", "pyspark"], ) as dag: cluster_creator = EmrCreateJobFlowOperator( task_id="create_job_flow", job_flow_overrides=get_object( "job_flow_overrides/job_flow_overrides.json", work_bucket), ) step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id="aws_default", steps=get_object("emr_steps/emr_steps.json", work_bucket), ) step_checker = EmrStepSensor( task_id="watch_step", job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id="aws_default", ) cluster_creator >> step_adder >> step_checker
file_sensor = S3KeySensor(task_id='file_sensor', poke_interval=600, timeout=1000, soft_fail=False, bucket_name='ds-afarrell', bucket_key='manybla.txt') create_cluster = EmrCreateJobFlowOperator( task_id='create_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_benchmarks_connection') run_some_pyspark = EmrAddStepsOperator( task_id='run_some_pyspark', job_flow_id= "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=EMR_STEP_1) output_file_sensor = S3KeySensor( task_id='output_file_sensor', poke_interval=600, timeout=1000, soft_fail=False, bucket_name='ds-afarrell', bucket_key='hello_world_was_written/_SUCCESS') cluster_remover = EmrTerminateJobFlowOperator( task_id="cluster_remover", job_flow_id= "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
check_data_exists_task = PythonOperator(task_id='check_data_exists', python_callable=check_data_exists, provide_context=False, dag=dag) create_job_flow_task = EmrCreateJobFlowOperator( task_id='create_job_flow', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=default_emr_settings, dag=dag) copy_python_script = EmrAddStepsOperator( task_id='copy_script', # XComs let tasks exchange messages job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=copy_script_step, dag=dag) watch_prev_step_task1 = EmrStepSensor( task_id='watch_prev_step1', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull('copy_script', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag) run_spark_job = EmrAddStepsOperator( task_id='run_spark_job',
# Create an EMR cluster create_emr_cluster = EmrCreateJobFlowOperator( task_id="create_emr_cluster", job_flow_overrides=get_job_flow_overrides(job_flow_overrides), aws_conn_id="aws_default", emr_conn_id="emr_default") # Add steps to the EMR cluster # Step 1 = ETL Pipeline # Step 2 = Data Quality Test step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEPS, params={ "bucket_name": bucket_name, "s3_etl_script": s3_etl_script, "s3_dq_script": s3_dq_script }) # wait for the steps to complete last_step = len(SPARK_STEPS) - 1 step_checker = EmrStepSensor( task_id="watch_step", job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[" + str(last_step) + "] }}",
dag = DAG('emr_job_flow_manual_steps_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *') cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', dag=dag) step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_TEST_STEPS, dag=dag) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag) cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id=
JOB_FLOW_OVERRIDES = {"Name": "MoviesAnalytics"} with DAG(dag_id='emr_job_movies_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval=None) as dag: cluster_creator = EmrCreateJobFlowOperator( task_id='create_emr_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default') step_adder = EmrAddStepsOperator( task_id='movie_analytics_job', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_TEST_STEPS) step_checker = EmrStepSensor( task_id='wait_for_analytics_completion', job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='movie_analytics_job', key='return_value')[0] }}", aws_conn_id='aws_default') cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
# Create an EMR cluster create_emr_cluster = EmrCreateJobFlowOperator( task_id="create_emr_cluster", job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id="aws_default", emr_conn_id="emr_default", dag=dag, ) # Add your steps to the EMR cluster load_us_states_step = EmrAddStepsOperator( task_id="load_us_states_steps", job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=LOAD_US_STATES_STEP, params={ # these params are used to fill the paramterized values in SPARK_STEPS json "BUCKET_NAME": BUCKET_NAME, }, dag=dag, ) # wait for the steps to complete us_states_step_checker = EmrStepSensor( task_id="us_states_step_checker", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='load_us_states_steps', key='return_value')[0] }}", aws_conn_id="aws_default", dag=dag, )