def apply_task_to_dag(self, **kwargs): task = kwargs['task'] parent = kwargs.get('parent', task.parent) self._validate_task_type(task) # assuming emr already exists add_step = EmrAddStepsOperator( task_id=f'{task.task_id}_add_step', job_flow_id=self.job_flow_id, job_flow_name=self.job_flow_name, aws_conn_id=self.aws_conn_id, steps=self.__generate_emr_step( task.task_id, [str(x) for x in task.get_runnable_command()]), cluster_states=self.cluster_states, dag=task.dag) if task.parent: parent.set_downstream(add_step) emr_sensor_step = EmrStepSensor( task_id=f'{task.task_id}_watch_step', job_flow_id="{{ task_instance.xcom_pull('" + add_step.task_id + "', key='job_flow_id') }}", step_id="{{ task_instance.xcom_pull('" + add_step.task_id + "', key='return_value')[0] }}", aws_conn_id=self.aws_conn_id, dag=task.dag) add_step.set_downstream(emr_sensor_step) return emr_sensor_step
def add_step_to_emr(cluster_create_task, task_identifier, step_params, cluster_remover, task_create_cluster, aws_connection, dag): """ In case we need to add multiple steps to the cluster cluster_create_task: ID of task that creates a cluster task_identifier: ID of step step_params: parameters to pass to the step cluster_remover: task that terminates the cluster task_create_cluster: task that creates the cluster aws_connection: Connection to AWS for account credentials dag: DAG that is created by the user """ step_adder = EmrAddStepsOperator( task_id=task_identifier, job_flow_id="{{ task_instance.xcom_pull('" + task_create_cluster + "', key='return_value') }}", aws_conn_id=aws_connection, steps=step_params, dag=dag) step_checker = EmrStepSensor( task_id=task_identifier + '_watch_step', job_flow_id="{{ task_instance.xcom_pull('" + task_create_cluster + "', key='return_value') }}", step_id="{{ task_instance.xcom_pull('" + task_identifier + "', key='return_value')[0] }}", aws_conn_id=aws_connection, dag=dag) cluster_create_task.set_downstream(step_adder) step_adder.set_downstream(step_checker) step_checker.set_downstream(cluster_remover)
def poke(): hook = hooks.S3_hook.S3Hook(aws_conn_id='aws_s3') job_flow_id = "j-2ASQREUMPJ0Y7" aws_conn_id = 'aws_emr' st = hook.read_key(key='prod_deployment/conf/athena_all_tables', bucket_name='bounce-data-platform') loop = st.split(",") print(loop) for i in range(0, len(loop)): steps = [{ 'Name': 'test step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': ['hive', '-e', loop[i]] } }] step_addr = EmrAddStepsOperator(task_id='add_steps' + str(i), job_flow_id="j-2ASQREUMPJ0Y7", aws_conn_id='aws_emr', steps=steps, dag=dag) step_adder.append(step_addr) step_checkr = EmrStepSensor( task_id='watch_step' + str(i), job_flow_id="j-2ASQREUMPJ0Y7", step_id= "{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_emr', dag=dag) step_checker.append(step_checkr)
def create_dag(): with DAG(dag_id='emr_job_flow_manual_steps_dag', default_args=DEFAULT_DAG_ARGS, dagrun_timeout=timedelta(hours=2), max_active_runs=1, schedule_interval=None) as dag: create_cluster_op = EmrCreateJobFlowOperator( task_id='create_cluster', job_flow_overrides={'Name': CLUSTER_NAME}, aws_conn_id=AWS_CONN_ID, emr_conn_id=EMR_CONN_ID) add_steps_to_cluster_op = TemplatedEmrAddStepsOperator( task_id='add_steps', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=AWS_CONN_ID, steps=[{ 'Name': 'calculate_pi', 'ActionOnFailure': 'TERMINATE_CLUSTER', 'HadoopJarStep': { 'Jar': 's3://psm-poc-dmp-temp/spark-examples.jar', 'Args': ['10'], 'MainClass': 'org.apache.spark.examples.SparkPi' } }]) monitor_cluster_op = EmrJobFlowSensor( task_id='monitor_cluster', retries=0, aws_conn_id=AWS_CONN_ID, job_flow_id= '{{ task_instance.xcom_pull("create_cluster", key="return_value") }}', timeout=1800) monitor_step_op = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id=AWS_CONN_ID) terminate_cluster_op = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=AWS_CONN_ID) handle_failure_op = PythonOperator( task_id='handle_failure', python_callable=handle_failure_task, trigger_rule=trigger_rule.TriggerRule.ONE_FAILED) create_cluster_op >> monitor_cluster_op >> handle_failure_op create_cluster_op >> add_steps_to_cluster_op >> monitor_step_op >> terminate_cluster_op return dag
def setUp(self): configuration.load_test_config() self.emr_client_mock = MagicMock() self.sensor = EmrStepSensor( task_id='test_task', poke_interval=1, job_flow_id='j-8989898989', step_id='s-VK57YR1Z9Z5N', aws_conn_id='aws_default', )
def main_summary_subdag_factory(parent_dag, task_id, day): ds = "{{{{ macros.ds_format(macros.ds_add(ds, {0}), '%Y-%m-%d', '%Y%m%d') }}}}".format(day) subdag = DAG("{}.{}".format(parent_dag.dag_id, task_id), schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE, default_args=default_args) parent_job_flow_id = ("{{{{ task_instance.xcom_pull('setup_backfill_cluster', " "key='return_value', dag_id={}) }}}}".format(parent_dag.dag_id)) # Try to alleviate throttling issues by introducing some slight jitter on each of the days timedelta_task = TimeDeltaSensor( task_id="day_start_jitter", delta=timedelta(seconds=day), dag=subdag ) add_step_task = EmrAddStepsOperator( task_id='submit_main_summary_day', job_flow_id=parent_job_flow_id, execution_timeout=timedelta(minutes=10), aws_conn_id='aws_default', steps=EmrAddStepsOperator.get_step_args( job_name="main_summary {}".format(ds), owner="*****@*****.**", action_on_failure='CONTINUE', uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", { "from": ds, "to": ds, "bucket": "telemetry-backfill" }, { "DO_ASSEMBLY": "False" }), ), dag=subdag ) step_sensor_task = EmrStepSensor( task_id="main_summary_step_sensor", timeout=timedelta(hours=10).total_seconds(), job_flow_id=parent_job_flow_id, step_id="{{ task_instance.xcom_pull('submit_main_summary_day', key='return_value') }}", poke_interval=timedelta(minutes=5).total_seconds(), dag=subdag ) step_sensor_task.set_upstream(add_step_task) add_step_task.set_upstream(timedelta_task) return subdag
def setUp(self): self.emr_client_mock = MagicMock() self.sensor = EmrStepSensor( task_id='test_task', poke_interval=0, job_flow_id='j-8989898989', step_id='s-VK57YR1Z9Z5N', aws_conn_id='aws_default', ) mock_emr_session = MagicMock() mock_emr_session.client.return_value = self.emr_client_mock # Mock out the emr_client creator self.boto3_session_mock = MagicMock(return_value=mock_emr_session)
def setUp(self): configuration.load_test_config() self.emr_client_mock = MagicMock() self.sensor = EmrStepSensor(task_id='test_task', poke_interval=1, job_flow_id='j-8989898989', step_id='s-VK57YR1Z9Z5N', aws_conn_id='aws_default', region_name='ap-southeast-1') mock_emr_session = MagicMock() mock_emr_session.client.return_value = self.emr_client_mock # Mock out the emr_client creator self.boto3_session_mock = MagicMock(return_value=mock_emr_session)
def poke(run_this, t2): hook = S3Hook(aws_conn_id='aws_s3') job_flow_id = "j-2ASQREUMPJ0Y7" aws_conn_id = 'aws_emr' st = hook.read_key(key='prod_deployment/conf/athena_all_tables', bucket_name='bounce-data-platform') loop = st.split(",") print(loop) # X = 5 if not loop is None else len(loop) X = 0 if loop is None else len(loop) for i in range(0, X): steps = [{ 'Name': 'test step', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': [ 'hive', '-e', 'msck repair table dataplatform.task_fact_daily_agg_entity' ] } }] # t3= BashOperator( # task_id='ag' + str(i), # bash_command='echo "success"', # dag=dag) # run_this>>t3>>t2 step_addr = EmrAddStepsOperator(task_id='add_steps_' + str(i), job_flow_id="j-2ASQREUMPJ0Y7", aws_conn_id='aws_emr', steps=steps, dag=dag) step_checkr = EmrStepSensor( task_id='watch_step_' + str(i), job_flow_id="j-2ASQREUMPJ0Y7", step_id= "{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_emr', dag=dag) run_this >> step_addr >> step_checkr >> t2
def test_execute_calls_with_the_job_flow_id_and_step_id_until_it_reaches_a_terminal_state( self): with patch('boto3.client', self.boto3_client_mock): operator = EmrStepSensor( task_id='test_task', poke_interval=1, job_flow_id='j-8989898989', step_id='s-VK57YR1Z9Z5N', aws_conn_id='aws_default', ) operator.execute(None) # make sure we called twice self.assertEqual(self.mock_emr_client.describe_step.call_count, 2) # make sure it was called with the job_flow_id and step_id self.mock_emr_client.describe_step.assert_called_with( ClusterId='j-8989898989', StepId='s-VK57YR1Z9Z5N')
'movie_review_load': movie_review_load_folder, 'text_classifier_script': text_classifier_script, 'movie_review_stage': movie_review_stage }, depends_on_past=True) last_step = len(emr_steps) - 1 # sensing if the last step is complete clean_movie_review_data = EmrStepSensor( dag=dag, task_id='clean_movie_review_data', job_flow_id=EMR_ID, step_id='{{ task_instance.xcom_pull("add_emr_steps", key="return_value")[' + str(last_step) + '] }}', depends_on_past=True, ) user_purchase_to_rs_stage = PythonOperator( dag=dag, task_id='user_purchase_to_rs_stage', python_callable=run_redshift_external_query, op_kwargs={ 'qry': "alter table spectrum.user_purchase_staging add partition(insert_date='{{ ds }}') \ location 's3://data-eng-bucket/user_purchase/stage/{{ ds }}'", }, )
task_id='create_emr_database_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, dag=dag) create_emr_database_step = EmrAddStepsOperator( task_id='create_emr_database_step', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_database_cluster', key='return_value') }}", aws_conn_id='aws_default', on_failure_callback=cleanup_emr_cluster_if_steps_fail, steps=CREATE_DATABASE, ) create_emr_database_sensor = EmrStepSensor( task_id='create_emr_database_sensor', job_flow_id= "{{ task_instance.xcom_pull('create_emr_database_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='create_emr_database_step', key='return_value')[0] }}", on_failure_callback=cleanup_emr_cluster_if_steps_fail, aws_conn_id='aws_default', ) terminate_emr_cluster = EmrTerminateJobFlowOperator( task_id='terminate_emr_cluster', job_flow_id= "{{ task_instance.xcom_pull('create_emr_database_cluster', key='return_value') }}", aws_conn_id='aws_default', ) emr_database_checks_done = DummyOperator( task_id="emr_database_checks_done", trigger_rule=TriggerRule.NONE_FAILED,
"s3_script_bucket": S3_SCRIPT_BUCKET, "s3_output": S3_ANALYTICS_BUCKET, }, dag=dag, ) # get the number of the final step final_EMR_step = len(SPARK_STEPS) - 1 # wait for the steps to complete - seem to have to use concatenation here as {}.format() # seems to fail because of double {} in source string EMR_step_checker = EmrStepSensor( task_id="EMR_step_checker", job_flow_id="{{ task_instance.xcom_pull('create_EMR_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='EMR_step_adder', key='return_value')[" + str(final_EMR_step) + "] }}", aws_conn_id="aws_default", dag=dag, ) # Shutdown EMR cluster shutdown_EMR_cluster = EmrTerminateJobFlowOperator( task_id="shutdown_EMR_cluster", job_flow_id="{{ task_instance.xcom_pull(task_ids='create_EMR_cluster', key='return_value') }}", aws_conn_id="aws_default", dag=dag, ) # Now create dimension table create_dimension_table = PostgresOperator( task_id="create_dimension_table",
) add_all_task = MyEmrAddStepsOperator( task_id='add_allstep', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=ALL_CSV, retries=3, retry_delay=timedelta(minutes=5), dag=dag ) watch_allstep_task = EmrStepSensor( task_id='watch_allstep', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id="{{ task_instance.xcom_pull('add_allstep', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag ) add_ca_task = MyEmrAddStepsOperator( task_id='add_castep', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=CA_CSV, retries=3, retry_delay=timedelta(minutes=5), dag=dag ) watch_castep_task = EmrStepSensor(
"main_class": import_raw_order_status_main_class, "source_path": f"{source_datasets_location}/status.json.gz", "source_format": "json", "target_path": target_raw_datasets_location, "target_format": "parquet", }, provide_context=True, dag=dag, ) sensor_import_raw_consumer_step = EmrStepSensor( task_id="sensor_import_raw_consumer_step", job_flow_id=( """{{ task_instance.xcom_pull(task_ids='create_cluster_emr_job', key='job_flow_id') }}"""), step_id= ("""{{ task_instance.xcom_pull(task_ids='create_import_raw_consumer_dataset_step_job', key='import_raw_consumer_dataset_step') }}""" ), dag=dag, ) sensor_import_raw_order_step = EmrStepSensor( task_id="sensor_import_raw_order_step", job_flow_id= ("{{ task_instance.xcom_pull(task_ids='create_cluster_emr_job', key='job_flow_id') }}" ), step_id= ("""{{ task_instance.xcom_pull(task_ids='create_import_raw_order_dataset_step_job', key='import_raw_order_dataset_step') }}""" ),
emr_conn_id="emr_default", region_name="eu-central-1", dag=dag) submit_jobflow_steps = EmrAddStepsOperator( task_id="submit_jobflow_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id='aws_credentials', steps=JOB_FLOW_STEPS, dag=dag) check_city_processing_status = EmrStepSensor( task_id="check_city_demographic_processing_step", job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='submit_jobflow_steps', key='return_value')[2] }}", aws_conn_id="aws_credentials", dag=dag) check_accidents_processing_status = EmrStepSensor( task_id="check_accidents_processing_step", job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='submit_jobflow_steps', key='return_value')[3] }}", aws_conn_id="aws_credentials", dag=dag) check_city_quality_processing_status = EmrStepSensor( task_id="check_city_quality_processing_status",
# Add steps to an existing EMR JobFlow add_pipeline_to_emr_cluster_task = EmrAddStepsOperator( task_id='add_pipeline_to_emr_cluster', job_flow_id="{{task_instance.xcom_pull('spin_up_emr_cluster', " \ +" key='return_value')}}", steps=covid19_pipeline, dag=dag ) # Wait step to be completed watch_pipeline_step_task = EmrStepSensor( task_id='watch_pipeline_step', job_flow_id="{{task_instance.xcom_pull(" \ " 'spin_up_emr_cluster'," \ " key='return_value')}}", step_id="{{task_instance.xcom_pull(" \ " 'add_pipeline_to_emr_cluster'," \ " key='return_value')[0]}}", dag=dag) # Terminate EMR JobFlows spin_down_emr_cluster_task = EmrTerminateJobFlowOperator( task_id='spin_down_emr_cluster', job_flow_id="{{task_instance.xcom_pull('spin_up_emr_cluster', " \ +" key='return_value')}}", trigger_rule="all_done", dag=dag ) stop_airflow_containers_task = PythonOperator(
dag=dag) add_transform_step_task = EmrAddStepsOperatorV2( task_id='add_transform_step', job_flow_id= "{{ task_instance.xcom_pull('create_immigration_job', key='return_value') }}", aws_conn_id='aws_default', steps=TRANSFORM_IMMIGRATION_SAS_DATA, region_name=PARAMS['REGION'], dag=dag) watch_immigration_transform_task = EmrStepSensor( task_id='watch_immigration_transform', job_flow_id= "{{ task_instance.xcom_pull('create_immigration_job', key='return_value') }}", step_id= "{{ task_instance.xcom_pull('add_transform_step', key='return_value')[0] }}", aws_conn_id='aws_default', region_name=PARAMS['REGION'], dag=dag) add_data_quality_check_task = EmrAddStepsOperatorV2( task_id='data_quality_check', job_flow_id= "{{ task_instance.xcom_pull('create_immigration_job', key='return_value') }}", aws_conn_id='aws_default', steps=DATA_QUALITY_SAS_DATA, region_name=PARAMS['REGION'], dag=dag) watch_prev_data_check_task = EmrStepSensor(
job_flow_overrides=default_emr_settings, dag=dag) copy_python_script = EmrAddStepsOperator( task_id='copy_script', # XComs let tasks exchange messages job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=copy_script_step, dag=dag) watch_prev_step_task1 = EmrStepSensor( task_id='watch_prev_step1', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull('copy_script', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag) run_spark_job = EmrAddStepsOperator( task_id='run_spark_job', # XComs let tasks exchange messages job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=run_job_step, dag=dag) watch_prev_step_task2 = EmrStepSensor( task_id='watch_prev_step2',
job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_credentials", steps=SPARK_STEPS, params={ # these params are used to fill the paramterized values in SPARK_STEPS json "BUCKET_NAME": config['bucket_name'], "spark_script": "/scripts/etl_spark_gtrends.py", "s3_processed": "processed", }) last_step = len(SPARK_STEPS) - 1 # this value will let the sensor know the last step to watch # wait for the steps to complete step_checker = EmrStepSensor( task_id="watch_step", job_flow_id="{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[" + str(last_step) + "] }}", aws_conn_id="aws_credentials") # Terminate the EMR cluster terminate_emr_cluster = EmrTerminateJobFlowOperator( task_id="terminate_emr_cluster", job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_credentials") ## [END EMR Spark ETL] join_before_emr>> create_emr_cluster >> step_adder
task_id="load_us_states_steps", job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_default", steps=LOAD_US_STATES_STEP, params={ # these params are used to fill the paramterized values in SPARK_STEPS json "BUCKET_NAME": BUCKET_NAME, }, dag=dag, ) # wait for the steps to complete us_states_step_checker = EmrStepSensor( task_id="us_states_step_checker", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='load_us_states_steps', key='return_value')[0] }}", aws_conn_id="aws_default", dag=dag, ) LOAD_TEMPERATURE_STEP = [ { "Name": "Load Temperature dimension", "ActionOnFailure": "CANCEL_AND_WAIT", "HadoopJarStep": { "Jar": "command-runner.jar", "Args": [ "spark-submit", "--deploy-mode",
'Jar': 'command-runner.jar' } } ] ) monitor_cluster_op = EmrJobFlowSensor( task_id='monitor_cluster', retries=0, aws_conn_id=get_config('emr')['aws_conn_id'], job_flow_id='{{ task_instance.xcom_pull("create_cluster", key="return_value") }}', timeout=1800) monitor_step_op_1 = EmrStepSensor( task_id='watch_step_pi', job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id=get_config('emr')['aws_conn_id'] ) monitor_step_op_2 = EmrStepSensor( task_id='watch_step_distcp', job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[1] }}", aws_conn_id=get_config('emr')['aws_conn_id'] ) validate_path_exists = S3KeySensor( task_id='validate_pii_exist', bucket_name='{{ params.bucket_name }}', bucket_key='{{ params.bucket_key }}', wildcard_match=True)
extract_script, "--source_bucket", econet_engineering_source_bucket, #"s3://rheemconnectrawdata/history/", "--destination", econet_engineering_destination_bucket, #"s3://weiyutest/", "--input_date", execution_date ] } }], ) watch_extract_step_task = EmrStepSensor( task_id='watch_extract_step', #'watch_prev_step', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id= "{{ task_instance.xcom_pull('extract_step', key='return_value')[0] }}", aws_conn_id='aws_default', ) connect_step_task = EmrAddStepsOperator( task_id='connect_step', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=[{ "Name": "Step2 Merge", "ActionOnFailure": "CONTINUE", "HadoopJarStep": { "Jar": "command-runner.jar", # todo https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html
} }] add_step_emr = EmrAddStepsOperator( task_id='add_step_emr', dag=dag, job_flow_id= '{{ task_instance.xcom_pull("create_emr", key="return_value") }}', aws_conn_id='conn_aws_id', steps=json_step) check_step_emr = EmrStepSensor( task_id='watch_step_emr', dag=dag, job_flow_id= '{{ task_instance.xcom_pull("create_emr", key="return_value") }}', step_id= '{{ task_instance.xcom_pull("add_step_emr", key="return_value")[0] }}', aws_conn_id='conn_aws_id', ) terminate_emr = EmrTerminateJobFlowOperator( task_id='terminate_emr', dag=dag, job_flow_id= '{{ task_instance.xcom_pull("create_emr", key="return_value") }}', aws_conn_id='conn_aws_id') complete_emr = DummyOperator(task_id='complete_emr', dag=dag) file_sensor >> success_bucket >> create_emr >> add_step_emr >> check_step_emr >> terminate_emr >> complete_emr
} }] dag = DAG('dag_name', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *') parse_request = PythonOperator(task_id='parse_request', provide_context=True, python_callable=retrieve_s3_file, dag=dag) # Step instructions for the EMR for data processing step_adder = EmrAddStepsOperator(task_id='add_steps', job_flow_id=CLUSTER_ID, aws_conn_id='aws_default', steps=SPARK_TEST_STEPS, dag=dag) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id=CLUSTER_ID, step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag) # Workflow order for the Celery workers step_adder.set_upstream(parse_request) step_checker.set_upstream(step_adder)
print('nocluster:', nocluster) kwargs['ti'].xcom_push(key='clusterid', value=nocluster) parse_id = PythonOperator(task_id='parse_id', provide_context=True, python_callable=parse_cluster_id, trigger_rule='all_done') step_adder = EmrAddStepsOperator(task_id='step_adder', job_flow_id=CLUSTER_ID, aws_conn_id='aws_default', steps=SPARK_TEST_STEPS) step_checker = EmrStepSensor( task_id='step_checker', job_flow_id=CLUSTER_ID, step_id= "{{ task_instance.xcom_pull('step_adder', key='return_value')[0] }}", aws_conn_id='aws_default') cluster_terminator = EmrTerminateJobFlowOperator( task_id='cluster_terminator', job_flow_id=CLUSTER_ID, aws_conn_id='aws_default') end = DummyOperator(task_id='end') parse_request >> cluster_checker cluster_checker >> cluster_creator >> parse_id cluster_checker >> parse_id parse_id >> step_adder >> step_checker >> cluster_terminator >> end
with DAG( dag_id=DAG_ID, default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(minutes=15), start_date=days_ago(1), schedule_interval='@once', tags=['emr'], ) as dag: cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', emr_conn_id='aws_default', job_flow_overrides=JOB_FLOW_OVERRIDES ) step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_STEPS, ) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id='aws_default', ) cluster_creator >> step_adder >> step_checker
"scripts_path_key": scripts_path_key + "/", "data_path_key": data_path_key, "processed_tables_key": processed_tables_key }, dag=dag, ) last_step = len( SPARK_STEPS ) - 1 # this value will let the sensor know the last step to watch # wait for the steps to complete step_checker = EmrStepSensor( task_id="watch_step", job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='step_one', key='return_value')[" + str(last_step) + "] }}", aws_conn_id="aws_default", dag=dag, ) #Terminate the EMR cluster terminate_emr_cluster = EmrTerminateJobFlowOperator( task_id="terminate_emr_cluster", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_default", dag=dag, ) end_data_pipeline = DummyOperator(task_id="end_data_pipeline", dag=dag)
dagrun_timeout=timedelta(hours=2), schedule_interval=None) as dag: cluster_creator = EmrCreateJobFlowOperator( task_id='create_emr_cluster', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default') step_adder = EmrAddStepsOperator( task_id='movie_analytics_job', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_TEST_STEPS) step_checker = EmrStepSensor( task_id='wait_for_analytics_completion', job_flow_id= "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='movie_analytics_job', key='return_value')[0] }}", aws_conn_id='aws_default') cluster_remover = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id='aws_default') cluster_creator >> step_adder >> step_checker >> cluster_remover
dag=dag) add_jobflow_steps = EmrAddStepsOperator( task_id='Add_jobflow_steps', job_flow_id= "{{ task_instance.xcom_pull(task_ids='Create_EMR_cluster', key='return_value') }}", aws_conn_id='aws_credentials', region_name='us-west-2', steps=SPARK_ETL_STEPS, dag=dag) trip_processing = EmrStepSensor( task_id='trip_processing_step', job_flow_id= "{{ task_instance.xcom_pull('Create_EMR_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='Add_jobflow_steps', key='return_value')[2] }}", aws_conn_id='aws_credentials', region_name='us-west-2', dag=dag) station_processing = EmrStepSensor( task_id='station_processing_step', job_flow_id= "{{ task_instance.xcom_pull('Create_EMR_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='Add_jobflow_steps', key='return_value')[3] }}", aws_conn_id='aws_credentials', region_name='us-west-2', dag=dag)