def test_execute_calls_with_the_job_flow_id_until_it_reaches_a_terminal_state(self): with patch('boto3.session.Session', self.boto3_session_mock): operator = EmrJobFlowSensor( task_id='test_task', poke_interval=2, job_flow_id='j-8989898989', aws_conn_id='aws_default' ) operator.execute(None) # make sure we called twice self.assertEqual(self.mock_emr_client.describe_cluster.call_count, 2) # make sure it was called with the job_flow_id self.mock_emr_client.describe_cluster.assert_called_with(ClusterId='j-8989898989')
def create_dag(): with DAG(dag_id='emr_job_flow_manual_steps_dag', default_args=DEFAULT_DAG_ARGS, dagrun_timeout=timedelta(hours=2), max_active_runs=1, schedule_interval=None) as dag: create_cluster_op = EmrCreateJobFlowOperator( task_id='create_cluster', job_flow_overrides={'Name': CLUSTER_NAME}, aws_conn_id=AWS_CONN_ID, emr_conn_id=EMR_CONN_ID) add_steps_to_cluster_op = TemplatedEmrAddStepsOperator( task_id='add_steps', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=AWS_CONN_ID, steps=[{ 'Name': 'calculate_pi', 'ActionOnFailure': 'TERMINATE_CLUSTER', 'HadoopJarStep': { 'Jar': 's3://psm-poc-dmp-temp/spark-examples.jar', 'Args': ['10'], 'MainClass': 'org.apache.spark.examples.SparkPi' } }]) monitor_cluster_op = EmrJobFlowSensor( task_id='monitor_cluster', retries=0, aws_conn_id=AWS_CONN_ID, job_flow_id= '{{ task_instance.xcom_pull("create_cluster", key="return_value") }}', timeout=1800) monitor_step_op = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id= "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id=AWS_CONN_ID) terminate_cluster_op = EmrTerminateJobFlowOperator( task_id='remove_cluster', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_cluster', key='return_value') }}", aws_conn_id=AWS_CONN_ID) handle_failure_op = PythonOperator( task_id='handle_failure', python_callable=handle_failure_task, trigger_rule=trigger_rule.TriggerRule.ONE_FAILED) create_cluster_op >> monitor_cluster_op >> handle_failure_op create_cluster_op >> add_steps_to_cluster_op >> monitor_step_op >> terminate_cluster_op return dag
def get_job_sensor(timeout, job_flow_name, aws_conn_id): creator_task_id = _get_job_flow_creator_task_id(job_flow_name) return EmrJobFlowSensor( timeout=timeout, mode="reschedule", task_id=f"check_{job_flow_name}", retries=0, job_flow_id=_get_task_return_value_template(creator_task_id), aws_conn_id=aws_conn_id, )
def test_execute_calls_with_the_job_flow_id_until_it_reaches_failed_state_with_exception(self): self.mock_emr_client.describe_cluster.side_effect = [ DESCRIBE_CLUSTER_RUNNING_RETURN, DESCRIBE_CLUSTER_TERMINATED_WITH_ERRORS_RETURN ] with patch('boto3.session.Session', self.boto3_session_mock): operator = EmrJobFlowSensor( task_id='test_task', poke_interval=2, job_flow_id='j-8989898989', aws_conn_id='aws_default' ) with self.assertRaises(AirflowException): operator.execute(None) # make sure we called twice self.assertEqual(self.mock_emr_client.describe_cluster.call_count, 2) # make sure it was called with the job_flow_id self.mock_emr_client.describe_cluster.assert_called_once_with(ClusterId='j-8989898989')
def test_execute_calls_with_the_job_flow_id_until_it_reaches_a_terminal_state( self): self.mock_emr_client.describe_cluster.side_effect = [ DESCRIBE_CLUSTER_RUNNING_RETURN, DESCRIBE_CLUSTER_TERMINATED_RETURN ] with patch('boto3.session.Session', self.boto3_session_mock): operator = EmrJobFlowSensor(task_id='test_task', poke_interval=2, job_flow_id='j-8989898989', aws_conn_id='aws_default') operator.execute(None) # make sure we called twice self.assertEqual(self.mock_emr_client.describe_cluster.call_count, 2) # make sure it was called with the job_flow_id calls = [ unittest.mock.call(ClusterId='j-8989898989'), unittest.mock.call(ClusterId='j-8989898989') ] self.mock_emr_client.describe_cluster.assert_has_calls(calls)
blp_dag = DAG('mango_log_processing_adi', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=6), schedule_interval='0 3 * * *') blp_logs = EmrCreateJobFlowOperator(task_id='blp_create_job_flow', job_flow_overrides={'Steps': BLP_STEPS}, aws_conn_id='aws_data_iam', emr_conn_id='emr_data_iam_mango', dag=blp_dag) blp_job_sensor = EmrJobFlowSensor( task_id='blp_check_job_flow', job_flow_id= "{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}", aws_conn_id='aws_data_iam', dag=blp_dag, on_retry_callback=lambda context: blp_dag.clear( start_date=context['execution_date'], end_date=context['execution_date']), ) gcp_conn_id = "google_cloud_derived_datasets" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) gcstj_object_conditions = {'includePrefixes': 'blpadi/{{ ds }}'} gcstj_transfer_options = {'deleteObjectsUniqueInSink': True} bq_args = [ 'bq', '--location=US',
'Name': 'calculate_pi', 'ActionOnFailure': 'CONTINUE', 'HadoopJarStep': { 'Jar': 'command-runner.jar', 'Args': ['/usr/lib/spark/bin/run-example', 'SparkPi', '10'] } }] JOB_FLOW_OVERRIDES = {'Name': 'PiCalc', 'Steps': SPARK_TEST_STEPS} dag = DAG('emr_job_flow_automatic_steps_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *') job_flow_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', dag=dag) job_sensor = EmrJobFlowSensor( task_id='check_job_flow', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", aws_conn_id='aws_default', dag=dag) job_flow_creator.set_downstream(job_sensor)
'email_on_retry': True } with DAG( dag_id='flight_delays_emr', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=1), schedule_interval='@once', ) as dag: start_operator = DummyOperator(task_id='begin_execution', dag=dag) end_operator = DummyOperator(task_id='stop_execution', dag=dag) with open('emr_job_flow.json', 'r') as fp: job_flow = json.load(fp) cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=job_flow, aws_conn_id='aws_credentials', emr_conn_id='emr_default' ) job_sensor = EmrJobFlowSensor( task_id='check_job_flow', job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_credentials' ) # define the DAG structure, in terms of the created operators start_operator >> cluster_creator >> job_sensor >> end_operator
from airflow.utils.dates import days_ago from emr_job_flow_with_sensor import EmrJobFlowWithSensor from emr_step_with_sensor import EmrStepWithSensor # the job flow step configuration as described here: # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html#EMR.Client.run_job_flow step_conf = {} job_conf = {} dag = DAG( dag_id='spark_job', default_args={ 'owner': 'airflow', 'start_date': days_ago(1) } ) job = EmrJobFlowWithSensor( task_id='job_and_retry', job_flow=EmrCreateJobFlowOperator( task_id='job', job_flow_overrides=job_conf ), sensor=EmrJobFlowSensor( task_id='sensor', job_flow_id='' ), dag=dag )
schedule_interval='0 3 * * *' ) blp_logs = EmrCreateJobFlowOperator( task_id='blp_create_job_flow', job_flow_overrides={'Steps': BLP_STEPS}, aws_conn_id='aws_data_iam', emr_conn_id='emr_data_iam_mango', dag=blp_dag ) blp_job_sensor = EmrJobFlowSensor( task_id='blp_check_job_flow', job_flow_id="{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}", aws_conn_id='aws_data_iam', dag=blp_dag, on_retry_callback=lambda context: blp_dag.clear( start_date=context['execution_date'], end_date=context['execution_date']), ) gcp_conn_id = "google_cloud_derived_datasets" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) gcstj_object_conditions = { 'includePrefixes': 'blpadi/{{ ds }}' } gcstj_transfer_options = { 'deleteObjectsUniqueInSink': True }
'ActionOnFailure': 'TERMINATE_CLUSTER', 'HadoopJarStep': { 'Args': [ 's3-dist-cp', '--src={{ params.dist_cp_src }}', '--dest={{ params.dist_cp_target }}' ], 'Jar': 'command-runner.jar' } } ] ) monitor_cluster_op = EmrJobFlowSensor( task_id='monitor_cluster', retries=0, aws_conn_id=get_config('emr')['aws_conn_id'], job_flow_id='{{ task_instance.xcom_pull("create_cluster", key="return_value") }}', timeout=1800) monitor_step_op_1 = EmrStepSensor( task_id='watch_step_pi', job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id=get_config('emr')['aws_conn_id'] ) monitor_step_op_2 = EmrStepSensor( task_id='watch_step_distcp', job_flow_id="{{ task_instance.xcom_pull('create_cluster', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[1] }}", aws_conn_id=get_config('emr')['aws_conn_id']
PATH_METEO_SPARK], 'bucket_name': BUCKET_NAME, }, dag=my_dag) # launch ephemere EMR task to process data uploaded precompute_data_meteo_emr_task = EmrCreateJobFlowOperator( task_id='precompute_data_meteo_emr', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', ) # wait for EMR end emr_job_sensor = EmrJobFlowSensor( task_id='check_job_flow', job_flow_id="{{ task_instance.xcom_pull(" + \ "task_ids='precompute_data_meteo_emr', key='return_value') }}", aws_conn_id='aws_default', ) # retrieve data processed download_files_from_S3_task = PythonOperator( task_id='download_from_S3_meteo', python_callable=download_files_from_S3, op_kwargs={ 'filenames': [PATH_DF_METEO_FR], 'bucket_name': BUCKET_NAME, }, dag=my_dag) prepare_features_task = PythonOperator( task_id='prepare_features',
# }, # ], } run_emr_create_job_flow_task = EmrCreateJobFlowOperator( task_id=emr_create_cluster_task_id, aws_conn_id=AWS_CREDENTIALS_EMR_ID, emr_conn_id="emr_default", region_name="us-west-2", # Remove deprecated job_flow_overrides=JOB_FLOW_OVERRIDES, do_xcom_push=True, # Remove deprecated dag=dag, ) emr_job_sensor = EmrJobFlowSensor( task_id='check_job_flow', job_flow_id=f"{{{{ task_instance.xcom_pull(task_ids='{emr_create_cluster_task_id}', key='return_value') }}}}", # step_id="{{ task_instance.xcom_pull(task_ids='TASK_TO_WATCH', key='return_value')[0] }}", # Here gos an EmrAddStepsOperator's id aws_conn_id=AWS_CREDENTIALS_EMR_ID, dag=dag, ) # SPARK_STEPS = [{ # 'Name': 'test step', # 'ActionOnFailure': 'CONTINUE', # 'HadoopJarStep': { # 'Jar': 'command-runner.jar', # 'Args': [ # 'spark-submit', '--deploy-mode', 'cluster', '--class', 'com.naturalint.data.spark.api.scala.NiSparkAppMain', 's3://ni-data-infra/jars/feeder-factorization-etl-1.0-SNAPSHOT.jar', '--ni-main-class', 'com.naturalint.data.etl.feeder.FactorizationEtl', '--job-id', '133', '--config-file', 's3://galactic-feeder-staging/factorization_input/133.json', '--raw-conversions-table', 'galactic_feeder_staging.conversions_raw', '--previous-runs-table', 'galactic_feeder_staging.factorization_output_partitions', '--parquet-output-location', 's3://galactic-feeder-staging/factorization_output_partitions', '--csv-output-location', 's3://galactic-feeder-staging/output' # ] # } # }] # add_emr_step_task = EmrAddStepsOperator(
schedule_interval='@daily', ) start_pinkman = EmrCreateJobFlowOperator( task_id='start_pinkman', job_flow_overrides=PINKMAN_JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default', region_name='eu-central-1', dag=dag, ) check_pinkman_result = EmrJobFlowSensor( task_id='check_pinkman_result', job_flow_id= "{{ task_instance.xcom_pull(task_ids='start_pinkman', key='return_value') }}", aws_conn_id='aws_default', dag=dag, ) run_walter_white = AWSBatchOperator( task_id='run_walter-white', job_name='walter-white', job_queue=os.getenv('COMPUTE_ENVIRONMENT_JOB_QUEUE'), job_definition=os.getenv('WALTER_WHITE_JOB_DEFINITION'), aws_conn_id='aws_default', region_name='eu-central-1', overrides={ 'environment': [ { 'name': 'MLFLOW_TRACKING_URI',