def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrCreateJobFlowOperator( task_id='test_task', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=self._config, region_name='ap-southeast-2', dag=DAG('test_dag_id', default_args=args))
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} # Mock out the emr_client (moto has incorrect response) self.emr_client_mock = MagicMock() self.operator = EmrCreateJobFlowOperator( task_id='test_task', aws_conn_id='aws_default', emr_conn_id='emr_default', region_name='ap-southeast-2', dag=DAG('test_dag_id', default_args=args, template_searchpath=TEMPLATE_SEARCHPATH, template_undefined=StrictUndefined))
'JobFlowRole': 'EMR_EC2_DefaultRole', 'ServiceRole': 'EMR_DefaultRole', } with DAG( dag_id='emr_job_flow_manual_steps_dag', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), schedule_interval='0 3 * * *', tags=['example'], ) as dag: # [START howto_operator_emr_manual_steps_tasks] cluster_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id='aws_default', emr_conn_id='emr_default') step_adder = EmrAddStepsOperator( task_id='add_steps', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=SPARK_STEPS) step_checker = EmrStepSensor( task_id='watch_step', job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id=
with DAG( dag_id=DAG_ID, description="Analyze Bakery Sales with Amazon EMR", default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), start_date=days_ago(1), schedule_interval=None, tags=["emr demo", "spark", "pyspark"], ) as dag: begin = DummyOperator(task_id="begin") end = DummyOperator(task_id="end") cluster_creator = EmrCreateJobFlowOperator( task_id="create_job_flow", job_flow_overrides=JOB_FLOW_OVERRIDES ) step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id="aws_default", steps=SPARK_STEPS, ) step_checker = EmrStepSensor( task_id="watch_step", job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}", step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}", aws_conn_id="aws_default", )
return [spark_step] with DAG('transform_raw_patents', default_args=default_args, schedule_interval='@quarterly', catchup=False) as dag: # create non entity specific tasks start_dag = DummyOperator( task_id='start_dag', ) create_emr_cluster = EmrCreateJobFlowOperator( task_id='create_emr_cluster', aws_conn_id='aws_default', emr_conn_id='emr_default' ) job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}" terminate_emr_cluster = EmrTerminateJobFlowOperator( task_id='terminate_emr_cluster', job_flow_id=job_flow_id, aws_conn_id='aws_default' ) # create_emr_cluster = DummyOperator( # task_id='create_emr_cluster', # ) # job_flow_id = 'j-2NFK2ZRYYVS71'
with DAG( 'etl', default_args=default_args, description='ETL', catchup=False, start_date=datetime.datetime(2020, 1, 1), schedule_interval=None, tags=['udacity', 'etl'], default_view="graph", ) as dag: # generate dag documentation dag.doc_md = __doc__ create_cluster = EmrCreateJobFlowOperator( dag=dag, task_id="create_cluster", job_flow_overrides=job_flow_overrides, aws_conn_id=aws_conn_id) wait_cluster_completion = EmrJobFlowSensor( task_id='wait_cluster_completion', job_flow_id=cluster_id, aws_conn_id=aws_conn_id, target_states=["RUNNING", "WAITING"], dag=dag) terminate_cluster = EmrTerminateJobFlowOperator( task_id="terminate_cluster", trigger_rule="all_done", job_flow_id=cluster_id, aws_conn_id=aws_conn_id, dag=dag)
with DAG('emr_dag', default_args=default_args, description='ETL for ESG analytics', catchup=False, schedule_interval=timedelta(days=1)) as dag: join_before_emr = DummyOperator(task_id='start_emr') end_data_pipeline = DummyOperator(task_id='ETL_DONE') ## [START EMR Spark ETL] # Create an EMR cluster create_emr_cluster = EmrCreateJobFlowOperator( task_id="create_emr_cluster", job_flow_overrides=JOB_FLOW_OVERRIDES, aws_conn_id="aws_credentials", emr_conn_id="emr_default") # Add steps to the EMR cluster step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id="{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}", aws_conn_id="aws_credentials", steps=SPARK_STEPS, params={ # these params are used to fill the paramterized values in SPARK_STEPS json "BUCKET_NAME": config['bucket_name'], "spark_script": "/scripts/etl_spark_gtrends.py", "s3_processed": "processed", })
"local_file": "some_file_path" }) check_data_exists_task = PythonOperator( task_id='check_data_exists', python_callable=utils_airflow.check_data_exists, op_kwargs={ "aws_conn": "my_aws_conn", "bucket": "my_bucket", "prefix": "file_name" }, provide_context=False) create_job_flow_task = EmrCreateJobFlowOperator( task_id='create_job_flow', aws_conn_id='aws_default', emr_conn_id='emr_default', job_flow_overrides=cluster_conf) add_step_task = EmrAddStepsOperator( task_id='My_first_job', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='my_aws_conn', steps=my_first_job) watch_prev_step_task = EmrStepSensor( task_id='watch_prev_step', job_flow_id= "{{task_instance.xcom_pull(task_ids='create_job_flow', key='return_value')}}", step_id=
) sensor_2 = EmrStepSensor( task_id='sensor_daily_parse_message', job_flow_id="{{ dag_run.conf['cluster_id'] }}", step_id="{{ task_instance.xcom_pull('step_2', key='return_value')[0] }}", aws_conn_id='aws_default', dag=dag ) step_1 >> sensor_1 >> step_2 >> sensor_2 create_cluster = EmrCreateJobFlowOperator( task_id='create_cluster', aws_conn_id='aws_default', emr_conn_id='emr_default', email_on_failure=True, dag=dag ) def get_spot_instance_fleet_id(cluster_id): emr_client = boto3.client('emr', region_name=Variable.get('aws_default_region')) response = emr_client.list_instance_fleets(ClusterId=cluster_id) for instance_fleet in response['InstanceFleets']: if instance_fleet["InstanceFleetType"] == "TASK": return instance_fleet["Id"] def dagrun_trigger(context, dag_run_obj):
with DAG( dag_id="test_emr_transient_cluster", default_args={ "owner": "Guido Kosloff", "depends_on_past": False, "email_on_failure": False, "email_on_retry": False, }, start_date=datetime(2021, 1, 1, 0, 0), schedule_interval="@daily", catchup=False, ) as dag: create_cluster = EmrCreateJobFlowOperator( task_id="create_cluster", job_flow_overrides=EMRClusterConfig.JOB_FLOW_OVERRIDES, aws_conn_id="aws_default", emr_conn_id="emr_default", ) add_step_load_raw_data = EmrAddStepsOperator( task_id="add_step_load_raw_data", job_flow_id=create_cluster.output, aws_conn_id="aws_default", steps=SparkSteps.LOAD_RAW_DATA, ) wait_for_step_load_raw_data = EmrStepSensor( task_id="wait_for_step_load_raw_data", job_flow_id=create_cluster.output, step_id="{{ task_instance.xcom_pull(task_ids='add_step_load_raw_data', key='return_value')[0] }}", aws_conn_id="aws_default",
with DAG( dag_id=DAG_ID, description="Run multiple Spark jobs with Amazon EMR", default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=2), start_date=days_ago(1), schedule_interval=None, tags=["emr demo", "spark", "pyspark"], ) as dag: begin = DummyOperator(task_id="begin") end = DummyOperator(task_id="end") cluster_creator = EmrCreateJobFlowOperator( task_id="create_job_flow", job_flow_overrides=get_object( "job_flow_overrides/job_flow_overrides.json", work_bucket), ) step_adder = EmrAddStepsOperator( task_id="add_steps", job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id="aws_default", steps=get_object("emr_steps/emr_steps.json", work_bucket), ) step_checker = EmrStepSensor( task_id="watch_step", job_flow_id= "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
'TerminationProtected': False, }, 'Steps': SPARK_STEPS, 'JobFlowRole': 'EMR_EC2_DefaultRole', 'ServiceRole': 'EMR_DefaultRole', } # [END howto_operator_emr_automatic_steps_config] with DAG( dag_id='emr_job_flow_automatic_steps_dag', dagrun_timeout=timedelta(hours=2), start_date=datetime(2021, 1, 1), schedule_interval='0 3 * * *', catchup=False, tags=['example'], ) as dag: # [START howto_operator_emr_automatic_steps_tasks] job_flow_creator = EmrCreateJobFlowOperator( task_id='create_job_flow', job_flow_overrides=JOB_FLOW_OVERRIDES, ) job_sensor = EmrJobFlowSensor(task_id='check_job_flow', job_flow_id=job_flow_creator.output) # [END howto_operator_emr_automatic_steps_tasks] # Task dependency created via `XComArgs`: # job_flow_creator >> job_sensor
"Jar": "command-runner.jar", "Args": ["bash", "/home/hadoop/emr_setup.sh"] } }] dag = DAG( 'sec-pipeline', default_args=default_args, description='DAG test', schedule_interval=timedelta(days=1), ) create_emr_cluster = EmrCreateJobFlowOperator( task_id='create_job_flow', aws_conn_id='aws_default', emr_conn_id='emr_default', region_name='us-east-2', job_flow_overrides=JOB_FLOW_OVERRIDES, dag=dag) emr_step_1 = EmrAddStepsOperator( task_id='emr_step1', job_flow_id= "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", aws_conn_id='aws_default', steps=step1, dag=dag) emr_step_2 = EmrAddStepsOperator( task_id='emr_step2', job_flow_id=