def test_execute_terminates_the_job_flow_and_does_not_error(self):
        with patch('boto3.client', self.boto3_client_mock):

            operator = EmrTerminateJobFlowOperator(
                task_id='test_task',
                job_flow_id='j-8989898989',
                aws_conn_id='aws_default'
            )

            operator.execute(None)
        "s3_clean": s3_clean,
    },
    dag=dag,
)

last_step = len(SPARK_STEPS) - 1
step_checker = EmrStepSensor(
    task_id="watch_step",
    job_flow_id=
    "{{ task_instance.xcom_pull('create_emr_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[" +
    str(last_step) + "] }}",
    aws_conn_id="aws_default",
    dag=dag,
)

terminate_emr_cluster = EmrTerminateJobFlowOperator(
    task_id="terminate_emr_cluster",
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='create_emr_cluster', key='return_value') }}",
    aws_conn_id="aws_default",
    dag=dag,
)

end_data_pipeline = DummyOperator(task_id="end_data_pipeline", dag=dag)

start_data_pipeline >> create_emr_cluster
create_emr_cluster >> step_adder >> step_checker >> terminate_emr_cluster
terminate_emr_cluster >> end_data_pipeline
    Variable.get(DAG_NAME + "." + API_PARAMS_OVERRIDE_VAR_KEY,
                 deserialize_json=True,
                 default_var={}))

# Create a cluster and wait until it get to status 'Waiting' and then save a new connection to the Livy service
create_cluster = ExtendedEmrCreateJobFlowOperator(
    task_id='create_cluster',
    aws_conn_id='aws_default',
    api_params=api_params,
    wait_for_status='WAITING',
    save_livy_connection_name='Daily-Livy-Spark',
    dag=dag)

# Run the LiveySparkOperator ( https://github.com/rssanders3/airflow-spark-operator-plugin )
spark_job = LivySparkOperator(task_id='spark_job',
                              spark_script='example_pyspark_script.py',
                              http_conn_id='Daily-Livy-Spark',
                              session_kind='pyspark',
                              dag=dag)

# Terminate the cluster using the built in operator
terminate_cluster = EmrTerminateJobFlowOperator(
    task_id='terminate_cluster',
    job_flow_id="{{ task_instance.xcom_pull('create_cluster') }}",
    aws_conn_id='aws_default',
    dag=dag)

# Configure dependencies
create_cluster >> spark_job
spark_job >> terminate_cluster
Exemple #4
0
                           options={},
                           branch="backfill",
                           other={"DO_SUBMIT": "False"})),
    },
    dag=dag)

cluster_start_sensor_task = MozEmrClusterStartSensor(
    task_id="wait_for_cluster",
    timeout=timedelta(hours=1).total_seconds(),
    job_flow_id=job_flow_id_template,
    dag=dag)

terminate_job_flow_task = EmrTerminateJobFlowOperator(
    task_id="terminate_backfill_cluster",
    aws_conn_id='aws_default',
    execution_timeout=timedelta(minutes=10),
    job_flow_id=job_flow_id_template,
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag)

job_flow_termination_sensor_task = MozEmrClusterEndSensor(
    task_id="cluster_termination_sensor",
    timeout=timedelta(hours=1).total_seconds(),
    job_flow_id=job_flow_id_template,
    dag=dag)

cluster_start_sensor_task.set_upstream(create_job_flow_task)

upstream = cluster_start_sensor_task
for day in range(7):
    task_id = "main_summary_day_{}".format(day)
    dag=dag)

watch_checkstep_task = EmrStepSensor(
    task_id='watch_checkstep',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull('add_checkstep', key='return_value')[0] }}",
    aws_conn_id='aws_default',
    dag=dag)

cluster_remover = EmrTerminateJobFlowOperator(
    task_id='remove_cluster',
    job_flow_id=
    "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    trigger_rule='all_done',  # Run shut down regardless of success
    retries=3,
    retry_delay=timedelta(minutes=5),
    dag=dag)


def pause(minutes):
    '''Sleep for the given number of minutes'''
    time.sleep(minutes * 60)


pause_task = PythonOperator(
    # Catch up of dates can cause EC2 quotas to be exceeded so pause for termination
    task_id='pause_for_termination',
    python_callable=pause,
Exemple #6
0
    step_id="{{ task_instance.xcom_pull(task_ids='Configuration_of_EMR', key='return_value')[5] }}",
    aws_conn_id='aws_credentials',
    dag=dag
)

step_check=EmrStepSensor(
    task_id='Check_Steps',
    job_flow_id="{{ task_instance.xcom_pull(task_ids='Create_EMR_Instance', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull(task_ids='Configuration_of_EMR', key='return_value')[-1] }}",
    aws_conn_id='aws_credentials',
    dag=dag
)

end_emr=EmrTerminateJobFlowOperator(
    task_id='Shutdown_EMR',
    job_flow_id="{{ task_instance.xcom_pull(task_ids='Create_EMR_Instance', key='return_value') }}",
    aws_conn_id='aws_credentials',
    dag=dag
)

check_upload=DataLakeUpload(
    task_id='Checking_Upload',
    aws_credentials_id='aws_credentials',
    s3_bucket =bucketname.bucket,
    path      ='processed/',
    keys      =['weather.csv/','airport.csv/','demographics.csv/','cleaned_immigration/'],
    dag=dag
)


end_operator=DummyOperator(task_id='Stop_execution',dag=dag)
Exemple #7
0
         schedule_interval='0 3 * * *') as dag:

    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',  # Done
        job_flow_overrides=JOB_FLOW_OVERRIDES,  # Done
        aws_conn_id='aws_default',  # Done
        emr_conn_id='emr_default'  # Done
    )

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',  # Done
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",  # Done
        aws_conn_id='aws_default',  # Done
        steps=SPARK_TEST_STEPS)

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id=
        "{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id=
        "{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id='aws_default')

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id=
        "{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='aws_default')

    cluster_creator >> step_adder >> step_checker >> cluster_remover
finished_write_copy_task = DummyOperator(task_id='finished_write_copy',
                                         dag=dag)

s3_write_copy_steps = build_emr_s3_write_copy_steps(get_uuid())
for table in WRITE_PATHS_TO_COPY_MAP:
    write_copy_step_adder_task_id = 's3_write_copy_add_steps_' + table
    s3_write_copy_step_adder_task = EmrAddStepsOperator(
        task_id=write_copy_step_adder_task_id,
        job_flow_id=get_cluster_id(),
        steps=s3_write_copy_steps[table],
        aws_conn_id=AWS_CONN,
        dag=dag)
    s3_write_copy_step_checker_task = EmrStepSensor(
        task_id='s3_write_copy_watch_step',
        job_flow_id=get_cluster_id(),
        step_id="{{ ti.xcom_pull(task_ids='" + write_copy_step_adder_task_id +
        "', key='return_value')[0] }}",
        aws_conn_id=AWS_CONN,
        dag=dag)
    s3_write_deletes_task >> s3_write_copy_step_adder_task >> s3_write_copy_step_checker_task >> finished_write_copy_task

# Terminate EMR Cluster
terminate_cluster_task = EmrTerminateJobFlowOperator(
    task_id='terminate_emr_cluster',
    job_flow_id=get_cluster_id(),
    aws_conn_id=AWS_CONN,
    dag=dag)

finished_write_copy_task >> terminate_cluster_task
Exemple #9
0
    cluster_creator = EmrCreateJobFlowOperator(
        task_id='create_job_flow',
        job_flow_overrides=JOB_FLOW_OVERRIDES,
        aws_conn_id='my_aws',
        emr_conn_id='emr_default',
    )

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='my_aws',
        steps=SPARK_STEPS,
    )

    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
        aws_conn_id='my_aws',
    )

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
        aws_conn_id='my_aws',
    )

    cluster_remover.trigger_rule = trigger_rule.TriggerRule.ALL_DONE


    dag_init >> cluster_creator >> step_adder >> step_checker >> cluster_remover
Exemple #10
0
    job_flow_id=
    "{{ task_instance.xcom_pull('Create_EMR_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='Add_jobflow_steps', key='return_value')[3] }}",
    aws_conn_id='aws_credentials',
    dag=dag)

check_data_quality_check = EmrStepSensor(
    task_id='Watch_data_quality_check_step',
    job_flow_id=
    "{{ task_instance.xcom_pull('Create_EMR_cluster', key='return_value') }}",
    step_id=
    "{{ task_instance.xcom_pull(task_ids='Add_jobflow_steps', key='return_value')[4] }}",
    aws_conn_id='aws_credentials',
    dag=dag)

delete_cluster = EmrTerminateJobFlowOperator(
    task_id='Delete_EMR_cluster',
    job_flow_id=
    "{{ task_instance.xcom_pull(task_ids='Create_EMR_cluster', key='return_value') }}",
    aws_conn_id='aws_credentials',
    dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

start_operator >> create_datalake_bucket >> create_cluster
start_operator >> create_code_bucket >> upload_etl_code >> create_cluster
create_cluster >> add_jobflow_steps
add_jobflow_steps >> check_covid_table_processing >> check_data_quality_check
add_jobflow_steps >> check_county_table_processing >> check_data_quality_check
check_data_quality_check >> delete_cluster >> end_operator
Exemple #11
0
# Wait until task terminates
watch_spark_step_one_task = EmrStepSensor(
    task_id='watch_spark_step_one',
    job_flow_id="{{task_instance.xcom_pull(" \
                "      'create_emr_cluster'," \
                "      key='return_value')}}",
    step_id="{{task_instance.xcom_pull(" \
            "      'add_spark_step_one'," \
            "      key='return_value')[0]}}",
    dag=dag)

# Spin down an AWS EMR cluster
terminate_emr_cluster_task = EmrTerminateJobFlowOperator(
    task_id='terminate_emr_cluster',
    job_flow_id="{{task_instance.xcom_pull(" \
                "      'create_emr_cluster'," \
                "      key='return_value')}}",
    trigger_rule="all_done",
    dag=dag)

# Stop any container running on covid19-ecs-cluster
stop_airflow_containers_task = PythonOperator(
    task_id='stop_airflow_containers',
    python_callable=stop_airflow_containers,
    op_kwargs={'cluster': 'covid19-ecs-cluster'},
    provide_context=False,
    dag=dag)

# Setting up dependencies
starting_point >> [check_world_data_exists_task,
                   copy_us_data_file_task,