def main_summary_subdag_factory(parent_dag, task_id, day): ds = "{{{{ macros.ds_format(macros.ds_add(ds, {0}), '%Y-%m-%d', '%Y%m%d') }}}}".format(day) subdag = DAG("{}.{}".format(parent_dag.dag_id, task_id), schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE, default_args=default_args) parent_job_flow_id = ("{{{{ task_instance.xcom_pull('setup_backfill_cluster', " "key='return_value', dag_id={}) }}}}".format(parent_dag.dag_id)) # Try to alleviate throttling issues by introducing some slight jitter on each of the days timedelta_task = TimeDeltaSensor( task_id="day_start_jitter", delta=timedelta(seconds=day), dag=subdag ) add_step_task = EmrAddStepsOperator( task_id='submit_main_summary_day', job_flow_id=parent_job_flow_id, execution_timeout=timedelta(minutes=10), aws_conn_id='aws_default', steps=EmrAddStepsOperator.get_step_args( job_name="main_summary {}".format(ds), owner="*****@*****.**", action_on_failure='CONTINUE', uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py", env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", { "from": ds, "to": ds, "bucket": "telemetry-backfill" }, { "DO_ASSEMBLY": "False" }), ), dag=subdag ) step_sensor_task = EmrStepSensor( task_id="main_summary_step_sensor", timeout=timedelta(hours=10).total_seconds(), job_flow_id=parent_job_flow_id, step_id="{{ task_instance.xcom_pull('submit_main_summary_day', key='return_value') }}", poke_interval=timedelta(minutes=5).total_seconds(), dag=subdag ) step_sensor_task.set_upstream(add_step_task) add_step_task.set_upstream(timedelta_task) return subdag
from airflow.operators import PostgresOperator from airflow.operators.sensors import TimeDeltaSensor from helpers import SqlQueries default_args = { 'owner': 'gabriel', 'start_date': datetime(2009, 12, 31, 23, 59, 59), 'end_date': datetime(2012, 12, 31, 23, 59, 59), 'depends_on_past': False, 'retries': 1, 'retry_delay': timedelta(seconds=300), 'catchup': True } dag = DAG('recreate_bi_tables', default_args=default_args, description='Recreate the business intelligence tables', schedule_interval=None, max_active_runs=1) dummy_wait = TimeDeltaSensor(task_id='dummy_wait', dag=dag, delta=timedelta(seconds=1)) recreate_bi_tables = PostgresOperator(task_id="recreate_bi_tables_task", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.recreate_bi_tables) dummy_wait >> recreate_bi_tables
'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'pool': 'default', } dag = DAG(dag_id='anatomy_of_a_dag', description="This describes my DAG", default_args=default_args, schedule_interval=timedelta(days=1)) # This is a daily DAG. # t0, t1, t2 and t3 are examples of tasks created by instantiating operators t0 = TimeDeltaSensor(task_id='wait_a_second', delta=timedelta(seconds=1), dag=dag) t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag) def my_cool_function(ds=None, **kwargs): print "{}".format(ds) t2 = PythonOperator(task_id='show_ds', python_callable=my_cool_function, retries=3, provide_context=True, dag=dag)
A DAG definition file in Airflow, written in Python. """ from datetime import datetime, timedelta from airflow.models import DAG # Import the DAG class from airflow.operators.bash_operator import BashOperator from airflow.operators.sensors import TimeDeltaSensor default_args = { 'owner': 'you', 'depends_on_past': False, 'inparam': 'xxxyyyzzz', 'start_date': datetime(2018, 1, 8), } dag = DAG( dag_id='anatomy_of_a_dag3', description="This describes my DAG", default_args=default_args, #schedule_interval=timedelta(days=1) schedule_interval='57 9 * * *', ) # This is a daily DAG. # t0 and t1 are examples of tasks created by instantiating operators t0 = TimeDeltaSensor(task_id='wait_60_seconds', delta=timedelta(seconds=60), dag=dag) t1 = BashOperator(task_id='print_date_in_bash', bash_command='date', dag=dag) t1.set_upstream(t0)
def delta_dependency_operator(task_id, dep): return TimeDeltaSensor( task_id=task_id, delta=dep.delta, )
import datetime from airflow import DAG from airflow.operators.python_operator import PythonOperator from airflow.operators.sensors import TimeDeltaSensor from printer import print_execution_date dag = DAG( dag_id='new_delta_sensor', schedule_interval='@hourly', start_date=datetime.datetime(2020, 4, 9, 0, 0, 0) ) with dag: guard_sensor = TimeDeltaSensor( task_id='guard_sensor', delta=datetime.timedelta(hours=2), # let 3 more hours for late data to arrive (2 + 1h since DAG executes at the end of the schedule interval) poke_interval=60*10, # 10 minutes mode='reschedule' ) printer = PythonOperator( task_id='printer', python_callable=print_execution_date, provide_context=True ) guard_sensor >> printer
from airflow.operators.python_operator import PythonOperator def print_world(): print('world') default_args = { 'owner': 'me', 'start_date': dt.datetime(2018, 5, 1), 'retries': 1, 'retry_delay': dt.timedelta(minutes=5), } with DAG( 'airflow_cerouno_v03', default_args=default_args, schedule_interval='0 * * * *', ) as dag: print_hello = BashOperator(task_id='print_hello', bash_command='echo "hello"') # sleep = BashOperator(task_id='sleep', # bash_command='sleep 5') sensor_dormilon = TimeDeltaSensor(task_id='sensor_dormilon', delta=dt.timedelta(minutes=5)) print_world = PythonOperator(task_id='print_world', python_callable=print_world) print_hello >> sensor_dormilon >> print_world