def test_infer_dag(self): dag = DAG('dag', start_date=DEFAULT_DATE) dag2 = DAG('dag2', start_date=DEFAULT_DATE) op1 = DummyOperator(task_id='test_op_1', owner='test') op2 = DummyOperator(task_id='test_op_2', owner='test') op3 = DummyOperator(task_id='test_op_3', owner='test', dag=dag) op4 = DummyOperator(task_id='test_op_4', owner='test', dag=dag2) # double check dags self.assertEqual( [i.has_dag() for i in [op1, op2, op3, op4]], [False, False, True, True]) # can't combine operators with no dags self.assertRaises(AirflowException, op1.set_downstream, op2) # op2 should infer dag from op1 op1.dag = dag op1.set_downstream(op2) self.assertIs(op2.dag, dag) # can't assign across multiple DAGs self.assertRaises(AirflowException, op1.set_downstream, op4) self.assertRaises(AirflowException, op1.set_downstream, [op3, op4])
def test_set_dag(self): """ Test assigning Operators to Dags, including deferred assignment """ dag = DAG('dag', start_date=DEFAULT_DATE) dag2 = DAG('dag2', start_date=DEFAULT_DATE) op = DummyOperator(task_id='op_1', owner='test') # no dag assigned self.assertFalse(op.has_dag()) self.assertRaises(AirflowException, getattr, op, 'dag') # no improper assignment with self.assertRaises(TypeError): op.dag = 1 op.dag = dag # no reassignment with self.assertRaises(AirflowException): op.dag = dag2 # but assigning the same dag is ok op.dag = dag self.assertIs(op.dag, dag) self.assertIn(op, dag.tasks)
def test_check_task_dependencies(self, trigger_rule, successes, skipped, failed, upstream_failed, done, flag_upstream_failed, expect_state, expect_completed): start_date = datetime.datetime(2016, 2, 1, 0, 0, 0) dag = models.DAG('test-dag', start_date=start_date) downstream = DummyOperator(task_id='downstream', dag=dag, owner='airflow', trigger_rule=trigger_rule) for i in range(5): task = DummyOperator(task_id='runme_{}'.format(i), dag=dag, owner='airflow') task.set_downstream(downstream) run_date = task.start_date + datetime.timedelta(days=5) ti = TI(downstream, run_date) completed = ti.evaluate_trigger_rule( successes=successes, skipped=skipped, failed=failed, upstream_failed=upstream_failed, done=done, flag_upstream_failed=flag_upstream_failed) self.assertEqual(completed, expect_completed) self.assertEqual(ti.state, expect_state)
catchup=False) stage_int_sql_path = os.path.join( JOB_ARGS["stage_sql_path"], #stage_sql_path = adlogs/load_raw_logs "int") stage_int_hourly_query = SqlUtils.load_query(stage_int_sql_path).split("---") stage_int_hourly_job = SnowflakeOperator(task_id="stage_adlogs_int_hourly", snowflake_conn_id=SF_CONN_ID, warehouse=SF_WAREHOUSE, database=SF_DATABASE, sql=stage_int_hourly_query, params={"env": ENV}, autocommit=True, dag=DAG) stage_int_tables = DummyOperator(task_id="finish_int_rl_staging") stage_onetag_sql_path = os.path.join(JOB_ARGS["stage_sql_path"], "onetag") stage_onetag_hourly_query = SqlUtils.load_query(stage_onetag_sql_path).split( "---") stage_onetag_hourly_job = SnowflakeOperator( task_id="stage_adlogs_onetag_hourly", snowflake_conn_id=SF_CONN_ID, warehouse=SF_WAREHOUSE, database=SF_DATABASE, sql=stage_onetag_hourly_query, params={"env": ENV}, autocommit=True, dag=DAG) stage_onetag_table = DummyOperator(task_id="finish_onetag_rl_staging")
with DAG( dag_id="external_task_marker_parent", start_date=start_date, schedule_interval='10 11 * * *', ) as parent_dag: parent_task = ExternalTaskMarker( task_id="parent_task", external_dag_id="external_task_marker_child", external_task_id="child_task1", ) with DAG( dag_id="external_task_marker_child", start_date=start_date, schedule_interval='20 11 * * *', ) as child_dag: child_task1 = ExternalTaskSensor( task_id="child_task1", external_dag_id=parent_dag.dag_id, external_task_id=parent_task.task_id # timeout=600, # allowed_states=['success'], # failed_states=['failed', 'skipped'], # mode="reschedule", ) # [END howto_operator_external_task_sensor] child_task2 = DummyOperator(task_id="child_task2") child_task1 >> child_task2
from airflow.operators import ShortCircuitOperator, DummyOperator from airflow.models import DAG import airflow.utils.helpers from datetime import datetime, timedelta seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_short_circuit_operator', default_args=args) cond_true = ShortCircuitOperator(task_id='condition_is_True', python_callable=lambda: True, dag=dag) cond_false = ShortCircuitOperator(task_id='condition_is_False', python_callable=lambda: False, dag=dag) ds_true = [DummyOperator(task_id='true_' + str(i), dag=dag) for i in [1, 2]] ds_false = [DummyOperator(task_id='false_' + str(i), dag=dag) for i in [1, 2]] airflow.utils.helpers.chain(cond_true, *ds_true) airflow.utils.helpers.chain(cond_false, *ds_false)
"transform", #goal of DAG is to transform new log data default_args=DEFAULTS, start_date=datetime(2018, 1, 1), schedule_interval=JOB_ARGS["schedule_interval"], catchup=False) # sensor that waits on the completion of stage_ad_logs_to_snowflake adlogs_sensor = ExternalTaskSensor( task_id="wait_for_stage", # this sensor's name external_dag_id="stage_ad_logs_to_snowflake", # DAG to reference external_task_id="adlogs_snowflake_staging_finish", # task to wait on execution_delta=timedelta(minutes=5), dag=DAG) #dummy op for finish task transform_finish = DummyOperator(task_id="finish_transforms") #loop through .yaml list and create sql transform tasks for relevant tables for table in JOB_ARGS["tables"]: #set path to .sql file query_log = [] #create empty list to hold sql queries for a given table for process in JOB_ARGS["tables"][ table]: #loop through processses in the .yaml for a given table sql_path = os.path.join( JOB_ARGS[ "stage_sql_path"], #stage_sql_path = adlogs/log_process/filename.sql process) sql_query = SqlUtils.load_query(sql_path).split( "---" ) # sql_query is a list of all the queries in a given .sql file, seperated by '---' query_log += sql_query
# -*- coding: utf-8 -*- # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ DAG designed to test what happens when a DAG with pooled tasks is run by a BackfillJob. Addresses issue #1225. """ from datetime import datetime from airflow.models import DAG from airflow.operators import DummyOperator dag = DAG(dag_id='test_backfill_pooled_task_dag') task = DummyOperator(task_id='test_backfill_pooled_task', dag=dag, pool='test_backfill_pooled_task_pool', owner='airflow', start_date=datetime(2016, 2, 1))
from airflow import DAG from airflow.operators import DummyOperator, EmailOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'start_date': datetime.now() - timedelta(seconds=10), 'retries': 0 } dag = DAG('Sales_Nov', default_args=default_args, start_date=datetime.now() - timedelta(seconds=10)) op1 = DummyOperator(task_id='File1_landing', dag=dag) t1 = EmailOperator(task_id='Processing_File_1', to='*****@*****.**', subject="Airflow_report", html_content="File 1 started", dag=dag) op2 = DummyOperator(task_id='File2_landing', dag=dag) t2 = EmailOperator(task_id='Processing_File_2', to='*****@*****.**', subject="Airflow_report", html_content="File 2 started", dag=dag) op3 = DummyOperator(task_id='Aggregating', dag=dag) op4 = DummyOperator(task_id='Final_Table_Push', dag=dag) t1.set_upstream(op1)
python_callable=compare_result, trigger_rule="all_done", dag=dag) t3.set_upstream(t1) t3.set_upstream(t2) options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(t3) join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag) t4 = QuboleOperator( task_id='hadoop_jar_cmd', command_type='hadoopcmd', sub_command= 'jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc', cluster_label='default', fetch_logs=True, dag=dag) t5 = QuboleOperator( task_id='pig_cmd', command_type="pigcmd", script_location= "s3://paid-qubole/PigAPIDemo/scripts/script1-hadoop-s3-small.pig",
# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division, unicode_literals from airflow import DAG from airflow.operators import DummyOperator from datetime import datetime, timedelta yesterday = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) default_args = { 'owner': 'airflow', 'start_date': yesterday, } dag = DAG('schedule', default_args=default_args, schedule_interval=timedelta(seconds=10)) t1 = DummyOperator(task_id='task1', dag=dag)
from airflow.operators import DummyOperator from airflow.operators.http_operator import SimpleHttpOperator import airflow from airflow.sensors.http_sensor import HttpSensor DAG_NAME = 'HTTP_OPERATOR_TEST' args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(10)} dag = DAG( dag_id=DAG_NAME, catchup=False, default_args=args, schedule_interval='3 12 * * *', ) start_task = DummyOperator(task_id='starting_task', dag=dag) http_sensor_task = HttpSensor(task_id='http_sensor_task', http_conn_id='https_default', method='GET', endpoint='dog.ceo/api/breed/hound/images', headers={"Content-Type": "application/json"}, xcom_push=True, dag=dag) t1 = SimpleHttpOperator(task_id='get_labrador', method='GET', http_conn_id='https_default', endpoint='dog.ceo/api/breed/hound/images', headers={"Content-Type": "application/json"}, xcom_push=True,
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes, default_args): dag = DAG(dag_id, schedule_interval=schedule, start_date=start_date, default_args=default_args) dag.doc_md = """ # DAG fetching data from smiles.com.ar ### procesing and dumping on postgresql """ """start = TimeDeltaSensor( task_id='wait_to_start', delta=timedelta(minutes=delta_sensor), dag=dag)""" start = DummyOperator(task_id="start", dag=dag) branches = [] def return_dates_branches(**kwargs): return branches gen_url_branch = BranchPythonOperator( task_id='generate_url_dates', provide_context=True, python_callable=return_dates_branches, dag=dag) def transform_data(**kwargs): ti = kwargs['ti'] raw_data = ti.xcom_pull(task_ids=return_dates_branches()) data = [] logging.info(raw_data) if raw_data is not None: flat_list = [item for sublist in raw_data for item in sublist] for row in flat_list: row = list(row) # add À-ÿ for spanish accents date = '/'.join( list( re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split( row[1]))[2:4]) date = dateparser.parse(date, languages=['pt', 'es'], date_formats=['%d/%b' ]).strftime('%Y-%m-%d') row[1] = date td = row[4].split(':') row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1]))) row[5] = int(row[5].replace('.', '')) row[6] = int(row[6].replace('.', '')) row[8] = row[8].split(' ')[-1] row.insert(0, datetime.now().strftime('%Y-%m-%d')) data.append(tuple(row)) return data else: print('No se recibio datos') t2 = PythonOperator( task_id='transform_data', python_callable=transform_data, depends_on_past=True, trigger_rule=TriggerRule.ALL_SUCCESS, provide_context=True, dag=dag, ) t2.doc_md = """ #### Task Documentation Transform fetched data @return a list of tuples """ # def gen_url_dates(**kwargs): date_start = read_scraped_date(airpots_codes) date_end = date_start + timedelta(days=AMOUNT_DAYS) date_generated = [ date_start + timedelta(days=x) for x in range(0, (date_end - date_start).days) ] for i, date in enumerate(date_generated): date_ml = str(date.timestamp())[:8] + '00000' url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3¤cyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format( airpots_codes[0][0], airpots_codes[1], date_ml, date_ml, airpots_codes[0][1], airpots_codes[1]) get_data_op = PythonOperator( task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0], airpots_codes[0][1], airpots_codes[1], i), python_callable=get_data_URL, op_kwargs={'URL': url_dated}, trigger_rule=TriggerRule.ONE_SUCCESS, provide_context=True, dag=dag, ) branches.append(get_data_op.task_id) get_data_op.set_upstream(gen_url_branch) get_data_op.set_downstream(t2) get_data_op.doc_md = """ #### Task Documentation Fetch data from passed url return list of semi-parsed data """ insert_data = PythonOperator( task_id='insert_data', python_callable=insert_into_table, provide_context=True, dag=dag, ) insert_data.doc_md = """ #### Task Documentation Insert parsed and transformed data into table """ t2.set_downstream(insert_data) gen_url_branch.set_upstream(start) return dag
check_to_remove_op = BranchPythonOperator( task_id='check_to_remove', python_callable=check_to_remove, provide_context=True, dag=dag ) check_to_update_op = BranchPythonOperator( task_id='check_to_update', python_callable=check_to_update, provide_context=True, dag=dag ) update_scores_branch_op = DummyOperator( task_id='update_scores_branch', dag=dag ) nothing_to_remove_op = DummyOperator( task_id='nothing_to_remove', dag=dag ) nothing_to_update_op = DummyOperator( task_id='nothing_to_update', dag=dag ) check_job_posting_to_be_updated_op.set_downstream(check_to_remove_op) check_job_posting_to_be_updated_op.set_downstream(check_to_update_op) check_work_experience_to_be_updated_op.set_downstream(check_to_remove_op)
# -*- coding: utf-8 -*- # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from datetime import datetime from airflow.models import DAG from airflow.operators import DummyOperator DEFAULT_DATE = datetime(2100, 1, 1) # DAG tests backfill with pooled tasks # Previously backfill would queue the task but never run it dag1 = DAG(dag_id='test_start_date_scheduling', start_date=DEFAULT_DATE) dag1_task1 = DummyOperator(task_id='dummy', dag=dag1, owner='airflow')
def create_test_pipeline(suffix, trigger_rule, dag): skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag) always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag) join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule) join.set_upstream(skip_operator) join.set_upstream(always_true) final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag) final.set_upstream(join)
remove_scores_op = PostgresOperator( task_id='remove_scores', postgres_conn_id='db1_etl', sql='scripts/postgres/remove_work_experience_job_post_scores.sql', dag=dag ) update_scores_op = PostgresOperator( task_id='update_scores', postgres_conn_id='db1_etl', sql='scripts/postgres/update_work_experience_job_post_scores.sql', dag=dag ) dummy_op = DummyOperator(task_id='compute_similarity_branching', dag=dag) copy_scores_to_temp_table_op = BashOperator( task_id='copy_scores_to_temp_table', bash_command='scripts/bash/copy_scores_to_temp_table.sh', params={"partnum": 4}, provide_context=True, dag=dag) for option in np.arange(4): t = PythonOperator( task_id='compute_similarity_branch_%d' % option, python_callable=compute_similarity_score, params={'partnum': 4, 'partindex': option}, provide_context=True, pool='high_memory_usage',
import sys from qfl.etl.data_ingest import daily_equity_price_ingest default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2015, 8, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('etl_daily', start_date=datetime(2016, 05, 01), schedule_interval="0 0 14 * MON-FRI", default_args=default_args) t2 = PythonOperator(task_id='daily_equity_price_ingest', python_callable=daily_equity_price_ingest, dag=dag) run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this_last.set_upstream(t2)
from airflow.operators import DummyOperator, PythonOperator import airflow.hooks.S3_hook default_args = { 'owner': 'anitha', 'start_date': datetime(2019, 1, 1), 'retry_delay': timedelta(minutes=5) } def upload(filename, key, bucket_name): hook = airflow.hooks.S3_hook.S3Hook('anitha_s3') hook.load_file(filename, key, bucket_name) # Using the context manager alllows you not to duplicate with DAG('S3', default_args=default_args, schedule_interval='@once') as dag: start_task = DummyOperator(task_id='dummy_start') upload_to_S3_task = PythonOperator( task_id='upload_file_to_S3', python_callable=upload, op_kwargs={ 'filename': '/home/ec2-user/airflow/dags/email.py', 'key': 'email.py', 'bucket_name': 'saksbucket', }, ) start_task >> upload_to_S3_task
# BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs dag = DAG(dag_id='example_branch_dop_operator_v3',schedule_interval='*/1 * * * *', default_args=args) def should_run(ds, **kwargs): print("------------- exec dttm = {} and minute = {}".format(kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: return "oper_1" else: return "oper_2" cond = BranchPythonOperator( task_id='condition', provide_context=True, python_callable=should_run, dag=dag) oper_1 = DummyOperator( task_id='oper_1', dag=dag) oper_1.set_upstream(cond) oper_2 = DummyOperator( task_id='oper_2', dag=dag) oper_2.set_upstream(cond)
dir_extract_applist = os.path.join(project_dir, "extract_applist") script_extract_applist = "extract-applist.sh" dir_media_stats = os.path.join(project_dir, "media_stats") script_media_stats = "media-daily-stats.sh" ##interval day between now and execution date day day_interval_between_execution_and_now = "{% set now_s = macros.time.mktime(macros.datetime.now().timetuple()) %} \ {% set exe_s = macros.time.mktime(execution_date.timetuple()) %} \ {% set interval_day = (now_s - exe_s)/(3600*24) %} \ {{ interval_day|int }}" dag = DAG( dag_id="dsp-report-daily", default_args=args, start_date=start_date_daily_rounded, schedule_interval="0 0 * * *" ) start_task = DummyOperator(task_id="start_now", dag=dag) end_task = DummyOperator(task_id="end_here", dag=dag, trigger_rule="all_done") def gen_hourly_job_sensor( report_name_value="your_report_name", task_id_value=None, report_time_type_value="hourly", report_time_day_value="1970-01-01", mysql_connid_value=mysql_conn_Id, table_value=success_job_table, parent_dag=dag, ): sql_template = "select case when count(*) >=24 \ then 1 else 0 end from {table} \ where report_name = '{report_name}' \
default_args = { 'owner': 'arnaud', 'start_date': datetime(2020, 5, 27), 'depends_on_past': False, 'catchup': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } #set DAG run at 5:30 AM everyday dag = DAG('stock_current_etl_dag', default_args=default_args, schedule_interval='@daily') start_task = DummyOperator(task_id='dummy_start', dag=dag) upload_current_news_to_S3_task = PythonOperator( task_id='upload_current_news_to_S3', python_callable=current_news_etl, dag=dag) upload_current_stock_to_S3_task = PythonOperator( task_id='upload_current_stock_to_S3', python_callable=current_stocks_etl, op_kwargs={ 'list_of_stocks': [ 'AAPL', 'INTC', 'TSLA', 'GILD', 'BA', 'AMZN', 'CBB', 'DAL', 'MMM', 'MSFT' ], 'ndays':
# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division, unicode_literals from airflow import DAG from airflow.operators import DummyOperator, ShortCircuitOperator from airflow.utils import chain from datetime import datetime, timedelta yesterday = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) default_args = { 'owner': 'airflow', 'start_date': yesterday, } dag = DAG('skip', default_args=default_args) t1 = DummyOperator(task_id='task1', dag=dag) t2 = DummyOperator(task_id='task2', dag=dag) t3 = DummyOperator(task_id='task3', dag=dag) cond_true = ShortCircuitOperator(task_id='cond_t', python_callable=lambda: True, dag=dag) cond_false = ShortCircuitOperator(task_id='cond_f', python_callable=lambda: False, dag=dag) chain(t1, cond_true, t2) chain(t1, cond_false, t3)
#Defining SubDag structure ############################################################################# default_args = { 'owner': 'dale', 'start_date': datetime(2019, 9, 1), 'retry_delay': timedelta(minutes=.25) #,'concurrency': 22 } one_dag = DAG( parent_dag_name, default_args=default_args, schedule_interval='@once' ) #in production, need to update this to run once daily (add various dags and set variables in Airflow?) #end dummy dag start_task = DummyOperator(task_id='start_task', dag=one_dag) # Dynamcially creates a task that randomly selects which audit table to insert data into with the goal of distibuting inserts which overcomes snowflake locking table issues def create_dynamic_task_dist_audit(database_name, table): return PythonOperator( #provide_context=True, task_id='select_audit_table_' + database_name + '_' + table, pool='Pool_max_parallel_5', python_callable=distribute_audit_inserts, dag=one_dag) # Creates the tasks dynamically. Each one will elaborate one chunk of data. def create_dynamic_task_tos3(database_name, table): return PythonOperator(
True, "approved": True, "inProg": False, "done": False, "approvedBy": "karakuri", "workflow": workflow_id }) print("TASKS: ", tasks) dag = DAG('sfsc_review_new_airflow_process_tasks', default_args=default_args, schedule_interval=None) start = DummyOperator(task_id='start', default_args=default_args, dag=dag) process = SubDagOperator( task_id='process', subdag=subdag_tasks('sfsc_review_new_airflow_process_tasks', 'process', tasks, default_args), default_args=default_args, dag=dag, ) start >> process
s3 = boto3.resource('s3') s3_filename = 'train.csv' local_filename = '/home/jennie/workspace/titanic/train.csv' bucket_name = 'airflow-demo-09092019' default_args = { 'owner': 'Jennie', 'start_date': datetime(2019, 1, 1), # 'retry_delay': timedelta(minutes=5) } dag = DAG('abbbbbbbbb_titanic_analysis', default_args=default_args, schedule_interval='@once') # task 1: dummy ===================== dummy_task = DummyOperator( task_id = 'dummy_start', dag=dag ) # task 2: Upload file ===================== upload_task = PythonOperator( task_id = 'upload_file_to_s3', python_callable= upload_file_to_s3, op_kwargs={ 's3': s3, 'filename': local_filename, 'key': s3_filename, 'bucket_name': bucket_name }, dag= dag )
DAG_NAME = 'example_subdag_operator' args = { 'owner': 'airflow', 'start_date': datetime(2016, 1, 1), } dag = DAG( dag_id=DAG_NAME, default_args=args, schedule_interval="@once", ) start = DummyOperator( task_id='start', default_args=args, dag=dag, ) section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', args), default_args=args, dag=dag, ) some_other_task = DummyOperator( task_id='some-other-task', default_args=args, dag=dag, )
default_args = { 'owner': 'airflow', 'start_date': datetime(2017, 12, 19) } # Schedule this DAG to run once. dag = DAG('ah_ftp_operator', description='FTPs with FTPOperator', schedule_interval='@once', start_date=datetime(2017, 12, 18), default_args=default_args) with dag: # Dummy start DAG. kick_off_dag = DummyOperator(task_id='kick_off_dag') # Call the functions ftp_to_s3 = FTPToS3Operator( task_id='download_file', ftp_conn_id='astro_ftp', ftp_directory='/files/test_ah/sample_data.csv', local_path='test_data.csv', s3_conn_id='astronomer-s3', s3_bucket='astronomer-worflows-dev', s3_key='test_data.csv', ) # A task won't start until the one before it does. # e.g. the upload won't start until the download taks finishes.
from builtins import range from airflow.operators import BashOperator, DummyOperator from airflow.models import DAG from datetime import datetime, timedelta seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_bash_operator', default_args=args) cmd = 'ls -l' run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this = BashOperator( task_id='run_after_loop', bash_command='echo 1', dag=dag) run_this.set_downstream(run_this_last) for i in range(3): i = str(i) task = BashOperator( task_id='runme_'+i, bash_command='echo "{{ task_instance_key_str }}" && sleep 1', dag=dag) task.set_downstream(run_this) task = BashOperator( task_id='also_run_this',
def generate_dag(area, download_dir, default_args): """Generate Landsat8 ingestion DAGs. Parameters ---------- area: Landsat8Area Configuration parameters for the Landsat8 area to be downloaded default_args: dict Default arguments for all tasks in the DAG. """ dag = DAG( LANDSAT8.id + "_{}".format(area.name), description="DAG for downloading, processing and ingesting {} AOI in Landsat8 data " "from scene_list".format(area.name), default_args=default_args, dagrun_timeout=LANDSAT8.dagrun_timeout, schedule_interval=LANDSAT8.dag_schedule_interval, catchup=LANDSAT8.catchup, max_active_runs=LANDSAT8.max_active_runs, params={ "area": area, } ) search_task = Landsat8SearchOperator( task_id='search_{}'.format(area.name), area=area, cloud_coverage=LANDSAT8.cloud_coverage, startdate = LANDSAT8.startdate, enddate = LANDSAT8.enddate, filter_max =LANDSAT8.filter_max, order_by =LANDSAT8.order_by, order_type =LANDSAT8.order_type, db_credentials= CFG.landsat8_postgresql_credentials, dag=dag ) generate_html_description = Landsat8ProductDescriptionOperator( task_id='generate_html_description', description_template=os.path.join( CFG.templates_base_dir, "product_abstract.html"), download_dir=download_dir, dag=dag ) download_thumbnail = Landsat8DownloadOperator( task_id="download_thumbnail", download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="thumb_small.jpg", download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) generate_thumbnail = Landsat8ThumbnailOperator( task_id='generate_thumbnail', get_inputs_from=download_thumbnail.task_id, thumb_size_x="64", thumb_size_y="64", dag=dag ) download_metadata = Landsat8DownloadOperator( task_id="download_metadata", download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="MTL.txt", download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) join_task = DummyOperator( task_id='landsat8_join', dag=dag ) download_tasks = [] translate_tasks = [] addo_tasks = [] upload_tasks = [] gdalinfo_tasks = [] for band in area.bands: download_band = Landsat8DownloadOperator( task_id="download_band{}".format(band), download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="B{}.TIF".format(band), download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) download_tasks.append(download_band) translate = GDALTranslateOperator( task_id="translate_band{}".format(band), get_inputs_from=download_band.task_id, dag=dag ) translate_tasks.append(translate) addo = GDALAddoOperator( task_id="add_overviews_band{}".format(band), get_inputs_from=translate.task_id, resampling_method="average", max_overview_level=128, compress_overview="PACKBITS", dag=dag ) addo_tasks.append(addo) gdalinfo = GDALInfoOperator( task_id='landsat8_gdalinfo_band_{}'.format(band), get_inputs_from=addo.task_id, dag=dag ) gdalinfo_tasks.append(gdalinfo) upload = RSYNCOperator( task_id="upload_band{}".format(band), host=CFG.rsync_hostname, remote_usr=CFG.rsync_username, ssh_key_file=CFG.rsync_ssh_key, remote_dir=LANDSAT8.repository_dir, get_inputs_from=addo.task_id, dag=dag) upload_tasks.append(upload) download_band.set_upstream(search_task) translate.set_upstream(download_band) addo.set_upstream(translate) gdalinfo.set_upstream(addo) upload.set_upstream(addo) join_task.set_upstream(upload) join_task.set_upstream(gdalinfo) download_task_ids = ( task.task_id for task in download_tasks ) create_original_package_task = PythonOperator(task_id="create_original_package", python_callable=create_original_package, op_kwargs={ 'get_inputs_from': { "search_task_id" : search_task.task_id, "download_task_ids" : download_task_ids, } , 'out_dir' : LANDSAT8.process_dir }, dag=dag) upload_original_package_task = RSYNCOperator( task_id="upload_original_package", host=CFG.rsync_hostname, remote_usr=CFG.rsync_username, ssh_key_file=CFG.rsync_ssh_key, remote_dir=LANDSAT8.original_package_upload_dir, get_inputs_from=create_original_package_task.task_id, dag=dag) # we only neeed gdalinfo output on one of the granules gdalinfo_task = gdalinfo_tasks[0] gdalinfo_task_id = gdalinfo_task.task_id upload_task_ids = (task.task_id for task in upload_tasks) generate_metadata = Landsat8MTLReaderOperator( task_id='generate_metadata', original_package_download_base_url = LANDSAT8.original_package_download_base_url, gs_workspace = LANDSAT8.geoserver_workspace, gs_wms_layer = LANDSAT8.geoserver_layer, gs_wms_width = LANDSAT8.geoserver_oseo_wms_width, gs_wms_height = LANDSAT8.geoserver_oseo_wms_height, gs_wms_format = LANDSAT8.geoserver_oseo_wms_format, gs_wms_version = LANDSAT8.geoserver_oseo_wms_version, gs_wfs_featuretype = LANDSAT8.geoserver_featuretype, gs_wfs_format = LANDSAT8.geoserver_oseo_wfs_format, gs_wfs_version=LANDSAT8.geoserver_oseo_wfs_version, gs_wcs_scale_i = LANDSAT8.geoserver_oseo_wcs_scale_i, gs_wcs_scale_j = LANDSAT8.geoserver_oseo_wcs_scale_j, gs_wcs_format = LANDSAT8.geoserver_oseo_wcs_format, gs_wcs_version = LANDSAT8.geoserver_oseo_wcs_version, gs_wcs_coverage_id = LANDSAT8.geoserver_layer, get_inputs_from={ "search_task_id" : search_task.task_id, "metadata_task_id": download_metadata.task_id, "upload_task_ids" : upload_task_ids, "gdalinfo_task_id": gdalinfo_task_id, "upload_original_package_task_id": upload_original_package_task.task_id, }, metadata_xml_path=os.path.join(CFG.templates_base_dir, "metadata.xml"), dag=dag ) product_zip_task = Landsat8ProductZipFileOperator( task_id='landsat8_product_zip', get_inputs_from=[ generate_html_description.task_id, generate_metadata.task_id, generate_thumbnail.task_id ], output_dir=LANDSAT8.process_dir, dag=dag ) # curl -vvv -u evoadmin:\! -XPOST -H "Content-type: application/zip" --data-binary @/var/data/Sentinel-2/S2_MSI_L1C/download/S2A_MSIL1C_20170909T093031_N0205_R136_T36VUQ_20170909T093032/product.zip "http://ows-oda.eoc.dlr.de/geoserver/rest/oseo/collections/SENTINEL2/products" publish_task = PythonOperator(task_id="publish_product_task", python_callable=publish_product, op_kwargs={ 'geoserver_username': CFG.geoserver_username, 'geoserver_password': CFG.geoserver_password, 'geoserver_rest_endpoint': '{}/oseo/collections/{}/products'.format( CFG.geoserver_rest_url, LANDSAT8.geoserver_oseo_collection), 'get_inputs_from': product_zip_task.task_id, }, dag=dag) if CFG.eoxserver_rest_url: publish_eox_task = PythonOperator(task_id="publish_product_eox_task", python_callable=publish_product, op_kwargs={ 'geoserver_username': CFG.eoxserver_username, 'geoserver_password': CFG.eoxserver_password, 'geoserver_rest_endpoint': CFG.eoxserver_rest_url, 'get_inputs_from': product_zip_task.task_id, }, dag = dag) download_thumbnail.set_upstream(search_task) download_metadata.set_upstream(search_task) for tid in download_tasks: create_original_package_task.set_upstream(tid) upload_original_package_task.set_upstream(create_original_package_task) generate_metadata.set_upstream(join_task) generate_metadata.set_upstream(download_metadata) generate_metadata.set_upstream(upload_original_package_task) generate_thumbnail.set_upstream(download_thumbnail) generate_html_description.set_upstream(search_task) product_zip_task.set_upstream(generate_html_description) product_zip_task.set_upstream(generate_metadata) product_zip_task.set_upstream(generate_thumbnail) publish_task.set_upstream(upload_original_package_task) publish_task.set_upstream(product_zip_task) if CFG.eoxserver_rest_url: publish_eox_task.set_upstream(publish_task) return dag
seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': seven_days_ago, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), ) dag = DAG('simple', default_args=default_args) t1 = DummyOperator( task_id='testairflow', bash_command='python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task1_1.py', dag=dag) t2 = DummyOperator( task_id='testairflow', bash_command='python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task1_2.py;' 'python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task1_3.py;', dag=dag) t3 = DummyOperator( task_id='testairflow', bash_command='python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task2_1.py;' 'python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task2_2.py;', 'python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task2_3.py;', dag=dag)
# DAG tests depends_on_past dependencies dag2 = DAG(dag_id='test_depends_on_past', default_args=default_args) dag2_task1 = DummyOperator( task_id='test_dop_task', dag=dag2, depends_on_past=True,) # DAG tests that a Dag run that doesn't complete is marked failed dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args) dag3_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag3, python_callable=fail) dag3_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag3,) dag3_task2.set_upstream(dag3_task1) # DAG tests that a Dag run that completes but has a failure is marked success dag4 = DAG(dag_id='test_dagrun_states_success', default_args=default_args) dag4_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag4, python_callable=fail, ) dag4_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag4, trigger_rule=TriggerRule.ALL_FAILED )
'start_date': datetime(2015, 8, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('etl_daily', start_date=datetime(2016, 05, 01), schedule_interval="0 0 14 * MON-FRI", default_args=default_args) t1 = PythonOperator(task_id='test_airflow', python_callable=test_airflow, dag=dag) t2 = PythonOperator(task_id='daily_equity_price_ingest', python_callable=daily_equity_price_ingest, dag=dag) run_this_last = DummyOperator(task_id='run_this_last', dag=dag) t2.set_upstream(t1) run_this_last.set_upstream(t2)
} # BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs dag = DAG(dag_id='example_branch_dop_operator_v3', schedule_interval='*/1 * * * *', default_args=args) def should_run(ds, **kwargs): print("------------- exec dttm = {} and minute = {}".format( kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: return "oper_1" else: return "oper_2" cond = BranchPythonOperator(task_id='condition', provide_context=True, python_callable=should_run, dag=dag) oper_1 = DummyOperator(task_id='oper_1', dag=dag) oper_1.set_upstream(cond) oper_2 = DummyOperator(task_id='oper_2', dag=dag) oper_2.set_upstream(cond)
from airflow.operators import BranchPythonOperator, DummyOperator from airflow.models import DAG from datetime import datetime, timedelta import random seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_branch_operator', default_args=args) cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) join = DummyOperator( task_id='join', trigger_rule='one_success', dag=dag )
from datetime import datetime, timedelta seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG( dag_id='dag2', default_args=args, schedule_interval="30 17 * * *" # 这里可以填crontab时间格式 ) task0 = DummyOperator(task_id='task0', dag=dag) cmd = 'ls -l' task1 = BashOperator(task_id='task1', bash_command=cmd, dag=dag) task0.set_downstream(task1) task2 = DummyOperator(trigger_rule='all_done', task_id='task2', dag=dag, depends_on_past=True) task2.set_upstream(task1) task3 = DummyOperator(trigger_rule='all_done', depends_on_past=True,
from airflow.operators import PythonOperator, DummyOperator # task 1: dummy ===================== # 3 types: # sensor: keep running until a certain criteria is met # HdfsSensor: Waits for a file or folder to land in HDFS # operator: trigger a certain action (eg. call a function) # PythonOperator # transfer: move data from one location to another # S3ToRedshiftTransfer: load files from s3 to Redshift dummy_task = DummyOperator( task_id='dummy_start', # dag container dag=dag) # task 2: Upload file ===================== di_bar = PythonOperator( task_id='di_bar', # function that is invoked python_callable=mua_may_quay_cuong, # function arguments op_kwargs={'bai_hat': 'Vinahouse'}, dag=dag) def mua_may_quay_cuong(bai_hat): print('Dang quay bai', bai_hat)
from airflow.operators import BranchPythonOperator, DummyOperator from airflow.models import DAG from datetime import datetime, timedelta import random seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_branch_operator', default_args=args) cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) t.set_downstream(dummy_follow)
dag2 = DAG(dag_id='test_depends_on_past', start_date=DEFAULT_DATE) dag2_task1 = DummyOperator( task_id='test_dop_task', dag=dag2, depends_on_past=True, owner='airflow') # DAG tests that a Dag run that doesn't complete is marked failed dag3 = DAG(dag_id='test_dagrun_states_fail', start_date=DEFAULT_DATE) dag3_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag3, owner='airflow', python_callable=fail) dag3_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag3, owner='airflow') dag3_task2.set_upstream(dag3_task1) # DAG tests that a Dag run that completes but has a failure is marked success dag4 = DAG(dag_id='test_dagrun_states_success', start_date=DEFAULT_DATE) dag4_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag4, owner='airflow', python_callable=fail, ) dag4_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag4, owner='airflow',