bash_command='date', executor_config={"KubernetesExecutor": { "image": "ubuntu:latest" }}, dag=dag, ) t1.doc_md = """\ #### Task Documentation You can document your task using the attributes `doc_md` (markdown), `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets rendered in the UI's Task Instance Details page. ![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png) """ dag.doc_md = __doc__ t2 = BashOperator( task_id='sleep', depends_on_past=False, bash_command='sleep 5', executor_config={"KubernetesExecutor": { "image": "ubuntu:latest" }}, dag=dag, ) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}"
from airflow.sensors.http_sensor import HttpSensor default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(2), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('example_http_operator', default_args=default_args) dag.doc_md = __doc__ # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = SimpleHttpOperator( task_id='post_op', endpoint='api/v1.0/nodes', data=json.dumps({"priority": 5}), headers={"Content-Type": "application/json"}, response_check=lambda response: True if len(response.json()) == 0 else False, dag=dag) t5 = SimpleHttpOperator( task_id='post_op_formenc', endpoint='nodes/url', data="name=Joe", headers={"Content-Type": "application/x-www-form-urlencoded"},
task_id='get_listings', python_callable=get_listings, dag=dag, ) t2 = PythonOperator( task_id='send_email', provide_context=True, python_callable=send_email, dag=dag, ) # noinspection PyStatementEffect t1 >> t2 # Documentation dag.doc_md = f""" #### DAG Documentation {dag.description} """ t1.doc_md = """ #### Task Documentation Retrieves and stores Zoopla data """ t2.doc_md = """ #### Task Documentation Sends email notification when new data is available """
'retry_delay': timedelta(minutes=2), 'catchup': False, 'email_on_retry': False } # DAG object creation # Scheduler: https://airflow.apache.org/docs/stable/scheduler.html dag = DAG( 'arXiv_Redshift_dag', default_args=default_args, description='Load and transform data from S3 in Redshift with Airflow', schedule_interval='@once', catchup=True, max_active_runs=1) dag.doc_md = """ ### DAG Summary This DAG describes the ETL process for ArXiv data from S3 to Redshift ### Points of Contact Email: [email protected] """ ############################ # Task Operators ############################ start_operator = DummyOperator(task_id='begin_execution', dag=dag) start_operator.doc_md = """ # Dummy operator: Start of DAG """
dag=dag_main, ) wrap_up = BashOperator( task_id='wrap_up', bash_command=f'python3 {work_dir}/src/update_run_time.py', dag=dag_main, ) # ======================End of main pipeline===================== # Ending Task t_end = BashOperator( task_id='running_end', bash_command='echo Running End! Time: $(date +"%T")', trigger_rule='one_success', # if one branches is done dag=dag_main, ) # Assemble main running pipline dag_main >> t_start >> monitor >> t1 >> t2 >> t3 >> t4 >> t5 >> wrap_up >> t_end # Branch if nothing to execute (no updates) monitor >> check >> t_end # DAG docs: dag_main.doc_md = __doc__ t1.doc_md = """\ #### Task Documentation Data Ingestion of files. It ingest two parts. """
from att_service_instance_funcs import * args = { 'start_date': datetime.utcnow(), 'owner': 'ATT', } dag_att = DAG( dag_id='att_workflow_onu', default_args=args, # this dag will be triggered by external systems schedule_interval=None, ) dag_att.doc_md = __doc__ def ONU_event(model_accessor, message, **kwargs): #context = kwargs #run_id = context['dag_run'].run_id logging.info('onu.events: received event', message=message) si = find_or_create_att_si(model_accessor, logging, message) if message['status'] == 'activated': logging.info('onu.events: activated onu', message=message) si.no_sync = False si.uni_port_id = long(message['portNumber']) si.of_dpid = message['deviceId'] si.oper_onu_status = 'ENABLED'
ext_task = dag.get_task('wait_for_{}_{}'.format( ext_dag_id, ext_task_id)) dummy = dag.get_task('{}_{}_finish'.format( ext_dag_id, ext_task_id)) except: ext_task = ExternalTaskSensor( task_id='wait_for_{}_{}'.format( ext_dag_id, ext_task_id), external_dag_id=ext_dag_id, external_task_id=ext_task_id, execution_delta=datetime.timedelta( minutes=int(execution_delta)), dag=dag) dummy = DummyOperator(task_id='{}_{}_finish'.format( ext_dag_id, ext_task_id), dag=dag) ext_task >> dummy >> dag.get_task( job_dict.get(str(row.job_num))) else: dag.get_task(job_dict.get(str(dep_job_id))) >> dag.get_task( job_dict.get(str(row.job_num))) dag.doc_md = md for task in filter(lambda x: x.task_id not in ('start', 'finish'), dag.tasks): if not task.upstream_list: start >> task if not task.downstream_list: task >> finish
from airflow.operators.dummy_operator import DummyOperator from datetime import datetime, timedelta from unicorn.airflow.util.unicorn_airflow_util import load_yaml dag_id = "unicorn_get_ip_dag" dir_path = os.path.dirname(os.path.realpath(__file__)) dag_config = load_yaml(os.path.join(dir_path, dag_id + ".yml")) default_args = dag_config['default_args'] default_args['start_date'] = datetime.now() dag = DAG(dag_id, default_args=dag_config["default_args"], schedule_interval=dag_config["schedule_interval"]) dag.doc_md = dag_config['doc_md'] task1 = BashOperator(task_id='TaskStart', bash_command="echo {{params}}", params={'cmd':dag_config["task1_cmd"]}, dag=dag) task2 = BashOperator(task_id='UnicornGetIp', depends_on_past=False, bash_command=dag_config["task1_cmd"], dag=dag) task3 = DummyOperator( task_id='TaskFinsish', dag=dag )
Add a Markdown description to a DAG or a task. The description is shown in “Graph View” for DAGs, “Task Details” for tasks. Doc: https://airflow.readthedocs.io/en/latest/concepts.html#documentation-notes """ from airflow import DAG from airflow.operators.bash_operator import BashOperator from datetime import datetime default_args = { 'start_date': datetime.now() } dag = DAG( 'description_markdown', default_args=default_args) dag.doc_md = """ # Markdown hi ## Subheader Here's a [url](www.airbnb.com) My numbered list: 1. one 1. two My bulleted list: - first - second """
task_id='pod2', is_delete_operator_pod=True, hostnetwork=False, ) t3 = KubernetesPodOperator(namespace='airflow', image="ubuntu:16.04", cmds=["bash", "-cx"], arguments=["echo", "hello world"], labels={'runner': 'airflow'}, name="pod3", task_id='pod3', is_delete_operator_pod=True, hostnetwork=False, ) t4 = KubernetesPodOperator(namespace='airflow', image="ubuntu:16.04", cmds=["bash", "-cx"], arguments=["echo", "hello world"], labels={'runner': 'airflow'}, name="pod4", task_id='pod4', is_delete_operator_pod=True, hostnetwork=False, ) company_onboarding.doc_md = __doc__ t1 >> [t2, t3] >> t4
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes, default_args): dag = DAG(dag_id, schedule_interval=schedule, start_date=start_date, default_args=default_args) dag.doc_md = """ # DAG fetching data from smiles.com.ar ### procesing and dumping on postgresql """ """start = TimeDeltaSensor( task_id='wait_to_start', delta=timedelta(minutes=delta_sensor), dag=dag)""" start = DummyOperator(task_id="start", dag=dag) branches = [] def return_dates_branches(**kwargs): return branches gen_url_branch = BranchPythonOperator( task_id='generate_url_dates', provide_context=True, python_callable=return_dates_branches, dag=dag) def transform_data(**kwargs): ti = kwargs['ti'] raw_data = ti.xcom_pull(task_ids=return_dates_branches()) data = [] logging.info(raw_data) if raw_data is not None: flat_list = [item for sublist in raw_data for item in sublist] for row in flat_list: row = list(row) # add À-ÿ for spanish accents date = '/'.join( list( re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split( row[1]))[2:4]) date = dateparser.parse(date, languages=['pt', 'es'], date_formats=['%d/%b' ]).strftime('%Y-%m-%d') row[1] = date td = row[4].split(':') row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1]))) row[5] = int(row[5].replace('.', '')) row[6] = int(row[6].replace('.', '')) row[8] = row[8].split(' ')[-1] row.insert(0, datetime.now().strftime('%Y-%m-%d')) data.append(tuple(row)) return data else: print('No se recibio datos') t2 = PythonOperator( task_id='transform_data', python_callable=transform_data, depends_on_past=True, trigger_rule=TriggerRule.ALL_SUCCESS, provide_context=True, dag=dag, ) t2.doc_md = """ #### Task Documentation Transform fetched data @return a list of tuples """ # def gen_url_dates(**kwargs): date_start = read_scraped_date(airpots_codes) date_end = date_start + timedelta(days=AMOUNT_DAYS) date_generated = [ date_start + timedelta(days=x) for x in range(0, (date_end - date_start).days) ] for i, date in enumerate(date_generated): date_ml = str(date.timestamp())[:8] + '00000' url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3¤cyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format( airpots_codes[0][0], airpots_codes[1], date_ml, date_ml, airpots_codes[0][1], airpots_codes[1]) get_data_op = PythonOperator( task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0], airpots_codes[0][1], airpots_codes[1], i), python_callable=get_data_URL, op_kwargs={'URL': url_dated}, trigger_rule=TriggerRule.ONE_SUCCESS, provide_context=True, dag=dag, ) branches.append(get_data_op.task_id) get_data_op.set_upstream(gen_url_branch) get_data_op.set_downstream(t2) get_data_op.doc_md = """ #### Task Documentation Fetch data from passed url return list of semi-parsed data """ insert_data = PythonOperator( task_id='insert_data', python_callable=insert_into_table, provide_context=True, dag=dag, ) insert_data.doc_md = """ #### Task Documentation Insert parsed and transformed data into table """ t2.set_downstream(insert_data) gen_url_branch.set_upstream(start) return dag
def create_basiskaart_dag(is_first: bool, table_name: str, select_statement: str) -> DAG: """Generates a DAG for each table. The table_name is the target table in de masterDB where the data will be inserted. The select_statement is one of the imported SQL query selects (see above) that will be executed on the source DB. """ # start time first DAG # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between # 3 and 5 a.m. Because of this, the first DAG starts running at 7 a.m. schedule_start_hour = 7 dag = DAG( f"{dag_id}_{table_name}", default_args={ "owner": owner, **default_args }, # the first DAG will have the is_first boolean set to True # the other DAG's will be triggered to start when the previous DAG is finished # (estafette run / relay run) schedule_interval=f"0 {schedule_start_hour} * * *" if is_first else None, description=""" basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50). The basiskaart data is collected from basiskaart DB.""", tags=["basiskaart"], ) with dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp and target table create_tables = PostgresOperator( task_id="create_tables", sql=CREATE_TABLES, params=dict(base_table=table_name, dag_id=dag_id), ) # 3. Copy data into temp table copy_data = PythonOperator( task_id="insert_data", python_callable=create_tables_from_basiskaartdb_to_masterdb, op_kwargs={ "source_connection": source_connection, "source_select_statement": globals()[select_statement], "target_base_table": f"{dag_id}_{table_name}_temp", }, dag=dag, ) # 4. Check for changes in temp table to merge in target table change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{dag_id}_{table_name}_temp", target_table=f"{dag_id}_{table_name}", ) # 5. Create mviews for T-REX tile server create_mviews = PostgresOperator( task_id="create_mviews", sql=CREATE_MVIEWS, params=dict(base_table=table_name, dag_id=dag_id), ) # 6. Rename COLUMNS based on Provenance provenance_translation = ProvenanceRenameOperator( task_id="rename_columns", dataset_name=dag_id, prefix_table_name=f"{dag_id}_", rename_indexes=False, pg_schema="public", ) # 7. Drop temp table clean_up = PostgresOperator( task_id="drop_temp_table", sql=[ f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE", ], ) # 8. Trigger next DAG to run (estafette) trigger_next_dag = TriggerDynamicDagRunOperator( task_id="trigger_next_dag", dag_id_prefix=f"{dag_id}_", trigger_rule="all_done", ) # 9. Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id) # Flow (slack_at_start >> create_tables >> copy_data >> change_data_capture >> create_mviews >> provenance_translation >> clean_up >> trigger_next_dag >> grant_db_permissions) dag.doc_md = """ #### DAG summary This DAG contains BGT (basisregistratie grootschalige topografie) i and KBK10 (kleinschalige basiskaart 10) and KBK50 (kleinschalige basiskaart 50) data #### Mission Critical Classified as 2 (beschikbaarheid [range: 1,2,3]) #### On Failure Actions Fix issues and rerun dag on working days #### Point of Contact Inform the businessowner at [businessowner]@amsterdam.nl #### Business Use Case / process / origin NA #### Prerequisites/Dependencies/Resourcing https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html Note: The basiskaart data is collected from the GOB objectstore and processed in the basiskaart DB => which is the source for this DAG. """ return dag
from airflow.sensors.cord_workflow_plugin import CORDEventSensor, CORDModelSensor from airflow.operators.cord_workflow_plugin import CORDModelOperator log = logging.getLogger(__name__) args = { # hard coded date 'start_date': datetime(2019, 1, 1), 'owner': 'iychoi' } dag_parallel_cord = DAG( dag_id='parallel_cord_workflow', default_args=args, # this dag will be triggered by external systems schedule_interval=None) dag_parallel_cord.doc_md = __doc__ dag_parallel_cord_admin = DAG( dag_id='parallel_cord_workflow_admin', default_args=args, # this dag will be triggered by external systems schedule_interval=None) dag_parallel_cord_admin.doc_md = __doc__ def on_onu_event(model_accessor, message, **kwargs): log.info('onu.events: received an event - %s' % message) def on_auth_event(model_accessor, message, **kwargs): log.info('authentication.events: received an event - %s' % message)