def create_basiskaart_dag(is_first, table_name, select_statement): """ DAG generator: Generates a DAG for each table. The table_name is the target table in de masterDB where the data will be inserted. The select_statement is one of the imported SQL query selects (see above) that will be executed on the source DB. """ # start time first DAG # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between 3 and 5 a.m. # Because of this, the first DAG starts running at 7 a.m. schedule_start_hour = 7 dag = DAG( f"{dag_id}_{table_name}", default_args={ "owner": owner, **default_args }, # the first DAG will have the is_first boolean set to True # the other DAG's will be triggered to start when the previous DAG is finished (estafette run / relay run) schedule_interval=f"0 {schedule_start_hour} * * *" if is_first else None, description= """basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50). The basiskaart data is collected from basiskaart DB.""", tags=["basiskaart"], ) with dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp and target table create_tables = PostgresOperator( task_id="create_tables", sql=CREATE_TABLES, params=dict(base_table=table_name, dag_id=dag_id), ) # 3. Copy data into temp table copy_data = PythonOperator( task_id="insert_data", python_callable=create_tables_from_basiskaartdb_to_masterdb, op_kwargs={ "source_connection": source_connection, "source_select_statement": globals()[select_statement], "target_base_table": f"{dag_id}_{table_name}_temp", }, dag=dag, ) # 4. Check for changes in temp table to merge in target table change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{dag_id}_{table_name}_temp", target_table=f"{dag_id}_{table_name}") # 5. Create mviews for T-REX tile server create_mviews = PostgresOperator( task_id="create_mviews", sql=CREATE_MVIEWS, params=dict(base_table=table_name, dag_id=dag_id), ) # 6. Rename COLUMNS based on Provenance provenance_translation = ProvenanceRenameOperator( task_id="rename_columns", dataset_name=f"{dag_id}", prefix_table_name=f"{dag_id}_", rename_indexes=False, pg_schema="public", ) # 7. Drop temp table clean_up = PostgresOperator( task_id="drop_temp_table", sql=[ f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE", ], ) # 8. Trigger next DAG to run (estafette) trigger_next_dag = TriggerDynamicDagRunOperator( task_id="trigger_next_dag", dag_id_prefix=f"{dag_id}_", trigger_rule="all_done", ) # Flow slack_at_start >> create_tables >> copy_data >> change_data_capture >> create_mviews >> provenance_translation >> clean_up >> trigger_next_dag dag.doc_md = """ #### DAG summery This DAG containts BGT (basisregistratie grootschalige topografie) and KBK10 (kleinschalige basiskaart 10) and KBK50 (kleinschalige basiskaart 50) data #### Mission Critical Classified as 2 (beschikbaarheid [range: 1,2,3]) #### On Failure Actions Fix issues and rerun dag on working days #### Point of Contact Inform the businessowner at [businessowner]@amsterdam.nl #### Business Use Case / process / origin NA #### Prerequisites/Dependencies/Resourcing https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html Note: The basiskaart data is collected from the GOB objectstore and processed in the basiskaart DB => which is the source for this DAG. """ return dag
# if table not exists yet create_tables = SqlAlchemyCreateObjectOperator( task_id="create_table", data_schema_name=dag_id, data_table_name=f"{dag_id}_{tables['dwh_stadsdelen']}", ind_table=True, # when set to false, it doesn't create indexes; only tables ind_extra_index=True, ) # 10. DWH STADSDELEN SOURCE # Check for changes to merge in target table by using CDC change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{dag_id}_{to_snake_case(tables['dwh_stadsdelen'])}_new", target_table=f"{dag_id}_{to_snake_case(tables['dwh_stadsdelen'])}", use_pg_copy=True, key_column="id", use_key=True, ) # 11. DWH STADSDELEN SOURCE # Clean up; delete temp table clean_up = PostgresOperator( task_id="clean_up", sql=SQL_DROP_TMP_TABLE, params=dict( tablename=f"{dag_id}_{to_snake_case(tables['dwh_stadsdelen'])}_new" ), ) # 12. Grant database permissions
# 9. Create the DB target table (as specified in the JSON data schema) # if table not exists yet create_target_table = SqlAlchemyCreateObjectOperator( task_id="create_target_table_based_upon_schema", data_schema_name=schema_name, data_table_name=f"{schema_name}_{table_name}", ind_table=True, # when set to false, it doesn't create indexes; only tables ind_extra_index=True, ) # 10. Check for changes to merge in target table change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{schema_name}_{table_name}_new", target_table=f"{schema_name}_{table_name}", ) # 11. Clean up (remove temp table _new) clean_up = PostgresOperator( task_id="clean_up", sql=SQL_DROP_TMP_TABLE, params=dict(tablename=f"{schema_name}_{table_name}_new"), ) # 12. Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id) # FLOW (
params=dict(tablename=f"{dag_id}_{table_name}_new"), ) for table_name, _ in files_to_download.items() if table_name == "historischeonderzoeken" ] # 10. Dummy operator acts as an Interface between parallel tasks # to another parallel tasks (i.e. lists or tuples) with different # number of lanes (without this intermediar, Airflow will give an error) Interface2 = DummyOperator(task_id="interface2") # 11. Check for changes to merge in target table change_data_capture = [ PgComparatorCDCOperator( task_id=f"change_data_capture_{table_name}", source_table=f"{dag_id}_{table_name}_new", target_table=f"{dag_id}_{table_name}", ) for table_name, _ in files_to_download.items() ] # 12. Clean up clean_up = [ PostgresOperator( task_id="clean_up", sql=SQL_DROP_TMP_TABLE, params=dict(tablename=f"{dag_id}_{table_name}_new"), ) for table_name, _ in files_to_download.items() ]
# 11. Set GEO type set_geom = [ PostgresOperator( task_id=f"set_geom_{resource}", sql=SQL_SET_GEOM, params=dict(tablename=f"{dag_id}_{resource}_new"), ) for resource in variables ] # 12. Check for changes to merge in target table by using CDC change_data_capture = [ PgComparatorCDCOperator( task_id=f"change_data_capture_{resource}", source_table=f"{dag_id}_{resource}_new", target_table=f"{dag_id}_{resource}", use_pg_copy=True, key_column="id", use_key=True, no_deletes=True, ) for resource in variables ] # 13. Clean up; delete temp table clean_up = [ PostgresOperator( task_id=f"clean_up_{resource}", sql=SQL_DROP_TMP_TABLE, params=dict(tablename=f"{dag_id}_{resource}_new"), ) for resource in variables ]
# Create the DB target table (as specified in the JSON data schema) # if table not exists yet create_tables = SqlAlchemyCreateObjectOperator( task_id="create_table", data_schema_name=data_schema_id, data_table_name=dag_id, ind_table=True, # when set to false, it doesn't create indexes; only tables ind_extra_index=True, ) # 6. Check for changes to merge in target table by using CDC change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{dag_id}_new", target_table=dag_id, use_pg_copy=True, key_column="id", use_key=True, ) # 7. Clean up; delete temp table clean_up = PostgresOperator( task_id="clean_up", sql=SQL_DROP_TMP_TABLE, params=dict(tablename=f"{dag_id}_new"), ) # 8. Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=data_schema_id)
total_checks = count_checks + geo_checks check_name[table_name] = total_checks # 18. Execute bundled checks on database multi_checks = [ PostgresMultiCheckOperator(task_id=f"multi_check_{table_name}", checks=check_name[table_name]) for table_name in files_to_download.keys() ] # 19. Check for changes to merge in target table by using CDC change_data_capture = [ PgComparatorCDCOperator( task_id=f"change_data_capture_{table_name}", source_table=f"{dag_id}_{table_name}_new", target_table=f"{dag_id}_{table_name}", ) for table_name in files_to_download.keys() ] # 20. Clean up; delete temp table clean_up = [ PostgresOperator( task_id=f"clean_up_{table_name}", sql=SQL_DROP_TMP_TABLE, params=dict(tablename=f"{dag_id}_{table_name}_new"), ) for table_name in files_to_download.keys() ] # 21. Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants",
# 13. Create the DB target table (as specified in the JSON data schema) # if table not exists yet create_table = SqlAlchemyCreateObjectOperator( task_id="create_table_based_upon_schema", data_schema_name=dag_id, data_table_name=f"{dag_id}_{dag_id}", ind_table=True, # when set to false, it doesn't create indexes; only tables ind_extra_index=True, ) # 14. Check for changes to merge in target table change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{dag_id}_{dag_id}_new", target_table=f"{dag_id}_{dag_id}", ) # 15. Clean up (remove temp table _new) clean_up = PostgresOperator( task_id="clean_up", sql=SQL_DROP_TMP_TABLE, params=dict(tablename=f"{dag_id}_{dag_id}_new"), ) # 16. Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id) ( slack_at_start
) data = tuple( FileTable(file=Path(f"{file_stem}.csv"), table=f"{TMP_TABLE_PREFIX}{table}") for file_stem, table in table_mappings.items()) postgres_insert_csv = PostgresInsertCsvOperator( task_id="postgres_insert_csv", data=data, base_dir_task_id="mk_tmp_dir") change_data_capture = [ PgComparatorCDCOperator( task_id=f"change_data_capture_{table}", source_table=f"{TMP_TABLE_PREFIX}{table}", target_table=table, use_pg_copy=True, key_column="cdc_id", use_key=True, ) for table in table_mappings.values() ] rm_tmp_tables_post = rm_tmp_tables("_post") def _rm_tmp_dir(**kwargs: Any) -> None: tmp_dir = kwargs["task_instance"].xcom_pull(task_ids="mk_tmp_dir") shutil.rmtree(tmp_dir) rm_tmp_dir = PythonOperator( task_id="rm_tmp_dir", python_callable=_rm_tmp_dir, provide_context=