) as dag: tmp_dir = f"/tmp/{dag_id}" colnames = [ "ogc_fid", "wkb_geometry", "oov_naam", "type", "url", ] fetch_shp_files = [] slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) for ext in ("dbf", "prj", "shp", "shx"): file_name = f"OOV_gebieden_totaal.{ext}" fetch_shp_files.append( SwiftOperator( task_id=f"fetch_shp_{ext}", container=dag_id, object_id=file_name, output_path=f"/tmp/{dag_id}/{file_name}", )) extract_shp = BashOperator(
def create_gob_dag(is_first, gob_dataset_name, gob_table_name): gob_db_table_name = f"{gob_dataset_name}_{gob_table_name}" graphql_dir_path = graphql_path / f"{gob_dataset_name}-{gob_table_name}" graphql_params_path = graphql_dir_path / "args.json" extra_kwargs = {} schedule_start_hour = 6 if graphql_params_path.exists(): with graphql_params_path.open() as json_file: args_from_file = json.load(json_file) extra_kwargs = args_from_file.get("extra_kwargs", {}) protected = extra_kwargs.get("protected", False) if protected: extra_kwargs["endpoint"] = GOB_SECURE_ENDPOINT dag = DAG( f"{dag_id}_{gob_db_table_name}", default_args={ "owner": owner, **default_args }, schedule_interval=f"0 {schedule_start_hour} * * *" if is_first else None, tags=["gob"], ) kwargs = dict( task_id=f"load_{gob_db_table_name}", endpoint=GOB_PUBLIC_ENDPOINT, dataset=gob_dataset_name, schema=gob_table_name, retries=3, graphql_query_path=graphql_dir_path / "query.graphql", max_records=MAX_RECORDS, http_conn_id="gob_graphql", token_expires_margin=OAUTH_TOKEN_EXPIRES_MARGIN, ) with dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id=f"slack_at_start_{gob_db_table_name}", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. drop temp table if exists init_table = PostgresTableInitOperator( task_id=f"init_{gob_db_table_name}", table_name=f"{gob_db_table_name}{TMP_TABLE_POSTFIX}", drop_table=True, ) # 3. load data into temp table load_data = HttpGobOperator(**{**kwargs, **extra_kwargs}) # 4. truncate target table and insert data from temp table copy_table = PostgresTableCopyOperator( task_id=f"copy_{gob_db_table_name}", source_table_name=f"{gob_db_table_name}{TMP_TABLE_POSTFIX}", target_table_name=gob_db_table_name, ) # 5. create an index on the identifier fields (as specified in the JSON data schema) create_extra_index = SqlAlchemyCreateObjectOperator( task_id=f"create_extra_index_{gob_db_table_name}", data_schema_name=kwargs.get("dataset", None), data_table_name=f"{gob_db_table_name}", # when set to false, it doesn't create the tables; only the index ind_table=False, ind_extra_index=True, ) # 6. trigger next DAG (estafette / relay run) trigger_next_dag = TriggerDynamicDagRunOperator( task_id="trigger_next_dag", dag_id_prefix="gob_", trigger_rule="all_done", ) # FLOW (slack_at_start >> init_table >> load_data >> copy_table >> create_extra_index >> trigger_next_dag) return dag
def create_gob_dag(is_first: bool, gob_dataset_id: str, gob_table_id: str) -> DAG: dataset_table_id = f"{gob_dataset_id}_{gob_table_id}" graphql_dir_path = graphql_path / f"{gob_dataset_id}-{gob_table_id}" graphql_params_path = graphql_dir_path / "args.json" extra_kwargs = {} schedule_start_hour = 6 if graphql_params_path.exists(): with graphql_params_path.open() as json_file: args_from_file = json.load(json_file) extra_kwargs = args_from_file.get("extra_kwargs", {}) protected = extra_kwargs.get("protected", False) if protected: extra_kwargs["endpoint"] = GOB_SECURE_ENDPOINT dag = DAG( f"{dag_id}_{dataset_table_id}", default_args={ "owner": owner, **default_args }, schedule_interval=f"0 {schedule_start_hour} * * *" if is_first else None, tags=["gob"], ) kwargs = dict( task_id=f"load_{dataset_table_id}", endpoint=GOB_PUBLIC_ENDPOINT, retries=3, graphql_query_path=graphql_dir_path / "query.graphql", max_records=MAX_RECORDS, http_conn_id="gob_graphql", token_expires_margin=OAUTH_TOKEN_EXPIRES_MARGIN, xcom_table_info_task_ids=f"mkinfo_{dataset_table_id}", ) with dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id=f"slack_at_start_{dataset_table_id}", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) def _create_dataset_info(dataset_id: str, table_id: str) -> DatasetInfo: dataset = schema_def_from_url(SCHEMA_URL, dataset_id, prefetch_related=True) # Fetch the db_name for this dataset and table db_table_name = dataset.get_table_by_id(table_id).db_name() # We do not pass the dataset through xcom, but only the id. # The methodtools.lru_cache decorator is not pickleable # (Airflow uses pickle for (de)serialization). # provide the dataset_table_id as fully qualified name, for convenience dataset_table_id = f"{dataset_id}_{table_id}" return DatasetInfo(SCHEMA_URL, dataset_id, table_id, dataset_table_id, db_table_name) # 2. Create Dataset info to put on the xcom channel for later use # by operators create_dataset_info = PythonOperator( task_id=f"mkinfo_{dataset_table_id}", python_callable=_create_dataset_info, op_args=(gob_dataset_id, gob_table_id), ) def init_assigner(o: Any, x: Any) -> None: o.table_name = f"{x.db_table_name}{TMP_TABLE_POSTFIX}" # 3. drop temp table if exists init_table = PostgresTableInitOperator( task_id=f"init_{dataset_table_id}", table_name=None, xcom_task_ids=f"mkinfo_{dataset_table_id}", xcom_attr_assigner=init_assigner, drop_table=True, ) # 4. load data into temp table load_data = HttpGobOperator(**{**kwargs, **extra_kwargs}) def copy_assigner(o: Any, x: Any) -> None: o.source_table_name = f"{x.db_table_name}{TMP_TABLE_POSTFIX}" o.target_table_name = x.db_table_name # 5. truncate target table and insert data from temp table copy_table = PostgresTableCopyOperator( task_id=f"copy_{dataset_table_id}", source_table_name=None, target_table_name=None, xcom_task_ids=f"mkinfo_{dataset_table_id}", xcom_attr_assigner=copy_assigner, ) def index_assigner(o: Any, x: Any) -> None: o.data_table_name = x.db_table_name # 6. create an index on the identifier fields (as specified in the JSON data schema) create_extra_index = SqlAlchemyCreateObjectOperator( task_id=f"create_extra_index_{dataset_table_id}", data_schema_name=gob_dataset_id, data_table_name=None, # when set to false, it doesn't create the tables; only the index ind_table=False, ind_extra_index=True, xcom_task_ids=f"mkinfo_{dataset_table_id}", xcom_attr_assigner=index_assigner, ) # 7. trigger next DAG (estafette / relay run) trigger_next_dag = TriggerDynamicDagRunOperator( task_id="trigger_next_dag", dag_id_prefix="gob_", trigger_rule="all_done", ) # 9. Grant database permissions grant_db_permissions = PostgresPermissionsOperator( task_id="grants", dag_name=f"{dag_id}_{dataset_table_id}") # FLOW (slack_at_start >> create_dataset_info >> init_table >> load_data >> copy_table >> create_extra_index >> trigger_next_dag >> grant_db_permissions) return dag
def create_basiskaart_dag(is_first, table_name, select_statement): """ DAG generator: Generates a DAG for each table. The table_name is the target table in de masterDB where the data will be inserted. The select_statement is one of the imported SQL query selects (see above) that will be executed on the source DB. """ # start time first DAG # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between 3 and 5 a.m. # Because of this, the first DAG starts running at 7 a.m. schedule_start_hour = 7 dag = DAG( f"{dag_id}_{table_name}", default_args={ "owner": owner, **default_args }, # the first DAG will have the is_first boolean set to True # the other DAG's will be triggered to start when the previous DAG is finished (estafette run / relay run) schedule_interval=f"0 {schedule_start_hour} * * *" if is_first else None, description= """basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50). The basiskaart data is collected from basiskaart DB.""", tags=["basiskaart"], ) with dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp and target table create_tables = PostgresOperator( task_id="create_tables", sql=CREATE_TABLES, params=dict(base_table=table_name, dag_id=dag_id), ) # 3. Copy data into temp table copy_data = PythonOperator( task_id="insert_data", python_callable=create_tables_from_basiskaartdb_to_masterdb, op_kwargs={ "source_connection": source_connection, "source_select_statement": globals()[select_statement], "target_base_table": f"{dag_id}_{table_name}_temp", }, dag=dag, ) # 4. Check for changes in temp table to merge in target table change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{dag_id}_{table_name}_temp", target_table=f"{dag_id}_{table_name}") # 5. Create mviews for T-REX tile server create_mviews = PostgresOperator( task_id="create_mviews", sql=CREATE_MVIEWS, params=dict(base_table=table_name, dag_id=dag_id), ) # 6. Rename COLUMNS based on Provenance provenance_translation = ProvenanceRenameOperator( task_id="rename_columns", dataset_name=f"{dag_id}", prefix_table_name=f"{dag_id}_", rename_indexes=False, pg_schema="public", ) # 7. Drop temp table clean_up = PostgresOperator( task_id="drop_temp_table", sql=[ f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE", ], ) # 8. Trigger next DAG to run (estafette) trigger_next_dag = TriggerDynamicDagRunOperator( task_id="trigger_next_dag", dag_id_prefix=f"{dag_id}_", trigger_rule="all_done", ) # Flow slack_at_start >> create_tables >> copy_data >> change_data_capture >> create_mviews >> provenance_translation >> clean_up >> trigger_next_dag dag.doc_md = """ #### DAG summery This DAG containts BGT (basisregistratie grootschalige topografie) and KBK10 (kleinschalige basiskaart 10) and KBK50 (kleinschalige basiskaart 50) data #### Mission Critical Classified as 2 (beschikbaarheid [range: 1,2,3]) #### On Failure Actions Fix issues and rerun dag on working days #### Point of Contact Inform the businessowner at [businessowner]@amsterdam.nl #### Business Use Case / process / origin NA #### Prerequisites/Dependencies/Resourcing https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html Note: The basiskaart data is collected from the GOB objectstore and processed in the basiskaart DB => which is the source for this DAG. """ return dag