コード例 #1
0
def create_basiskaart_dag(is_first, table_name, select_statement):
    """ 
    DAG generator: Generates a DAG for each table.
    The table_name is the target table in de masterDB where the data will be inserted.
    The select_statement is one of the imported SQL query selects (see above) that will be executed on the source DB.    
    """
    # start time first DAG
    # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between 3 and 5 a.m.
    # Because of this, the first DAG starts running at 7 a.m.
    schedule_start_hour = 7

    dag = DAG(
        f"{dag_id}_{table_name}",
        default_args={
            "owner": owner,
            **default_args
        },
        # the first DAG will have the is_first boolean set to True
        # the other DAG's will be triggered to start when the previous DAG is finished (estafette run / relay run)
        schedule_interval=f"0 {schedule_start_hour} * * *"
        if is_first else None,
        description=
        """basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50). 
        The basiskaart data is collected from basiskaart DB.""",
        tags=["basiskaart"],
    )

    with dag:

        # 1. Post info message on slack
        slack_at_start = MessageOperator(
            task_id="slack_at_start",
            http_conn_id="slack",
            webhook_token=slack_webhook_token,
            message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
            username="******",
        )

        # 2. Create temp and target table
        create_tables = PostgresOperator(
            task_id="create_tables",
            sql=CREATE_TABLES,
            params=dict(base_table=table_name, dag_id=dag_id),
        )

        # 3. Copy data into temp table
        copy_data = PythonOperator(
            task_id="insert_data",
            python_callable=create_tables_from_basiskaartdb_to_masterdb,
            op_kwargs={
                "source_connection": source_connection,
                "source_select_statement": globals()[select_statement],
                "target_base_table": f"{dag_id}_{table_name}_temp",
            },
            dag=dag,
        )

        # 4. Check for changes in temp table to merge in target table
        change_data_capture = PgComparatorCDCOperator(
            task_id="change_data_capture",
            source_table=f"{dag_id}_{table_name}_temp",
            target_table=f"{dag_id}_{table_name}")

        # 5. Create mviews for T-REX tile server
        create_mviews = PostgresOperator(
            task_id="create_mviews",
            sql=CREATE_MVIEWS,
            params=dict(base_table=table_name, dag_id=dag_id),
        )

        # 6. Rename COLUMNS based on Provenance
        provenance_translation = ProvenanceRenameOperator(
            task_id="rename_columns",
            dataset_name=f"{dag_id}",
            prefix_table_name=f"{dag_id}_",
            rename_indexes=False,
            pg_schema="public",
        )

        # 7. Drop temp table
        clean_up = PostgresOperator(
            task_id="drop_temp_table",
            sql=[
                f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE",
            ],
        )

        # 8. Trigger next DAG to run (estafette)
        trigger_next_dag = TriggerDynamicDagRunOperator(
            task_id="trigger_next_dag",
            dag_id_prefix=f"{dag_id}_",
            trigger_rule="all_done",
        )

    # Flow
    slack_at_start >> create_tables >> copy_data >> change_data_capture >> create_mviews >> provenance_translation >> clean_up >> trigger_next_dag

    dag.doc_md = """
    #### DAG summery
    This DAG containts BGT (basisregistratie grootschalige topografie) and KBK10 (kleinschalige basiskaart 10) and KBK50 (kleinschalige basiskaart 50) data
    #### Mission Critical
    Classified as 2 (beschikbaarheid [range: 1,2,3])
    #### On Failure Actions
    Fix issues and rerun dag on working days
    #### Point of Contact
    Inform the businessowner at [businessowner]@amsterdam.nl
    #### Business Use Case / process / origin
    NA
    #### Prerequisites/Dependencies/Resourcing
    https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html
    Note: The basiskaart data is collected from the GOB objectstore and processed in the basiskaart DB => which is the source for this DAG.
    """

    return dag
コード例 #2
0
    # if table not exists yet
    create_tables = SqlAlchemyCreateObjectOperator(
        task_id="create_table",
        data_schema_name=dag_id,
        data_table_name=f"{dag_id}_{tables['dwh_stadsdelen']}",
        ind_table=True,
        # when set to false, it doesn't create indexes; only tables
        ind_extra_index=True,
    )

    # 10. DWH STADSDELEN SOURCE
    # Check for changes to merge in target table by using CDC
    change_data_capture = PgComparatorCDCOperator(
        task_id="change_data_capture",
        source_table=f"{dag_id}_{to_snake_case(tables['dwh_stadsdelen'])}_new",
        target_table=f"{dag_id}_{to_snake_case(tables['dwh_stadsdelen'])}",
        use_pg_copy=True,
        key_column="id",
        use_key=True,
    )

    # 11. DWH STADSDELEN SOURCE
    # Clean up; delete temp table
    clean_up = PostgresOperator(
        task_id="clean_up",
        sql=SQL_DROP_TMP_TABLE,
        params=dict(
            tablename=f"{dag_id}_{to_snake_case(tables['dwh_stadsdelen'])}_new"
        ),
    )

    # 12. Grant database permissions
コード例 #3
0
    # 9. Create the DB target table (as specified in the JSON data schema)
    # if table not exists yet
    create_target_table = SqlAlchemyCreateObjectOperator(
        task_id="create_target_table_based_upon_schema",
        data_schema_name=schema_name,
        data_table_name=f"{schema_name}_{table_name}",
        ind_table=True,
        # when set to false, it doesn't create indexes; only tables
        ind_extra_index=True,
    )

    # 10. Check for changes to merge in target table
    change_data_capture = PgComparatorCDCOperator(
        task_id="change_data_capture",
        source_table=f"{schema_name}_{table_name}_new",
        target_table=f"{schema_name}_{table_name}",
    )

    # 11. Clean up (remove temp table _new)
    clean_up = PostgresOperator(
        task_id="clean_up",
        sql=SQL_DROP_TMP_TABLE,
        params=dict(tablename=f"{schema_name}_{table_name}_new"),
    )

    # 12. Grant database permissions
    grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id)

# FLOW
(
コード例 #4
0
            params=dict(tablename=f"{dag_id}_{table_name}_new"),
        )
        for table_name, _ in files_to_download.items()
        if table_name == "historischeonderzoeken"
    ]

    # 10. Dummy operator acts as an Interface between parallel tasks
    # to another parallel tasks (i.e. lists or tuples) with different
    # number of lanes (without this intermediar, Airflow will give an error)
    Interface2 = DummyOperator(task_id="interface2")

    # 11. Check for changes to merge in target table
    change_data_capture = [
        PgComparatorCDCOperator(
            task_id=f"change_data_capture_{table_name}",
            source_table=f"{dag_id}_{table_name}_new",
            target_table=f"{dag_id}_{table_name}",
        )
        for table_name, _ in files_to_download.items()
    ]

    # 12. Clean up
    clean_up = [
        PostgresOperator(
            task_id="clean_up",
            sql=SQL_DROP_TMP_TABLE,
            params=dict(tablename=f"{dag_id}_{table_name}_new"),
        )
        for table_name, _ in files_to_download.items()
    ]
コード例 #5
0
    # 11. Set GEO type
    set_geom = [
        PostgresOperator(
            task_id=f"set_geom_{resource}",
            sql=SQL_SET_GEOM,
            params=dict(tablename=f"{dag_id}_{resource}_new"),
        ) for resource in variables
    ]

    # 12. Check for changes to merge in target table by using CDC
    change_data_capture = [
        PgComparatorCDCOperator(
            task_id=f"change_data_capture_{resource}",
            source_table=f"{dag_id}_{resource}_new",
            target_table=f"{dag_id}_{resource}",
            use_pg_copy=True,
            key_column="id",
            use_key=True,
            no_deletes=True,
        ) for resource in variables
    ]

    # 13. Clean up; delete temp table
    clean_up = [
        PostgresOperator(
            task_id=f"clean_up_{resource}",
            sql=SQL_DROP_TMP_TABLE,
            params=dict(tablename=f"{dag_id}_{resource}_new"),
        ) for resource in variables
    ]
コード例 #6
0
    # Create the DB target table (as specified in the JSON data schema)
    # if table not exists yet
    create_tables = SqlAlchemyCreateObjectOperator(
        task_id="create_table",
        data_schema_name=data_schema_id,
        data_table_name=dag_id,
        ind_table=True,
        # when set to false, it doesn't create indexes; only tables
        ind_extra_index=True,
    )

    # 6. Check for changes to merge in target table by using CDC
    change_data_capture = PgComparatorCDCOperator(
        task_id="change_data_capture",
        source_table=f"{dag_id}_new",
        target_table=dag_id,
        use_pg_copy=True,
        key_column="id",
        use_key=True,
    )

    # 7. Clean up; delete temp table
    clean_up = PostgresOperator(
        task_id="clean_up",
        sql=SQL_DROP_TMP_TABLE,
        params=dict(tablename=f"{dag_id}_new"),
    )

    # 8. Grant database permissions
    grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=data_schema_id)

コード例 #7
0
        total_checks = count_checks + geo_checks
        check_name[table_name] = total_checks

    # 18. Execute bundled checks on database
    multi_checks = [
        PostgresMultiCheckOperator(task_id=f"multi_check_{table_name}",
                                   checks=check_name[table_name])
        for table_name in files_to_download.keys()
    ]

    # 19. Check for changes to merge in target table by using CDC
    change_data_capture = [
        PgComparatorCDCOperator(
            task_id=f"change_data_capture_{table_name}",
            source_table=f"{dag_id}_{table_name}_new",
            target_table=f"{dag_id}_{table_name}",
        ) for table_name in files_to_download.keys()
    ]

    # 20. Clean up; delete temp table
    clean_up = [
        PostgresOperator(
            task_id=f"clean_up_{table_name}",
            sql=SQL_DROP_TMP_TABLE,
            params=dict(tablename=f"{dag_id}_{table_name}_new"),
        ) for table_name in files_to_download.keys()
    ]

    # 21. Grant database permissions
    grant_db_permissions = PostgresPermissionsOperator(task_id="grants",
コード例 #8
0
    # 13. Create the DB target table (as specified in the JSON data schema)
    # if table not exists yet
    create_table = SqlAlchemyCreateObjectOperator(
        task_id="create_table_based_upon_schema",
        data_schema_name=dag_id,
        data_table_name=f"{dag_id}_{dag_id}",
        ind_table=True,
        # when set to false, it doesn't create indexes; only tables
        ind_extra_index=True,
    )

    # 14. Check for changes to merge in target table
    change_data_capture = PgComparatorCDCOperator(
        task_id="change_data_capture",
        source_table=f"{dag_id}_{dag_id}_new",
        target_table=f"{dag_id}_{dag_id}",
    )

    # 15. Clean up (remove temp table _new)
    clean_up = PostgresOperator(
        task_id="clean_up",
        sql=SQL_DROP_TMP_TABLE,
        params=dict(tablename=f"{dag_id}_{dag_id}_new"),
    )

    # 16. Grant database permissions
    grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id)

(
    slack_at_start
コード例 #9
0
    )

    data = tuple(
        FileTable(file=Path(f"{file_stem}.csv"),
                  table=f"{TMP_TABLE_PREFIX}{table}")
        for file_stem, table in table_mappings.items())
    postgres_insert_csv = PostgresInsertCsvOperator(
        task_id="postgres_insert_csv",
        data=data,
        base_dir_task_id="mk_tmp_dir")

    change_data_capture = [
        PgComparatorCDCOperator(
            task_id=f"change_data_capture_{table}",
            source_table=f"{TMP_TABLE_PREFIX}{table}",
            target_table=table,
            use_pg_copy=True,
            key_column="cdc_id",
            use_key=True,
        ) for table in table_mappings.values()
    ]

    rm_tmp_tables_post = rm_tmp_tables("_post")

    def _rm_tmp_dir(**kwargs: Any) -> None:
        tmp_dir = kwargs["task_instance"].xcom_pull(task_ids="mk_tmp_dir")
        shutil.rmtree(tmp_dir)

    rm_tmp_dir = PythonOperator(
        task_id="rm_tmp_dir",
        python_callable=_rm_tmp_dir,
        provide_context=