Exemple #1
0
    # 9. RENAME columns based on PROVENANCE
    provenance_translation = ProvenanceRenameOperator(
        task_id="provenance_rename",
        dataset_name=dag_id,
        prefix_table_name=f"{dag_id}_",
        postfix_table_name="_new",
        rename_indexes=False,
        pg_schema="public",
    )

    # 10. Rename temp named tables to final names
    rename_tables = PostgresOperator(task_id="rename_tables", sql=SQL_TABLE_RENAMES)

    # 11. Grant database permissions
    grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id)


(
    slack_at_start
    >> mkdir
    >> download_geojson
    >> fetch_files
    >> proces_cmsa
    >> create_tables
    >> import_data
    >> fill_markering
    >> provenance_translation
    >> rename_tables
    >> grant_db_permissions
)
Exemple #2
0
dag_id = "airflow_db_permissions"
logger = logging.getLogger(__name__)

with DAG(
        dag_id,
        default_args=default_args,
        schedule_interval="*/15 * * * *",
        catchup=False,
        description=
        "set grants on database objects to database roles based upon schema auth definition, based upon successfully executed dags within specified time window.",
) as dag:

    # 1. Post info message on slack
    slack_at_start = MessageOperator(
        task_id="slack_at_start",
        http_conn_id="slack",
        webhook_token=slack_webhook_token,
        message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
        username="******",
    )

    # 2. Add grants (in batch mode)
    # seeks for dags that successfully executed within time window
    # default time window is set on 30 min.
    # beware: currently the logic assumes dag_id == dataset name
    # TODO look for possibility to use other then dag_id as dataset name
    grants = PostgresPermissionsOperator(task_id="grants", batch_ind=True)

# FLOW
slack_at_start >> grants
Exemple #3
0
def create_gob_dag(is_first: bool, gob_dataset_id: str,
                   gob_table_id: str) -> DAG:

    dataset_table_id = f"{gob_dataset_id}_{gob_table_id}"
    graphql_dir_path = graphql_path / f"{gob_dataset_id}-{gob_table_id}"
    graphql_params_path = graphql_dir_path / "args.json"
    extra_kwargs = {}
    schedule_start_hour = 6
    if graphql_params_path.exists():
        with graphql_params_path.open() as json_file:
            args_from_file = json.load(json_file)
            extra_kwargs = args_from_file.get("extra_kwargs", {})
            protected = extra_kwargs.get("protected", False)
            if protected:
                extra_kwargs["endpoint"] = GOB_SECURE_ENDPOINT

    dag = DAG(
        f"{dag_id}_{dataset_table_id}",
        default_args={
            "owner": owner,
            **default_args
        },
        schedule_interval=f"0 {schedule_start_hour} * * *"
        if is_first else None,
        tags=["gob"],
    )

    kwargs = dict(
        task_id=f"load_{dataset_table_id}",
        endpoint=GOB_PUBLIC_ENDPOINT,
        retries=3,
        graphql_query_path=graphql_dir_path / "query.graphql",
        max_records=MAX_RECORDS,
        http_conn_id="gob_graphql",
        token_expires_margin=OAUTH_TOKEN_EXPIRES_MARGIN,
        xcom_table_info_task_ids=f"mkinfo_{dataset_table_id}",
    )

    with dag:

        # 1. Post info message on slack
        slack_at_start = MessageOperator(
            task_id=f"slack_at_start_{dataset_table_id}",
            http_conn_id="slack",
            webhook_token=slack_webhook_token,
            message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
            username="******",
        )

        def _create_dataset_info(dataset_id: str,
                                 table_id: str) -> DatasetInfo:
            dataset = schema_def_from_url(SCHEMA_URL,
                                          dataset_id,
                                          prefetch_related=True)
            # Fetch the db_name for this dataset and table
            db_table_name = dataset.get_table_by_id(table_id).db_name()

            # We do not pass the dataset through xcom, but only the id.
            # The methodtools.lru_cache decorator is not pickleable
            # (Airflow uses pickle for (de)serialization).
            # provide the dataset_table_id as fully qualified name, for convenience
            dataset_table_id = f"{dataset_id}_{table_id}"
            return DatasetInfo(SCHEMA_URL, dataset_id, table_id,
                               dataset_table_id, db_table_name)

        # 2. Create Dataset info to put on the xcom channel for later use
        # by operators
        create_dataset_info = PythonOperator(
            task_id=f"mkinfo_{dataset_table_id}",
            python_callable=_create_dataset_info,
            op_args=(gob_dataset_id, gob_table_id),
        )

        def init_assigner(o: Any, x: Any) -> None:
            o.table_name = f"{x.db_table_name}{TMP_TABLE_POSTFIX}"

        # 3. drop temp table if exists
        init_table = PostgresTableInitOperator(
            task_id=f"init_{dataset_table_id}",
            table_name=None,
            xcom_task_ids=f"mkinfo_{dataset_table_id}",
            xcom_attr_assigner=init_assigner,
            drop_table=True,
        )

        # 4. load data into temp table
        load_data = HttpGobOperator(**{**kwargs, **extra_kwargs})

        def copy_assigner(o: Any, x: Any) -> None:
            o.source_table_name = f"{x.db_table_name}{TMP_TABLE_POSTFIX}"
            o.target_table_name = x.db_table_name

        # 5. truncate target table and insert data from temp table
        copy_table = PostgresTableCopyOperator(
            task_id=f"copy_{dataset_table_id}",
            source_table_name=None,
            target_table_name=None,
            xcom_task_ids=f"mkinfo_{dataset_table_id}",
            xcom_attr_assigner=copy_assigner,
        )

        def index_assigner(o: Any, x: Any) -> None:
            o.data_table_name = x.db_table_name

        # 6. create an index on the identifier fields (as specified in the JSON data schema)
        create_extra_index = SqlAlchemyCreateObjectOperator(
            task_id=f"create_extra_index_{dataset_table_id}",
            data_schema_name=gob_dataset_id,
            data_table_name=None,
            # when set to false, it doesn't create the tables; only the index
            ind_table=False,
            ind_extra_index=True,
            xcom_task_ids=f"mkinfo_{dataset_table_id}",
            xcom_attr_assigner=index_assigner,
        )

        # 7. trigger next DAG (estafette / relay run)
        trigger_next_dag = TriggerDynamicDagRunOperator(
            task_id="trigger_next_dag",
            dag_id_prefix="gob_",
            trigger_rule="all_done",
        )

        # 9. Grant database permissions
        grant_db_permissions = PostgresPermissionsOperator(
            task_id="grants", dag_name=f"{dag_id}_{dataset_table_id}")

        # FLOW
        (slack_at_start >> create_dataset_info >> init_table >> load_data >>
         copy_table >> create_extra_index >> trigger_next_dag >>
         grant_db_permissions)

    return dag
Exemple #4
0
def create_basiskaart_dag(is_first: bool, table_name: str,
                          select_statement: str) -> DAG:
    """Generates a DAG for each table.

    The table_name is the target table in de masterDB where the data will be inserted. The
    select_statement is one of the imported SQL query selects (see above) that will be executed
    on the source DB.
    """
    # start time first DAG
    # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between
    # 3 and 5 a.m. Because of this, the first DAG starts running at 7 a.m.
    schedule_start_hour = 7

    dag = DAG(
        f"{dag_id}_{table_name}",
        default_args={
            "owner": owner,
            **default_args
        },
        # the first DAG will have the is_first boolean set to True
        # the other DAG's will be triggered to start when the previous DAG is finished
        # (estafette run / relay run)
        schedule_interval=f"0 {schedule_start_hour} * * *"
        if is_first else None,
        description="""
        basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50).
        The basiskaart data is collected from basiskaart DB.""",
        tags=["basiskaart"],
    )

    with dag:

        # 1. Post info message on slack
        slack_at_start = MessageOperator(
            task_id="slack_at_start",
            http_conn_id="slack",
            webhook_token=slack_webhook_token,
            message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
            username="******",
        )

        # 2. Create temp and target table
        create_tables = PostgresOperator(
            task_id="create_tables",
            sql=CREATE_TABLES,
            params=dict(base_table=table_name, dag_id=dag_id),
        )

        # 3. Copy data into temp table
        copy_data = PythonOperator(
            task_id="insert_data",
            python_callable=create_tables_from_basiskaartdb_to_masterdb,
            op_kwargs={
                "source_connection": source_connection,
                "source_select_statement": globals()[select_statement],
                "target_base_table": f"{dag_id}_{table_name}_temp",
            },
            dag=dag,
        )

        # 4. Check for changes in temp table to merge in target table
        change_data_capture = PgComparatorCDCOperator(
            task_id="change_data_capture",
            source_table=f"{dag_id}_{table_name}_temp",
            target_table=f"{dag_id}_{table_name}",
        )

        # 5. Create mviews for T-REX tile server
        create_mviews = PostgresOperator(
            task_id="create_mviews",
            sql=CREATE_MVIEWS,
            params=dict(base_table=table_name, dag_id=dag_id),
        )

        # 6. Rename COLUMNS based on Provenance
        provenance_translation = ProvenanceRenameOperator(
            task_id="rename_columns",
            dataset_name=dag_id,
            prefix_table_name=f"{dag_id}_",
            rename_indexes=False,
            pg_schema="public",
        )

        # 7. Drop temp table
        clean_up = PostgresOperator(
            task_id="drop_temp_table",
            sql=[
                f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE",
            ],
        )

        # 8. Trigger next DAG to run (estafette)
        trigger_next_dag = TriggerDynamicDagRunOperator(
            task_id="trigger_next_dag",
            dag_id_prefix=f"{dag_id}_",
            trigger_rule="all_done",
        )

        # 9. Grant database permissions
        grant_db_permissions = PostgresPermissionsOperator(task_id="grants",
                                                           dag_name=dag_id)

    # Flow
    (slack_at_start >> create_tables >> copy_data >> change_data_capture >>
     create_mviews >> provenance_translation >> clean_up >> trigger_next_dag >>
     grant_db_permissions)

    dag.doc_md = """
    #### DAG summary
    This DAG contains BGT (basisregistratie grootschalige topografie) i
    and KBK10 (kleinschalige basiskaart 10)
    and KBK50 (kleinschalige basiskaart 50) data
    #### Mission Critical
    Classified as 2 (beschikbaarheid [range: 1,2,3])
    #### On Failure Actions
    Fix issues
    and rerun dag on working days
    #### Point of Contact
    Inform the businessowner at [businessowner]@amsterdam.nl
    #### Business Use Case / process / origin
    NA
    #### Prerequisites/Dependencies/Resourcing
    https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html
    Note: The basiskaart data is collected from the GOB objectstore
    and processed in the basiskaart DB
    => which is the source for this DAG.
    """

    return dag