Beispiel #1
0
    stop_cpv = DummyOperator(task_id='stop_cpv')

    upload_cpv_to_s3 = S3UploadFromLocal(task_id='Upload_cpv_to_s3s',
                                         s3_folder='staging/cpv_attributes/')

    create_redshift = RedshiftOperator(task_id='create_redshift',
                                       sql='schema_cpv_staging_datalake.sql')

    copy_from_s3 = RedshiftCopyFromS3(task_id='copy_from_s3',
                                      s3_folder='staging/cpv_attributes',
                                      schema='staging',
                                      table='cpv_attributes',
                                      format='csv',
                                      header=True,
                                      delimiter='|')

    upsert_datalake = RedshiftUpsert(
        task_id='upsert_datalake',
        schema="datalake",
        table="cpv_attributes",
        pkey="codecpv",
        sql="SELECT * FROM staging.cpv_attributes")

    q_check = RedshiftQualityCheck(task_id='quality_check',
                                   schema="datalake",
                                   table="cpv_attributes",
                                   pkey="codecpv")

    start_cpv >> upload_cpv_to_s3 >> create_redshift >> copy_from_s3 >> upsert_datalake >> q_check >> stop_cpv
Beispiel #2
0
    tags=['dend', 'infogreffe', 'dwh']
) as dag:
    _docs_md_fp = os.path.join(default_args['working_dir'], 'Readme.md')
    dag.doc_md = open(_docs_md_fp, 'r').read()

    start_refresh = DummyOperator(
        task_id='start_refresh'
    )

    create_dwh = RedshiftOperator(
        task_id='create_infogreffe_dwh',
        sql='schema_infogreffe_dwh.sql'
    )
    refresh_dwh = RedshiftOperator(
        task_id='refresh_infogreffe_dwh',
        sql='refresh_infogreffe.sql'
    )

    q_check = RedshiftQualityCheck(
        task_id='quality_check',
        schema="datalake",
        table="infogreffe_attributes",
        pkey="infogreffe_uid"
    )

    stop_refresh = DummyOperator(
        task_id='stop_refresh'
    )

    start_refresh >> create_dwh >> refresh_dwh >> q_check >> stop_refresh
    upsert_titulaires = RedshiftUpsert(
        task_id='upsert_titulaires',
        sql="SELECT * FROM staging.decp_titulaires_unique;",
        schema='datalake',
        table='decp_titulaires',
        pkey='decp_bridge_uid')

    upsert_marches = RedshiftUpsert(
        task_id='upsert_marches',
        sql="SELECT * FROM staging.decp_marches_unique;",
        schema='datalake',
        table='decp_marches',
        pkey='decp_uid')

    q_check_marches = RedshiftQualityCheck(task_id='quality_check_marches',
                                           schema="datalake",
                                           table="decp_marches",
                                           pkey="decp_uid")

    q_check_titulaires = RedshiftQualityCheck(
        task_id='quality_check_titulaires',
        schema="datalake",
        table="decp_titulaires",
        pkey="decp_bridge_uid")

    stop_decp = DummyOperator(task_id='Stop_decp')

create_schema >> [copy_titulaires_from_s3, copy_marches_from_s3]
copy_marches_from_s3 >> upsert_marches >> q_check_marches
copy_titulaires_from_s3 >> upsert_titulaires >> q_check_titulaires
[q_check_marches, q_check_titulaires] >> stop_decp
Beispiel #4
0
    'retries': 0,
    'redshift_conn_id': 'aa_redshift',
    'autocommit': True,
    'execution_timeout': timedelta(seconds=300),
    'working_dir': os.path.dirname(os.path.abspath(__file__))
}

with DAG('siren_dwh_transformations',
         default_args=default_args,
         description='Refresh siren data in the Data Warehouse layer (DWH)',
         schedule_interval=None,
         tags=['dend', 'siren', 'dwh']) as dag:
    _docs_md_fp = os.path.join(default_args['working_dir'], 'Readme.md')
    dag.doc_md = open(_docs_md_fp, 'r').read()

    start_refresh = DummyOperator(task_id='start_refresh')

    create_dwh = RedshiftOperator(task_id='create_siren_dwh',
                                  sql='schema_siren_dwh.sql')
    refresh_dwh = RedshiftOperator(task_id='refresh_siren_dwh',
                                   sql='refresh_siren.sql')

    q_check = RedshiftQualityCheck(task_id='quality_check',
                                   schema="dwh",
                                   table="siren_attributes",
                                   pkey="siren")

    stop_refresh = DummyOperator(task_id='stop_refresh')

    start_refresh >> create_dwh >> refresh_dwh >> q_check >> stop_refresh