Esempio n. 1
0
def workflow(src_conn_id, src_schema, dt, target_conn_id, target_table):
    # TODO Split code into functions
    etl_conn = PostgresHook(postgres_conn_id='etl').get_conn()

    with Session(etl_conn, f'{dag.dag_id}__{src_schema}') as session:
        # Load data
        source_conn = MsSqlHook(mssql_conn_id=src_conn_id,
                                schema=src_schema).get_conn()

        query = f"""
            SELECT 
                id, start_time, end_time, type, data
            FROM dbo.Orders
            WHERE
                CONVERT(DATE, start_time) = '{dt}'
            """

        df = pd.read_sql_query(query, source_conn)

        # Skip if no rows
        if df.empty:
            raise AirflowSkipException('No rows to load')

        # Add service fields
        df['etl_source'] = src_schema
        df['etl_id'] = session.id
        df['hash_id'] = hash_pandas_object(df[['etl_source', 'id']])

        # Export data to CSV buffer
        buffer = StringIO()
        df.to_csv(buffer,
                  index=False,
                  sep='|',
                  na_rep='NUL',
                  quoting=csv.QUOTE_MINIMAL,
                  header=False,
                  float_format='%.8f',
                  doublequote=False,
                  escapechar='\\')
        buffer.seek(0)

        # Push CSV
        target_conn = VerticaHook(vertica_conn_id=target_conn_id).get_conn()

        copy_stmt = f"""
            COPY {target_table}({df.columns.to_list()}) 
            FROM STDIN 
            DELIMITER '|' 
            ENCLOSED '"' 
            ABORT ON ERROR 
            NULL 'NUL'
            """

        cursor = target_conn.cursor()
        cursor.copy(copy_stmt, buffer)

        session.loaded_rows = cursor.rowcount
        session.successful = True