Python PostgresHook.cursorの例、airflow.hooks.postgres_hook.PostgresHook.cursor Pythonの例

コード例 #1

0

ファイルを表示

ファイル: init.py プロジェクト: mvsousa/mac499

def move_clean_to_conform(**kwargs):
    ti = kwargs['ti']
    file = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                        key='currently_processing')
    lote_id = ti.xcom_pull(task_ids='generate_unique_lote_id', key='lote_id')
    source = ti.xcom_pull(task_ids='move_csv_to_stagging_area', key='source')
    file_size = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                             key='file_size')
    table = ti.xcom_pull(task_ids='set_base_table', key='table')
    query = """SELECT * FROM {} WHERE loteId = '{}';""".format(table, lote_id)

    src_conn = PostgresHook(postgres_conn_id='etl_stage_clean').get_conn()
    dest_conn = PostgresHook(postgres_conn_id='etl_stage_conform').get_conn()

    src_cursor = src_conn.cursor()
    dest_cursor = dest_conn.cursor()

    src_cursor.execute(query)
    count = 0
    set_lote_id_begin(dest_cursor, lote_id, source, file_size)
    while True:
        records = src_cursor.fetchmany(size=1)
        if not records:
            break
        query_insert = """INSERT INTO {} VALUES %s""".format(table)
        execute_values(dest_cursor, query_insert, records)
        dest_conn.commit()
        count += 1
    set_lote_id_end(dest_cursor, lote_id, count, source, file_size)
    dest_conn.commit()
    src_cursor.close()
    dest_cursor.close()
    src_conn.close()
    dest_conn.close()
    return True

コード例 #2

0

ファイルを表示

def etl(ds, **kwargs):
    execution_date = kwargs['execution_date'].strftime('%Y-%m-%d')
    query = """
SELECT *
FROM users
WHERE created_at::date = date '{}'
    """.format(execution_date)

    src_conn = PostgresHook(postgres_conn_id='source',
                            schema='source_schema').get_conn()
    dest_conn = PostgresHook(postgres_conn_id='dest',
                             schema='dest_schema').get_conn()

    # notice this time we are naming the cursor for the origin table
    # that's going to force the library to create a server cursor
    src_cursor = src_conn.cursor("serverCursor")
    src_cursor.execute(query)
    dest_cursor = dest_conn.cursor()

    # now we need to iterate over the cursor to get the records in batches
    while True:
        records = src_cursor.fetchmany(size=2000)
        if not records:
            break
        execute_values(dest_cursor, "INSERT INTO users VALUES %s", records)
        dest_conn.commit()

    src_cursor.close()
    dest_cursor.close()
    src_conn.close()
    dest_conn.close()

コード例 #3

0

ファイルを表示

ファイル: backfill_uuids_for_onboarding.py プロジェクト: AgentIQ/aiq-airflow

def get_analytics_stats_conn_cursors():
    analytics_conn = PostgresHook(postgres_conn_id='ANALYTICS_DB').get_conn()
    stats_conn = PostgresHook(postgres_conn_id='STATS_DB').get_conn()

    analytics_server_cursor = analytics_conn.cursor("analytics_server_cursor")
    # providing an argument makes this a server cursor, which wouldn't hold
    # all records in the memory
    # https://www.psycopg.org/docs/usage.html#server-side-cursors
    stats_client_cursor = stats_conn.cursor()
    return (analytics_conn, analytics_server_cursor, stats_conn, stats_client_cursor)

コード例 #4

0

ファイルを表示

ファイル: extract_and_load.py プロジェクト: alexzlue/RB-Airflow

def bq_to_gcs(**kwargs):
    date_stamp = kwargs['ts']

    # get the last current date from Postgres
    conn = PostgresHook(postgres_conn_id='my_local_db').get_conn()
    cursor = conn.cursor()

    cursor.execute('SELECT MAX(last_update_date) FROM airflow.austin_service_reports;')
    
    recent_ds = cursor.fetchone()[0]
    if recent_ds is not None:
        recent_ds+=timedelta(seconds=1)
        last = recent_ds
    else:
        last = kwargs['start_date']-timedelta(days=1)
    
    cursor.close()
    conn.close()

    # open connection to BigQuery
    hook = BigQueryHook(
        bigquery_conn_id='my_gcp_connection',
        use_legacy_sql=False
    )
    conn = hook.get_conn()
    cursor = conn.cursor()
    with open(SQL_PATH + 'query_bq_dataset.sql', 'r') as f:
        query = f.read()
    query = query.format(last, date_stamp)

    cursor.execute(query)

    # write to gcs bucket
    # Each returned row of the result gives:
    # result = [unique_key, complaint_type, complaint_description, owning_department, source,
    #           status, created_date, last_update_date, close_date, city]
    with BUCKET.open('bq_bucket/bq_dataset.txt', 'w') as f:
        while True:
            result = cursor.fetchone()
            if result is None:
                break
            
            if result[8] is None:
                result[8] = ''
            else:
                result[8] = datetime.utcfromtimestamp(result[8])
            if result[9] is None:
                result[9] = ''
            result[7] = datetime.utcfromtimestamp(result[7])
            result[6] = datetime.utcfromtimestamp(result[6])
            f.write('|'.join([str(val) for val in result]) + '\n')

    cursor.close()
    conn.close()

コード例 #5

0

ファイルを表示

ファイル: extract_and_load.py プロジェクト: clairetaylor352/RB-Airflow

def bq_to_gcs(**kwargs):
    ds = kwargs['ds']
    previous = datetime.strptime(kwargs['prev_ds'], '%Y-%m-%d').date()

    # get the last current date from Postgres
    conn = PostgresHook(postgres_conn_id='my_local_db').get_conn()
    cursor = conn.cursor()

    cursor.execute('SELECT MAX(CAST(created_date AS DATE)) FROM airflow.austin_service_reports;')
    
    recent_ds = cursor.fetchone()[0]
    if recent_ds is not None:
        recent_ds+=timedelta(days=1)
        if recent_ds < previous:
            prev_ds = datetime.strftime(recent_ds, '%Y-%m-%d')
        else:
            prev_ds = kwargs['prev_ds']
    else:
        prev_ds = datetime.strftime(kwargs['start_date']-timedelta(days=1), '%Y-%m-%d')
    
    cursor.close()
    conn.close()

    # open connection to BigQuery
    hook = BigQueryHook(
        bigquery_conn_id='my_gcp_connection',
        use_legacy_sql=False
    )
    conn = hook.get_conn()
    cursor = conn.cursor()
    with open(SQL_PATH + 'query_bq_dataset.sql', 'r') as f:
        query = f.read()
        query = query.format(prev_ds,ds)

    cursor.execute(query)

    # write to gcs bucket
    with BUCKET.open('bq_bucket/bq_dataset.csv', 'w') as f:
        while True:
            result = cursor.fetchone()
            if result is None:
                break
            
            if result[6] is None:
                result[6]= ''
            else:
                result[6] = datetime.utcfromtimestamp(result[6])
            result[5] = datetime.utcfromtimestamp(result[5])
            f.write(','.join([str(val) for val in result]) + '\n')

    cursor.close()
    conn.close()

コード例 #6

0

ファイルを表示

def load_transfer(**kwargs):
    # Collect days using function that accesses postgres
    days_open = average_days_open()

    conn = PostgresHook(postgres_conn_id='my_local_db').get_conn()
    cursor = conn.cursor()

    # Find most recent pull
    cursor.execute('SELECT MAX(last_update_date) FROM airflow.transfer_table')
    start_date = cursor.fetchone()[0]
    if not start_date:
        cursor.execute(
            'SELECT MIN(last_update_date) FROM airflow.austin_service_reports')
        start_date = cursor.fetchone()[0] + timedelta(seconds=1)

    end_date = datetime.strptime(kwargs['ds'],
                                 '%Y-%m-%d').date() - timedelta(days=days_open)

    # Remove previous data in transfer table
    cursor.execute('DELETE FROM airflow.transfer_table')

    with open(SQL_PATH + 'insert_into_transfer_table.sql') as f:
        insert = f.read().format(start_date, end_date)

    cursor.execute(insert)
    conn.commit()

    cursor.close()
    conn.close()

コード例 #7

0

ファイルを表示

ファイル: init.py プロジェクト: mvsousa/mac499

def set_base_table(**kwargs):
    import random
    ti = kwargs['ti']
    file = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                        key='currently_processing')

    header = tuple(pd.read_csv(file, sep=';', nrows=1).columns)
    header = header + ('loteid', )
    strfy_header = '(' + ','.join(str(s) for s in header) + ')'
    strfy_header = strfy_header.lower()
    query_tables = """ SELECT table_name FROM information_schema.tables WHERE table_schema='public';"""
    dest_conn = PostgresHook(postgres_conn_id='etl_stage_extract').get_conn()
    dest_cursor = dest_conn.cursor()
    dest_cursor.execute(query_tables)
    tables = dest_cursor.fetchmany(size=100)
    for table_tuple in tables:
        table_name = table_tuple[0]
        query_header = """ select column_name from information_schema.columns where table_name = '{}';""".format(
            table_name)
        dest_cursor.execute(query_header)
        headers = dest_cursor.fetchmany(size=1000)
        hd = '(' + ','.join(str(s[0]).lower() for s in headers) + ')'
        if hd == strfy_header:
            ti.xcom_push(key='table', value=table_name)
            break
    return tables

コード例 #8

0

ファイルを表示

ファイル: db_tables.py プロジェクト: akumuthan-dev/data-flow

def query_database(query: str,
                   target_db: str,
                   table_name,
                   batch_size: int = 100000,
                   **kwargs):
    s3 = S3Data(table_name, kwargs["ts_nodash"])
    total_records = 0
    next_batch = 1
    connection = PostgresHook(postgres_conn_id=target_db).get_conn()

    try:
        # create connection with named cursor to fetch data in batches
        cursor = connection.cursor(name='query_database')
        cursor.execute(query)

        rows = cursor.fetchmany(batch_size)
        fields = [d[0] for d in cursor.description]
        while rows:
            records = []
            for row in rows:
                record = {fields[col]: row[col] for col in range(len(row))}
                records.append(record)
            s3.write_key(f'{next_batch:010}.json', records)
            next_batch += 1
            total_records += len(records)
            rows = cursor.fetchmany(batch_size)
    finally:
        if connection:
            cursor.close()
            connection.close()

コード例 #9

0

ファイルを表示

ファイル: init.py プロジェクト: mvsousa/mac499

def load_csv_extract_db(**kwargs):
    ti = kwargs['ti']
    file = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                        key='currently_processing')
    lote_id = ti.xcom_pull(task_ids='generate_unique_lote_id', key='lote_id')
    source = ti.xcom_pull(task_ids='move_csv_to_stagging_area', key='source')
    file_size = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                             key='file_size')
    table = ti.xcom_pull(task_ids='set_base_table', key='table')

    dest_conn = PostgresHook(postgres_conn_id='etl_stage_extract').get_conn()
    dest_cursor = dest_conn.cursor()

    header = tuple(pd.read_csv(file, sep=';', nrows=1).columns)
    header = header + ('loteid', )
    strfy_header = '(' + ','.join(str(s) for s in header) + ')'
    count = 0
    set_lote_id_begin(dest_cursor, lote_id, source, file_size)
    for df_row in pd.read_csv(file, sep=';', chunksize=1):
        row = [tuple(x) for x in df_row.values]
        row = row[0]
        row = row + (lote_id, )
        strfy_row = '(' + ','.join('\'' + str(s) + '\'' for s in row) + ')'

        query = """INSERT INTO {} {} VALUES {};""".format(
            table, strfy_header, strfy_row)
        dest_cursor.execute(query)
        count += 1
        if count >= 100:
            break
    set_lote_id_end(dest_cursor, lote_id, count, source, file_size)
    dest_conn.commit()
    dest_cursor.close()
    dest_conn.close()
    return True

コード例 #10

0

ファイルを表示

ファイル: get_data.py プロジェクト: CurtLH/airflow_dags

def get_team_data():

    conn = PostgresHook(postgres_conn_id="postgres_workboard").get_conn()
    conn.autocommit = True
    cur = conn.cursor()

    params = {"token": Variable.get("workboard_token")}
    base_url = "https://www.myworkboard.com/wb/apis"

    r = requests.get(base_url + "/team?include=org_teams", params=params)
    if r.status_code != 200:
        raise ValueError("Request not successful. Status code {}".format(
            r.status_code))
    else:
        teams = r.json()["data"]["team"]
        logging.info("There are {} teams".format(len(teams)))
        for t in teams:
            try:
                cur.execute(
                    """INSERT INTO teams_raw (team_id, team_data)
                        VALUES (%s, %s) ON CONFLICT (team_id)
                        DO UPDATE SET team_data = EXCLUDED.team_data
                    """,
                    [t["team_id"], json.dumps(t)],
                )
                logging.info("Team id #{} inserted into the table".format(
                    t["team_id"]))
            except BaseException:
                logging.warning("Record skipped")
                pass

    cur.close()
    conn.close()

コード例 #11

0

ファイルを表示

    def execute(self, context):
        # self.log.info('LoadDimensionOperator not implemented yet')
        try:
            conn = PostgresHook(self.redshift_conn_id).get_conn()

            logging.info(f"Ensuring Valid Records::{self.table_name}")

            cursor = conn.cursor()

            cursor.execute(f'select count(*) from {self.table_name}')

            total_count = cursor.fetchone()

            logging.info(f"Total Count::{total_count}")

            if total_count == 0:
                raise Exception(
                    f"There are no records for table::{self.table_name}")

            conn.commit()

            cursor.close()

            conn.close()

        except Exception as e:
            raise e

コード例 #12

0

ファイルを表示

    def start_etl_totals(self, table_name):
        """Gets the data from postgreSQL(countries2020_2021),
            transform data(world sum vaccined people), load to table(totals2020_2021)

        Parameters:
            table_name(string): The name of the table that will be used for loading data
        Returns:
            bool: True if everything successful
        """
        # 1. extract/SELECT  data from postgreSQL
        path = __file__.split('.')[-2] + '.xml'
        create_select_query = ET.parse(path).getroot().find(
            'select_query').text.format(table_name=table_name)
        df = pd.read_sql_query(
            create_select_query,
            con=PostgresHook(postgres_conn_id='covid_id').get_conn())

        # 2. transform data with pandas for loading to postgreSQL
        df = df.drop('country', 1)
        dfs = df.sum(axis=0)

        # 3. load to posgreSQL
        connection = PostgresHook(postgres_conn_id='covid_id').get_conn()
        cursor = connection.cursor()
        cursor.execute(
            """INSERT INTO totals2020_2021 
                    (fields, total_2020, total_2021) 
                    VALUES ('vaccinated', %s, %s)""",
            (int(dfs[0]), int(dfs[1])))
        connection.commit()
        return True

コード例 #13

0

ファイルを表示

ファイル: dags.py プロジェクト: Kamparia/airflow-datapipeline-samples

def filter_questions() -> str:
    """ 
    Read all questions from the database and filter them.
    Returns a JSON string that looks like:
    
    [
        {
        "title": "Question Title",
        "is_answered": false,
        "link": "https://stackoverflow.com/questions/0000001/...",
        "tags": ["tag_a","tag_b"],
        "question_id": 0000001
        },
    ]
    
    """
    columns = ("title", "is_answered", "link", "tags", "question_id")
    filtering_query = """
        SELECT title, is_answered, link, tags, question_id
        FROM public.questions
        WHERE score >= 1 AND owner_reputation > 1000;
        """
    pg_hook = PostgresHook(postgres_conn_id="postgres_connection").get_conn()

    with pg_hook.cursor("serverCursor") as pg_cursor:
        pg_cursor.execute(filtering_query)
        rows = pg_cursor.fetchall()
        results = [dict(zip(columns, row)) for row in rows]
        return json.dumps(results, indent=2)

コード例 #14

0

ファイルを表示

def etl():

    conn = PostgresHook(postgres_conn_id="postgres_apple").get_conn()
    src_cur = conn.cursor()
    dest_cur = conn.cursor()

    query = """
    SELECT *
    FROM raw
    """

    src_cur.execute(query)

    while True:
        records = src_cur.fetchone()
        if not records:
            break

        row = []
        url = records[2]
        soup = bs(records[3], "html.parser")
        row.append(
            [
                url,
                apple.get_id_num(url),
                apple.get_price(soup),
                apple.get_date(soup),
                apple.get_screen(soup),
                apple.get_color(url),
            ]
        )

        try:
            dest_cur.execute(
                """INSERT INTO ads (url, id_num, price, date, screen, color)
                                VALUES (%s, %s, %s, %s, %s, %s)""",
                [i for i in row[0]],
            )
            conn.commit()
            logging.info("New record inserted")
        except BaseException:
            pass
            logging.info("Duplicate records")

    src_cur.close()
    dest_cur.close()
    conn.close()

コード例 #15

0

ファイルを表示

ファイル: xcom_demo.py プロジェクト: jthomas123/examples

def run_and_push(**kwargs):
    conn = PostgresHook('postgres_default').get_conn()
    PostgresHook.set_autocommit(PostgresHook(), conn, True)
    cur = conn.cursor()
    cur.execute(kwargs['templates_dict']['script'])
    result = cur.fetchall()
    row = result[0]
    kwargs['ti'].xcom_push(key=row[0], value=row[1])

コード例 #16

0

ファイルを表示

ファイル: second_domashka_dag.py プロジェクト: katerinekhh/airflow_tutorial

def export_postgres_data_to_csv(filepath, query, **kwargs):
    connection = PostgresHook(postgres_conn_id='tutorial_general').get_conn()
    cursor = connection.cursor()
    cursor.execute(query)
    data = cursor.fetchall()
    with open(filepath, 'w') as csv_file:
        csvwriter = csv.writer(csv_file, delimiter=',')
        csvwriter.writerow(header[0] for header in cursor.description)
        for row in data:
            csvwriter.writerow(row)

コード例 #17

0

ファイルを表示

ファイル: example.py プロジェクト: onurtashan/airflow-on-docker-example

def csvToPostgres():
    #Open Postgres Connection
    pg_hook = PostgresHook(postgres_conn_id='airflow_db')
    get_postgres_conn = PostgresHook(postgres_conn_id='airflow_db').get_conn()
    curr = get_postgres_conn.cursor("cursor")
    # CSV loading to table.
    with open('/usr/local/airflow/dags/example.csv', 'r') as f:
        next(f)
        curr.copy_from(f, 'example_table', sep=',')
        get_postgres_conn.commit()

コード例 #18

0

ファイルを表示

def average_days_open():
    conn = PostgresHook(postgres_conn_id='my_local_db').get_conn()
    cursor = conn.cursor()
    cursor.execute(
        'SELECT AVG(average_num_days_open) FROM airflow.metrics_by_day')
    days_open = cursor.fetchone()

    cursor.close()
    conn.close()
    return int(days_open[0])

コード例 #19

0

ファイルを表示

ファイル: postgresxcom_operator.py プロジェクト: clearclaw/airflow_jcl

 def execute (self, context): # pylint: disable=unused-argument
   LOG.info ("Executing: %s", str (self.sql))
   db = PostgresHook (postgres_conn_id = self.postgres_conn_id).get_conn ()
   rc = []
   with db.cursor (cursor_factory = psycopg2.extras.DictCursor) as cur:
     LOG.info ("Executing SQL: %s", self.sql)
     cur.execute (self.sql)
     for rec in cur:
       rc.append (dict (rec))
   return rc

コード例 #20

0

ファイルを表示

def insert_from_copy_table_if_needed(target_db,
                                     table_name=None,
                                     task_instance=None,
                                     run_fetch_task_id=None,
                                     **kwargs):
    """
    Inserts from copy table to target table when all tasks succeed, if target table
    already exists. The rational behind is not doing any modification on target table
    before we make sure fetching from source and insertation is successful.
    """
    insert_from_copy_sql = """
        DELETE FROM {table_name};
        INSERT INTO {table_name}
        SELECT * FROM {copy_table_name};
        DROP TABLE {copy_table_name};
    """
    table_exists = task_instance.xcom_pull(
        task_ids='check-if-table-exists')[0][0]
    fetcher_state = task_instance.xcom_pull(key='state',
                                            task_ids=run_fetch_task_id)
    inserter_state = True
    for index in range(constants.INGEST_TASK_CONCURRENCY):
        inserter_state = (inserter_state and task_instance.xcom_pull(
            key='state',
            task_ids=f'execute-insert-into-{index}',
        ))
    if ((table_exists and table_exists != 'None') and fetcher_state is True
            and inserter_state is True):
        logging.info(f'Inserting from {table_name}_copy table to {table_name}')
        try:
            target_db_conn = PostgresHook(
                postgres_conn_id=target_db).get_conn()
            target_db_cursor = target_db_conn.cursor()
            target_db_cursor.execute(
                insert_from_copy_sql.format(
                    table_name=sql.Identifier(table_name).as_string(
                        target_db_conn),
                    copy_table_name=sql.Identifier(
                        f'{table_name}_copy').as_string(target_db_conn)))
            target_db_conn.commit()

        # TODO: Gotta Catch'm all
        except Exception as e:
            logging.error(f'Exception: {e}')
            target_db_conn.rollback()
            raise

        finally:
            if target_db_conn:
                target_db_cursor.close()
                target_db_conn.close()

    else:
        logging.info('Target table newly created. No need for copy table')

コード例 #21

0

ファイルを表示

ファイル: data_quality.py プロジェクト: meetparekh09/udacity-data-engineering-projects

 def execute(self, context):
     conn = PostgresHook(self.conn_id).get_conn()
     cur = conn.cursor()
     for query, exp_result in self.queries_with_exp_result.items:
         result = cur.execute(query)
         if (exp_result == result[0]):
             print(f'Tested {query} and Sucessfully completed')
         else:
             raise AirflowRescheduleException
             self.log.info(
                 f'DataQualityOperator not for {query} was not successfull')
     conn.commit()

コード例 #22

0

ファイルを表示

ファイル: init.py プロジェクト: mvsousa/mac499

def move_conform_to_final(**kwargs):
    ti = kwargs['ti']
    file = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                        key='currently_processing')
    lote_id = ti.xcom_pull(task_ids='generate_unique_lote_id', key='lote_id')
    source = ti.xcom_pull(task_ids='move_csv_to_stagging_area', key='source')
    file_size = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                             key='file_size')
    table = ti.xcom_pull(task_ids='set_base_table', key='table')
    query = """SELECT * FROM {} WHERE loteId = '{}';""".format(table, lote_id)

    src_conn = PostgresHook(postgres_conn_id='etl_stage_conform').get_conn()
    dest_conn = PostgresHook(postgres_conn_id='etl_stage_final').get_conn()

    src_cursor = src_conn.cursor()
    dest_cursor = dest_conn.cursor()

    query_header = """ select column_name from information_schema.columns where table_name = '{}';""".format(
        table)
    src_cursor.execute(query_header)
    headers_result = src_cursor.fetchmany(size=1000)
    headers = [str(s[0]).lower() for s in headers_result]

    src_cursor.execute(query)
    count = 0
    set_lote_id_begin(dest_cursor, lote_id, source, file_size)
    while True:
        records = src_cursor.fetchmany(size=1)
        if not records:
            break
        insert_final_table(dest_cursor, table, records, headers)
        count += 1
    set_lote_id_end(dest_cursor, lote_id, count, source, file_size)
    dest_conn.commit()
    src_cursor.close()
    dest_cursor.close()
    src_conn.close()
    dest_conn.close()
    return True

コード例 #23

0

ファイルを表示

def count_users():
    conn = PostgresHook(postgres_conn_id='pgConn_pg').get_conn()
    cur = conn.cursor()
    cur.execute(
        'delete from dar_group.fm_count_users where date_ = current_date')

    cur.execute('insert into dar_group.fm_count_users \
                select brand,id_space,current_date as date_,count(distinct identifier_id) as users \
                from ( select upper(brand) as brand,id_space,state,identifier_id,created::date \
                from dar_group.dar_account where lower(state) like \'%active%\' and lower(brand) = \'forte_market\' \
                and lower(id_space) like \'%mobile%\') as a group by brand, id_space'
                )
    conn.commit()

コード例 #24

0

ファイルを表示

def create_target_table(
    target_db,
    table_name=None,
    field_mapping=None,
    task_instance=None,
    **kwargs,
):
    """Create target database table or delete all from it.

    If target table exists, create a copy table as a back up to be used in case of failure.
    If target table doesn't exits, create one.
    """
    create_table_sql = """
        CREATE TABLE {{ table_name }} (
        {% for _, tt_field_name, tt_field_constraints in field_mapping %}
            {{ tt_field_name }} {{ tt_field_constraints }}{{ "," if not loop.last }}
        {% endfor %}
        );
    """

    table_exists = task_instance.xcom_pull(
        task_ids='check-if-table-exists')[0][0]
    try:
        target_db_conn = PostgresHook(postgres_conn_id=target_db).get_conn()
        target_db_cursor = target_db_conn.cursor()

        # If table already exists in the target database, create a copy table to be used
        # for rollback in case of failiure or create one.
        # Until there will be possibility for incremental load
        if table_exists and table_exists != 'None':
            table_name = f'{table_name}_copy'

        logging.info(f'Creating a target table {table_name}')
        rendered_create_table_sql = Template(create_table_sql).render(
            table_name=sql.Identifier(table_name).as_string(target_db_conn),
            field_mapping=field_mapping,
        )
        target_db_cursor.execute(rendered_create_table_sql)
        target_db_conn.commit()

    # TODO: Gotta Catch'm all
    except Exception as e:
        logging.error(f'Exception: {e}')
        target_db_conn.rollback()
        raise

    finally:
        if target_db_conn:
            target_db_cursor.close()
            target_db_conn.close()

コード例 #25

0

ファイルを表示

 def execute(self, context):
     copy_query = f"""
     copy {self.table}
     from {self.s3_src_path}
     iam_role {self.iam_role}
     format as JSON '{self.json_type}'
     region '{self.region}'
     {self.extra_params};
     """
     conn = PostgresHook(self.conn_id).get_conn()
     cur = conn.cursor()
     cur.execute(copy_query)
     conn.commit()
     self.log.info('StageToRedshiftOperator Ran Sucessfully')

コード例 #26

0

ファイルを表示

    def start_etl(self, table_name):
        """Download the data from S3 bucket,  transform data, load to table(countries2020_2021)

        Parameters:
            table_name(string): The name of the table that will be used for loading data
        Returns:
            bool: True if everything successful
        """

        # 1. extract  data from s3 to dataframe
        client = boto3.client('s3')
        path = 's3://covid-exercise/covid-data.csv'
        df = pd.read_csv(path)

        # 2. transform data(dataframe) with pandas for loading to postgreSQL
        q2021 = """SELECT location, SUM (new_vaccinations) FROM df
                WHERE new_vaccinations IS NOT NULL AND date LIKE '%2021%' GROUP BY location"""
        q2020 = """SELECT location, SUM (new_vaccinations) FROM df WHERE
                new_vaccinations IS NOT NULL AND date LIKE '%2020%' GROUP BY location"""
        df2021 = ps.sqldf(q2021, locals())
        df2020 = ps.sqldf(q2020, locals())
        df2021 = df2021.rename(columns={
            'location': 'Country',
            'SUM (new_vaccinations)': 'Total_2021'
        },
                               inplace=False)
        df2020 = df2020.rename(columns={
            'location': 'Country',
            'SUM (new_vaccinations)': 'Total_2020'
        },
                               inplace=False)
        dict1 = df2020.set_index('Country')['Total_2020'].to_dict()
        df2021['Total_2020'] = df2021['Country'].map(dict1)
        df2021['Total_2021'] = df2021['Total_2021'].fillna(0).astype(int)
        df2021['Total_2020'] = df2021['Total_2020'].fillna(0).astype(int)
        df2021 = df2021.reindex(
            columns=['Country', 'Total_2020', 'Total_2021'])

        # 3. load to posgreSQL
        connection = PostgresHook(postgres_conn_id='covid_id').get_conn()
        cursor = connection.cursor()

        for index, row in df2021.iterrows():
            cursor.execute(
                """INSERT INTO countries2020_2021 
                    (Country, Total_2020, Total_2021) 
                    VALUES (%s, %s, %s)""",
                (row["Country"], row["Total_2020"], row["Total_2021"]))
        connection.commit()
        return True

コード例 #27

0

ファイルを表示

 def execute(self, context):
     try:
         logging.info("Dimension Operator::Creating Dimension Tables")
         conn = PostgresHook(self.redshift_conn_id).get_conn()
         logging.info(
             f"Dimension Operator::Executing Dimension Query::{self.sql_statement}"
         )
         cursor = conn.cursor()
         cursor.execute(self.sql_statement)
         conn.commit()
         cursor.close()
         conn.close()
     except Exception as e:
         raise e

コード例 #28

0

ファイルを表示

ファイル: af_fm_express_check.py プロジェクト: 1minimalist/dags

def read_data(**kwargs):
    cluster = cl(['10.103.5.51', '10.103.5.52', '10.103.5.53'])
    session = cluster.connect('darbiz')

    conn = PostgresHook(postgres_conn_id='pgConn_pg').get_conn()
    cur = conn.cursor()

    rows = session.execute("SELECT * from darbiz.forte_express_loan_requests where created_on>='2020-09-20' allow filtering")
    cur.execute ("select distinct owner profile_id, uid order_id, pay_title from dar_group.bazar_orders1 where created_on>=now()-interval '24' hour")
    res = cur.fetchall()
    for user_row in rows:
        d = json.loads(user_row.loan_request)
        id0 = user_row.profile_id
        id1 = user_row.order_id
        id2 = user_row.created_on
        pp = d['person_info']['financing_info']['period'] if 'period' in d['person_info']['financing_info'] else None
        lh = datetime.now() - timedelta(hours = 12)
        if id2>=lh:
            for a,b,c in res:
                ll=c.split()
                if id1==b:
                    if pp!=int(ll[2]):
                        email = EmailOperator(\
                            task_id='send_email',\
                            to=['*****@*****.**','*****@*****.**'],\
                            subject='Ошибка в Fortemarket',\
                            html_content='Error in order_id: {} created at: {}, profile_id: {}, months in request: {}, months in orders: {}\n' \
                                .format(a, id2, b, pp, ll[2])\
                            )
                        email.execute(context=kwargs)

                        t3 = SlackWebhookOperator(
                            task_id='send_slack_notification',
                            http_conn_id='slack_connection',
                            message='Error in order_id: {} created at: {}, profile_id: {}, months in request: {}, months in orders: {}\n' \
                                .format(a, id2, b, pp, ll[2]),\
                            # files = '/tmp/BPM_report.xlsx',
                            channel='#reports',\
                            dag=dag
                        )
                        t3.execute(context=kwargs)
                else:
                    continue
        else:
            continue
        # lt = d['person_info']['financing_info']['loan_type'] if 'loan_type' in d['person_info']['financing_info'] else None

    cur.close()
    conn.close()

コード例 #29

0

ファイルを表示

ファイル: crafttask.py プロジェクト: felexkemboi/Airflow

def postgres(ds, **kwargs):

    #this should be a query to insert the data to the postgress
    query = """ SELECT *FROM marks """

    conn = PostgresHook(postgres_conn_id='source', schema='marks').get_conn()

    # that's going to force the library to create a server cursor
    cursor = conn.cursor("serverCursor")
    cursor.execute(query)

    # now we need to close our connection
    cursor.close()

    return "Saved to the database"

コード例 #30

0

ファイルを表示

ファイル: elasticsearch_operator.py プロジェクト: GabrielSGoncalves/AirFlow_HCV

    def execute(self, context):
        postgres = PostgresHook(
            postgres_conn_id=self.postgres_conn_id).get_conn()
        es = ElasticsearchHook(
            elasticsearch_conn_id=self.elasticsearch_conn_id)

        self.log.info("Extracting data from PostgreSQL: %s", self.sql)

        with postgres.cursor(name="serverCursor",
                             cursor_factory=RealDictCursor) as postgres_cursor:
            postgres_cursor.itersize = 2000
            postgres_cursor.execute(self.sql)
            for row in postgres_cursor:
                doc = json.dumps(row, indent=2)
                es.add_doc(index=self.index, doc_type='external', doc=doc)
        postgres.close()