Esempio n. 1
0
def load_local_data_to_intermediate_table(postgres_conn_id,
                                          tsv_file_name,
                                          identifier,
                                          max_rows_to_skip=10):
    load_table = _get_load_table_name(identifier)
    logger.info(f'Loading {tsv_file_name} into {load_table}')

    postgres = PostgresHook(postgres_conn_id=postgres_conn_id)
    load_successful = False

    while not load_successful and max_rows_to_skip >= 0:
        try:
            postgres.bulk_load(f'{load_table}', tsv_file_name)
            load_successful = True

        except InvalidTextRepresentation as e:
            line_number = _get_malformed_row_in_file(str(e))
            _delete_malformed_row_in_file(tsv_file_name, line_number)

        finally:
            max_rows_to_skip = max_rows_to_skip - 1

    if not load_successful:
        raise InvalidTextRepresentation(
            'Exceeded the maximum number of allowed defective rows')

    _clean_intermediate_table_data(postgres, load_table)
Esempio n. 2
0
    def copy_rows(self, file_name, table_name, postgres_conn_id):

        print("### fetching records from CSV file ###")
        print("### inserting records into Postgres table ###")
        postgresserver = PostgresHook(postgres_conn_id)
        postgresserver.bulk_load(table_name, file_name)
        return True
Esempio n. 3
0
    def gcs_to_psql_import(**kwargs):
        fd, tmp_filename = tempfile.mkstemp(text=True)

        # download file locally
        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=kwargs['gcp_conn_id'])
        gcs_hook.download(bucket=kwargs['bucket'],
                          object=kwargs['object'],
                          filename=tmp_filename)
        del gcs_hook

        # load the file into postgres
        pg_hook = PostgresHook(postgres_conn_id=kwargs['postgres_conn_id'],
                               schema=kwargs['database'])
        pg_hook.bulk_load(
            '{schema}.{table}'.format(schema=kwargs['schema'],
                                      table=kwargs['table']), tmp_filename)

        # output errors
        for output in pg_hook.conn.notices:
            print(output)

        # remove temp file
        os.close(fd)
        os.unlink(tmp_filename)
Esempio n. 4
0
def load_spark_csv_to_postgres(spark_csv_dir,
                               postgres_conn_id,
                               postgres_table_name):
    """
    Load Spark CSV to Postgres.
    
    Keyword arguments:
    spark_csv_dir -- directory where CSV written from Spark is located
    postgres_conn_id -- ID of Airflow connection
    postgres_table_name -- name of target table in Postgres database
    """
    
    # Get Spark CSV file path
    spark_csv_file_name = list(filter(
        re.compile("part-.*csv$").match, os.listdir(spark_csv_dir)
    ))[0]
    spark_csv_file_path = f"{spark_csv_dir}/{spark_csv_file_name}"
    
    # Get Postgres hook
    postgres_hook = PostgresHook(postgres_conn_id)
    
    # Clear Postgres table
    print(f"Clearing Postgres table {postgres_table_name}")
    postgres_hook.run(f"truncate {postgres_table_name}")
    
    # Copy data to Postgres table
    print(
        f"""
        Loading Spark CSV {spark_csv_file_path} into Postgres table {postgres_table_name}
        """
    )
    postgres_hook.bulk_load(postgres_table_name, spark_csv_file_path)
Esempio n. 5
0
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id)

        self.log.info(
            f"Loading file s3://{self.s3_bucket}/{self.s3_key} into table {self.table}."
        )
        with DataHelper.buffer_s3_object_as_file(s3_client, self.s3_bucket,
                                                 self.s3_key) as f:
            postgres.bulk_load(self.table, f)
        self.log.info(
            f"s3://{self.s3_bucket}/{self.s3_key} loaded into table {self.table} sucesfully."
        )
Esempio n. 6
0
def import_data_to_intermediate_table(postgres_conn_id, tsv_file_name,
                                      identifier):
    load_table = _get_load_table_name(identifier)
    logger.info(f'Loading {tsv_file_name} into {load_table}')

    postgres = PostgresHook(postgres_conn_id=postgres_conn_id)
    postgres.bulk_load(f'{load_table}', tsv_file_name)
    postgres.run(f'DELETE FROM {load_table} WHERE url IS NULL;')
    postgres.run(f'DELETE FROM {load_table} WHERE license IS NULL;')
    postgres.run(
        f'DELETE FROM {load_table} WHERE foreign_landing_url IS NULL;')
    postgres.run(f'DELETE FROM {load_table} WHERE foreign_identifier IS NULL;')
    postgres.run(f'DELETE FROM {load_table} p1'
                 f' USING {load_table} p2'
                 f' WHERE p1.ctid < p2.ctid'
                 f' AND p1.provider = p2.provider'
                 f' AND p1.foreign_identifier = p2.foreign_identifier;')
    def test_bulk_load(self):
        hook = PostgresHook()
        input_data = ["foo", "bar", "baz"]

        with hook.get_conn() as conn:
            with conn.cursor() as cur:
                cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table))
                conn.commit()

                with NamedTemporaryFile() as f:
                    f.write("\n".join(input_data).encode("utf-8"))
                    f.flush()
                    hook.bulk_load(self.table, f.name)

                cur.execute("SELECT * FROM {}".format(self.table))
                results = [row[0] for row in cur.fetchall()]

        self.assertEqual(sorted(input_data), sorted(results))
    def test_bulk_load(self):
        hook = PostgresHook()
        input_data = ["foo", "bar", "baz"]

        with hook.get_conn() as conn:
            with conn.cursor() as cur:
                cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table))
                conn.commit()

                with NamedTemporaryFile() as f:
                    f.write("\n".join(input_data).encode("utf-8"))
                    f.flush()
                    hook.bulk_load(self.table, f.name)

                cur.execute("SELECT * FROM {}".format(self.table))
                results = [row[0] for row in cur.fetchall()]

        self.assertEqual(sorted(input_data), sorted(results))
def load_athena_to_postgres(**kwargs):
    s3_hook = S3Hook("aws_default")
    # get folder in s3 bucket
    bucket_prefix = kwargs["p_buckpref"]
    the_folder = s3_hook.list_keys(bucket_name=BUCKET_NAME,
                                   prefix=bucket_prefix)
    select_statement = "SELECT * FROM S3Object s"
    select_result = s3_hook.select_key(
        key=the_folder[0],
        bucket_name=BUCKET_NAME,
        expression=select_statement,
        input_serialization={"CSV": {
            "FileHeaderInfo": "USE"
        }},
        output_serialization={
            "CSV": {
                "RecordDelimiter": "\n",
                "FieldDelimiter": "\t"
            }
        },
    )
    fname = kwargs["p_filename"]
    with tempfile.NamedTemporaryFile(mode="w+b",
                                     delete=False,
                                     suffix=".csv",
                                     prefix=fname,
                                     dir="/tmp") as temp:
        with open(temp.name, "w") as f:
            for line in select_result:
                f.write(line)
    print("The CSV File of Routes is: ", temp.name)
    pg_staging_table = kwargs["p_staging_table"]
    pg_target_table = kwargs["p_target_table"]
    pg_target_sql = kwargs["p_target_sql"]
    pg_hook = PostgresHook(postgres_conn_id="postgres_sales",
                           supports_autocommit=True)
    pg_hook.run(sql=f"truncate table {pg_staging_table};", autocommit=True)
    pg_hook.run(sql=f"truncate table {pg_target_table};", autocommit=True)
    pg_hook.bulk_load(table=f"{pg_staging_table}", tmp_file=temp.name)
    print("Postgres sales.transportation_zones_staging loaded")
    pg_hook.run(sql=pg_target_sql, autocommit=True)
Esempio n. 10
0
def importData(_filename):
    print('Processing: {}'.format(_filename))

    pgHook  = PostgresHook(postgres_conn_id=DB_NAME)

    #LOAD the data INTO the intermediary table
    pgHook.bulk_load('provider_image_data', _filename)


    #DELETE invalid records
    pgHook.run('DELETE FROM provider_image_data WHERE url IS NULL;')
    pgHook.run('DELETE FROM provider_image_data WHERE license IS NULL;')
    pgHook.run('DELETE FROM provider_image_data WHERE foreign_landing_url IS NULL;')
    pgHook.run('DELETE FROM provider_image_data WHERE foreign_identifier IS NULL;')


    #DELETE duplicate records
    pgHook.run('DELETE FROM provider_image_data p1 USING provider_image_data p2 WHERE p1.ctid < p2.ctid and p1.provider = p2.provider and p1.foreign_identifier = p2.foreign_identifier;')


    #UPSERT the records
    query = """INSERT INTO image (created_on, updated_on, provider, source, foreign_identifier, foreign_landing_url, url, thumbnail, width, height, license, license_version, creator, creator_url, title, last_synced_with_source, removed_from_source, meta_data, tags, watermarked)
        SELECT NOW(), NOW(), provider, source, foreign_identifier, foreign_landing_url, url, thumbnail, width, height, license, license_version, creator, creator_url, title, NOW(), 'f', meta_data, tags, watermarked
        FROM provider_image_data
        ON CONFLICT (provider, md5((foreign_identifier)::text), md5((url)::text))
        DO UPDATE SET updated_on = now(), foreign_landing_url = EXCLUDED.foreign_landing_url, url = EXCLUDED.url, thumbnail = EXCLUDED.thumbnail, width = EXCLUDED.width, height = EXCLUDED.height, license = EXCLUDED.license, license_version = EXCLUDED.license_version, creator = EXCLUDED.creator, creator_url = EXCLUDED.creator_url, title = EXCLUDED.title, last_synced_with_source = NOW(), removed_from_source = 'f', meta_data = EXCLUDED.meta_data, watermarked = EXCLUDED.watermarked
        WHERE image.foreign_identifier = EXCLUDED.foreign_identifier and image.provider = EXCLUDED.provider;"""

    pgHook.run(query)


    #delete rows from the intermediary table
    #If the above task was NOT successful, the task state will switch to 'up for retry' and the tasks below will not be performed (i.e. it will be queued until its preceding task is successful
    pgHook.run('DELETE FROM provider_image_data;')

    #REMOVE the file that was loaded
    os.remove(_filename) if os.path.exists(_filename) else None
Esempio n. 11
0
class Postgres:
    """
        Postgres
    """
    def __init__(self, conn_id):
        self.hook = PostgresHook(postgres_conn_id=conn_id)
        self.conn = self.hook.get_conn()

    def exec(self, query: str, query_name: str):
        """
            Executa query e printa log com:
                - Time elapsed tempo de execução
                - mensagens de retorno do postgres
                - notificações do postgres
        """
        start_time = datetime.now()
        self.conn.autocommit = False
        self.conn.notices = []
        cursor = self.conn.cursor()
        cursor.execute(query)
        self.conn.commit()

        time_elapsed = datetime.now() - start_time

        str_notice = ''
        if cursor.statusmessage != 'DO':
            str_notice = cursor.statusmessage

        for notice in self.conn.notices:
            str_notice += notice.strip().rstrip().replace('NOTICE:  ', '')

        msg = f'Time elapsed {format(time_elapsed)}   {query_name}'
        msg = msg + ((120 - len(msg)) * ' ') + str_notice
        print(msg)

    def exec_no_transaction(self, query: str, query_name: str):
        """
            Executa query sem transação
        """
        start_time = datetime.now()
        self.conn.notices = []
        self.conn.autocommit = True
        cursor = self.conn.cursor()
        cursor.execute(query)
        self.conn.autocommit = False

        time_elapsed = datetime.now() - start_time

        str_notice = ''

        if cursor.statusmessage != 'DO':
            str_notice = cursor.statusmessage

        for notice in self.conn.notices:
            str_notice += notice.strip().rstrip().replace('NOTICE:  ', '')

        msg = f'Time elapsed {format(time_elapsed)}   {query_name}'
        msg = msg + ((120 - len(msg)) * ' ') + str_notice
        print(msg)

    def exec_return(self, query: str, query_name: str):
        """
            Executa query e printa log com:
                - Time elapsed tempo de execução
                - mensagens de retorno do postgres
                - notificações do postgres
        """
        start_time = datetime.now()

        self.conn.autocommit = False
        self.conn.notices = []
        cursor = self.conn.cursor()
        cursor.execute(query)
        tables = cursor.fetchall()
        self.conn.commit()

        time_elapsed = datetime.now() - start_time

        str_notice = ''
        if cursor.statusmessage != 'DO':
            str_notice = cursor.statusmessage

        for notice in self.conn.notices:
            str_notice += notice.strip().rstrip().replace('NOTICE:  ', '')

        msg = f'Time elapsed {format(time_elapsed)}   {query_name}'
        msg = msg + ((120 - len(msg)) * ' ') + str_notice
        print(msg)
        return tables

    def exec_insert(self, data: list, table: str, truncate: bool = False):
        """
            Insere lista em tabela
        """

        register_adapter(
            dict,
            Json)  # avoid parse to python dictionary (keeps postgres json)
        register_json(
            oid=3802, array_oid=3807, globally=True
        )  # avoid parse to python dictionary (keeps postgres json)

        print(f'Inserting in: {table}')

        self.conn.autocommit = False
        dest_cursor = self.conn.cursor()

        if truncate:
            dest_cursor.execute(f'TRUNCATE TABLE {table};')

        inserted = 0
        while True:
            lines = data[0:1000]
            del data[0:1000]
            inserted += len(lines)
            if not lines:
                break
            try:
                execute_values(
                    dest_cursor,
                    f'INSERT INTO {table} VALUES %s;'.format(
                        table=sql.Identifier(table)),
                    lines,
                )
            except Exception as error:
                print(f'Line - {lines}')
                raise Exception(error) from error

            print(f'Inserted: {inserted}')
        self.conn.commit()

    def dump_to_dest(self, src_conn_id: str, t_schema: str, t_name: str):
        """
            Carrega dump da origem para o destino
        """
        # avoid parse to python dictionary (keeps postgres json)
        register_adapter(dict, Json)
        register_json(oid=3802, array_oid=3807, globally=True)

        src_hook = PostgresHook(postgres_conn_id=src_conn_id)
        src_conn = src_hook.get_conn()
        src_cursor = src_conn.cursor()
        src_cursor.execute(f'select count(0) from {t_name};')
        qtd = src_cursor.fetchone()[0]

        dest_cursor = self.conn.cursor()
        dest_cursor.execute(f'TRUNCATE TABLE {t_schema}.{t_name};')
        self.conn.commit()

        if qtd > 0:
            with tempfile.NamedTemporaryFile() as temp_file:
                print('Gerando dump tabela:', t_name, 'linhas:', qtd)
                src_hook.bulk_dump(t_name, temp_file.name)
                print('Carregando dump tabela:', f'{t_schema}.{t_name}',
                      'linhas:', qtd)
                self.hook.bulk_load(f'{t_schema}.{t_name}', temp_file.name)
        else:
            print('Não foi gerado dump tabela:', t_name,
                  'pois possui 0 registros')
 def execute(self, context):
     hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                         schema=self.database)
     hook.bulk_load(table=self.target_table, tmp_file=self.source_file)