def load_local_data_to_intermediate_table(postgres_conn_id, tsv_file_name, identifier, max_rows_to_skip=10): load_table = _get_load_table_name(identifier) logger.info(f'Loading {tsv_file_name} into {load_table}') postgres = PostgresHook(postgres_conn_id=postgres_conn_id) load_successful = False while not load_successful and max_rows_to_skip >= 0: try: postgres.bulk_load(f'{load_table}', tsv_file_name) load_successful = True except InvalidTextRepresentation as e: line_number = _get_malformed_row_in_file(str(e)) _delete_malformed_row_in_file(tsv_file_name, line_number) finally: max_rows_to_skip = max_rows_to_skip - 1 if not load_successful: raise InvalidTextRepresentation( 'Exceeded the maximum number of allowed defective rows') _clean_intermediate_table_data(postgres, load_table)
def copy_rows(self, file_name, table_name, postgres_conn_id): print("### fetching records from CSV file ###") print("### inserting records into Postgres table ###") postgresserver = PostgresHook(postgres_conn_id) postgresserver.bulk_load(table_name, file_name) return True
def gcs_to_psql_import(**kwargs): fd, tmp_filename = tempfile.mkstemp(text=True) # download file locally gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=kwargs['gcp_conn_id']) gcs_hook.download(bucket=kwargs['bucket'], object=kwargs['object'], filename=tmp_filename) del gcs_hook # load the file into postgres pg_hook = PostgresHook(postgres_conn_id=kwargs['postgres_conn_id'], schema=kwargs['database']) pg_hook.bulk_load( '{schema}.{table}'.format(schema=kwargs['schema'], table=kwargs['table']), tmp_filename) # output errors for output in pg_hook.conn.notices: print(output) # remove temp file os.close(fd) os.unlink(tmp_filename)
def load_spark_csv_to_postgres(spark_csv_dir, postgres_conn_id, postgres_table_name): """ Load Spark CSV to Postgres. Keyword arguments: spark_csv_dir -- directory where CSV written from Spark is located postgres_conn_id -- ID of Airflow connection postgres_table_name -- name of target table in Postgres database """ # Get Spark CSV file path spark_csv_file_name = list(filter( re.compile("part-.*csv$").match, os.listdir(spark_csv_dir) ))[0] spark_csv_file_path = f"{spark_csv_dir}/{spark_csv_file_name}" # Get Postgres hook postgres_hook = PostgresHook(postgres_conn_id) # Clear Postgres table print(f"Clearing Postgres table {postgres_table_name}") postgres_hook.run(f"truncate {postgres_table_name}") # Copy data to Postgres table print( f""" Loading Spark CSV {spark_csv_file_path} into Postgres table {postgres_table_name} """ ) postgres_hook.bulk_load(postgres_table_name, spark_csv_file_path)
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id) self.log.info( f"Loading file s3://{self.s3_bucket}/{self.s3_key} into table {self.table}." ) with DataHelper.buffer_s3_object_as_file(s3_client, self.s3_bucket, self.s3_key) as f: postgres.bulk_load(self.table, f) self.log.info( f"s3://{self.s3_bucket}/{self.s3_key} loaded into table {self.table} sucesfully." )
def import_data_to_intermediate_table(postgres_conn_id, tsv_file_name, identifier): load_table = _get_load_table_name(identifier) logger.info(f'Loading {tsv_file_name} into {load_table}') postgres = PostgresHook(postgres_conn_id=postgres_conn_id) postgres.bulk_load(f'{load_table}', tsv_file_name) postgres.run(f'DELETE FROM {load_table} WHERE url IS NULL;') postgres.run(f'DELETE FROM {load_table} WHERE license IS NULL;') postgres.run( f'DELETE FROM {load_table} WHERE foreign_landing_url IS NULL;') postgres.run(f'DELETE FROM {load_table} WHERE foreign_identifier IS NULL;') postgres.run(f'DELETE FROM {load_table} p1' f' USING {load_table} p2' f' WHERE p1.ctid < p2.ctid' f' AND p1.provider = p2.provider' f' AND p1.foreign_identifier = p2.foreign_identifier;')
def test_bulk_load(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table)) conn.commit() with NamedTemporaryFile() as f: f.write("\n".join(input_data).encode("utf-8")) f.flush() hook.bulk_load(self.table, f.name) cur.execute("SELECT * FROM {}".format(self.table)) results = [row[0] for row in cur.fetchall()] self.assertEqual(sorted(input_data), sorted(results))
def test_bulk_load(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table)) conn.commit() with NamedTemporaryFile() as f: f.write("\n".join(input_data).encode("utf-8")) f.flush() hook.bulk_load(self.table, f.name) cur.execute("SELECT * FROM {}".format(self.table)) results = [row[0] for row in cur.fetchall()] self.assertEqual(sorted(input_data), sorted(results))
def load_athena_to_postgres(**kwargs): s3_hook = S3Hook("aws_default") # get folder in s3 bucket bucket_prefix = kwargs["p_buckpref"] the_folder = s3_hook.list_keys(bucket_name=BUCKET_NAME, prefix=bucket_prefix) select_statement = "SELECT * FROM S3Object s" select_result = s3_hook.select_key( key=the_folder[0], bucket_name=BUCKET_NAME, expression=select_statement, input_serialization={"CSV": { "FileHeaderInfo": "USE" }}, output_serialization={ "CSV": { "RecordDelimiter": "\n", "FieldDelimiter": "\t" } }, ) fname = kwargs["p_filename"] with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix=".csv", prefix=fname, dir="/tmp") as temp: with open(temp.name, "w") as f: for line in select_result: f.write(line) print("The CSV File of Routes is: ", temp.name) pg_staging_table = kwargs["p_staging_table"] pg_target_table = kwargs["p_target_table"] pg_target_sql = kwargs["p_target_sql"] pg_hook = PostgresHook(postgres_conn_id="postgres_sales", supports_autocommit=True) pg_hook.run(sql=f"truncate table {pg_staging_table};", autocommit=True) pg_hook.run(sql=f"truncate table {pg_target_table};", autocommit=True) pg_hook.bulk_load(table=f"{pg_staging_table}", tmp_file=temp.name) print("Postgres sales.transportation_zones_staging loaded") pg_hook.run(sql=pg_target_sql, autocommit=True)
def importData(_filename): print('Processing: {}'.format(_filename)) pgHook = PostgresHook(postgres_conn_id=DB_NAME) #LOAD the data INTO the intermediary table pgHook.bulk_load('provider_image_data', _filename) #DELETE invalid records pgHook.run('DELETE FROM provider_image_data WHERE url IS NULL;') pgHook.run('DELETE FROM provider_image_data WHERE license IS NULL;') pgHook.run('DELETE FROM provider_image_data WHERE foreign_landing_url IS NULL;') pgHook.run('DELETE FROM provider_image_data WHERE foreign_identifier IS NULL;') #DELETE duplicate records pgHook.run('DELETE FROM provider_image_data p1 USING provider_image_data p2 WHERE p1.ctid < p2.ctid and p1.provider = p2.provider and p1.foreign_identifier = p2.foreign_identifier;') #UPSERT the records query = """INSERT INTO image (created_on, updated_on, provider, source, foreign_identifier, foreign_landing_url, url, thumbnail, width, height, license, license_version, creator, creator_url, title, last_synced_with_source, removed_from_source, meta_data, tags, watermarked) SELECT NOW(), NOW(), provider, source, foreign_identifier, foreign_landing_url, url, thumbnail, width, height, license, license_version, creator, creator_url, title, NOW(), 'f', meta_data, tags, watermarked FROM provider_image_data ON CONFLICT (provider, md5((foreign_identifier)::text), md5((url)::text)) DO UPDATE SET updated_on = now(), foreign_landing_url = EXCLUDED.foreign_landing_url, url = EXCLUDED.url, thumbnail = EXCLUDED.thumbnail, width = EXCLUDED.width, height = EXCLUDED.height, license = EXCLUDED.license, license_version = EXCLUDED.license_version, creator = EXCLUDED.creator, creator_url = EXCLUDED.creator_url, title = EXCLUDED.title, last_synced_with_source = NOW(), removed_from_source = 'f', meta_data = EXCLUDED.meta_data, watermarked = EXCLUDED.watermarked WHERE image.foreign_identifier = EXCLUDED.foreign_identifier and image.provider = EXCLUDED.provider;""" pgHook.run(query) #delete rows from the intermediary table #If the above task was NOT successful, the task state will switch to 'up for retry' and the tasks below will not be performed (i.e. it will be queued until its preceding task is successful pgHook.run('DELETE FROM provider_image_data;') #REMOVE the file that was loaded os.remove(_filename) if os.path.exists(_filename) else None
class Postgres: """ Postgres """ def __init__(self, conn_id): self.hook = PostgresHook(postgres_conn_id=conn_id) self.conn = self.hook.get_conn() def exec(self, query: str, query_name: str): """ Executa query e printa log com: - Time elapsed tempo de execução - mensagens de retorno do postgres - notificações do postgres """ start_time = datetime.now() self.conn.autocommit = False self.conn.notices = [] cursor = self.conn.cursor() cursor.execute(query) self.conn.commit() time_elapsed = datetime.now() - start_time str_notice = '' if cursor.statusmessage != 'DO': str_notice = cursor.statusmessage for notice in self.conn.notices: str_notice += notice.strip().rstrip().replace('NOTICE: ', '') msg = f'Time elapsed {format(time_elapsed)} {query_name}' msg = msg + ((120 - len(msg)) * ' ') + str_notice print(msg) def exec_no_transaction(self, query: str, query_name: str): """ Executa query sem transação """ start_time = datetime.now() self.conn.notices = [] self.conn.autocommit = True cursor = self.conn.cursor() cursor.execute(query) self.conn.autocommit = False time_elapsed = datetime.now() - start_time str_notice = '' if cursor.statusmessage != 'DO': str_notice = cursor.statusmessage for notice in self.conn.notices: str_notice += notice.strip().rstrip().replace('NOTICE: ', '') msg = f'Time elapsed {format(time_elapsed)} {query_name}' msg = msg + ((120 - len(msg)) * ' ') + str_notice print(msg) def exec_return(self, query: str, query_name: str): """ Executa query e printa log com: - Time elapsed tempo de execução - mensagens de retorno do postgres - notificações do postgres """ start_time = datetime.now() self.conn.autocommit = False self.conn.notices = [] cursor = self.conn.cursor() cursor.execute(query) tables = cursor.fetchall() self.conn.commit() time_elapsed = datetime.now() - start_time str_notice = '' if cursor.statusmessage != 'DO': str_notice = cursor.statusmessage for notice in self.conn.notices: str_notice += notice.strip().rstrip().replace('NOTICE: ', '') msg = f'Time elapsed {format(time_elapsed)} {query_name}' msg = msg + ((120 - len(msg)) * ' ') + str_notice print(msg) return tables def exec_insert(self, data: list, table: str, truncate: bool = False): """ Insere lista em tabela """ register_adapter( dict, Json) # avoid parse to python dictionary (keeps postgres json) register_json( oid=3802, array_oid=3807, globally=True ) # avoid parse to python dictionary (keeps postgres json) print(f'Inserting in: {table}') self.conn.autocommit = False dest_cursor = self.conn.cursor() if truncate: dest_cursor.execute(f'TRUNCATE TABLE {table};') inserted = 0 while True: lines = data[0:1000] del data[0:1000] inserted += len(lines) if not lines: break try: execute_values( dest_cursor, f'INSERT INTO {table} VALUES %s;'.format( table=sql.Identifier(table)), lines, ) except Exception as error: print(f'Line - {lines}') raise Exception(error) from error print(f'Inserted: {inserted}') self.conn.commit() def dump_to_dest(self, src_conn_id: str, t_schema: str, t_name: str): """ Carrega dump da origem para o destino """ # avoid parse to python dictionary (keeps postgres json) register_adapter(dict, Json) register_json(oid=3802, array_oid=3807, globally=True) src_hook = PostgresHook(postgres_conn_id=src_conn_id) src_conn = src_hook.get_conn() src_cursor = src_conn.cursor() src_cursor.execute(f'select count(0) from {t_name};') qtd = src_cursor.fetchone()[0] dest_cursor = self.conn.cursor() dest_cursor.execute(f'TRUNCATE TABLE {t_schema}.{t_name};') self.conn.commit() if qtd > 0: with tempfile.NamedTemporaryFile() as temp_file: print('Gerando dump tabela:', t_name, 'linhas:', qtd) src_hook.bulk_dump(t_name, temp_file.name) print('Carregando dump tabela:', f'{t_schema}.{t_name}', 'linhas:', qtd) self.hook.bulk_load(f'{t_schema}.{t_name}', temp_file.name) else: print('Não foi gerado dump tabela:', t_name, 'pois possui 0 registros')
def execute(self, context): hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) hook.bulk_load(table=self.target_table, tmp_file=self.source_file)