def test_get_conn_from_connection_with_schema(self, mock_connect): conn = Connection(login='******', password='******', host='host', schema='schema') hook = PostgresHook(connection=conn, schema='schema-override') hook.get_conn() mock_connect.assert_called_once_with( user='******', password='******', host='host', dbname='schema-override', port=None )
def execute(self, context, testing=False): """ Query data from Postgresql master database and then write into a json file. The default json file name is the table name + the utc time. """ self.log.info('LoadToJsonOperator Starting...') try: self.log.info("Initializing Postgres Master DB Connection...") psql_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id) psql_conn = psql_hook.get_conn() psql_cursor = psql_conn.cursor(cursor_factory=RealDictCursor) execution_date = context.get('execution_date') execution_date = execution_date.strftime(self._date_format) for table in self._tables: ports_select_all = self._query.format(table=table) psql_cursor.execute(ports_select_all) if not self._filenames: self._filenames[table] = f"{table}_{execution_date}.json" self.save_to_json(cursor=psql_cursor, key=table, file_name=self._filenames[table]) except (UndefinedTable, OperationalError): self.log.error("LoadToJsonOperator FAILED.") raise Exception("LoadToJsonOperator FAILED. OperationalError") except Exception: self.log.error(traceback.format_exc()) raise Exception("LoadToJsonOperator FAILED.") finally: if not testing: self.log.info('Closing database connections...') psql_conn.close() self.log.info('LoadToJsonOperator SUCCESS!')
def query(self): """Queries Postgres and returns a cursor to the results.""" hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) conn = hook.get_conn() cursor = conn.cursor() cursor.execute(self.sql, self.parameters) return cursor
def process_song_file(filepath): """ Reads from song datasource and loads data into song and artist tables :param filepath: Source data location """ postgres_hook = PostgresHook(postgres_conn_id='local_postgres', schema='postgres') conn = postgres_hook.get_conn() cur = conn.cursor() # open song file df = pd.read_json(filepath, lines=True) # insert song record song_data = df[['song_id', 'title', 'artist_id', 'year', 'duration']].values[0].tolist() cur.execute(song_table_insert, song_data) # insert artist record artist_data = df[[ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ]].values[0].tolist() cur.execute(artist_table_insert, artist_data) cur.close() conn.commit()
def grower_export_wrap(ds, **kwargs): from lib.utils import print_time db = PostgresHook(postgres_conn_id=postgresConnId) conn = db.get_conn() try: date = datetime.now().strftime("%Y-%m-%d") CKAN_DOMAIN = Variable.get("CKAN_DOMAIN") # check if CKAN_DOMAIN exists assert CKAN_DOMAIN CKAN_DATASET_NAME_GROWER_DATA = Variable.get( "CKAN_DATASET_NAME_GROWER_DATA") assert CKAN_DATASET_NAME_GROWER_DATA CKAN_API_KEY = Variable.get("CKAN_API_KEY") assert CKAN_API_KEY ckan_config = { "CKAN_DOMAIN": CKAN_DOMAIN, "CKAN_DATASET_NAME_GROWER_DATA": CKAN_DATASET_NAME_GROWER_DATA, "CKAN_API_KEY": CKAN_API_KEY, } print("ckan_config:", ckan_config) grower_export(conn, date, 178, ckan_config) return 0 except Exception as e: print("get error when export:", e) raise ValueError('Error executing query')
def tearDownClass(cls): postgres = PostgresHook() with postgres.get_conn() as conn: with conn.cursor() as cur: for table in TABLES: cur.execute( "DROP TABLE IF EXISTS {} CASCADE;".format(table))
def execute(self, context): hook = PostgresHook() connection = hook.get_conn() cursor = connection.cursor() cursor.execute(f"SELECT COUNT(*) FROM {self.table_name.lower()};") count_r = cursor.fetchall() context["ti"].xcom_push( key="{db_name}_rows_count".format(db_name=self.table_name), value=count_r, )
def insert_row(sql_query, table_name, custom_id, dt_now, **kwargs): """ postgres hook to insert a new row: | id | user | timestamp | """ hook = PostgresHook() connection = hook.get_conn() cursor = connection.cursor() cursor.execute( sql_query, (custom_id, kwargs["ti"].xcom_pull(task_ids="getting_current_user"), dt_now) ) connection.commit()
def create_new_person_records(ds, **kwargs): db = PostgresHook(postgres_conn_id=postgresConnId) conn = db.get_conn() cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) try: cursor.execute(""" SELECT * FROM planter JOIN ( SELECT regexp_replace((trim(lower(first_name)) || trim(lower(last_name))), '[ .-]', '', 'g') as name_key, count(*) FROM planter WHERE planter.organization_id IN ( select entity_id from getEntityRelationshipChildren(178) ) GROUP BY name_key HAVING count(*) = 1 ORDER BY name_key ) eligible_records ON regexp_replace((trim(lower(first_name)) || trim(lower(last_name))), '[ .-]', '', 'g') = eligible_records.name_key WHERE planter.organization_id IN ( select entity_id from getEntityRelationshipChildren(178) ) AND person_id IS NULL """); print("SQL result:", cursor.query) for row in cursor: #do something with every single row here #optionally print the row print(row) updateCursor = conn.cursor() updateCursor.execute(""" INSERT INTO entity (type, first_name, last_name, email, phone) values ('p', %s, %s, %s, %s) RETURNING * """, ( row['first_name'], row['last_name'], row['email'], row['phone'] ) ); personId = updateCursor.fetchone()[0]; print(personId) updateCursor.execute(""" UPDATE planter SET person_id = %s WHERE id = %s """, (personId, row['id']) ) conn.commit() return 0 except Exception as e: print("get error when exec SQL:", e) print("SQL result:", updateCursor.query) raise ValueError('Error executing query') return 1
def query(self): """Queries Postgres and returns a cursor to the results.""" hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) conn = hook.get_conn() cursor = conn.cursor(name=self._unique_name()) cursor.execute(self.sql, self.parameters) if self.use_server_side_cursor: cursor.itersize = self.cursor_itersize return _PostgresServerSideCursorDecorator(cursor) return cursor
def execute(self, context): es = ElasticHook(conn_id=self.elastic_conn_id) pg = PostgresHook(postgres_conn_id=self.postgres_conn_id) with closing(pg.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.itersize = 1000 cursor.execute(self.sql) for row in cursor: doc = json.dumps(row, indent=2) es.add_doc(index=self.index, doc_type='external', doc=doc)
def test_rowcount(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute(f"CREATE TABLE {self.table} (c VARCHAR)") values = ",".join(f"('{data}')" for data in input_data) cur.execute(f"INSERT INTO {self.table} VALUES {values}") conn.commit() assert cur.rowcount == len(input_data)
def test_rowcount(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table)) values = ",".join("('{}')".format(data) for data in input_data) cur.execute("INSERT INTO {} VALUES {}".format( self.table, values)) conn.commit() self.assertEqual(cur.rowcount, len(input_data))
def ingest_file_into_db(self, request_id): pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) with closing(pg_hook.get_conn()) as pg_conn: with closing(pg_conn.cursor()) as pg_cursor: csv_file = self.unzip_csv( self.get_file_data(request_id, pg_cursor)) rowcount = self.upload_local_file_into_table( csv_file, pg_cursor, table.weather_daily) csv_file.close() pg_conn.commit() return rowcount
def process_log_file(filepath): """ Reads from log datasource and loads data into time, user and songplays tables :param filepath: Source data location """ postgres_hook = PostgresHook(postgres_conn_id='local_postgres', schema='postgres') conn = postgres_hook.get_conn() cur = conn.cursor() # open log file df = pd.read_json(filepath, lines=True) # filter by NextSong action df = df[df['page'] == 'NextSong'] # convert timestamp column to datetime t = pd.to_datetime(df['ts'], unit='ms') # insert time data records time_data = [t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday] column_labels = ['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'] time_df = pd.DataFrame.from_dict(dict(zip(column_labels, time_data))) for i, row in time_df.iterrows(): cur.execute(time_table_insert, list(row)) # load user table user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']] # insert user records for i, row in user_df.iterrows(): cur.execute(user_table_insert, row) # insert songplay records for index, row in df.iterrows(): # get songid and artistid from song and artist tables cur.execute(song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None # insert songplay record songplay_data = (pd.to_datetime(row['ts'], unit='ms'), row['userId'], row['level'], songid, artistid, row['sessionId'], row['location'], row['userAgent']) cur.execute(songplay_table_insert, songplay_data) cur.close() conn.commit()
def get_data(): pg_hook = PostgresHook(postgres_conn_id='my_postgres_conn', schema='breakfast') connection = pg_hook.get_conn() cursor = connection.cursor() sql = """ SELECT * FROM product """ cursor.execute(sql) rows = cursor.fetchall() for each in rows: print(each)
def dump_transaction_data_each_week(yesterday_ds, **kwargs): week_end_date = macros.ds_format(yesterday_ds, '%Y-%m-%d', '%d-%b-%y') pg_hook = PostgresHook(postgres_conn_id='my_postgres_conn', schema='breakfast') connection = pg_hook.get_conn() cursor = connection.cursor() sql = f""" SELECT * FROM transaction WHERE week_end_date = '{week_end_date}' """ cursor.execute(sql) rows = cursor.fetchall() for each in rows: print(each)
def test_data_quality_operator_handles_db_error( self, mocker, postgresql, ports_collection, test_dag, tmp_path: Path ): """Test if data_quality_operator correctly handles db errors.""" # Create PSQL mock object mocker.patch.object( PostgresHook, "get_conn", return_value=postgresql ) # Make sure that the source table has an item in it pg_hook = PostgresHook() conn = pg_hook.get_conn() cursor = conn.cursor(cursor_factory=RealDictCursor) query = TransformAndLoadOperator._format_query( SqlQueries.ports_table_insert, {"updated_at": datetime.datetime.utcnow()} ) data = { 'id': 1, 'countryName': 'Philippines', 'portName': 'Aleran/Ozamis', 'unlocode': 'PH ALE', 'coordinates': '4234N 00135E', 'staging_id': uuid.uuid4().__str__() } cursor.execute(query, data) # Define the data quality checker postgres_master_config = PostgresConfig('postgres_default') check_data_quality = DataQualityCheckOperator( tables=['ports'], postgres_config=postgres_master_config, task_id='test', queries={ "ports_row_count": SqlQueries.ports_row_count, "ports_updated_count": SqlQueries.ports_updated_count }, dag=test_dag ) with raises(Exception) as exc: # Close the connection to implicitly raise an error conn.close() check_data_quality.execute( context={}, testing=True ) assert exc.type is Exception
def test_bulk_dump(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table)) values = ",".join("('{}')".format(data) for data in input_data) cur.execute("INSERT INTO {} VALUES {}".format(self.table, values)) conn.commit() with NamedTemporaryFile() as f: hook.bulk_dump(self.table, f.name) f.seek(0) results = [line.rstrip().decode("utf-8") for line in f.readlines()] self.assertEqual(sorted(input_data), sorted(results))
def setUpClass(cls): postgres = PostgresHook() with postgres.get_conn() as conn: with conn.cursor() as cur: for table in TABLES: cur.execute("DROP TABLE IF EXISTS {} CASCADE;".format(table)) cur.execute("CREATE TABLE {}(some_str varchar, some_num integer);".format(table)) cur.execute( "INSERT INTO postgres_to_gcs_operator VALUES(%s, %s);", ('mock_row_content_1', 42) ) cur.execute( "INSERT INTO postgres_to_gcs_operator VALUES(%s, %s);", ('mock_row_content_2', 43) ) cur.execute( "INSERT INTO postgres_to_gcs_operator VALUES(%s, %s);", ('mock_row_content_3', 44) )
def test_bulk_dump(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute(f"CREATE TABLE {self.table} (c VARCHAR)") values = ",".join(f"('{data}')" for data in input_data) cur.execute(f"INSERT INTO {self.table} VALUES {values}") conn.commit() with NamedTemporaryFile() as f: hook.bulk_dump(self.table, f.name) f.seek(0) results = [line.rstrip().decode("utf-8") for line in f.readlines()] assert sorted(input_data) == sorted(results)
def earing_export_wrap(ds, **kwargs): from lib.utils import print_time db = PostgresHook(postgres_conn_id=postgresConnId) conn = db.get_conn() try: # # generate last month # # get the month value of the last month # now = datetime.now() # last_month = now.month - 1 # # get the year value of the last month # last_year = now.year # if last_month == 0: # last_month = 12 # last_year = now.year - 1 # print ("last_month:", last_month) # print ("last_year:", last_year) # # get the last month # year_month = str(last_year) + "-" + str(last_month) # print ("year_month:", year_month) date = datetime.now().strftime("%Y-%m-%d") print("date:", date) CKAN_DOMAIN = Variable.get("CKAN_DOMAIN") # check if CKAN_DOMAIN exists assert CKAN_DOMAIN CKAN_DATASET_NAME_EARNING_DATA = Variable.get( "CKAN_DATASET_NAME_EARNING_DATA") assert CKAN_DATASET_NAME_EARNING_DATA CKAN_API_KEY = Variable.get("CKAN_API_KEY") assert CKAN_API_KEY ckan_config = { "CKAN_DOMAIN": CKAN_DOMAIN, "CKAN_DATASET_NAME_EARNING_DATA": CKAN_DATASET_NAME_EARNING_DATA, "CKAN_API_KEY": CKAN_API_KEY, } print("ckan_config:", ckan_config) start_end = lib.utils.get_start_and_end_date_of_previous_month( date) lib.earning_export.earning_export(conn, start_end[0], start_end[1], ckan_config) return 0 except Exception as e: print("get error when exec SQL:", e) raise ValueError('Error executing query') return 1
def test_bulk_load(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table)) conn.commit() with NamedTemporaryFile() as f: f.write("\n".join(input_data).encode("utf-8")) f.flush() hook.bulk_load(self.table, f.name) cur.execute("SELECT * FROM {}".format(self.table)) results = [row[0] for row in cur.fetchall()] self.assertEqual(sorted(input_data), sorted(results))
def create_request(self, uri): query_exists = f"""SELECT EXISTS ( SELECT A.id FROM {table.request} AS A WHERE A.uri = '{uri}' AND A.status IN ('COMPLETE', 'PENDING', 'ACTIVE'));""" query_insert = f"""INSERT INTO {table.request} (uri, operator) VALUES ('{uri}', 'SynopDailyOperator');""" request_created = False pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) with closing(pg_hook.get_conn()) as pg_conn: with closing(pg_conn.cursor()) as pg_cursor: pg_cursor.execute(query_exists) if not pg_cursor.fetchone()[0]: pg_cursor.execute(query_insert) request_created = True if request_created: pg_conn.commit() return request_created
def create_temporary_dbfile(self, request_id, buffer): pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) with closing(pg_hook.get_conn()) as pg_conn: with closing(pg_conn.cursor()) as pg_cursor: pg_cursor.execute( """ INSERT INTO imgw.temporary_file (request_id, data) VALUES (%s, %s) ON CONFLICT (request_id) DO UPDATE SET data = EXCLUDED.data RETURNING id """, (request_id, psycopg2.Binary(buffer))) temporary_file_id = pg_cursor.fetchone() pg_conn.commit() return temporary_file_id[0]
def execute(self, context, testing=False): """Does data quality checks for each table in table list. Assert a list of tables against a business defined SQL metrics. """ self.log.info('DataQualityCheckOperator Starting...') self.log.info("Initializing Postgres Master DB Connection...") psql_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id) try: conn = psql_hook.get_conn() cursor = conn.cursor(cursor_factory=RealDictCursor) for table in self._tables: data_quality = dict() for name, query in self._queries.items(): self.log.info(f"Running query: {query}") cursor.execute(query) result = cursor.fetchone() result = result.get('count') if not result: error = ("Data quality check FAILED. " f"{table} returned no results " f"for query: {name}") self.log.error(error) raise ValueError(error) data_quality[name] = result self.log.info( f"Data quality check on table '{table}' PASSED\n" "Results Summary:\n" f"{json.dumps(data_quality, indent=4, sort_keys=True)}") except (InterfaceError, OperationalError): self.log.error("DataQualityCheckOperator FAILED.") self.log.error(traceback.format_exc()) raise Exception("DataQualityCheckOperator FAILED.") except Exception: self.log.error("DataQualityCheckOperator FAILED.") raise Exception("DataQualityCheckOperator FAILED.") finally: if not testing: conn.close() self.log.info('DataQualityCheckOperator SUCCESS!') return data_quality
def execute(self, context, testing=False): """ Read all data from mongo db, process it and write to postgresql db. Uses UPSERT SQL query to write data. """ self.log.info('LoadToMasterdbOperator Starting...') self.log.info("Initializing Mongo Staging DB Connection...") mongo_hook = MongoHook(conn_id=self._mongo_conn_id) ports_collection = mongo_hook.get_collection(self._mongo_collection) self.log.info("Initializing Postgres Master DB Connection...") psql_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id) psql_conn = psql_hook.get_conn() psql_cursor = psql_conn.cursor() self.log.info("Loading Staging data to Master Database...") try: for idx, document in enumerate(ports_collection.find({})): document = self._processor.process_item(document) staging_id = document.get('_id').__str__() document['staging_id'] = staging_id document.pop('_id') psql_cursor.execute(self._sql_query, document) psql_conn.commit() except (OperationalError, UndefinedTable, OperationFailure): self.log.error("Writting to database FAILED.") self.log.error(traceback.format_exc()) raise Exception("LoadToMasterdbOperator FAILED.") except Exception: self.log.error(traceback.format_exc()) raise Exception("LoadToMasterdbOperator FAILED.") finally: if not testing: self.log.info('Closing database connections...') psql_conn.close() mongo_hook.close_conn() self.log.info(f'UPSERTED {idx+1} records into Postgres Database.') self.log.info('LoadToMasterdbOperator SUCCESS!')
def execute(self, context=None): """ Format the sql statements with the params_sql statement. Execute one by one the different statements. Args: context: Returns: """ if self.params_sql is not None: commands_formatted = [ S.SQL(q).format(**self.params_sql) for q in self.commands_stripped ] else: commands_formatted = [S.SQL(q) for q in self.commands_stripped] hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) for qf in commands_formatted: self.log.info("Executing Query:{}".format( qf.as_string(hook.get_conn()))) hook.run((qf, )) pass
def earnings_report(ds, **kwargs): db = PostgresHook(postgres_conn_id=postgresConnId) conn = db.get_conn() print("db:", conn) cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) try: # these hard coded values are placeholders for the upcoming contracts system freetown_stakeholder_uuid = "2a34fa81-0683-4d25-94b9-24843ceec3c4" freetown_base_contract_uuid = "483a1f4e-0c52-4b53-b917-5ff4311ded26" freetown_base_contract_consolidation_uuid = "a2dc79ec-4556-4cc5-bff1-2dbb5fd35b51" cursor.execute(""" SELECT COUNT(tree_id) capture_count, person_id, stakeholder_uuid, MIN(time_created) consolidation_start_date, MAX(time_created) consolidation_end_date, ARRAY_AGG(tree_id) tree_ids FROM ( SELECT trees.id tree_id, person_id, time_created, stakeholder_uuid, rank() OVER ( PARTITION BY person_id ORDER BY time_created ASC ) FROM trees JOIN planter ON trees.planter_id = planter.id JOIN entity ON entity.id = planter.person_id AND earnings_id IS NULL AND planter.organization_id IN ( select entity_id from getEntityRelationshipChildren(178) ) AND time_created > TO_TIMESTAMP( '2021-09-01 00:00:00', 'YYYY-MM-DD HH24:MI:SS' ) AND time_created < TO_TIMESTAMP( '2021-11-12 00:00:00', 'YYYY-MM-DD HH24:MI:SS' ) AND trees.approved = true AND trees.active = true ) rank GROUP BY person_id, stakeholder_uuid ORDER BY person_id; """); print("SQL result:", cursor.query) for row in cursor: print(row) #calculate the earnings based on FCC logic multiplier = (row['capture_count'] - row['capture_count'] % 100) / 10 / 100 if multiplier > 1: multiplier = 1 print( "multiplier " + str(multiplier) ) maxPayout = 1200000 earningsCurrency = 'SLL' earnings = multiplier * maxPayout updateCursor = conn.cursor() updateCursor.execute(""" INSERT INTO earnings.earnings( worker_id, contract_id, funder_id, currency, amount, calculated_at, consolidation_rule_id, consolidation_period_start, consolidation_period_end, status ) VALUES( %s, %s, %s, %s, %s, NOW(), %s, %s, %s, 'calculated' ) RETURNING * """, ( row['stakeholder_uuid'], freetown_base_contract_uuid, freetown_stakeholder_uuid, earningsCurrency, earnings, freetown_base_contract_consolidation_uuid, row['consolidation_start_date'], row['consolidation_end_date'])) print("SQL result:", updateCursor.query) earningsId = updateCursor.fetchone()[0] print(earningsId) updateCursor.execute(""" UPDATE trees SET earnings_id = %s WHERE id = ANY(%s) """, (earningsId, row['tree_ids'])) conn.commit() return 0 except Exception as e: print("get error when exec SQL:", e) print("SQL result:", updateCursor.query) raise ValueError('Error executing query') return 1
def get_data(self): pgHook = PostgresHook(postgres_conn_id=self.postgres_conn_id) with closing(pgHook.get_conn()) as conn: df = pd.read_sql(self.postgres_sql, conn) return df