def process_song_file(cur: psycopg2.connect, filepath: str) -> None: """Extract and load data for song and artist from log files. Parameters ---------- cur: psycopg2.connect Psycopg2 database cursor for inserting data. filepath: str Path for log file. """ # open song file df = pd.read_json(filepath, lines=True) # insert song record song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration'] song_data = song_data = df[song_columns].values[0].tolist() cur.execute(song_table_insert, song_data) # insert artist record artist_columns = [ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude', ] artist_data = df[artist_columns].values[0].tolist() cur.execute(artist_table_insert, artist_data)
def upsert_rows(conn: psycopg2.connect, df: pd.DataFrame, table: str, pkeys: list) -> None: """ Using cursor.mogrify() to build the bulk insert query then cursor.execute() to execute the query """ # Create a list of tupples from the dataframe values tuples = [tuple(x) for x in df.to_numpy()] tuples_str = ', '.join(map(str, tuples)) # Comma-separated dataframe columns cols = ','.join(list(df.columns)) insert_statement = "INSERT INTO %s(%s) VALUES %s" % (table, cols, tuples_str) on_conflict_statement = 'ON CONFLICT (' + ', '.join(map(str, pkeys)) + ')' do_update_statement = _create_update_set_statement(list(df.columns)) # SQL quert to execute query = insert_statement + ' ' + on_conflict_statement + ' ' + do_update_statement cursor = conn.cursor() try: cursor.execute(query) conn.commit() except (Exception, psycopg2.DatabaseError) as error: print("Error: %s" % error) conn.rollback() cursor.close() return 1 cursor.close()
def insert_mood(conn: psycopg2.connect, update: Update, user_data: Dict[str, Union[str, List[str]]]) -> None: sql = """ INSERT INTO moodtracker (message_id, chat_id, mood, reason, note, date) VALUES(%s, %s, %s, %s, %s, %s) """ cur = conn.cursor() chat_id = update.message.chat_id mood = user_data['mood'] note = user_data['note'] reasons = user_data['reasons'] date = datetime.now() message_id = str(chat_id) + str(date.strftime("%Y%m%d%H%M%S%f")) if len(reasons) == 0: reasons = None cur.execute(sql, (message_id, chat_id, mood, reasons, note, date)) else: for reason in user_data['reasons']: cur.execute(sql, (message_id, chat_id, mood, reason, note, date)) conn.commit() cur.close()
def insert_data(conn: psycopg2.connect, df: pd.DataFrame) -> None: ''' Bulk insert dataframe into advertisementdata table. This function was inspired by Naysan Saran's article "Pandas to PostgreSQL using Psycopg2: Bulk Insert Performance Benchmark", in which the author chose a variety of bulk insert methods and compared their execution time. Saving the dataframe to a StringIO object and then copying this to the database proved to be the most efficient when dealing with millions of records. Source: https://naysan.ca/2020/05/09/pandas-to-postgresql-using-psycopg2-bulk-insert-performance-benchmark/ ''' set_index(conn, df) buffer = StringIO() df.to_csv(buffer, index_label='id', header=False) buffer.seek(0) cursor = conn.cursor() try: cursor.copy_from(buffer, 'advertisementdata', sep=",") conn.commit() except (Exception, psycopg2.DatabaseError) as error: logging.error(f"Error inserting data: {error}") conn.rollback() cursor.close() cursor.close()
def drop_tables(cur: psycopg2.connect) -> None: tables = [ "Employees", "OrderDetails", "Categories", "Customers", "Orders", "Products", "Shippers", "Suppliers" ] for table in tables: cur.execute(f"DROP TABLE {table.lower()};")
def create_table_teams(conn: psycopg2.connect, overwrite: bool=False) -> None: ''' Creates the columns and relationships of the TEAMS table ''' table_name = 'TEAMS' if overwrite == True: drop_table_statement = f'''DROP TABLE IF EXISTS {table_name};''' else: drop_table_statement = '' create_table_statment = f''' {drop_table_statement} CREATE TABLE {table_name} ( LEAGUE_ID BIGINT , SEASON_ID SMALLINT , TEAM_ID SMALLINT , MANAGER_ID VARCHAR(50) , TEAM_NAME VARCHAR(50) , MANAGER_NAME VARCHAR(50) , ESPN_NAME VARCHAR(50) , CONSTRAINT TEAMS_PKEY PRIMARY KEY(LEAGUE_ID, SEASON_ID, TEAM_ID) ); ''' cursor = conn.cursor() cursor.execute(create_table_statment) conn.commit() cursor.close()
def init_tables(cur: psycopg2.connect) -> None: fixtures_init_path = os.path.join(FIXTURES_PATH, "init") fixtures = [os.path.join(fixtures_init_path, f) for f in listdir(fixtures_init_path)] for fixture in fixtures: with open(fixture, 'r') as f: sql = f.read().strip() cur.execute(sql)
def create_table_weeks(conn: psycopg2.connect, overwrite: bool=False) -> None: ''' Creates the columns and relationships of the WEEKS table ''' table_name = 'WEEKS' if overwrite == True: drop_table_statement = f'''DROP TABLE IF EXISTS {table_name};''' else: drop_table_statement = '' create_table_statment = f''' {drop_table_statement} CREATE TABLE {table_name} ( LEAGUE_ID BIGINT , SEASON_ID SMALLINT , WEEK_NUMBER SMALLINT , MATCHUP_PERIOD SMALLINT , REG_SEASON_FLAG SMALLINT , CONSTRAINT WEEKS_PKEY PRIMARY KEY(LEAGUE_ID, SEASON_ID, WEEK_NUMBER) ); ''' cursor = conn.cursor() cursor.execute(create_table_statment) conn.commit() cursor.close()
def table_columns(conn: psycopg2.connect, table: str) -> tuple: """ Pulls all columns in a table """ table = table.lower() query = f''' SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = '{table}' ''' cursor = conn.cursor() try: cursor.execute(query) cols = cursor.fetchall() cols = [col[0] for col in cols] cursor.close() return cols except (Exception, psycopg2.DatabaseError) as error: print("Error: %s" % error) conn.rollback() cursor.close() return 1
def bootstrap( self ): """ Initialize our database connection after our fork() has occured, this is due to the nature of the psycopg2 library when used with psycopg2.connect( async = True ) http://initd.org/psycopg/docs/usage.html#thread-and-process-safety @params None @return None """ # Initialize database connection sqlCon = Database( database = "SpfAudit", async = True ) # Obtain our database descriptor once were ready to process self.block( sqlCon ) sql = sqlCon.cursor() # Ensure schema intact before processing sql.execute( self.query[ 'ctable' ] ) self.block( sql.connection ) # Propagate our master connection and cursor objects into # state structure. self.state.update( { 'sqlCon' : sqlCon, 'sql' : sql, } )
def clear_tables(cur: psycopg2.connect) -> None: tables = [ "Employees", "OrderDetails", "Categories", "Customers", "Orders", "Products", "Shippers", "Suppliers" ] for table in tables: query = f"DELETE FROM {table.lower()} WHERE TRUE ;" cur.execute(query)
def add_timer_to_db(conn: psycopg2.connect, cursor, user_id: int, chat_id: int, time: datetime): """Добавляет таймер в БД""" print('add_timer_to_db') time = time - timedelta(hours=5) cursor.execute( f"""INSERT INTO alerts (user_id, chat_id, time) VALUES ({user_id}, {chat_id}, '{time}')""" ) conn.commit()
def create_table_scores(conn: psycopg2.connect, overwrite: bool=False) -> None: ''' Creates the columns and relationships of the WEEKLY_SCORES table ''' table_name = 'SCORES' if overwrite == True: drop_table_statement = f'''DROP TABLE IF EXISTS {table_name};''' else: drop_table_statement = '' create_table_statment = f''' {drop_table_statement} CREATE TABLE {table_name} ( LEAGUE_ID BIGINT , SEASON_ID SMALLINT , WEEK_NUMBER BIGINT , TEAM_ID SMALLINT , TEAM_ID_OPP SMALLINT , SCORE NUMERIC(5, 2) , SCORE_OPP NUMERIC(5, 2) , WLT_POINTS NUMERIC(2, 1) , WIN_IND SMALLINT , LOSS_IND SMALLINT , TIE_IND SMALLINT , ALL_PLAY_WLT_POINTS NUMERIC(3, 1) , ALL_PLAY_WINS SMALLINT , ALL_PLAY_LOSSES SMALLINT , ALL_PLAY_TIES SMALLINT , CUM_SCORE NUMERIC(6, 2) , CUM_SCORE_OPP NUMERIC(6, 2) , CUM_WLT_POINTS NUMERIC(3, 1) , CUM_WINS SMALLINT , CUM_LOSSES SMALLINT , CUM_TIES SMALLINT , CUM_ALL_PLAY_WLT_POINTS NUMERIC(4, 1) , CUM_ALL_PLAY_WINS SMALLINT , CUM_ALL_PLAY_LOSSES SMALLINT , CUM_ALL_PLAY_TIES SMALLINT , CUM_SCORE_PER_WEEK NUMERIC(5, 2) , CUM_SCORE_OPP_PER_WEEK NUMERIC(5, 2) , CUM_ALL_PLAY_WLT_POINTS_PER_WEEK NUMERIC(3, 1) , RECORD VARCHAR(10) , ALL_PLAY_RECORD VARCHAR(10) , STANDINGS SMALLINT , HOME_OR_AWAY VARCHAR(10) , CONSTRAINT WEEKLY_SCORES_PKEY PRIMARY KEY(LEAGUE_ID, SEASON_ID, WEEK_NUMBER, TEAM_ID) ); ''' cursor = conn.cursor() cursor.execute(create_table_statment) conn.commit() cursor.close()
def execute_sql(cnx: connect, sql: str, values: tuple) -> list: """Pass sql query""" start = time.time() cursor = cnx.cursor() if '.sql' in sql: file = open(sql, 'r') sql = file.read() file.close() result = [] sql_command = sql.split(';') try: for sql in sql_command: if sql == '' or '--' in sql: continue try: cursor.execute(sql, values) if 'SELECT' in sql: column_names = tuple(desc[0] for desc in cursor.description) result.append({ "query": sql, 'column_names': column_names, "data": cursor.fetchall() }) elif 'DELETE' in sql: stop = time.time() - start result.append({ "query": sql, "data": 'query executed in {:06.3f}s. {} rows affected'.format( stop, cursor.rowcount) }) else: stop = time.time() - start result.append({ "query": sql, "data": 'query executed in {:06.3f}s'.format(stop) }) except ProgrammingError as error: print(error) print("query unsuccessful: {}".format(sql)) result.append({ "query": sql, "data": 'query unsuccessful: {}'.format(error) }) finally: cursor.close() cnx.close() return result
def run(conn: connect): with conn: with conn.cursor() as curs: curs.execute("insert into fetched_records") with conn: with conn.cursor() as curs: a = curs.execute("select * from fetched_records") print(a)
def add_chat_to_db(conn: psycopg2.connect, cursor, chat_name: str, chat_id: int): """Добавляет чат в список чатов. Если чат уже есть в БД, то возвращает False""" cursor.execute(f"""SELECT * FROM chats WHERE chat_id = {int(chat_id)}""") chat = cursor.fetchone() if not chat: cursor.execute( f"""INSERT INTO chats (chat_name, chat_id) VALUES ('{chat_name}', {int(chat_id)})""" ) conn.commit()
def copy_s3_to_staging(cur: psycopg2, conn: psycopg2.connect) -> None: """ Copy the contents of the S3 buckets to the staging tables in the database :param cur: PostgreSQL cursor :param conn: PostgreSQL connection object :return: None """ for query in copy_table_queries: logger.debug("Copying data to staging table as\n{}".format(query)) cur.execute(query) conn.commit()
def upload_comment(self, comment_data: Comment, conn: psycopg2.connect): cur = conn.cursor() datas = (comment_data.message, comment_data.author_name, comment_data.thumbnails, comment_data.timestamp_msec, comment_data.timestamp_text, comment_data.purchase_amount, comment_data.movie_id) cur.execute( "INSERT INTO comment(message,author_name,thumbnails,timestamp_msec,timestamp_text,purchase_amount,movie_id) VALUES (%s,%s,%s,%s,%s,%s,%s);", datas) conn.commit() cur.close()
def create_staging_tables(cur: psycopg2, conn: psycopg2.connect) -> None: """ Create the temporary tables used for staging. :param cur: PostgreSQL cursor :param conn: PostgreSQL connection object :return: None """ for query in insert_temp_table_queries: logger.debug("Creating staging table as \n{}".format(query)) cur.execute(query) conn.commit()
def execute_query(conn: psycopg2.connect, query: str) -> None: cursor = conn.cursor() try: cursor.execute(query) conn.commit() except (Exception, psycopg2.DatabaseError) as error: logging.error(f"Unable to execute query. Error: {error}") conn.rollback() cursor.close() cursor.close()
def upload_comments(self, comment_datas: List[Comment], conn: psycopg2.connect): cur = conn.cursor() list_data = [(comment_data.message, comment_data.author_name, comment_data.thumbnails, comment_data.timestamp_msec, comment_data.timestamp_text, comment_data.purchase_amount, comment_data.movie_id) for comment_data in comment_datas] cur.executemany( "INSERT INTO comment(message,author_name,thumbnails,timestamp_msec,timestamp_text,purchase_amount,movie_id) VALUES (%s,%s,%s,%s,%s,%s,%s);", list_data) conn.commit() cur.close()
def access_control_db(con: psycopg2.connect) -> psycopg2.connect: #this function will create two users; an admin for us to populate the db, and another #user which can only query stuff (this will be what the front end users authenticate as) con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = con.cursor() try: query = sql.SQL( "select COUNT(*) from {table} where {pkey} = %s").format( table=sql.Identifier('pg_roles'), pkey=sql.Identifier('rolname')) cur.execute(query, ('music_man', )) print(cur.fetchone()[0]) except psycopg2.DatabaseError as e: print('Error in DB access control: {}'.format(e))
def create_table(conn: psycopg2.connect) -> None: TABLE_CREATION = """ CREATE TABLE IF NOT EXISTS moodtracker ( message_id varchar(40) NOT NULL, chat_id varchar(10) NOT NULL, mood varchar(10), reason varchar(25), note varchar(255), date timestamp ) """ cur = conn.cursor() cur.execute(TABLE_CREATION) cur.close() conn.commit()
def get_score(conn: psycopg2.connect) -> float: sql = """ SELECT AVG(value) FROM ( SELECT DISTINCT message_id, mood, date, CASE WHEN mood = 'Awesome' THEN 5 WHEN mood = 'Good' THEN 4 WHEN mood = 'Okay' THEN 3 WHEN mood = 'Bad' THEN 2 WHEN mood = 'Terrible' THEN 1 END AS value FROM moodtracker ORDER BY date ASC ) AS mean """ cur = conn.cursor() cur.execute(sql) data = cur.fetchall() cur.close() return round(data[0][0], 1)
def drop_tables(cur: psycopg2.connect, conn: psycopg2.connect) -> None: """ Drops each table using the queries in `drop_table_queries` list. Parameters ---------- cur : psycopg2.cursor Cursor for accessing database with psycopg. conn : psycopg2.connect Database connection instance. """ for query in drop_table_queries: cur.execute(query) conn.commit()
def record_exists(record: Record, conn: connect): with conn: with conn.cursor() as curs: curs.execute(( "SELECT record_id_updated_at FROM fetched_records WHERE record_id = %(record_id)s" ), {'record_id': record.meta.record_id}) response = curs.fetchone() return not (response is None or len(response) == 0)
def drop_rows(conn: psycopg2.connect, table: str, where_condition: str) -> None: ''' Drops rows from a table based on a set of conditions ''' query = f''' DELETE FROM {table} WHERE {where_condition} ''' cursor = conn.cursor() try: cursor.execute(query) conn.commit() except (Exception, psycopg2.DatabaseError) as error: print("Error: %s" % error) conn.rollback() cursor.close() return 1 cursor.close()
def run_sql_etl(sql: str, conn: pg.connect, table_name: str): """Runs given SQL query on the provided PostgreSQL connection obj. Args: sql (str): SQL script to run conn (psycopg2.connect): PostgreSQL database connection table_name (str): Table name for logging purposes """ func_start_time = time() log.debug('Running SQL ETL for "%s" table', table_name) with conn.cursor() as cur: cur.execute(sql) conn.commit() log.info( 'SQL ETL for table "%s" completed in: %s seconds', table_name, round(time() - func_start_time, 3), )
def fetch_local_s3_url(record: Record, query: Search, conn: connect): with conn: with conn.cursor() as curs: curs.execute( "SELECT s3_bucket, s3_location FROM fetched_records where record_id = %(record_id)s limit 1", {'record_id': record.meta.record_id}) response = curs.fetchone() if response is not None: return True, response else: return False,
def process_data( cur: psycopg2.connect, conn: psycopg2.connect, filepath: str, func: Callable, ) -> None: """ Perform data processing for specific raw files. Parameters ---------- cur : psycopg2.cursor Cursor for accessing database with psycopg. conn : psycopg2.connect Database connection instance. filepath : str Path for target file. func : Callable Function to use to process file. """ # get all files matching extension from directory all_files = [] for root, dirs, files in os.walk(filepath): files = glob.glob(os.path.join(root, '*.json')) for f in files: all_files.append(os.path.abspath(f)) # get total number of files found num_files = len(all_files) print('{} files found in {}'.format(num_files, filepath)) # iterate over files and process for i, datafile in enumerate(all_files, 1): func(cur, datafile) conn.commit() print('{}/{} files processed.'.format(i, num_files))
def bulk_load_df(data: pd.DataFrame, table_name: str, conn: pg.connect): """Bulk inserts a pandas dataframe into PostgreSQL table Args: data (pandas.Dataframe): Data for insertion table_name (str): Table name for logging purposes conn (psycopg2.connect): PostgreSQL database connection """ buffer = StringIO() buffer.write(data.to_csv(index=None, header=None, na_rep='')) buffer.seek(0) with conn.cursor() as cur: cur.copy_from( buffer, table_name, columns=data.columns, sep=',', null='', ) conn.commit()