def fetch_by_one(db_cursor: cursor) -> Iterable[tuple]: """ Вытаскивает все записи из БД. :param db_cursor: Курсор базы данных. :return: Запись базы данных. """ row = db_cursor.fetchone() while row: yield row row = db_cursor.fetchone()
def manga_has_author(self, manga_id: int, *, cur: Cursor = NotImplemented) -> bool: sql = 'SELECT EXISTS(SELECT 1 FROM manga_authors WHERE manga_id=%s) as "exists"' cur.execute(sql, (manga_id, )) return cur.fetchone()['exists']
def default_value(pg_cur: cursor, table_schema: str, table_name: str, column: str) -> str: """ Returns the default value of the column Parameters ---------- pg_cur the psycopg cursor table_schema the table schema table_name the table name column the column name """ # see https://stackoverflow.com/a/8148177/1548052 sql = "SELECT pg_get_expr(d.adbin, d.adrelid) AS default_value\n" \ "FROM pg_catalog.pg_attribute a\n" \ "LEFT JOIN pg_catalog.pg_attrdef d ON (a.attrelid, a.attnum) = (d.adrelid, d.adnum)\n" \ "WHERE NOT a.attisdropped -- no dropped (dead) columns\n" \ "AND a.attnum > 0 -- no system columns\n" \ "AND a.attrelid = '{ts}.{tn}'::regclass\n" \ "AND a.attname = '{col}';" \ .format(ts=table_schema, tn=table_name, col=column) pg_cur.execute(sql) return pg_cur.fetchone()[0] or 'NULL'
def primary_key(pg_cur: cursor, schema_name: str, table_name: str) -> str: """ Returns the primary of a table Parameters ---------- pg_cur psycopg cursor schema_name the schema name table_name the table name """ sql = "SELECT c.column_name"\ " FROM information_schema.key_column_usage AS c "\ " LEFT JOIN information_schema.table_constraints AS t"\ " ON t.constraint_name = c.constraint_name"\ " WHERE t.table_name = '{t}'"\ " AND t.table_schema = '{s}'"\ " AND t.constraint_type = 'PRIMARY KEY'".format(s=schema_name, t=table_name) pg_cur.execute(sql) try: pkey = pg_cur.fetchone()[0] except Exception: raise TableHasNoPrimaryKey(sql) return pkey
def process_log_file(cur: extensions.cursor, filepath: str): """ Function reads a file of logs data. Filters data only to "NextSong" action. Breaks timestamps of records in songplays down into specific units and saves in helper table. Saves users and songplay information in corresponding tables. :param cur: connection cursor :param filepath: filepath to the file with songs data """ data: List = [] with open(filepath, 'r') as f: while True: line = f.readline() if not line: break else: data.append(json.loads(line)) df = pd.DataFrame(data) df = df.loc[df['page'] == 'NextSong'] # convert timestamp column to datetime t = df['ts'] t = pd.to_datetime(t, unit='ms') time_data = ( t.dt.tz_localize('UTC').values, t.dt.hour.tolist(), t.dt.day.tolist(), t.dt.week.tolist(), # t.dt.isocalendar().week.tolist(), t.dt.month.tolist(), t.dt.year.tolist(), t.dt.weekday.tolist() ) column_labels = ('timestamp', 'hour', 'day', 'week', 'month', 'year', 'weekday') time_df = pd.DataFrame(dict(zip(column_labels, time_data))) for i, row in time_df.iterrows(): cur.execute(time_table_insert, list(row)) user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']] for i, row in user_df.iterrows(): cur.execute(users_table_insert, row) for index, row in df.iterrows(): cur.execute(song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: song_id, artist_id = results else: song_id, artist_id = None, None d = pd.to_datetime(row.ts, unit='ms') start_time, user_id, level, session_id, user_agent = d.tz_localize('UTC'), row.userId, row.level, row.sessionId, row.userAgent songplay_data: Tuple = (start_time, user_id, level, song_id, artist_id, session_id, user_agent) cur.execute(songplays_table_insert, songplay_data)
def find_parent(cursor: extensions.cursor, named_location_id: int, parents: Dict[str, str]): """ Recursively search for the site. :param cursor: A database cursor object. :param named_location_id: The named location ID. :param parents: Collection to append to. """ sql = ''' select prnt_nam_locn_id, nam_locn.nam_locn_name, type.type_name from nam_locn_tree join nam_locn on nam_locn.nam_locn_id = nam_locn_tree.prnt_nam_locn_id join type on type.type_id = nam_locn.type_id where chld_nam_locn_id = %s ''' cursor.execute(sql, [named_location_id]) row = cursor.fetchone() if row is not None: parent_id = row[0] name = row[1] type_name = row[2] if type_name.lower() == 'site': parents['site'] = name if type_name.lower() == 'domain': parents['domain'] = name find_parent(cursor, parent_id, parents)
def test_tables(cur: psycopg2Ext.cursor, conn: psycopg2Ext.connection) -> None: """ Description: Test table status to make sure tables exists. Arguments: cur (psycopg2Ext.cursor): cursor object conn (psycopg2Ext.connection): connection object Returns: None """ print("\n==================== TEST -- table status ====================") for query in create_table_queries: tbl_name = query[query.find("EXISTS") + len("EXISTS"):query.find("(")].strip() query = f"""select exists(select * from information_schema.tables where table_name='{tbl_name}')""" try: cur.execute(query) except psycopg2.Error as e: msg = f"ERROR: Could not retrieve table info with query: {query}" logger.warning(msg, e) return conn.commit() try: tbl_status = cur.fetchone()[0] except psycopg2.Error as e: msg = f"ERROR: Could not fetch table status for table: {tbl_name}" logger.warning(msg, e) return print(f"Table '{tbl_name}' exists status: {tbl_status}.")
def geometry_type(pg_cur: cursor, table_schema: str, table_name: str, column: str = 'geometry') -> (str, int): """ Returns the geometry type of a column as a tuple (type, srid) Parameters ---------- pg_cur the psycopg cursor table_schema the table schema table_name the table name column: the geometry column name, defaults to "geometry" """ sql = "SELECT type, srid " \ "FROM geometry_columns " \ "WHERE f_table_schema = '{s}' " \ "AND f_table_name = '{t}' " \ "AND f_geometry_column = '{c}';".format(s=table_schema, t=table_name, c=column) pg_cur.execute(sql) res = pg_cur.fetchone() if res: return res[0], res[1] else: return None
def find_service_manga(self, service_id: int, title_id: str, *, cur: Cursor = NotImplemented) -> DictRow: sql = 'SELECT * from manga_service WHERE service_id=%s AND title_id=%s' cur.execute(sql, (service_id, title_id)) return cur.fetchone()
def find_manga_by_title(self, title: str, *, cur: Cursor = NotImplemented) -> Optional[Manga]: sql = 'SELECT * FROM manga WHERE title=%s LIMIT 1' cur.execute(sql, (title, )) row = cur.fetchone() return None if not row else Manga(**row)
def check_table(db: Cursor, table_name: str) -> bool: sql = """ SELECT EXISTS ( SELECT * FROM information_schema.tables WHERE table_name = %s ); """ db.execute(sql, (table_name, )) return db.fetchone()[0]
def get_role(cursor: Cursor, name: str) -> Role: """ Look up a single role by name. """ query(cursor, "{} WHERE rolname = %s".format(_ROLE_SELECT), name) if cursor.rowcount: return cursor.fetchone() else: raise KeyError(name)
def get_service_whole( self, service_id: int, *, cur: Cursor = NotImplemented) -> Optional[ServiceWhole]: sql = 'SELECT * FROM service_whole WHERE service_id=%s' cur.execute(sql, [service_id]) row = cur.fetchone() return ServiceWhole(**row) if row else None
def get_newest_chapter(self, manga_id: int, service_id: Optional[int] = None, *, cur: Cursor = NotImplemented): sql = f'SELECT * FROM chapters WHERE manga_id=%s{" AND service_id=%s" if service_id is not None else ""} ORDER BY release_date DESC LIMIT 1' args = (manga_id, ) if service_id is None else (manga_id, service_id) cur.execute(sql, args) return cur.fetchone()
def get_manga(self, manga_id: int, *, cur: Cursor = NotImplemented) -> Optional[Manga]: """ Get manga object from database """ sql = 'SELECT * FROM manga WHERE manga_id=%s' cur.execute(sql, (manga_id, )) row = cur.fetchone() return Manga(**row) if row else None
def get_author_by_name(self, name: str, *, cur: Cursor = NotImplemented) -> Optional[Author]: sql = 'SELECT * FROM authors WHERE name=%s LIMIT 1' cur.execute(sql, (name, )) row = cur.fetchone() if row is None: return None return Author(**row)
def get_manga_service(self, service_id: int, title_id: str, *, cur: Cursor = NotImplemented ) -> Optional[MangaService]: sql = 'SELECT * FROM manga_service ms ' \ 'INNER JOIN manga m ON ms.manga_id = m.manga_id ' \ 'WHERE service_id=%s AND ms.title_id=%s' cur.execute(sql, (service_id, title_id)) row = cur.fetchone() return MangaService(**row) if row else None
def process_log_file(cur: cursor, filepath: str) -> None: """ process a given log file and load to database """ # open log file df = pd.read_json(filepath, lines=True, convert_dates=['ts']) # filter by NextSong action df = df[df['page'] == 'NextSong'] # convert timestamp column to datetime t = pd.to_datetime(df['ts'], unit='ms') # insert time data records time_data = [[x, x.hour, x.day, x.week, x.month, x.year, x.dayofweek] for x in t] column_labels = [ 'start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday' ] time_df = pd.DataFrame(time_data, columns=column_labels) for i, row in time_df.iterrows(): cur.execute(time_table_insert, list(row)) # load user table user_df = df.filter(['userId', 'firstName', 'lastName', 'gender', 'level']).drop_duplicates() # insert user records for i, row in user_df.iterrows(): cur.execute(user_table_insert, row) # insert songplay records for index, row in df.iterrows(): # get songid and artistid from song and artist tables cur.execute(song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None # insert songplay record songplay_data = [ row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent ] cur.execute(songplay_table_insert, songplay_data)
def get_row_count(schema_table: SchemaTable, cursor: extensions.cursor) -> int: """ Given a SchemaTable and a cursor, this simple utility will run a SELECT COUNT(*) on the object and return an int :param schema_table: the SchemaTable object that we want to compute the row count :param cursor: a cursor for where to execute this query :return: the number of rows in the schema table object after querying the database with the cursor """ cursor.execute(sql.SQL(""" SELECT COUNT(*) FROM {} """).format(schema_table)) count = cursor.fetchone()[0] # grab the first element of the tuple that is returned validate_is_int(count) return count
def _db_exists(self, cur: cursor) -> bool: '''Checks if a database already exists. Args: cur: Cursor for SQL queries. db_name: Database name. Returns: True, if the database exists, False otherwise. ''' sql_query = 'SELECT datname FROM pg_catalog.pg_database WHERE datname = \'auto_scheduler\';' cur.execute(sql_query) return cur.fetchone() is not None
def reference_columns(pg_cur: cursor, table_schema: str, table_name: str, foreign_table_schema: str, foreign_table_name: str) -> (str, str): """ Returns the columns use in a reference constraint Parameters ---------- pg_cur the psycopg cursor table_schema the table schema table_name the table name foreign_table_schema the schema of the foreign table foreign_table_name the name of the foreign table """ # see https://stackoverflow.com/a/1152321/1548052 sql = "SELECT kcu.column_name, ccu.column_name AS foreign_column_name " \ "FROM information_schema.table_constraints AS tc " \ "JOIN information_schema.key_column_usage AS kcu " \ "ON tc.constraint_name = kcu.constraint_name " \ "AND tc.table_schema = kcu.table_schema " \ "JOIN information_schema.constraint_column_usage AS ccu " \ "ON ccu.constraint_name = tc.constraint_name " \ "AND ccu.table_schema = tc.table_schema " \ "WHERE tc.constraint_type = 'FOREIGN KEY' " \ "AND tc.table_name='{tn}' " \ "AND tc.table_schema='{ts}' " \ "AND ccu.table_name = '{ftn}' " \ "AND ccu.table_schema = '{fts}';".format(tn=table_name, ts=table_schema, ftn=foreign_table_name, fts=foreign_table_schema) pg_cur.execute(sql) cols = pg_cur.fetchone() if not cols: raise NoReferenceFound( '{ts}.{tn} has no reference to {fts}.{ftn}'.format( tn=table_name, ts=table_schema, ftn=foreign_table_name, fts=foreign_table_schema)) return cols
def get_service(self, service: Union[int, str], *, cur: Cursor = NotImplemented) -> Optional[Service]: """ Get service by url or by id Args: service: The id or url of the service cur: Optional cursor Returns: Service object """ if isinstance(service, int): sql = 'SELECT * FROM services WHERE service_id=%s' else: sql = 'SELECT * FROM services WHERE url=%s' cur.execute(sql, (service, )) row = cur.fetchone() return Service(**row) if row else None
def get_column_count(schema_table: SchemaTable, cursor: extensions.cursor) -> int: """ Given a SchemaTable and a cursor, this simple utility will query the information schema to find out how many columns are in it. Note that this works equally well if the schema_table actually refers to a view, but it won't work with a materialized view since they aren't part of the SQL standard (so they aren't in the information schema) :param schema_table: the SchemaTable object that we want to compute the row count :param cursor: a cursor for where to execute this query :return: the number of rows in the schema table object after querying the database with hte cursor """ schema_name = schema_table.schema.string table_name = schema_table.table.string cursor.execute(sql.SQL(""" SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = %s AND table_name = %s """), (schema_name, table_name)) count = cursor.fetchone()[0] # grab the first element of the tuple that is returned validate_is_int(count) return count
def get_citus_version(cur: cursor) -> str: cur.execute("SELECT citus_version()") result = cur.fetchone() return result
def _get_latest_id_from_table(table: Table, cur: cursor) -> int: cur.execute(f"select id from {table} order by id desc limit 1") row = cur.fetchone() return row[0] if row else None
def process_log_file(cur: cursor, filepath: str): """Process a log file Args: cur (cursor): Conected cursor filepath (str): Path for a json file """ # open log file df = pd.read_json(filepath, lines=True) # filter by NextSong action df = df.loc[df["page"] == "NextSong"] # convert timestamp column to datetime df.ts = df.ts.astype("datetime64[ms]") df["hour"] = df.ts.dt.hour df["day"] = df.ts.dt.day df["week"] = df.ts.dt.week df["month"] = df.ts.dt.month df["year"] = df.ts.dt.year df["weekday"] = df.ts.dt.weekday for row in df.itertuples(index=False): time_vals = ( f"{row.ts:%Y-%m-%d %H:%M:%S}", row.hour, row.day, row.week, row.month, row.year, row.weekday, ) time_vals = [str(col) for col in time_vals] cur.execute(sql.time_table_insert, time_vals) user_vals = (row.userId, row.firstName, row.lastName, row.gender, row.level) cur.execute(sql.user_table_insert, user_vals) # get songid and artistid from song and artist tables cur.execute(sql.song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None # insert songplay record cur.execute( sql.songplay_table_insert, ( f"{row.ts:%Y-%m-%d %H:%M:%S}", str(row.userId), str(row.level), str(songid), str(artistid), str(row.sessionId), str(row.location), str(row.userAgent), ), )