def init_tables(cur: psycopg2.connect) -> None: fixtures_init_path = os.path.join(FIXTURES_PATH, "init") fixtures = [os.path.join(fixtures_init_path, f) for f in listdir(fixtures_init_path)] for fixture in fixtures: with open(fixture, 'r') as f: sql = f.read().strip() cur.execute(sql)
def drop_tables(cur: psycopg2.connect) -> None: tables = [ "Employees", "OrderDetails", "Categories", "Customers", "Orders", "Products", "Shippers", "Suppliers" ] for table in tables: cur.execute(f"DROP TABLE {table.lower()};")
def process_song_file(cur: psycopg2.connect, filepath: str) -> None: """Extract and load data for song and artist from log files. Parameters ---------- cur: psycopg2.connect Psycopg2 database cursor for inserting data. filepath: str Path for log file. """ # open song file df = pd.read_json(filepath, lines=True) # insert song record song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration'] song_data = song_data = df[song_columns].values[0].tolist() cur.execute(song_table_insert, song_data) # insert artist record artist_columns = [ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude', ] artist_data = df[artist_columns].values[0].tolist() cur.execute(artist_table_insert, artist_data)
def clear_tables(cur: psycopg2.connect) -> None: tables = [ "Employees", "OrderDetails", "Categories", "Customers", "Orders", "Products", "Shippers", "Suppliers" ] for table in tables: query = f"DELETE FROM {table.lower()} WHERE TRUE ;" cur.execute(query)
def drop_tables(cur: psycopg2.connect, conn: psycopg2.connect) -> None: """ Drops each table using the queries in `drop_table_queries` list. Parameters ---------- cur : psycopg2.cursor Cursor for accessing database with psycopg. conn : psycopg2.connect Database connection instance. """ for query in drop_table_queries: cur.execute(query) conn.commit()
def clear_tables(cur: psycopg2.connect) -> None: tables = ["Tasks", "Dates"] for table in tables: query = f"DELETE FROM {table.lower()} WHERE TRUE ;" cur.execute(query)
def drop_tables(cur: psycopg2.connect) -> None: tables = ["Tasks", "Dates"] for table in tables: cur.execute(f"DROP TABLE {table.lower()};")
def process_log_file(cur: psycopg2.connect, filepath: str) -> None: """Extract and load data for song and artist from log files. Parameters ---------- cur: psycopg2.connect Psycopg2 database cursor for inserting data. filepath: str Path for log file. """ # open log file df = pd.read_json(filepath, lines=True, convert_dates=['ts']) # filter by NextSong action df = df[df['page'] == 'NextSong'] # convert timestamp column to datetime t = pd.to_datetime(df['ts'], unit='ms') # insert time data records time_data = [[x, x.hour, x.day, x.week, x.month, x.year, x.dayofweek] for x in t] column_labels = [ 'start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday', ] time_df = pd.DataFrame(time_data, columns=column_labels) for i, row in time_df.iterrows(): cur.execute(time_table_insert, list(row)) # load user table user_df = df.filter(['userId', 'firstName', 'lastName', 'gender', 'level']).drop_duplicates() # insert user records for i, row in user_df.iterrows(): cur.execute(user_table_insert, row) # insert songplay records for index, row in df.iterrows(): # get songid and artistid from song and artist tables cur.execute(song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None # insert songplay record songplay_data = [ row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent, ] cur.execute(songplay_table_insert, songplay_data)