def main(): """Launches all the steps required to create the data tables in redshift.""" # Launches the creation of an IAM role and redshift clusters if they don't already exists: client, cluster_name = create_rc.main() # Check redshift cluster's availability. If available, create the tables: cluster_status = check_rc.check_cluster_status(client, cluster_name) if cluster_status == 'creating': print( f"Cluster '{cluster_name}' is being created.\n" + "This can take several minutes. Do you want to Wait (W) or Exit (Q)?\n" + "Exiting won't interrupt cluster creation. You can resume later by re-launching main.py" ) valid_choices = ['q', 'Q', 'w', 'W'] waiting = advanced_input(valid_choices) if waiting.lower() == 'q': sys.exit(0) elif waiting.lower() == 'w': print("Waiting...") while cluster_status == 'creating': time.sleep(20) cluster_status = check_rc.check_cluster_status( client, cluster_name) print(f"Waiting... cluster status: {cluster_status}.") if cluster_status == 'available': cluster = client.describe_clusters( ClusterIdentifier=cluster_name)['Clusters'][0] # TODO: remove local reference to 'dwh.cfg'. create_rc.update_section_key('dwh.cfg', 'CLUSTER', 'cl_endpoint', cluster['Endpoint']['Address']) # When cluster available ask the user if she wants to launch the etl process: print( f"Cluster '{cluster_name}' available.\n" + "Do you want to create tables and launch the ETL process? Yes (Y), No (n)?\n" + "This will drop existing tables, re-create them and load data.") valid_choices = ['y', 'Y', 'n', 'N'] launch_etl = advanced_input(valid_choices) if launch_etl.lower() == 'y': create_tables.main() etl.main() else: sys.exit(0) else: print( f"Cluster '{cluster_name}' current status: '{cluster_status}'.\n" "Please activate or repair the cluster and relaunch the program.\n" "Exiting.") sys.exit(1) print("The End.")
def main(): """ - Creates the sparkify keyspace and relative tables - process data - query result """ IPCluster, keyspace = "127.0.0.1", "sparkifydb" create_tables.main(IPCluster, keyspace) process_data(IPCluster, keyspace, "./event_data/") query_result(IPCluster, keyspace)
def execute(): """ Execute the complete ETL process. Create database tables. Insert json files into created tables. """ create_tables.main() print('All tables are created!') print('-' * 70) etl.main() print('-' * 70) print('OMG is done!')
def main(): # drop sparkifydb database if exists, then re-create it. create_tables.main() conn = psycopg2.connect( "host=127.0.0.1 dbname=sparkifydb user=student password=student") cur = conn.cursor() process_data(cur, conn, filepath="data/song_data", func=process_song_file) process_data(cur, conn, filepath="data/log_data", func=process_log_file) conn.close()
def main(): """ - Drops (if exists) and Creates the sparkify database and relative tables - process song data - process log data """ hostname, dbname = "127.0.0.1", "sparkifydb" create_tables.main(hostname, dbname) process_song_data(hostname, dbname, "./data/song_data/") process_log_data(hostname, dbname, "./data/log_data/")
def main(): config = configparser.ConfigParser() config.read('dwh.cfg') conn = psycopg2.connect( "host={} dbname={} user={} password={} port={}".format( *config['CLUSTER'].values())) cur = conn.cursor() create_tables.main() load_staging_tables(cur, conn) insert_tables(cur, conn) conn.close()
def main(): """ This is the main function etl.py INPUT : None OUTPUT : None """ create_tables.main() conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") cur = conn.cursor() process_data(cur, conn, filepath='data/song_data', func=process_song_file) process_data(cur, conn, filepath='data/log_data', func=process_log_file) conn.close()
def main(): assert ((settings.ACTIVATE_FILTER_TABLES and settings.ACTIVATE_SELECT_TABLES) == False), \ "ACTIVATE_SELECT_TABLES and ACTIVATE_FILTER_TABLES cannot be True at the same time" #prepare the destination folder and the database init_process.main() start_time = datetime.datetime.now() #split and select/filter the SQL files #convert INSERT into INSERT IGNORE splitter.main() print_info(start_time, "splitter") next_time = datetime.datetime.now() #create tables in the db and add primary keys on issue_comments, issue_events, pull_request_comment create_tables.main() print_info(next_time, "create tables") next_time = datetime.datetime.now() #create triggers in the db if settings.ACTIVATE_CREATE_TRIGGERS: create_triggers.main() print_info(next_time, "create triggers") next_time = datetime.datetime.now() if not settings.ACTIVATE_ONLY_CREATE_TABLE: #insert data into the db (the triggers will filter out the data from deleted projects, # however it could be possible to perform other filtering operations insert_data.main() print_info(next_time, "insert data") next_time = datetime.datetime.now() #create indexes create_indexes.main() print_info(next_time, "create indexes") next_time = datetime.datetime.now() #delete the rows used to take benefit from the insert ignore if settings.ACTIVATE_CREATE_TRIGGERS: delete_modified_rows.main() print_info(next_time, "delete modified rows") print_info(start_time, "import process")
def main(): create_tables.main() conn = psycopg2.connect( "host=127.0.0.1 dbname=sparkifydb user=student password=student") conn.set_session(autocommit=True) cur = conn.cursor() try: process_data(cur, filepath='data/song_data', func=stage_song_data) except psycopg2.errors.InsufficientPrivilege as e: print(str(e).split('HINT')[0]) print( "HINT: You must GRANT student pg_read_server_files permissions in order to read the log files." ) print("Try `GRANT pg_read_server_files TO STUDENT`") process_data(cur, filepath='data/log_data', func=stage_log_data) load_tables(cur) conn.close()
def main(): """ Main program for etl.py INPUT: None RETURN: None """ # create event data create_event_data() print("Created event_datafile_new.csv") # create tables create_tables.main() print("Created tables") # connect to db session, cluster = create_tables.connect_db() # event data filepath file = 'event_datafile_new.csv' # insert data to session table insert_session_data(session, file) print("Inserted data to session_history table") # insert data to user table insert_user_data(session, file) print("Inserted data to user_history table") # insert data to song table insert_song_data(session, file) print("Inserted data to song_history table") # close session and disconnect cluster create_tables.disconnect_db(session, cluster) print("ETL Completed")
def main(): """ - Runs create_tables.py(DDLs) script to reset database state - Manages a connection with the database - Processes data files """ setup.main() conn = psycopg2.connect( "host=127.0.0.1 dbname=sparkifydb user=student password=student") cur = conn.cursor() process_data(cur, conn, filepath='data/song_data', func=process_song_file, filepath_pattern="*.json") process_data(cur, conn, filepath='data/log_data', func=process_log_file, filepath_pattern="*.json") conn.close()
import sys sys.path.insert(0, './') import configparser import psycopg2 import pandas as pd from sql_queries import * import time from create_tables import main from etl import etl_main from infra_control import createCluster, checkCluster, deleteCluster if __name__ == "__main__": print('Checking if cluster exists. If not, then create it.') if checkCluster() == -1: createCluster() while checkCluster() == -2: time.sleep(5) print('Waiting for cluster to be ready ...', end='\r') print('Cluster ready, getting endpoint.') end_point = checkCluster() print('\n\nAll good, commencing ETL.') main(create_table_queries, drop_table_queries) etl_main(copy_table_queries, insert_table_queries)
def main(): IPCluster, keyspace = "127.0.0.1", "sparkifydb" create_tables.main(IPCluster, keyspace) process_data(IPCluster, keyspace, "./event_data/")
import configparser import psycopg2 import pandas as pd from sql_queries import * import time from create_tables import main from sql_queries import copy_table_queries, insert_table_queries # from etl import etl_main from create_cluster import createCluster def load_staging_tables(cur, conn): for query in copy_table_queries: cur.execute(query) conn.commit() def insert_tables(cur, conn): for query in insert_table_queries: cur.execute(query) conn.commit() if __name__ == "__main__": print('First, check if cluster exists then create it') createCluster() print('Now, it\'s ready for ETL') main()
def main(): """ Run script creating and loading Postgres database from data directory. Steps: 1. Run create_tables.py to create Postgress database 'sparkifydb' and star schema with tables: songplays, users, time, artists, songs. 2. Connect to sparkifydb. 3. Create staging tables song_staging and log_staging. 4. Create profilers. 5. In the body of `try`: - copy data to staging tables using profilers. - insert data into star schema - set foreign keys and indices - execute a simple query to check if at least one entry in `songplays` has a non-null artist_id 6. In the body of `finally`: drop the staging tables and close the connection. 7. Print the profiling statistics, showing the top 10% of operations sorted by time. """ create_tables.main() dsn = "host=127.0.0.1 dbname=sparkifydb user=student password=student" conn = psycopg2.connect(dsn) cur = conn.cursor() stagers = [song_stager, log_stager] for s in stagers: print('* Creating table', s.get_table_name()) s.create_table(cur) conn.commit() profilers = [Profile(), Profile()] try: for p, s in zip(profilers, stagers): print('* Copying to table', s.get_table_name()) p.runcall(lambda: s.copy(cur, stream=True)) conn.commit() print('* Inserting into star schema') for query in sql_queries.insert_queries: cur.execute(query) conn.commit() print('* Setting foreign keys') for query in sql_queries.fk_queries: cur.execute(query) conn.commit() print('* Setting indices') for query in sql_queries.idx_queries: cur.execute(query) conn.commit() cur.execute("SELECT COUNT(*) FROM songplays WHERE artist_id IS NOT NULL;") print('* Number of songplays with artist_id not NULL:', cur.fetchone()[0]) finally: for s in stagers: print('* Dropping table', s.get_table_name()) s.drop_table(cur) conn.commit() cur.close() conn.close() for profiler in profilers: stats = Stats(profiler) stats.strip_dirs() stats.sort_stats('time') stats.print_stats(.1)
import drop_tables import create_tables import inserting_literacy1951_2011 import inserting_population01 import inserting_population11 import inserting_police01 import inserting_police11 import inserting_crimepart1 import inserting_crimepart2 import inserting_totalcrime # import inserting_totalcrime13 drop_tables.main() create_tables.main() inserting_literacy1951_2011.main() inserting_population01.main() inserting_population11.main() inserting_police01.main() inserting_police11.main() inserting_crimepart1.main() inserting_crimepart2.main() inserting_totalcrime.main() # inserting_totalcrime13.main() print '*' * 60 print '\t\tDATABASE CONFIGURED.' print '*' * 60
for f in files: all_files.append(os.path.abspath(f)) # get total number of files found num_files = len(all_files) print('{} files found in {}'.format(num_files, filepath)) # iterate over files and process for i, datafile in enumerate(all_files, 1): func(cur, datafile) conn.commit() print('{}/{} files processed.'.format(i, num_files)) def main(): conn = mysql.connector.connect(host="localhost", user="******", password="******", database="SparkifyDB") cur = conn.cursor() process_data(cur, conn, filepath='data/song_data', func=process_song_file) process_data(cur, conn, filepath='data/log_data', func=process_log_file) conn.close() if __name__ == "__main__": create_tables.main() main()