Beispiel #1
0
def main():
    """Launches all the steps required to create the data tables in redshift."""

    # Launches the creation of an IAM role and redshift clusters if they don't already exists:
    client, cluster_name = create_rc.main()

    # Check redshift cluster's availability. If available, create the tables:
    cluster_status = check_rc.check_cluster_status(client, cluster_name)

    if cluster_status == 'creating':
        print(
            f"Cluster '{cluster_name}' is being created.\n" +
            "This can take several minutes. Do you want to Wait (W) or Exit (Q)?\n"
            +
            "Exiting won't interrupt cluster creation. You can resume later by re-launching main.py"
        )
        valid_choices = ['q', 'Q', 'w', 'W']
        waiting = advanced_input(valid_choices)

        if waiting.lower() == 'q':
            sys.exit(0)
        elif waiting.lower() == 'w':
            print("Waiting...")
            while cluster_status == 'creating':
                time.sleep(20)
                cluster_status = check_rc.check_cluster_status(
                    client, cluster_name)
                print(f"Waiting... cluster status: {cluster_status}.")

    if cluster_status == 'available':
        cluster = client.describe_clusters(
            ClusterIdentifier=cluster_name)['Clusters'][0]
        # TODO: remove local reference to 'dwh.cfg'.
        create_rc.update_section_key('dwh.cfg', 'CLUSTER', 'cl_endpoint',
                                     cluster['Endpoint']['Address'])

        # When cluster available ask the user if she wants to launch the etl process:
        print(
            f"Cluster '{cluster_name}' available.\n" +
            "Do you want to create tables and launch the ETL process? Yes (Y), No (n)?\n"
            + "This will drop existing tables, re-create them and load data.")
        valid_choices = ['y', 'Y', 'n', 'N']
        launch_etl = advanced_input(valid_choices)
        if launch_etl.lower() == 'y':
            create_tables.main()
            etl.main()
        else:
            sys.exit(0)

    else:
        print(
            f"Cluster '{cluster_name}' current status: '{cluster_status}'.\n"
            "Please activate or repair the cluster and relaunch the program.\n"
            "Exiting.")
        sys.exit(1)

    print("The End.")
Beispiel #2
0
def main():
    """
    - Creates the sparkify keyspace and relative tables
    - process data
    - query result
    """
    IPCluster, keyspace = "127.0.0.1", "sparkifydb"
    create_tables.main(IPCluster, keyspace)

    process_data(IPCluster, keyspace, "./event_data/")
    query_result(IPCluster, keyspace)
Beispiel #3
0
def execute():
    """
    Execute the complete ETL process.
    Create database tables.
    Insert json files into created tables.
    """
    create_tables.main()
    print('All tables are created!')
    print('-' * 70)
    etl.main()
    print('-' * 70)
    print('OMG is done!')
Beispiel #4
0
def main():
    # drop sparkifydb database if exists, then re-create it.
    create_tables.main()

    conn = psycopg2.connect(
        "host=127.0.0.1 dbname=sparkifydb user=student password=student")
    cur = conn.cursor()

    process_data(cur, conn, filepath="data/song_data", func=process_song_file)
    process_data(cur, conn, filepath="data/log_data", func=process_log_file)

    conn.close()
Beispiel #5
0
def main():
    """
    - Drops (if exists) and Creates the sparkify database and relative tables

    - process song data

    - process log data
    """
    hostname, dbname = "127.0.0.1", "sparkifydb"
    create_tables.main(hostname, dbname)

    process_song_data(hostname, dbname, "./data/song_data/")
    process_log_data(hostname, dbname, "./data/log_data/")
Beispiel #6
0
def main():
    config = configparser.ConfigParser()
    config.read('dwh.cfg')

    conn = psycopg2.connect(
        "host={} dbname={} user={} password={} port={}".format(
            *config['CLUSTER'].values()))
    cur = conn.cursor()

    create_tables.main()
    load_staging_tables(cur, conn)
    insert_tables(cur, conn)

    conn.close()
def main():
    """
    This is the main function etl.py
    INPUT : None
    OUTPUT : None
    """
    
    create_tables.main()
    
    conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
    cur = conn.cursor()
       
    process_data(cur, conn, filepath='data/song_data', func=process_song_file)
    process_data(cur, conn, filepath='data/log_data', func=process_log_file)

    conn.close()
def main():

    assert ((settings.ACTIVATE_FILTER_TABLES and settings.ACTIVATE_SELECT_TABLES) == False), \
        "ACTIVATE_SELECT_TABLES and ACTIVATE_FILTER_TABLES cannot be True at the same time"

    #prepare the destination folder and the database
    init_process.main()

    start_time = datetime.datetime.now()

    #split and select/filter the SQL files
    #convert INSERT into INSERT IGNORE
    splitter.main()
    print_info(start_time, "splitter")
    next_time = datetime.datetime.now()

    #create tables in the db and add primary keys on issue_comments, issue_events, pull_request_comment
    create_tables.main()
    print_info(next_time, "create tables")
    next_time = datetime.datetime.now()

    #create triggers in the db
    if settings.ACTIVATE_CREATE_TRIGGERS:
        create_triggers.main()
        print_info(next_time, "create triggers")
        next_time = datetime.datetime.now()

    if not settings.ACTIVATE_ONLY_CREATE_TABLE:
        #insert data into the db (the triggers will filter out the data from deleted projects,
        # however it could be possible to perform other filtering operations
        insert_data.main()
        print_info(next_time, "insert data")
        next_time = datetime.datetime.now()

    #create indexes
    create_indexes.main()
    print_info(next_time, "create indexes")
    next_time = datetime.datetime.now()

    #delete the rows used to take benefit from the insert ignore
    if settings.ACTIVATE_CREATE_TRIGGERS:
        delete_modified_rows.main()
        print_info(next_time, "delete modified rows")

    print_info(start_time, "import process")
Beispiel #9
0
def main():
    create_tables.main()
    conn = psycopg2.connect(
        "host=127.0.0.1 dbname=sparkifydb user=student password=student")
    conn.set_session(autocommit=True)
    cur = conn.cursor()

    try:
        process_data(cur, filepath='data/song_data', func=stage_song_data)
    except psycopg2.errors.InsufficientPrivilege as e:
        print(str(e).split('HINT')[0])
        print(
            "HINT: You must GRANT student pg_read_server_files permissions in order to read the log files."
        )
        print("Try `GRANT pg_read_server_files TO STUDENT`")

    process_data(cur, filepath='data/log_data', func=stage_log_data)
    load_tables(cur)

    conn.close()
Beispiel #10
0
def main():
    """
    Main program for etl.py
    INPUT: None
    RETURN: None
    """

    # create event data
    create_event_data()
    print("Created event_datafile_new.csv")

    # create tables
    create_tables.main()
    print("Created tables")

    # connect to db
    session, cluster = create_tables.connect_db()

    # event data filepath
    file = 'event_datafile_new.csv'

    # insert data to session table
    insert_session_data(session, file)
    print("Inserted data to session_history table")

    # insert data to user table
    insert_user_data(session, file)
    print("Inserted data to user_history table")

    # insert data to song table
    insert_song_data(session, file)
    print("Inserted data to song_history table")

    # close session and disconnect cluster
    create_tables.disconnect_db(session, cluster)
    print("ETL Completed")
Beispiel #11
0
def main():
    """
    - Runs create_tables.py(DDLs) script to reset database state
    - Manages a connection with the database
    - Processes data files
    """
    setup.main()

    conn = psycopg2.connect(
        "host=127.0.0.1 dbname=sparkifydb user=student password=student")
    cur = conn.cursor()

    process_data(cur,
                 conn,
                 filepath='data/song_data',
                 func=process_song_file,
                 filepath_pattern="*.json")
    process_data(cur,
                 conn,
                 filepath='data/log_data',
                 func=process_log_file,
                 filepath_pattern="*.json")

    conn.close()
Beispiel #12
0
import sys
sys.path.insert(0, './')

import configparser
import psycopg2
import pandas as pd
from sql_queries import *
import time
from create_tables import main
from etl import etl_main
from infra_control import createCluster, checkCluster, deleteCluster

if __name__ == "__main__":

    print('Checking if cluster exists. If not, then create it.')
    if checkCluster() == -1:
        createCluster()

    while checkCluster() == -2:
        time.sleep(5)
        print('Waiting for cluster to be ready ...', end='\r')

    print('Cluster ready, getting endpoint.')
    end_point = checkCluster()

    print('\n\nAll good, commencing ETL.')
    main(create_table_queries, drop_table_queries)
    etl_main(copy_table_queries, insert_table_queries)
Beispiel #13
0
def main():
    IPCluster, keyspace = "127.0.0.1", "sparkifydb"
    create_tables.main(IPCluster, keyspace)

    process_data(IPCluster, keyspace, "./event_data/")
Beispiel #14
0
import configparser
import psycopg2
import pandas as pd
from sql_queries import *
import time
from create_tables import main
from sql_queries import copy_table_queries, insert_table_queries

# from etl import etl_main
from create_cluster import createCluster

def load_staging_tables(cur, conn):
    for query in copy_table_queries:
        cur.execute(query)
        conn.commit()


def insert_tables(cur, conn):
    for query in insert_table_queries:
        cur.execute(query)
        conn.commit()


if __name__ == "__main__":
    
    print('First, check if cluster exists then create it')
    createCluster()

    print('Now, it\'s ready for ETL')
    main()
 
Beispiel #15
0
def main():
    """
    Run script creating and loading Postgres database from data directory.

    Steps:
    1. Run create_tables.py to create Postgress database 'sparkifydb' and
    star schema with tables: songplays, users, time, artists, songs.

    2. Connect to sparkifydb.

    3. Create staging tables song_staging and log_staging.

    4. Create profilers.

    5. In the body of `try`:
    - copy data to staging tables using profilers.
    - insert data into star schema
    - set foreign keys and indices
    - execute a simple query to check if at least one entry in
    `songplays` has a non-null artist_id

    6. In the body of `finally`: drop the staging tables and 
    close the connection.

    7. Print the profiling statistics, showing the top 10% of
    operations sorted by time.
    """

    create_tables.main()
    
    dsn = "host=127.0.0.1 dbname=sparkifydb user=student password=student"
    conn = psycopg2.connect(dsn)
    cur = conn.cursor()

    stagers = [song_stager, log_stager]
    
    for s in stagers:
        print('* Creating table', s.get_table_name())
        s.create_table(cur)
        conn.commit()
        

    profilers = [Profile(), Profile()]
    try:
        for p, s in zip(profilers, stagers):
            print('* Copying to table', s.get_table_name())
            p.runcall(lambda: s.copy(cur, stream=True))
            conn.commit()

        print('* Inserting into star schema')
        for query in sql_queries.insert_queries:
            cur.execute(query)
            conn.commit()
        print('* Setting foreign keys')
        for query in sql_queries.fk_queries:
            cur.execute(query)
            conn.commit()
        print('* Setting indices')
        for query in sql_queries.idx_queries:
            cur.execute(query)
            conn.commit()

        cur.execute("SELECT COUNT(*) FROM songplays WHERE artist_id IS NOT NULL;")
        print('* Number of songplays with artist_id not NULL:', cur.fetchone()[0])
    finally:
        for s in stagers:
            print('* Dropping table', s.get_table_name())
            s.drop_table(cur)
            conn.commit()
        cur.close()
        conn.close()
        
    for profiler in profilers:
        stats = Stats(profiler)
        stats.strip_dirs()
        stats.sort_stats('time')
        stats.print_stats(.1)
import drop_tables
import create_tables
import inserting_literacy1951_2011
import inserting_population01
import inserting_population11
import inserting_police01
import inserting_police11
import inserting_crimepart1
import inserting_crimepart2
import inserting_totalcrime
# import inserting_totalcrime13


drop_tables.main()
create_tables.main()
inserting_literacy1951_2011.main()
inserting_population01.main()
inserting_population11.main()
inserting_police01.main()
inserting_police11.main()
inserting_crimepart1.main()
inserting_crimepart2.main()
inserting_totalcrime.main()
# inserting_totalcrime13.main()

print '*' * 60
print '\t\tDATABASE CONFIGURED.'
print '*' * 60
Beispiel #17
0
        for f in files:
            all_files.append(os.path.abspath(f))

    # get total number of files found
    num_files = len(all_files)
    print('{} files found in {}'.format(num_files, filepath))

    # iterate over files and process
    for i, datafile in enumerate(all_files, 1):
        func(cur, datafile)
        conn.commit()
        print('{}/{} files processed.'.format(i, num_files))


def main():
    conn = mysql.connector.connect(host="localhost",
                                   user="******",
                                   password="******",
                                   database="SparkifyDB")
    cur = conn.cursor()

    process_data(cur, conn, filepath='data/song_data', func=process_song_file)
    process_data(cur, conn, filepath='data/log_data', func=process_log_file)

    conn.close()


if __name__ == "__main__":
    create_tables.main()
    main()