def insert_into_date(): with UseRedshift(redshift_db_config) as cursor: cursor.execute(rf"""SELECT DISTINCT etl_1.date FROM public.etl_1 etl_1 LEFT JOIN public.dim_date dim_date ON dim_date.date = etl_1.date WHERE etl_1.in_etl_2=False AND dim_date.date IS NULL""") dates = cursor.fetchall() for i, date in zip(range(len(dates)), dates): date_str = dates[i][0].isoformat() split_date = date_str.split('-') year = split_date[0] month = split_date[1] day = split_date[2] if month in ['01', '02', '03']: qtr = '1' elif month in ['04', '05', '06']: qtr = '2' elif month in ['07', '08', '09']: qtr = '3' else: qtr = '4' date_id = str(uuid.uuid4()) print('-------- Executing INSERT DATE Statement --------') cursor.execute( rf"""INSERT INTO public.dim_date( id, date, day, month, quarter, year) VALUES(%s, %s, %s, %s, %s, %s)""", (date_id, date, day, month, qtr, year)) print('-------- Insert Statement Complete -------- ')
def create_table(): with UseRedshift(redshift_db_config) as cursor: SQL_CREATE = rf"""CREATE TABLE IF NOT EXISTS public.s3_load ( date VARCHAR(20), time VARCHAR(20), server_ip VARCHAR(20), method VARCHAR(10), uri_stem VARCHAR(80), uri_query VARCHAR(1000), server_port VARCHAR(10), username VARCHAR(60), client_ip VARCHAR(20), client_browser VARCHAR(1000), client_cookie VARCHAR(1000), client_referrer VARCHAR(1000), status VARCHAR(10), substatus VARCHAR(10), win32_status VARCHAR(100), bytes_sent VARCHAR(100), bytes_received VARCHAR(100), duration VARCHAR(100), in_etl_1 BOOLEAN) """ try: print('-------- Executing CREATE TABLE --------') cursor.execute(SQL_CREATE) print('-------- SQL CREATE Complete --------') except Exception as err: print('Error executing SQL: ', err)
def create_table(): """Creates a table in AWS Redshift if it doesn't already exist.""" try: with UseRedshift(redshift_db_config) as cursor: SQL_CREATE = rf"""CREATE TABLE IF NOT EXISTS public.sparktest ( date DATE, time TIMESTAMP, server_ip VARCHAR(20), method VARCHAR(10), uri_stem VARCHAR(80), uri_query VARCHAR(1000), server_port INT, username VARCHAR(60), client_ip VARCHAR(20), client_browser VARCHAR(1000), client_cookie VARCHAR(1000), client_referrer VARCHAR(1000), status INT, substatus INT, win32_status INT, bytes_sent INT, bytes_received INT, duration INT )""" cursor.execute(SQL_CREATE) except Exception as err: print('Error: ', err)
def insert_into_file(): with UseRedshift(redshift_db_config) as cursor: cursor.execute(rf"""SELECT DISTINCT etl_1.uri_stem, etl_1.bytes_sent FROM public.etl_1 etl_1 LEFT JOIN public.dim_file dim_file ON dim_file.uri_stem = etl_1.uri_stem AND dim_file.bytes_sent = etl_1.bytes_sent WHERE etl_1.in_etl_2=False AND dim_file.uri_stem IS NULL AND dim_file.bytes_sent IS NULL""") file_contents = cursor.fetchall() for i in file_contents: uri_stem = i[0] bytes_sent = i[1] file_ext = re.search(r'\.[A-Za-z0-9]+$', i[0]) if file_ext is not None: file_type = file_ext.group(0) else: file_type = file_ext if uri_stem == '/robots.txt': is_crawler = True else: is_crawler = False file_id = str(uuid.uuid4()) print('-------- Executing INSERT FILE Statement --------') cursor.execute( rf"""INSERT INTO public.dim_file( id, uri_stem, bytes_sent, file_type, is_crawler) VALUES(%s, %s, %s, %s, %s)""", (file_id, uri_stem, bytes_sent, file_type, is_crawler)) print('-------- Insert Statement Complete -------- ')
def insert_into_location(): with UseRedshift(redshift_db_config) as cursor: cursor.execute(rf"""SELECT DISTINCT etl_1.client_ip FROM public.etl_1 etl_1 LEFT JOIN public.dim_location dim_location ON dim_location.client_ip = etl_1.client_ip WHERE etl_1.in_etl_2=False AND dim_location.client_ip IS NULL""") client_ips = cursor.fetchall() for ip in client_ips: results = requests.get(f'https://ipinfo.io/{ip[0]}') results_dict = results.json() if 'postal' in results_dict.keys(): postcode = results_dict['postal'] else: postcode = None if 'city' in results_dict.keys(): city = results_dict['city'] else: city = None if 'region' in results_dict.keys(): region = results_dict['region'] else: region = None country = results_dict['country'] location_id = str(uuid.uuid4()) print('-------- Executing INSERT LOCATION Statement --------') cursor.execute( rf"""INSERT INTO public.dim_location( id, client_ip, postcode, city, region, country) VALUES(%s, %s, %s, %s, %s, %s)""", (location_id, ip[0], postcode, city, region, country)) print('-------- Insert Statement Complete -------- ')
def insert_into_time(): with UseRedshift(redshift_db_config) as cursor: cursor.execute(rf"""SELECT DISTINCT etl_1.time FROM public.etl_1 etl_1 LEFT JOIN public.dim_time dim_time ON dim_time.time = etl_1.time WHERE etl_1.in_etl_2=False AND dim_time.time IS NULL""") times = cursor.fetchall() for i, time in zip(range(len(times)), times): time_str = times[i][0].isoformat() split_time = time_str.split( 'T') # returns ['2009-10-24', '01:40:40'] hr_min_sec = [] for i in split_time: hr_min_sec = i.split(':') # returns ['01', '40', '40'] hour = hr_min_sec[0] minute = hr_min_sec[1] second = hr_min_sec[2] time_id = str(uuid.uuid4()) print('-------- Executing INSERT TIME Statement --------') cursor.execute( rf"""INSERT INTO public.dim_time( id, time, hour, minute, second) VALUES(%s, %s, %s, %s, %s)""", (time_id, time, hour, minute, second)) print('-------- Insert Statement Complete -------- ')
def insert_ids_to_fact(): with UseRedshift(redshift_db_config) as cursor: INSERT_DATE_ID = rf"""UPDATE public.etl_1 SET date_id = public.dim_date.id FROM public.dim_date WHERE public.etl_1.date = public.dim_date.date AND public.etl_1.date_id IS NULL;""" INSERT_TIME_ID = rf"""UPDATE public.etl_1 SET time_id = public.dim_time.id FROM public.dim_time WHERE public.etl_1.time = public.dim_time.time AND public.etl_1.time_id IS NULL;""" INSERT_LOCATION_ID = rf"""UPDATE public.etl_1 SET location_id = public.dim_location.id FROM public.dim_location WHERE public.etl_1.client_ip = public.dim_location.client_ip AND public.etl_1.location_id IS NULL;""" INSERT_REQUEST_ID = rf"""UPDATE public.etl_1 SET request_id = public.dim_request.id FROM public.dim_request WHERE public.etl_1.method = public.dim_request.method AND public.etl_1.client_browser = public.dim_request.client_browser AND (public.etl_1.client_referrer = public.dim_request.client_referrer OR (public.etl_1.client_referrer IS NULL AND public.dim_request.client_referrer IS NULL)) AND public.etl_1.status = public.dim_request.status AND public.etl_1.duration = public.dim_request.duration AND public.etl_1.request_id IS NULL;""" # fixed by accepting NULLs for client_referrer INSERT_FILE_ID = rf"""UPDATE public.etl_1 SET file_id = public.dim_file.id FROM public.dim_file WHERE public.etl_1.uri_stem = public.dim_file.uri_stem AND (public.etl_1.bytes_sent = public.dim_file.bytes_sent OR (public.etl_1.bytes_sent IS NULL AND public.dim_file.bytes_sent IS NULL)) AND public.etl_1.file_id IS NULL;""" # fixed by accepting NULLs for bytes_sent INSERT_VISIT_ID = rf"""UPDATE public.etl_1 SET visit_id = public.dim_visit.id FROM public.dim_visit WHERE public.etl_1.client_cookie = public.dim_visit.client_cookie AND public.etl_1.visit_id IS NULL;""" try: cursor.execute(INSERT_DATE_ID) cursor.execute(INSERT_TIME_ID) cursor.execute(INSERT_LOCATION_ID) cursor.execute(INSERT_REQUEST_ID) cursor.execute(INSERT_FILE_ID) cursor.execute(INSERT_VISIT_ID) except Exception as err: print('Error: ', err) cursor.execute(rf"""UPDATE public.etl_1 SET in_etl_2=True WHERE in_etl_2=False""")
def copy_to_redshift(): """Executes an SQL command to copy s3 folder to a Redshift table.""" try: with UseRedshift(redshift_db_config) as cursor: SQL_COPY = rf"""COPY public.sparktest FROM 's3://la-ticket-bucket-eu/spark-etl5' IAM_ROLE 'arn:aws:iam::900056063831:role/RedshiftCopyUnload' FORMAT AS PARQUET; """ cursor.execute(SQL_COPY) except Exception as err: print('Error: ', err)
def create_table(): with UseRedshift(redshift_db_config) as cursor: SQL_CREATE_DATE = rf"""CREATE TABLE IF NOT EXISTS public.dim_date ( id VARCHAR(50), date DATE, day INTEGER, week INTEGER, month INTEGER, quarter INTEGER, year INTEGER)""" SQL_CREATE_TIME = rf"""CREATE TABLE IF NOT EXISTS public.dim_time ( id VARCHAR(50), time TIMESTAMP, hour INTEGER, minute INTEGER, second INTEGER)""" SQL_CREATE_LOCATION = rf"""CREATE TABLE IF NOT EXISTS public.dim_location ( id VARCHAR(50), client_ip VARCHAR(200), postcode VARCHAR(10), city VARCHAR(300), region VARCHAR(200), country VARCHAR(200))""" SQL_CREATE_REQUEST = rf"""CREATE TABLE IF NOT EXISTS public.dim_request ( id VARCHAR(50), method VARCHAR(10), client_browser VARCHAR(1000), client_referrer VARCHAR(1000), status INTEGER, duration INTEGER)""" SQL_CREATE_FILE = rf"""CREATE TABLE IF NOT EXISTS public.dim_file ( id VARCHAR(50), uri_stem VARCHAR(80), bytes_sent INTEGER, file_type VARCHAR(10), is_crawler BOOLEAN)""" SQL_CREATE_VISIT = rf"""CREATE TABLE IF NOT EXISTS public.dim_visit ( id VARCHAR(50), client_cookie VARCHAR(1000))""" try: print('-------- Executing CREATE TABLE --------') cursor.execute(SQL_CREATE_DATE) cursor.execute(SQL_CREATE_TIME) cursor.execute(SQL_CREATE_LOCATION) cursor.execute(SQL_CREATE_REQUEST) cursor.execute(SQL_CREATE_FILE) cursor.execute(SQL_CREATE_VISIT) print('-------- SQL CREATE Complete --------') except Exception as err: print('Error executing SQL: ', err)
def insert_into_visit(): with UseRedshift(redshift_db_config) as cursor: cursor.execute(rf"""SELECT DISTINCT etl_1.client_cookie FROM public.etl_1 etl_1 LEFT JOIN public.dim_visit dim_visit ON dim_visit.client_cookie = etl_1.client_cookie WHERE etl_1.in_etl_2=False AND dim_visit.client_cookie IS NULL""") cookies = cursor.fetchall() for i in cookies: if i[0] == None: client_cookie = '' else: client_cookie = i[0] visit_id = str(uuid.uuid4()) print('-------- Executing INSERT VISIT Statement --------') cursor.execute( rf"""INSERT INTO public.dim_visit( id, client_cookie) VALUES(%s, %s)""", (visit_id, client_cookie)) print('-------- Insert Statement Complete -------- ')
def insert_into_request(): with UseRedshift(redshift_db_config) as cursor: cursor.execute(rf"""SELECT DISTINCT etl_1.method, etl_1.client_browser, etl_1.client_referrer, etl_1.status, etl_1.duration FROM public.etl_1 etl_1 LEFT JOIN public.dim_request dim_request ON dim_request.method = etl_1.method AND dim_request.client_browser = etl_1.client_browser AND dim_request.client_referrer = etl_1.client_referrer AND dim_request.status = etl_1.status AND dim_request.duration = etl_1.duration WHERE etl_1.in_etl_2=False AND dim_request.method IS NULL AND dim_request.client_browser IS NULL AND dim_request.client_referrer IS NULL AND dim_request.status IS NULL AND dim_request.duration IS NULL""") request_results = cursor.fetchall() for i in request_results: method = i[0] client_browser = i[1] client_referrer = i[2] status = i[3] duration = i[4] request_id = str(uuid.uuid4()) print('-------- Executing INSERT REQUEST Statement --------') cursor.execute( rf"""INSERT INTO public.dim_request( id, method, client_browser, client_referrer, status, duration) VALUES(%s, %s, %s, %s, %s, %s)""", (request_id, method, client_browser, client_referrer, status, duration)) print('-------- Insert Statement Complete -------- ')
def create_table(): with UseRedshift(redshift_db_config) as cursor: SQL_CREATE = rf"""CREATE TABLE IF NOT EXISTS public.etl_1 ( id VARCHAR(50), date_id VARCHAR(50), time_id VARCHAR(50), location_id VARCHAR(50), request_id VARCHAR(50), file_id VARCHAR(50), visit_id VARCHAR(50), date DATE, time TIMESTAMP, server_ip VARCHAR(20), method VARCHAR(10), uri_stem VARCHAR(80), uri_query VARCHAR(1000), server_port INT, username VARCHAR(60), client_ip VARCHAR(20), client_browser VARCHAR(1000), client_cookie VARCHAR(1000), client_referrer VARCHAR(1000), status INT, substatus INT, win32_status INT, bytes_sent INT, bytes_received INT, duration INT, in_etl_2 BOOLEAN)""" try: print('-------- Executing CREATE TABLE --------') cursor.execute(SQL_CREATE) print('-------- SQL CREATE Complete --------') except Exception as err: print('Error executing SQL: ', err)
def insert_into_table(): with UseRedshift(redshift_db_config) as cursor: s3_client = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key) ######## CODE TO ITERATE THROUGH ALL FILES IN S3 BUCKET ######## # bucket_contents = s3_client.list_objects(Bucket='la-ticket-bucket-eu', Prefix='BI_logs/') # file_names = [] # for i in bucket_contents['Contents']: # file_names.append(i['Key']) # file_names.pop(0) # remove 'BI_logs' folder name from list, because it's not a specific file name # for log in file_names: ######## If too many files in S3 bucket, the script freezes after a few hours of running ######## ######## Instead of iterating through the whole bucket, I process one file each ######## file_object = s3_client.get_object(Bucket='la-ticket-bucket-eu', Key='BI_logs/u_ex110407.log') file_contents = file_object['Body'].read().decode().split('\n') in_etl_1_flag = False split_logs = [] for line in file_contents: split_logs.append(line.split(' ')) for log in split_logs: if log[0].startswith('#'): pass else: if len(log) == 18: print('-------- Executing INSERT Statement --------') cursor.execute( rf"""INSERT INTO public.s3_load( date, time, server_ip, method, uri_stem, uri_query, server_port, username, client_ip, client_browser, client_cookie, client_referrer, status, substatus, win32_status, bytes_sent, bytes_received, duration, in_etl_1) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", (log[0], log[1], log[2], log[3], log[4], log[5], log[6], log[7], log[8], log[9], log[10], log[11], log[12], log[13], log[14], log[15], log[16], log[17], in_etl_1_flag)) print('-------- Insert Statement Complete -------- ') elif len(log) == 14: print('-------- Executing INSERT Statement --------') cursor.execute( rf"""INSERT INTO public.s3_load( date, time, server_ip, method, uri_stem, uri_query, server_port, username, client_ip, client_browser, status, substatus, win32_status, duration, in_etl_1) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )""", (log[0], log[1], log[2], log[3], log[4], log[5], log[6], log[7], log[8], log[9], log[10], log[11], log[12], log[13], in_etl_1_flag)) print('-------- Insert Statement Complete -------- ') else: print('Unknown log length.')
def insert_into_table(): with UseRedshift(redshift_db_config) as cursor: # Select all if not already present in public.etl_1 table cursor.execute(rf"""SELECT * FROM public.s3_load WHERE in_etl_1 = False""") in_etl_2_flag = False logs = cursor.fetchall() for single_log in logs: if len(single_log) == 19: new_row = [] new_row.append(single_log[0]) # append date new_timestamp = single_log[0:2] new_timestamp = " ".join(new_timestamp) new_row.append(new_timestamp) # append timestamp new_row.append(single_log[2]) # append server_ip new_row.append(single_log[3]) # append method new_row.append(single_log[4]) # append uri_stem new_row.append(single_log[5]) # append uri_query server_port = int(single_log[6]) new_row.append(server_port) # append server_port new_row.append(single_log[7]) # append username new_row.append(single_log[8]) # append client_ip new_row.append(single_log[9]) # append client_browser if single_log[10] == None: new_row.append('') else: new_row.append(single_log[10]) # append client_cookie new_row.append(single_log[11]) # append client_referrer status = int(single_log[12]) new_row.append(status) # append status sub_status = int(single_log[13]) new_row.append(sub_status) # append sub_status win32_status = int(single_log[14]) new_row.append(win32_status) # append win32_status if single_log[15] == None: new_row.append(single_log[15]) # append bytes_sent else: bytes_sent = int(single_log[15]) new_row.append(bytes_sent) # append bytes_sent if single_log[16] == None: new_row.append(single_log[16]) else: bytes_received = int(single_log[16]) new_row.append(bytes_received) # append bytes_received duration = int(single_log[17]) new_row.append(duration) # append duration log_id = str(uuid.uuid4()) print('.') cursor.execute(rf"""INSERT INTO public.etl_1( id, date, time, server_ip, method, uri_stem, uri_query, server_port, username, client_ip, client_browser, client_cookie, client_referrer, status, substatus, win32_status, bytes_sent, bytes_received, duration, in_etl_2) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", (log_id, new_row[0], new_row[1], new_row[2], new_row[3], new_row[4], new_row[5], new_row[6], new_row[7], new_row[8], new_row[9], new_row[10], new_row[11], new_row[12], new_row[13], new_row[14], new_row[15], new_row[16], new_row[17], in_etl_2_flag)) print('.') elif len(single_log) == 15: new_row = [] new_row.append(single_log[0]) # append date new_timestamp = single_log[0:2] new_timestamp = " ".join(new_timestamp) new_row.append(new_timestamp) # append timestamp new_row.append(single_log[2]) # append server_ip new_row.append(single_log[3]) # append method new_row.append(single_log[4]) # append uri_stem new_row.append(single_log[5]) # append uri_query server_port = int(single_log[6]) new_row.append(server_port) # append server_port new_row.append(single_log[7]) # append username new_row.append(single_log[8]) # append client_ip new_row.append(single_log[9]) # append client_browser status = int(single_log[10]) new_row.append(status) # append status sub_status = int(single_log[11]) new_row.append(sub_status) # append sub_status win32_status = int(single_log[12]) new_row.append(win32_status) # append win32_status duration = int(single_log[13]) new_row.append(duration) # append duration new_row.append('') # to replace client_cookie NULL with empty string log_id = str(uuid.uuid4()) print('.') cursor.execute(rf"""INSERT INTO public.etl_1( id, date, time, server_ip, method, uri_stem, uri_query, server_port, username, client_ip, client_browser, status, substatus, win32_status, duration, client_cookie, in_etl_2) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", (log_id, new_row[0], new_row[1], new_row[2], new_row[3], new_row[4], new_row[5], new_row[6], new_row[7], new_row[8], new_row[9], new_row[10], new_row[11], new_row[12], new_row[13], new_row[14], in_etl_2_flag)) print('.') else: print('not 18 columns long') cursor.execute(rf"""UPDATE public.s3_load SET in_etl_1=True WHERE in_etl_1=False""")