def drop_duplicates_from_db(db_table=None, groupby_cols=None, keep_last=True, *args, **kwargs): """ :Description: Drop duplicate records from specified database table :Params: db_table: Name of the database table type: str default: None groupby_cols: list of columns to check for duplicates type: list default: None keep_last: keep the last duplicated entry? type: bool default: True returns: Nothing :Dependencies: Python3 psycopg2 :Example: drop_duplicates_from_db(db_table='Waze', groupby_cols=['ID', 'Date_Time'], keep_last=True) """ import psycopg2 params = get_params(*args, **kwargs) # Make sure the type is list if type(groupby_cols) == list: cols = '", "'.join(groupby_cols) else: cols = '", "'.join([groupby_cols]) if keep_last: keep = 'MAX' else: keep = 'MIN' # Build command command = '''DELETE FROM ''' + '{0}."{1}" WHERE ctid NOT IN (SELECT {2}(ctid) FROM {0}."{1}" ' \ 'GROUP BY "{3}");'.format(params['schema'], db_table, keep, cols) # Assemble the database metadata conn_str = "host='{0}' dbname='{1}' port='{2}' user='******' password='******'".format(params['host'], params['database'], params['port'], params['username'], params['password']) # Connect to database conn = psycopg2.connect(conn_str) cursor = conn.cursor() # Execute command cursor.execute(command) conn.commit()
def create_new_column(db_table, col_name, col_type, *args, **kwargs): """ :Description: Add a column to a database table :Params: db_table: Database table name type: str col_name: Name of the new column type: str col_type: Datatype of the new column type: str :Returns: Nothing, creates new column in database table if it doesn't exist :Dependencies: Python3 psycopg2 :Example: create_new_column('some_table', 'new_col', 'integer') """ import psycopg2 params = get_params(*args, **kwargs) # Assemble the database metadata conn_str = "host='{0}' dbname='{1}' port='{2}' user='******' password='******'".format( params['host'], params['database'], params['port'], params['username'], params['password']) base_command = '''SELECT * FROM ''' + params['schema'] # Create the database query command = base_command + '."{0}" ADD COLUMN IF NOT EXISTS "{1}" {2}'.format( db_table, col_name, col_type) # Connect to the database conn = psycopg2.connect(conn_str) cursor = conn.cursor() # Execute the command cursor.execute(command) conn.commit()
def update_db_cell(db_table=None, new_value=None, new_column=None, column_values=None, update_all=False, indent='', *args, **kwargs): """ :Description: Pushes new status to the internal status dashboard database table Removes old status from the table :Params: db_table: Name of the database table type: str default: None new_value: New value for the cell type: str default: None new_column: Column of the new value type: str default: None column_values: Column names and values to match type: dict format: {column_name_0: value_0, column_name_1: value_1, ...} default: None update_all: Update every cell matching column_values? type: bool default: False text: type: bool default: False returns: Nothing, updates the specified cell of a database table :Dependencies: Python3 pandas psycopg2 :Notes: If new_value is set to None, the row will be deleted :Example: update_internal_status_db('automation/waze/some_script', 'VM1', True, 'Something went wrong...script failed') """ from psycopg2 import connect params = get_params(*args, **kwargs) wheres = [] for i in column_values: wheres.append("\"{0}\" = '{1}'".format(i, column_values[i])) wheres = ' AND '.join(wheres) command = '''SELECT COUNT(*) FROM ''' + '{0}."{1}" WHERE {2}'.format( params['schema'], db_table, wheres) # Assemble the database metadata conn_str = "host='{0}' dbname='{1}' port='{2}' user='******' password='******'".format( params['host'], params['database'], params['port'], params['username'], params['password']) # Connect to database conn = connect(conn_str) cursor = conn.cursor() cursor.execute(command) conn.commit() records = cursor.fetchall() num = records[0][0] if num == 1: if new_value is not None: command = '''UPDATE ''' + '{0}."{1}" SET "{2}" = \'{3}\' WHERE {4}'.format( params['schema'], db_table, new_column, new_value, wheres) msg = 'Value updated' else: command = '''DELETE FROM ''' + '{0}."{1}" WHERE {2}'.format( params['schema'], db_table, wheres) msg = 'Row deleted' # Execute command cursor.execute(command) conn.commit() print(indent + msg) elif num == 0: print(indent + 'Location not found') elif (num > 1) & update_all: if new_value is not None: command = '''UPDATE ''' + '{0}."{1}" SET "{2}" = \'{3}\' WHERE {4}'.format( params['schema'], db_table, new_column, new_value, wheres) msg = 'All values updated' else: command = '''DELETE FROM ''' + '{0}."{1}" WHERE {2}'.format( params['schema'], db_table, wheres) msg = 'All rows deleted' # Execute command cursor.execute(command) conn.commit() print(indent + msg) else: print( indent + 'Multiple locations found. Be more specific or set "update_all=True" to update value in all locations.' ) conn.close()
def to_db_new(df, db_table, d_types=None, index=False, chunk_size=None, *args, **kwargs): """ :Description: Pushes data to the selected database table :Params: df: Data type: pandas DataFrame db_table: The name of the database table type: str d_types: The datatypes of each column type: dict default: None index: Whether or not to use index type: bool default: False chunk_size: Number of rows to send at a time type: int default: None returns: Nothing, pushes data to database table :Dependencies: Python3 sqlalchemy :Example: to_db_new(df, 'Waze') """ import psycopg2 import traceback from pandas.api.types import is_numeric_dtype from generic.get_params import get_params params = get_params(*args, **kwargs) dd = df.copy() if index: df.reset_index(inplace=True) cols = dd.columns for c in cols: if is_numeric_dtype(dd[c]): dd[c] = dd[c].astype(str) else: dd[c] = dd[c].astype(str) dd[c] = "'" + dd[c] + "'" dd = dd.to_numpy() # Assemble the database metadata conn_str = "host='{0}' dbname='{1}' port='{2}' user='******' password='******'".format(params['host'], params['database'], params['port'], params['username'], params['password']) try: # Connect to database conn = psycopg2.connect(conn_str) # Create new cursor cursor = conn.cursor() n = 0 L = len(dd) cols = ', '.join(['"{0}"'.format(i) for i in cols]) if chunk_size is None: chunk_size = L exists = check_if_table_exists(cursor, params['schema'], db_table) if not exists: create_table(df.head(0), db_table, params, d_types) command = '''INSERT INTO ''' + '{0}."{1}" ({2}) VALUES ('.format(params['schema'], db_table, cols) while n < L: # Chunk data and convert to string data = "), (".join([", ".join(i) for i in dd[n: n + chunk_size]]) + ')' # Execute command cursor.execute(command + data) n += chunk_size # Commit changes to database conn.commit() # Close connection to database cursor.close() except: print(traceback.format_exc()) finally: # Make sure connection to database is closed if conn is not None: conn.close()
def load_data(db_table=None, start=None, end=None, date_col='date_time', chunk_size=100000, time_zone='America/New_York', *args, **kwargs): """ :Description: Load data from database :Params: db_table: Database table name type: str default: None start: Start date[time] type: str default: None format: YYYY-MM-DD [HH:MM:SS] end: End date[time] type: str default: None format: YYYY-MM-DD [HH:MM:SS] date_col: Datetime column type: str default: date_time chunk_size: Number of rows to load at a time type: int default: 100000 :Returns: df: Data from given table for specified time frame type: pandas DataFrame :Dependencies: Python3 psycopg2 :Notes: If start is not given, all data before end will be loaded If end is not given, all data after start will be loaded If neither start nor end are given, all data will be loaded :Example: load_data(db_table='some_table', start='2019-01-01') """ import psycopg2 import pandas as pd params = get_params(*args, **kwargs) base_command = '''SELECT * FROM ''' + '{0}."{1}"'.format( params['schema'], db_table) # Create the database query if start is None and end is None: pass elif start is None: base_command = base_command + ' WHERE "{1}" < \'{2} {3}\''.format( db_table, date_col, end, time_zone) elif end is None: base_command = base_command + ' WHERE "{1}" >= \'{2} {3}\''.format( db_table, date_col, start, time_zone) else: base_command = base_command + ' WHERE "{1}" >= \'{2} {4}\' AND "{1}" < \'{3} {4}\''.format( db_table, date_col, start, end, time_zone) # Assemble the database metadata conn_str = "host='{0}' dbname='{1}' port='{2}' user='******' password='******'".format( params['host'], params['database'], params['port'], params['username'], params['password']) # Connect to the database conn = psycopg2.connect(conn_str) cursor = conn.cursor() # Grab the first row from the database table # Needed to get the column names cursor.execute(base_command + ' LIMIT 1'.format(db_table)) col_names = [desc[0] for desc in cursor.description] df = pd.DataFrame() # Iterate through the database rows # Needed if the query selection is too large to fit in the RAM n = 0 while True: # Add LIMIT and OFFSET to the command command = base_command + " LIMIT " + str( chunk_size) + " OFFSET " + str(n * chunk_size) # Execute the command cursor.execute(command) records = cursor.fetchall() temp = pd.DataFrame(records) # Breaks when finished try: temp.columns = col_names except ValueError: break df = df.append(temp) n += 1 del temp # Convert date column to datetime format try: df[date_col] = pd.to_datetime(df[date_col], infer_datetime_format=True) except: pass if df.empty: df = pd.DataFrame(columns=col_names) return df
def drop_from_db(db_table=None, date_col='date_time', pre_date=None, post_date=None, *args, **kwargs): """ :Description: Drops all entries between the start/end dates from the specified database table :Params: db_table: The name of the database table type: str default: None date_col: The name of the date column type: str default: None pre_date: The starting date type: str format: YYYY-MM-DD default: None post_date: The ending date type: str format: YYYY-MM-DD default: None returns: Nothing :Dependencies: Python3 psycopg2 :Notes: Will grab all data if start/end dates are not specified :Example: drop_from_db(db_table='Waze', date_col='Date_Time', start='2018-01-01', end='2018-02-01') """ import psycopg2 params = get_params(*args, **kwargs) print('{0}\n{1}\t-\t{2}\n'.format(db_table, pre_date, post_date)) # Build command if (pre_date is None) and (post_date is None): command = '''DELETE FROM ''' + '{0}."{1}"'.format(params['schema'], db_table) elif pre_date is None: command = '''DELETE FROM ''' + '{0}."{1}" WHERE "{2}" < \'{3}\''.format(params['schema'], db_table, date_col, post_date) elif post_date is None: command = '''DELETE FROM ''' + '{0}."{1}" WHERE "{2}" >= \'{3}\''.format(params['schema'], db_table, date_col, pre_date) else: command = '''DELETE FROM ''' + '{0}."{1}" WHERE "{2}" >= \'{3}\' AND "{2}" < \'{4}\''.format(params['schema'], db_table, date_col, pre_date, post_date) # Assemble the database metadata conn_str = "host='{0}' dbname='{1}' port='{2}' user='******' password='******'".format(params['host'], params['database'], params['port'], params['username'], params['password']) # Connect to database conn = psycopg2.connect(conn_str) cursor = conn.cursor() # Execute command cursor.execute(command) conn.commit()
def update_internal_status_db(script, location, flag, description, date=None, db_table='internal_status_dashboard', *args, **kwargs): """ :Description: Pushes new status to the internal status dashboard database table Removes old status from the table :Params: script: Name of the script type: str location: Location of the script type: str flag: Did the script trip a flag? type: bool description: Description of the flag type: str date: Date/time that the script was run type: str default: None db_table: Name of the database table type: str default: internal_status_dashboard date_col: Name of the date column type: str default: date_time returns: Nothing, updates the internal status dashboard database table :Dependencies: Python3 pandas :Example: update_internal_status_db('automation/waze/some_script', 'VM1', True, 'Something went wrong...script failed') """ from time import gmtime, strftime from datetime import datetime from psycopg2 import connect params = get_params(*args, **kwargs) # Grab the current UTC date if date is None: date = gmtime() date = strftime('%Y-%m-%d %H:%M:%S', date) date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') print(date) # Convert the flag to a string if flag: flag = 'FLAG' else: flag = '' # Assemble the database metadata conn_str = "host='{0}' dbname='{1}' port='{2}' user='******' password='******'".format( params['host'], params['database'], params['port'], params['username'], params['password']) # Connect to database conn = connect(conn_str) cursor = conn.cursor() command = '''SELECT COUNT(*) FROM ''' + \ "{0}.\"{1}\" WHERE \"location\" LIKE '{2}' AND script LIKE '{3}'".format(params['schema'], db_table, location, script) cursor.execute(command) conn.commit() records = cursor.fetchall() num = records[0][0] # Will fail if script doesn't exist in table yet if num == 1: # Update flag command = '''UPDATE ''' + '{0}."{1}" SET "flag"'.format(params['schema'], db_table) + \ " = '{0}' WHERE \"location\" LIKE '{1}' AND script LIKE '{2}'".format(flag, location, script) cursor.execute(command) conn.commit() # Update description command = '''UPDATE ''' + '{0}."{1}" SET "description"'.format(params['schema'], db_table) + \ " = '{0}' WHERE \"location\" LIKE '{1}' AND script LIKE '{2}'".format(description, location, script) cursor.execute(command) conn.commit() # Update date_time command = '''UPDATE ''' + '{0}."{1}" SET "date_time"'.format(params['schema'], db_table) + \ " = '{0}' WHERE \"location\" LIKE '{1}' AND script LIKE '{2}'".format(date, location, script) # Execute command cursor.execute(command) conn.commit() elif num == 0: print('script doesnt exist yet') from generic.to_db import to_db import pandas as pd df = pd.DataFrame( { 'script': script, 'date_time': date, 'location': location, 'flag': flag, 'description': description }, index=[0]) # Push new status to database to_db(df, db_table) conn.close()