def filter_none_from_frequency_count(frequency_count): """ Filter out None rows from a frequency count. A frequency count is a list of tuples or a list of lists, in which the first element is the property, and the second element is the count. Parameters ---------- frequency_count: list of lists or tuples with length 2 Returns ------- new_frequency_count: list of lists or tuples with length 2 Example ------- >> a = [ [None, 4], [1, 2], [2, 3], [10, 1] ] >> UtilsContainer.filter_none_from_frequency_count(a) [ [1, 2], [2, 3], [10, 1] ] >> a = [ [None, 4], ['None', 2], ['c', 3], ['d', 1] ] >> UtilsContainer.filter_none_from_frequency_count(a) [ ['None', 2], ['c', 3], ['d', 1] ] """ Assert.frequency_count(frequency_count) new_frequency_count = [x for x in frequency_count if x[0] is not None] return new_frequency_count
def create_database(self, db_name, force_create=False): """Create an empty database. Parameters ---------- db_name: string Name of the to be created database force_create: bool Flag indicating if a possible existing database with the same name shall be overwritten or not. If force_create = False and there is already a database with the name db_name, nothing happens. Returns ------- None Postconditions -------------- Database with the given name exists. If force_create=True: database with the given name is empty. """ Assert.py_type(db_name, str, 'db_name') Assert.py_type(force_create, bool, 'force_create') if force_create: self.drop_database(db_name, confirm=True) self.cursor.execute("CREATE DATABASE {0} DEFAULT CHARACTER SET 'utf8'".format(db_name)) if self.info: print ('Database {0} created'.format(db_name))
def index_column(self, table, column): """Add an index to a specific column in a Table. Parameters ---------- table: string Name of the Table to index. column: string Name of the column to index. Returns ------- None Prerequisites ------------- Table exists (in current database if no database is specified). Column exists in Table """ Assert.py_type(table, str, 'table') Assert.py_type(column, str, 'column') assert self.table_exists(table) index_exists = self._index_exists(table, column) if not index_exists: q = ''' ALTER TABLE {table} ADD INDEX `{col}_idx` (`{col}` ASC) '''.format(table=table, col=column) self.execute(q)
def filter_array(lst): """ Filter out NaN values from a list or np.array. If the type of the input list implements np.isnan, filter out NaN. Otherwise, leave the input list unaltered. Example ------- >> L = [1, 2, 3, np.nan] >> UtilsContainer.filter_array(L) [1, 2, 3] >> L = np.array(['a', 'b', 'c', np.nan]) >> UtilsContainer.filter_array(L) ['a', 'b', 'c', np.nan] """ Assert.seq(lst) try: lst_invalid = np.isnan(lst) except TypeError: lst_invalid = np.zeros_like(lst, dtype=bool) lst_valid = np.logical_not(lst_invalid) if isinstance(lst, list): result = list(lst[i] for i in range(len(lst_valid)) if lst_valid[i]) elif isinstance(lst, np.ndarray): result = lst[lst_valid] else: msg = 'Input shall be either list or numpy array, is now a {}'.format(type(lst)) raise AssertionError(msg) assert type(lst) == type(result) return result
def column_exists(self, table, column): """Return whether the given column exists in the Table. Parameters ---------- table: string Name of the table to look in. table: string Name of the column to look for. Returns ------- column_exists: bool Flag indicating whether the Table `table` contains a column `column`. Prerequisites ------------- The Table `table` shall exist. """ Assert.py_type(table, str, 'table') Assert.py_type(column, str, 'column') assert self.table_exists(table), 'The Table `{0}` shall exist.'.format(table) cols = self.show_columns(table) return column in cols
def checksum_table(self, table): """Return the checksum of the given Table. Parameters ---------- table: string Name of the Table to calculate the checksum of. Returns ------- checksum: long int Checksum of the Table. Prerequisites ------------- Table shall exist (in current database if no database is specified). Notes ----- During the checksum operation, the table is locked with a read lock for InnoDB and MyISAM. """ Assert.py_type(table, str, 'table') msg = 'Table {0} shall exist (in current database if no database is specified).'.format(table) assert self.table_exists(table), msg q = ''' CHECKSUM TABLE {0} '''.format(table) self.execute(q) result = self.fetchone() return result[1]
def truncate_table(self, table, disable_foreign_key_checks=False): """Truncate a table from the database Raises an error if the table does not exists Parameters ---------- table: string Name of the table to truncate. disable_foreign_key_checks: boolean Flag indicating whether foreign keys shall be checked Returns ------- None """ Assert.py_type(table, str, 'table') Assert.table_exists(table, self) if disable_foreign_key_checks: # Temporarily disable foreign key checks sql = '''SET FOREIGN_KEY_CHECKS = 0''' self.execute(sql) truncate = 'TRUNCATE {}'.format(table) self.execute(truncate) if disable_foreign_key_checks: # Reenable foreign key checks sql = '''SET FOREIGN_KEY_CHECKS = 1''' self.execute(sql)
def __init__(self, config=None, database=None, info_mode=True, debug_mode=False): if config is None: config = { 'user': '******', 'passwd': 'Nyon6966', 'host': '83.161.215.203', 'port': 8889 } Assert.py_type(config, dict, 'MySQL configuration') assert 'user' in config assert 'passwd' in config assert 'host' in config assert 'port' in config self.config = config connection_config = {'user': config['user'], 'passwd': config['passwd'], 'host': config['host'], 'port': config['port'], } self.cnx = self._connect_to_mysql(connection_config) self.cursor = self.cnx.cursor() self.debug = debug_mode self.info = info_mode if database is not None: self.connect_to_database(database) elif 'default_db' in config and config['default_db'] is not None: self.connect_to_database(config['default_db']) self.cnx.autocommit(True)
def frequency_count2occurrence_list(frequency_count): """Transform a frequency count into an occurrence list. A frequency count is a list of tuples or a list of lists, in which the first element is the property, and the second element is the count. An occurrence list is a list with each property <count> times in it. Parameters ---------- frequency_count: list of lists or tuples with length 2 Returns ------- occurrence_list: np.array, dtype = <type(most general key in frequency_count)> Example ------- >> a = [ [0, 4], [1, 2], [2, 3], [10, 1] ] >> UtilsContainer.frequency_count2occurrence_list(a) [ 0 0 0 0 1 1 2 2 2 10] >> a = [ ['a', 4], ['b', 2], ['c', 3], ['d', 1] ] >> UtilsContainer.frequency_count2occurrence_list(a) ['a' 'a' 'a' 'a' 'b' 'b' 'c' 'c' 'c' 'd'] """ Assert.frequency_count(frequency_count) occurrence_list = np.empty(0) for k, v in frequency_count: if k is not None: k_list = np.empty(v, dtype=type(k)) k_list.fill(k) occurrence_list = np.append(occurrence_list, k_list) return occurrence_list
def increment_smallest(lst, dx): """Increment the smallest element of the list L with the given amount dx. If the minimum occurs multiple times, the first element is incremented.""" Assert.py_type(lst, np.ndarray, 'L') min_index = np.argmin(lst) lst[min_index] += dx
def sneak_preview(self, table): """Print the first ten rows of the Table to the console.""" Assert.py_type(table, str, 'Input table') assert self.table_exists(table), \ 'Table {0} shall exist in the database'.format(table) show = ''' SELECT * FROM {0} LIMIT 10 '''.format(table) self.execute(show) print (self.fetchall())
def list2dict(lst): """ Convert a list to a dictionary, where the key of each entry are the list elements, and the values are indices 0..N, where the ordering is the ordering used in the list. """ Assert.seq(lst) nr_elements = len(lst) result = dict(zip(lst, range(nr_elements))) assert isinstance(result, dict), 'Output shall be a dictionary' msg = 'All elements of input list ({}) shall be in dictionary ({})'.format(len(result), len(lst)) assert len(result) == len(lst), msg return result
def drop_table(self, table): """Drop a table from the database. If the table does not exist in the database, nothing happens. Parameters ---------- table: string Name of the table to drop. Returns ------- None """ Assert.py_type(table, str, 'table') drop = 'DROP TABLE IF EXISTS {0}'.format(table) self.execute(drop)
def table_exists(self, table, db_name=None): """Return whether the given Table exists in the current database. Parameters ---------- table: string Name of the table to look up. Returns ------- table_exists: bool Flag indicating whether the Table `table` exists in the current database. Prerequisites ------------- Connection to a database must have been made. """ # TODO: This pattern with temporarily disabling debug information happens more frequently. # Is there a way to write this in some sort of with statement? # Temporarily disable debug information current_debug_mode = self.debug self.debug = False # Make the assertions Assert.py_type(table, str, 'table') assert self.current_database() is not None, 'Connection to a database shall have been made.' if db_name is None: db_name = self.current_database() q = ''' SELECT COUNT(`TABLE_NAME`) FROM `INFORMATION_SCHEMA`.TABLES WHERE `TABLE_SCHEMA` = '{}' AND `TABLE_NAME` = '{}' '''.format(db_name, table) # Reenable debug information self.debug = current_debug_mode # Return the outcome return bool(self.fetch_as_value(q))
def create_table_like(self, table_ref, table_out, force_create=False): """Create an empty Table with columns like another Table. Create the Table `table_out` with columns and properties exactly as Table `table_ref`, but with no contents. Parameters ---------- table_ref: string Name of the reference Table to mimic. table_out: string Name of the Table to create. force_create: bool Flag indicating whether `table_out` shall be overwritten if it already exists. If it shall not be overwritten and does exist yet, nothing happens. Returns ------- table_created: bool Flag indicating whether `table_out` is indeed written to the DB. If False, the table already existed and nothing had happened. Prerequisites ------------- Connection to a database must have been made. Table 'table_ref' exists (in current database if no database is specified). """ Assert.py_type(table_ref, str, 'table_ref') Assert.py_type(table_out, str, 'table_out') Assert.py_type(force_create, bool, 'force_create') assert self.current_database() is not None, \ 'Connection to a database must have been made.' if '.' in table_ref: i = table_ref.index('.') db_name = table_ref[:i] table_name = table_ref[i + 1:] else: db_name = None table_name = table_ref assert self.table_exists(table_name, db_name), \ 'Table `table_ref` shall exist (in current database if no database is specified).' create_table_query = self._show_create(table_ref) create_table_query = create_table_query.replace(table_ref, table_out) table_created = self.create_table_from_create(create_table_query, force_create) return table_created
def create_table_from_select(self, table_out, select_query, force_create=False): """Create a table from a select query. Parameters ---------- table_out: string Name of the Table to create. select_query: string Valid SQL query that returns a Table. Example: SELECT <fields> FROM <table> WHERE <condition> force_create: bool Flag indicating whether `table_out` shall be overwritten if it already exists. If it shall not be overwritten and does exist yet, nothing happens. Returns ------- table_created: bool Flag indicating whether `table_out` is indeed written to the DB. If False, the table already existed and nothing had happened. Prerequisites ------------- Connection to a database must have been made. Postconditions -------------- Table 'table_out' exists in current database. """ Assert.py_type(table_out, str, 'table_out') Assert.py_type(select_query, str, 'select_query') Assert.py_type(force_create, bool, 'force_create') assert self.current_database() is not None, \ 'Connection to a database must have been made.' create_table_query = 'CREATE TABLE {0} AS ({1})'.format(table_out, select_query) table_created = self.create_table_from_create(create_table_query, force_create) return table_created
def get_config(predefined_config): """Retrieve a predefined config dictionary for DBManager Input: ------ predefined_config: string Name of the predefined configuration settings. It shall be a recognized predefined configuration, otherwise an AssertionError is raised. local: Mac server in office in Utrecht aws: Aurora AWS RDS Returns: -------- config: dict Dictionary with DB connection configuration, necessary for the class DBManager. """ Assert.py_type(predefined_config, str, 'predefined config') if predefined_config == 'localhost': config = { 'user': '******', 'passwd': '', 'host': '127.0.0.1', 'port': 3306, 'default_db': 'orderwriter' } elif predefined_config == 'puzzle': config = { 'user': '******', 'passwd': '', 'host': '127.0.0.1', 'port': 3306, 'default_db': 'puzzle_words' } elif predefined_config == 'nyon_office': config = { 'user': '******', 'passwd': 'Nyon6966', 'host': '83.161.215.203', 'port': 8889 } elif predefined_config == 'orderwriter_test': config = { 'user': '******', 'passwd': 'AbLsLKnq6NJjd8a6', 'host': '213.206.228.254', 'port': 3306, 'default_db': 'orderwriter_20150721', } elif predefined_config == 'orderwriter_live': config = { 'user': '******', 'passwd': 'BjaNWs29sUeNWyZ4', 'host': '213.206.228.254', 'port': 3306, 'default_db': 'orderwriter' } elif predefined_config == 'aws_au': config = { 'user': '******', 'passwd': 'Nyon6966', 'host': 'aurora.ck67ii8uyuzl.eu-west-1.rds.amazonaws.com', 'port': 3306 } elif predefined_config == 'aws_ow': config = { 'user': '******', 'passwd': 'Nyon6966', 'host': 'orderwriter.crii9g2k1ak3.eu-central-1.rds.amazonaws.com', 'port': 3306 } else: msg = 'Predefined config "{0}" not recognized.'.format(predefined_config) raise AssertionError(msg) config['config'] = predefined_config return config
def create_table_from_create(self, create_table_query, force_create=False, temporary=False): """Create a table from a create query. Execute the create_table_query to create a table. The name of the Table to create is assumed to be the third word of the query. Parameters ---------- create_table_query: string Valid SQL query to create a Table. force_create: bool Flag indicating whether `table_out` shall be overwritten if it already exists. If it shall not be overwritten and does exist yet, nothing happens. temporary: bool Flag indicating whether `table_out` will be a temporary Table. Returns ------- :rtype: bool Flag indicating whether `table_out` is indeed written to the DB. If False, the table already existed and nothing had happened. Prerequisites ------------- Connection to a database must have been made. Postconditions -------------- Table exists in current database. Example ------- Syntax of create_table_query shall look like: CREATE TABLE `name` AS ( <select query> ); or: CREATE TABLE `name` ( `col1` int(11) NOT NULL, `col2` date DEFAULT NULL, ) ENGINE=InnoDB DEFAULT CHARSET=latin1; or: CREATE TABLE `name` LIKE `ref_table`; """ Assert.py_type(create_table_query, str, 'create_table_query') Assert.py_type(force_create, bool, 'force_create') assert self.current_database() is not None, \ 'Connection to a database shall have been made.' table_name = self._table_name_from_create_query(create_table_query, temporary) if force_create and self.table_exists(table_name): self.drop_table(table_name) try: self.execute(create_table_query) table_created = True if self.info: print ('Table {} created'.format(table_name)) except pymysql.err.InternalError: print('Last query:\n{0}'.format(create_table_query)) raise return table_created
def fetch_as_np_matrix(self, select_query, glob2loc_x, glob2loc_y): """Execute a select query and return the outcome as an np matrix. Parameters ---------- select_query: string Valid SQL query that returns three columns from a Table. Example: SELECT person_id, article_number, SUM(purchase_weight) FROM orders GROUP BY person_id, article_number glob2loc_x: dict Keys: All possible global values for the first column Values: Range from 0,...,N where N is the number of items in the dict glob2loc_y: dict Keys: All possible global values for the second column Values: Range from 0,...,N where N is the number of items in the dict Returns ------- result: np matrix A numpy matrix where the first two columns indicate the row and column index, and the third column in the select query indicates the value of the matrix elements. Prerequisites ------------- Argument select_query shall select exactly three columns Argument glob2loc_x keys shall contain all encountered values in the first column Argument glob2loc_y keys shall contain all encountered values in the second column Argument glob2loc_x values shall be a continous range from 1,...N Argument glob2loc_y values shall be a continous range from 1,...N The third column in the select query shall be a number """ Assert.py_type(glob2loc_x, dict, 'glob2loc_x') Assert.py_type(glob2loc_y, dict, 'glob2loc_y') max_loc_x = max(glob2loc_x.values()) nr_rows = len(glob2loc_x) assert max_loc_x < nr_rows, 'glob2loc_x values shall be a continous range from 1,...N' max_loc_y = max(glob2loc_y.values()) nr_cols = len(glob2loc_y) assert max_loc_y < nr_cols, 'glob2loc_y values shall be a continous range from 1,...N' self.execute(select_query) data = self.fetchall() result = np.zeros(shape=[nr_rows, nr_cols]) for (x, y, val) in data: try: row = glob2loc_x[x] except KeyError: msg = 'Global value {0} not found in glob2loc_x'.format(x) raise AssertionError(msg) try: col = glob2loc_y[y] except: msg = 'Global value {0} not found in glob2loc_y'.format(y) raise AssertionError(msg) try: result[row, col] = val except ValueError: raise AssertionError('The third column in the select query shall be a number') return result
def insert_into_table(self, table_name, cols, values, update_on_duplicate=True, db_name=None): """Insert the values in the columns of the given Table. Parameters ---------- table_name: string Name of the table to add the records to cols: (numpy) array of strings List of the columns to fill values: (numpy) array of tuples List of the values to fill. update_on_duplicate: boolean Determines what happens if a duplicate key is encountered If True: the record is updated (default) If False: the insertion is ignored db_name: string Name of the database in which the table is defined. If no database name is given, the current database is used. Returns ------- None Prerequisites ------------- Connection to a database shall have been made Table `table_name` shall exists Table `table_name` shall contain all columns in `cols` Postconditions -------------- If possible, all records are added to the Table Notes ----- Only 1000 records can be added in a single INSERT INTO statement (MySQL restriction) This method breaks up the insertion in multiple insertions of a 1000 when more than 1000 records need to be inserted. """ Assert.py_type(table_name, str, 'table_name') if db_name is not None: Assert.py_type(db_name, str, 'db_name') table_name = db_name + '.' + table_name Assert.nonemptylist(cols) for col in cols: Assert.py_type(col, str, 'Column {0}'.format(col)) nr_cols = len(cols) Assert.nonemptylist(values) for val in values: Assert.nonemptylist(val, nr_cols) assert self.current_database() is not None, \ 'Connection to a database must have been made.' # Split up the values in chunks of 1,000 max_inserts = 1000 nr_queries = np.ceil(len(values) / max_inserts) list_values = np.array_split(values, nr_queries) for vals in list_values: # Build up the query if update_on_duplicate: ignore = '' else: ignore = 'IGNORE ' q = "INSERT {0}INTO {1}\n".format(ignore, table_name) q += "(" for i_col, col in enumerate(cols): q += col if i_col < nr_cols - 1: q += ", " q += ")\n" q += "VALUES\n" for i_val, val in enumerate(vals): q += "(" for i_field, field in enumerate(val): # Field with quotes if necessary if field is None: q += "NULL" elif isinstance(field, str): if field.upper() == "NULL": q += "NULL" else: q += "'{0}'".format(field) elif isinstance(field, date) or isinstance(field, time): q += "'{0}'".format(field) elif isinstance(field, Decimal): # TODO: Note! Decimal is converted here to float! # In principle, this should not matter, if you insert it # in the database again, the desired precision is maintained. q += "{}".format(float(field)) elif np.isnan(field): q += "NULL" elif np.isscalar(field): q += "{0}".format(field) else: msg = 'Type {0} not recognized'.format(type(field)) raise AssertionError(msg) # Trailing comma if i_field < nr_cols - 1: q += ", " q += ")" if i_val < len(vals) - 1: q += "," q += "\n" if update_on_duplicate: q += "ON DUPLICATE KEY UPDATE\n" for i_col, col in enumerate(cols): q += "{0}=VALUES({0})".format(col) if i_col < nr_cols - 1: q += ",\n" # Execute the query self.execute(q)