Example #1
0
def filter_none_from_frequency_count(frequency_count):
    """
    Filter out None rows from a frequency count.

    A frequency count is a list of tuples or a list of lists, in which the first
    element is the property, and the second element is the count.

    Parameters
    ----------
    frequency_count: list of lists or tuples with length 2

    Returns
    -------
    new_frequency_count: list of lists or tuples with length 2

    Example
    -------
    >> a = [ [None, 4], [1, 2], [2, 3], [10, 1] ]
    >> UtilsContainer.filter_none_from_frequency_count(a)
    [ [1, 2], [2, 3], [10, 1] ]
    >> a = [ [None, 4], ['None', 2], ['c', 3], ['d', 1] ]
    >> UtilsContainer.filter_none_from_frequency_count(a)
    [ ['None', 2], ['c', 3], ['d', 1] ]
    """

    Assert.frequency_count(frequency_count)

    new_frequency_count = [x for x in frequency_count if x[0] is not None]
    return new_frequency_count
    def create_database(self, db_name, force_create=False):
        """Create an empty database.

        Parameters
        ----------
        db_name: string
            Name of the to be created database
        force_create: bool
            Flag indicating if a possible existing database with the same
            name shall be overwritten or not. If force_create = False and
            there is already a database with the name db_name, nothing happens.

        Returns
        -------
        None

        Postconditions
        --------------
        Database with the given name exists.
        If force_create=True: database with the given name is empty.
        """

        Assert.py_type(db_name, str, 'db_name')
        Assert.py_type(force_create, bool, 'force_create')

        if force_create:
            self.drop_database(db_name, confirm=True)
        self.cursor.execute("CREATE DATABASE {0} DEFAULT CHARACTER SET 'utf8'".format(db_name))
        if self.info:
            print ('Database {0} created'.format(db_name))
    def index_column(self, table, column):
        """Add an index to a specific column in a Table.

        Parameters
        ----------
        table: string
            Name of the Table to index.
        column: string
            Name of the column to index.

        Returns
        -------
        None

        Prerequisites
        -------------
        Table exists (in current database if no database is specified).
        Column exists in Table
        """

        Assert.py_type(table, str, 'table')
        Assert.py_type(column, str, 'column')
        assert self.table_exists(table)

        index_exists = self._index_exists(table, column)
        if not index_exists:
            q = '''
                ALTER TABLE {table}
                ADD INDEX `{col}_idx` (`{col}` ASC)
            '''.format(table=table, col=column)
            self.execute(q)
Example #4
0
def filter_array(lst):
    """
    Filter out NaN values from a list or np.array.

    If the type of the input list implements np.isnan, filter out NaN.
    Otherwise, leave the input list unaltered.

    Example
    -------
    >> L = [1, 2, 3, np.nan]
    >> UtilsContainer.filter_array(L)
    [1, 2, 3]
    >> L = np.array(['a', 'b', 'c', np.nan])
    >> UtilsContainer.filter_array(L)
    ['a', 'b', 'c', np.nan]
    """

    Assert.seq(lst)

    try:
        lst_invalid = np.isnan(lst)
    except TypeError:
        lst_invalid = np.zeros_like(lst, dtype=bool)
    lst_valid = np.logical_not(lst_invalid)

    if isinstance(lst, list):
        result = list(lst[i] for i in range(len(lst_valid)) if lst_valid[i])
    elif isinstance(lst, np.ndarray):
        result = lst[lst_valid]
    else:
        msg = 'Input shall be either list or numpy array, is now a {}'.format(type(lst))
        raise AssertionError(msg)

    assert type(lst) == type(result)
    return result
    def column_exists(self, table, column):
        """Return whether the given column exists in the Table.

        Parameters
        ----------
        table: string
            Name of the table to look in.
        table: string
            Name of the column to look for.

        Returns
        -------
        column_exists: bool
            Flag indicating whether the Table `table` contains a column `column`.

        Prerequisites
        -------------
        The Table `table` shall exist.
        """

        Assert.py_type(table, str, 'table')
        Assert.py_type(column, str, 'column')
        assert self.table_exists(table), 'The Table `{0}` shall exist.'.format(table)

        cols = self.show_columns(table)
        return column in cols
    def checksum_table(self, table):
        """Return the checksum of the given Table.

        Parameters
        ----------
        table: string
            Name of the Table to calculate the checksum of.

        Returns
        -------
        checksum: long int
            Checksum of the Table.

        Prerequisites
        -------------
        Table shall exist (in current database if no database is specified).

        Notes
        -----
        During the checksum operation, the table is locked with a read lock
        for InnoDB and MyISAM.
        """

        Assert.py_type(table, str, 'table')
        msg = 'Table {0} shall exist (in current database if no database is specified).'.format(table)
        assert self.table_exists(table), msg

        q = '''
            CHECKSUM TABLE {0}
        '''.format(table)
        self.execute(q)
        result = self.fetchone()
        return result[1]
    def truncate_table(self, table, disable_foreign_key_checks=False):
        """Truncate a table from the database

        Raises an error if the table does not exists

        Parameters
        ----------
        table: string
            Name of the table to truncate.
        disable_foreign_key_checks: boolean
            Flag indicating whether foreign keys shall be checked

        Returns
        -------
        None
        """

        Assert.py_type(table, str, 'table')
        Assert.table_exists(table, self)

        if disable_foreign_key_checks:
            # Temporarily disable foreign key checks
            sql = '''SET FOREIGN_KEY_CHECKS = 0'''
            self.execute(sql)

        truncate = 'TRUNCATE {}'.format(table)
        self.execute(truncate)

        if disable_foreign_key_checks:
            # Reenable foreign key checks
            sql = '''SET FOREIGN_KEY_CHECKS = 1'''
            self.execute(sql)
    def __init__(self, config=None, database=None, info_mode=True, debug_mode=False):
        if config is None:
            config = {
                'user': '******',
                'passwd': 'Nyon6966',
                'host': '83.161.215.203',
                'port': 8889
            }
        Assert.py_type(config, dict, 'MySQL configuration')
        assert 'user' in config
        assert 'passwd' in config
        assert 'host' in config
        assert 'port' in config
        self.config = config

        connection_config = {'user': config['user'],
                             'passwd': config['passwd'],
                             'host': config['host'],
                             'port': config['port'], }
        self.cnx = self._connect_to_mysql(connection_config)
        self.cursor = self.cnx.cursor()
        self.debug = debug_mode
        self.info = info_mode
        if database is not None:
            self.connect_to_database(database)
        elif 'default_db' in config and config['default_db'] is not None:
            self.connect_to_database(config['default_db'])
        self.cnx.autocommit(True)
Example #9
0
def frequency_count2occurrence_list(frequency_count):
    """Transform a frequency count into an occurrence list.

    A frequency count is a list of tuples or a list of lists, in which the first
    element is the property, and the second element is the count. An occurrence
    list is a list with each property <count> times in it.

    Parameters
    ----------
    frequency_count: list of lists or tuples with length 2

    Returns
    -------
    occurrence_list: np.array, dtype = <type(most general key in frequency_count)>

    Example
    -------
    >> a = [ [0, 4], [1, 2], [2, 3], [10, 1] ]
    >> UtilsContainer.frequency_count2occurrence_list(a)
    [ 0  0  0  0  1  1  2  2  2 10]
    >> a = [ ['a', 4], ['b', 2], ['c', 3], ['d', 1] ]
    >> UtilsContainer.frequency_count2occurrence_list(a)
    ['a' 'a' 'a' 'a' 'b' 'b' 'c' 'c' 'c' 'd']
    """

    Assert.frequency_count(frequency_count)

    occurrence_list = np.empty(0)
    for k, v in frequency_count:
        if k is not None:
            k_list = np.empty(v, dtype=type(k))
            k_list.fill(k)
            occurrence_list = np.append(occurrence_list, k_list)

    return occurrence_list
Example #10
0
def increment_smallest(lst, dx):
    """Increment the smallest element of the list L with the given amount dx.

    If the minimum occurs multiple times, the first element is incremented."""

    Assert.py_type(lst, np.ndarray, 'L')

    min_index = np.argmin(lst)
    lst[min_index] += dx
    def sneak_preview(self, table):
        """Print the first ten rows of the Table to the console."""

        Assert.py_type(table, str, 'Input table')
        assert self.table_exists(table), \
            'Table {0} shall exist in the database'.format(table)

        show = '''
            SELECT * 
            FROM {0}
            LIMIT 10
        '''.format(table)
        self.execute(show)
        print (self.fetchall())
Example #12
0
def list2dict(lst):
    """
    Convert a list to a dictionary, where the key of each entry are the list
    elements, and the values are indices 0..N, where the ordering is the ordering
    used in the list.
    """

    Assert.seq(lst)

    nr_elements = len(lst)
    result = dict(zip(lst, range(nr_elements)))

    assert isinstance(result, dict), 'Output shall be a dictionary'
    msg = 'All elements of input list ({}) shall be in dictionary ({})'.format(len(result), len(lst))
    assert len(result) == len(lst), msg
    return result
    def drop_table(self, table):
        """Drop a table from the database.

        If the table does not exist in the database, nothing happens.

        Parameters
        ----------
        table: string
            Name of the table to drop.

        Returns
        -------
        None
        """

        Assert.py_type(table, str, 'table')

        drop = 'DROP TABLE IF EXISTS {0}'.format(table)
        self.execute(drop)
    def table_exists(self, table, db_name=None):
        """Return whether the given Table exists in the current database.

        Parameters
        ----------
        table: string
            Name of the table to look up.

        Returns
        -------
        table_exists: bool
            Flag indicating whether the Table `table` exists in the current database.

        Prerequisites
        -------------
        Connection to a database must have been made.
        """

        # TODO: This pattern with temporarily disabling debug information happens more frequently.
        # Is there a way to write this in some sort of with statement? 

        # Temporarily disable debug information
        current_debug_mode = self.debug
        self.debug = False

        # Make the assertions
        Assert.py_type(table, str, 'table')
        assert self.current_database() is not None, 'Connection to a database shall have been made.'

        if db_name is None:
            db_name = self.current_database()
        q = '''
            SELECT COUNT(`TABLE_NAME`)
            FROM `INFORMATION_SCHEMA`.TABLES 
            WHERE `TABLE_SCHEMA` = '{}' 
            AND `TABLE_NAME` = '{}'
        '''.format(db_name, table)

        # Reenable debug information
        self.debug = current_debug_mode

        # Return the outcome
        return bool(self.fetch_as_value(q))
    def create_table_like(self, table_ref, table_out, force_create=False):
        """Create an empty Table with columns like another Table.

        Create the Table `table_out` with columns and properties exactly
        as Table `table_ref`, but with no contents.

        Parameters
        ----------
        table_ref: string
            Name of the reference Table to mimic.
        table_out: string
            Name of the Table to create.
        force_create: bool
            Flag indicating whether `table_out` shall be overwritten if it
            already exists. If it shall not be overwritten and does exist yet,
            nothing happens.

        Returns
        -------
        table_created: bool
            Flag indicating whether `table_out` is indeed written to the DB.
            If False, the table already existed and nothing had happened.

        Prerequisites
        -------------
        Connection to a database must have been made.
        Table 'table_ref' exists (in current database if no database is specified).
        """

        Assert.py_type(table_ref, str, 'table_ref')
        Assert.py_type(table_out, str, 'table_out')
        Assert.py_type(force_create, bool, 'force_create')
        assert self.current_database() is not None, \
            'Connection to a database must have been made.'

        if '.' in table_ref:
            i = table_ref.index('.')
            db_name = table_ref[:i]
            table_name = table_ref[i + 1:]
        else:
            db_name = None
            table_name = table_ref
        assert self.table_exists(table_name, db_name), \
            'Table `table_ref` shall exist (in current database if no database is specified).'

        create_table_query = self._show_create(table_ref)
        create_table_query = create_table_query.replace(table_ref, table_out)
        table_created = self.create_table_from_create(create_table_query, force_create)

        return table_created
    def create_table_from_select(self, table_out, select_query, force_create=False):
        """Create a table from a select query.

        Parameters
        ----------
        table_out: string
            Name of the Table to create.
        select_query: string
            Valid SQL query that returns a Table. Example:
            SELECT <fields> FROM <table> WHERE <condition>
        force_create: bool
            Flag indicating whether `table_out` shall be overwritten if it
            already exists. If it shall not be overwritten and does exist yet,
            nothing happens.

        Returns
        -------
        table_created: bool
            Flag indicating whether `table_out` is indeed written to the DB.
            If False, the table already existed and nothing had happened.

        Prerequisites
        -------------
        Connection to a database must have been made.

        Postconditions
        --------------
        Table 'table_out' exists in current database.
        """

        Assert.py_type(table_out, str, 'table_out')
        Assert.py_type(select_query, str, 'select_query')
        Assert.py_type(force_create, bool, 'force_create')
        assert self.current_database() is not None, \
            'Connection to a database must have been made.'

        create_table_query = 'CREATE TABLE {0} AS ({1})'.format(table_out, select_query)
        table_created = self.create_table_from_create(create_table_query, force_create)

        return table_created
    def get_config(predefined_config):
        """Retrieve a predefined config dictionary for DBManager

        Input:
        ------
        predefined_config: string
            Name of the predefined configuration settings. It shall be a recognized predefined configuration,
            otherwise an AssertionError is raised.
            local: Mac server in office in Utrecht
            aws: Aurora AWS RDS

        Returns:
        --------
        config: dict
            Dictionary with DB connection configuration, necessary for the class DBManager.
        """

        Assert.py_type(predefined_config, str, 'predefined config')

        if predefined_config == 'localhost':
            config = {
                'user': '******',
                'passwd': '',
                'host': '127.0.0.1',
                'port': 3306,
                'default_db': 'orderwriter'
            }
        elif predefined_config == 'puzzle':
            config = {
                'user': '******',
                'passwd': '',
                'host': '127.0.0.1',
                'port': 3306,
                'default_db': 'puzzle_words'
            }
        elif predefined_config == 'nyon_office':
            config = {
                'user': '******',
                'passwd': 'Nyon6966',
                'host': '83.161.215.203',
                'port': 8889
            }
        elif predefined_config == 'orderwriter_test':
            config = {
                'user': '******',
                'passwd': 'AbLsLKnq6NJjd8a6',
                'host': '213.206.228.254',
                'port': 3306,
                'default_db': 'orderwriter_20150721',
            }
        elif predefined_config == 'orderwriter_live':
            config = {
                'user': '******',
                'passwd': 'BjaNWs29sUeNWyZ4',
                'host': '213.206.228.254',
                'port': 3306,
                'default_db': 'orderwriter'
            }
        elif predefined_config == 'aws_au':
            config = {
                'user': '******',
                'passwd': 'Nyon6966',
                'host': 'aurora.ck67ii8uyuzl.eu-west-1.rds.amazonaws.com',
                'port': 3306
            }
        elif predefined_config == 'aws_ow':
            config = {
                'user': '******',
                'passwd': 'Nyon6966',
                'host': 'orderwriter.crii9g2k1ak3.eu-central-1.rds.amazonaws.com',
                'port': 3306
            }
        else:
            msg = 'Predefined config "{0}" not recognized.'.format(predefined_config)
            raise AssertionError(msg)
        config['config'] = predefined_config
        return config
    def create_table_from_create(self, create_table_query, force_create=False,
                                 temporary=False):
        """Create a table from a create query.

        Execute the create_table_query to create a table. The name of the Table
        to create is assumed to be the third word of the query.

        Parameters
        ----------
        create_table_query: string
            Valid SQL query to create a Table.
        force_create: bool
            Flag indicating whether `table_out` shall be overwritten if it
            already exists. If it shall not be overwritten and does exist yet,
            nothing happens.
        temporary: bool
            Flag indicating whether `table_out` will be a temporary Table.

        Returns
        -------
        :rtype: bool
            Flag indicating whether `table_out` is indeed written to the DB.
            If False, the table already existed and nothing had happened.

        Prerequisites
        -------------
        Connection to a database must have been made.

        Postconditions
        --------------
        Table exists in current database.

        Example
        -------
        Syntax of create_table_query shall look like:
            CREATE TABLE `name` AS ( <select query> );
        or:
            CREATE TABLE `name` (
              `col1` int(11) NOT NULL,
              `col2` date DEFAULT NULL,
            ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
        or:
            CREATE TABLE `name` LIKE `ref_table`;
        """

        Assert.py_type(create_table_query, str, 'create_table_query')
        Assert.py_type(force_create, bool, 'force_create')
        assert self.current_database() is not None, \
            'Connection to a database shall have been made.'

        table_name = self._table_name_from_create_query(create_table_query, temporary)

        if force_create and self.table_exists(table_name):
            self.drop_table(table_name)
        try:
            self.execute(create_table_query)
            table_created = True
            if self.info:
                print ('Table {} created'.format(table_name))
        except pymysql.err.InternalError:
            print('Last query:\n{0}'.format(create_table_query))
            raise

        return table_created
    def fetch_as_np_matrix(self, select_query, glob2loc_x, glob2loc_y):
        """Execute a select query and return the outcome as an np matrix.

        Parameters
        ----------
        select_query: string
            Valid SQL query that returns three columns from a Table. Example:
                SELECT person_id, article_number, SUM(purchase_weight)
                FROM orders
                GROUP BY person_id, article_number
        glob2loc_x: dict
            Keys: All possible global values for the first column
            Values: Range from 0,...,N where N is the number of items in the dict
        glob2loc_y: dict
            Keys: All possible global values for the second column
            Values: Range from 0,...,N where N is the number of items in the dict

        Returns
        -------
        result: np matrix
            A numpy matrix where the first two columns indicate the row and column index,
            and the third column in the select query indicates the value of the matrix elements.

        Prerequisites
        -------------
        Argument select_query shall select exactly three columns
        Argument glob2loc_x keys shall contain all encountered values in the first column
        Argument glob2loc_y keys shall contain all encountered values in the second column
        Argument glob2loc_x values shall be a continous range from 1,...N
        Argument glob2loc_y values shall be a continous range from 1,...N
        The third column in the select query shall be a number
        """

        Assert.py_type(glob2loc_x, dict, 'glob2loc_x')
        Assert.py_type(glob2loc_y, dict, 'glob2loc_y')
        max_loc_x = max(glob2loc_x.values())
        nr_rows = len(glob2loc_x)
        assert max_loc_x < nr_rows, 'glob2loc_x values shall be a continous range from 1,...N'
        max_loc_y = max(glob2loc_y.values())
        nr_cols = len(glob2loc_y)
        assert max_loc_y < nr_cols, 'glob2loc_y values shall be a continous range from 1,...N'

        self.execute(select_query)
        data = self.fetchall()
        result = np.zeros(shape=[nr_rows, nr_cols])
        for (x, y, val) in data:
            try:
                row = glob2loc_x[x]
            except KeyError:
                msg = 'Global value {0} not found in glob2loc_x'.format(x)
                raise AssertionError(msg)
            try:
                col = glob2loc_y[y]
            except:
                msg = 'Global value {0} not found in glob2loc_y'.format(y)
                raise AssertionError(msg)
            try:
                result[row, col] = val
            except ValueError:
                raise AssertionError('The third column in the select query shall be a number')

        return result
    def insert_into_table(self, table_name, cols, values, update_on_duplicate=True, db_name=None):
        """Insert the values in the columns of the given Table.

        Parameters
        ----------
        table_name: string
            Name of the table to add the records to
        cols: (numpy) array of strings
            List of the columns to fill
        values: (numpy) array of tuples
            List of the values to fill.
        update_on_duplicate: boolean
            Determines what happens if a duplicate key is encountered
            If True: the record is updated (default)
            If False: the insertion is ignored
        db_name: string
            Name of the database in which the table is defined. If no database name is given,
            the current database is used.

        Returns
        -------
        None

        Prerequisites
        -------------
        Connection to a database shall have been made
        Table `table_name` shall exists
        Table `table_name` shall contain all columns in `cols`

        Postconditions
        --------------
        If possible, all records are added to the Table

        Notes
        -----
        Only 1000 records can be added in a single INSERT INTO statement (MySQL restriction)
        This method breaks up the insertion in multiple insertions of a 1000
        when more than 1000 records need to be inserted.
        """
        Assert.py_type(table_name, str, 'table_name')
        if db_name is not None:
            Assert.py_type(db_name, str, 'db_name')
            table_name = db_name + '.' + table_name
        Assert.nonemptylist(cols)
        for col in cols:
            Assert.py_type(col, str, 'Column {0}'.format(col))
        nr_cols = len(cols)
        Assert.nonemptylist(values)
        for val in values:
            Assert.nonemptylist(val, nr_cols)
        assert self.current_database() is not None, \
            'Connection to a database must have been made.'

        # Split up the values in chunks of 1,000
        max_inserts = 1000
        nr_queries = np.ceil(len(values) / max_inserts)
        list_values = np.array_split(values, nr_queries)

        for vals in list_values:
            # Build up the query
            if update_on_duplicate:
                ignore = ''
            else:
                ignore = 'IGNORE '

            q = "INSERT {0}INTO {1}\n".format(ignore, table_name)
            q += "("
            for i_col, col in enumerate(cols):
                q += col
                if i_col < nr_cols - 1:
                    q += ", "
            q += ")\n"
            q += "VALUES\n"
            for i_val, val in enumerate(vals):
                q += "("
                for i_field, field in enumerate(val):
                    # Field with quotes if necessary
                    if field is None:
                        q += "NULL"
                    elif isinstance(field, str):
                        if field.upper() == "NULL":
                            q += "NULL"
                        else:
                            q += "'{0}'".format(field)
                    elif isinstance(field, date) or isinstance(field, time):
                        q += "'{0}'".format(field)
                    elif isinstance(field, Decimal):
                        # TODO: Note! Decimal is converted here to float!
                        # In principle, this should not matter, if you insert it
                        # in the database again, the desired precision is maintained. 
                        q += "{}".format(float(field))
                    elif np.isnan(field):
                        q += "NULL"
                    elif np.isscalar(field):
                        q += "{0}".format(field)
                    else:
                        msg = 'Type {0} not recognized'.format(type(field))
                        raise AssertionError(msg)
                    # Trailing comma
                    if i_field < nr_cols - 1:
                        q += ", "
                q += ")"
                if i_val < len(vals) - 1:
                    q += ","
                q += "\n"
            if update_on_duplicate:
                q += "ON DUPLICATE KEY UPDATE\n"
                for i_col, col in enumerate(cols):
                    q += "{0}=VALUES({0})".format(col)
                    if i_col < nr_cols - 1:
                        q += ",\n"

            # Execute the query
            self.execute(q)