Ejemplo n.º 1
0
def list_all_tables(db_uri=None):
    '''
    Print out a list of all tables in anon.db

    Parameters
    ----------
    db_uri : string or None
        added so that we can test the function using
        SQLite in-memory DB.

    Returns
    -------
    Prints out a simple list to console
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    conn = sqlite3.connect(db_uri, uri=True)

    with closing(conn):
        c = conn.cursor()
        c.execute('SELECT name from sqlite_master where type= "table"')
        table_names = c.fetchall()
        print([tbl[0] for tbl in table_names])
Ejemplo n.º 2
0
def number_of_table_columns(table_name, db_uri=None):
    '''
    Returns the number of columns of a given table

    Parameters
    ----------
    table_name : str
        table in anon.db to query
    Returns
    -------
    Count of columns
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    sql = f"PRAGMA TABLE_INFO({table_name})"

    conn = sqlite3.connect(db_uri, uri=True)

    #fetchall will return a list with a tuple for each column
    with closing(conn):
        c = conn.cursor()
        c.execute(sql)
        result = len(c.fetchall())

    return result
Ejemplo n.º 3
0
    def test_overlapping_hierarchical_and_predefined_linked_columns(self):
        '''
        When there is a conflict between a user defined and hierarchical linked
        columns, user defined list wins which means that the columns that make up
        the user defined list are excluded from consideration in the discovery phase
        of the hierarchical linkage. Make sure you include hb_code as well, otherwise
        hb_code will be linked to loc_name (correctly, but unexpectedly)
        '''

        user_linked_cols = ["hb_name", "hb_code", "age"]

        args = dict(command="fromdata",
                    source=Path(
                        package_dir("sample", "_data", "inpatients.csv")),
                    verbose=True,
                    inline_limit=30,
                    equal_weights=True,
                    skip_columns=[],
                    linked_columns=user_linked_cols)

        xA = tm.newExhibit(**args)
        xA.read_data()
        xA.generate_spec()

        # save the spec ID to delete temp tables after tests finish
        self._temp_tables.append(xA.spec_dict["metadata"]["id"])

        self.assertEqual(len(xA.spec_dict["linked_columns"]), 1)
        self.assertListEqual(xA.spec_dict["linked_columns"][0][1],
                             user_linked_cols)
Ejemplo n.º 4
0
def number_of_table_rows(table_name, column=None, db_uri=None):
    '''
    Returns the number of rows in the given table

    Parameters
    ----------
    table_name : str
        table in anon.db to query
    column : str
        optional. column name in the given table
    Returns
    -------
    Count of rows
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    if "." in table_name:
        table_name, column = table_name.split(".")

    if column:
        sql = f"SELECT COUNT(DISTINCT {column}) FROM {table_name}"
    else:
        sql = f"SELECT COUNT() FROM {table_name}"

    conn = sqlite3.connect(db_uri, uri=True)

    #fetchall will return a list with the single tuple (result, )
    with closing(conn):
        c = conn.cursor()
        c.execute(sql)
        result = c.fetchall()[0][0]

    return result
Ejemplo n.º 5
0
    def test_spec_generation_with_predefined_linked_columns(self):
        '''
        User defined linked columns are always saved as 0-th element in the
        linked columns list of the YAML specification.
        '''

        user_linked_cols = ["sex", "age"]

        args = dict(command="fromdata",
                    source=Path(
                        package_dir("sample", "_data", "inpatients.csv")),
                    verbose=True,
                    inline_limit=30,
                    equal_weights=True,
                    skip_columns=[],
                    linked_columns=user_linked_cols)

        xA = tm.newExhibit(**args)
        xA.read_data()
        xA.generate_spec()

        # save the spec ID to delete temp tables after tests finish
        self._temp_tables.append(xA.spec_dict["metadata"]["id"])

        self.assertListEqual(xA.spec_dict["linked_columns"][0][1],
                             user_linked_cols)
Ejemplo n.º 6
0
    def test_connection_to_sqlite(self):
        '''
        Check that the connect_to_anon returns a 
        cursor object
        '''

        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"
        conn = sqlite3.connect(db_uri, uri=True)

        with closing(conn):
            assert isinstance(conn, sqlite3.Connection)
Ejemplo n.º 7
0
    def test_read_data_func_reads_csv_from_source_path(self):
        '''
        Send "mock" command line arguments to parse_args function
        and assert that the program reads the same data as ref_df.
        '''
        args = dict(command="fromdata",
                    source=Path(
                        package_dir("sample", "_data", "inpatients.csv")),
                    verbose=True,
                    skip_columns=[])

        xA = tm.newExhibit(**args)
        xA.read_data()

        assert isinstance(xA.df, pd.DataFrame)
Ejemplo n.º 8
0
def insert_table(file_path, table_name=None, db_uri=None):
    '''
    Parse a .csv file and insert it into anon.db under its stem name

    Parameters
    ----------
    file_path : string
        Any format that Pandas can read is potentially suitable, but
        only .csv is currently implemented
    table_name : string
        Optional parameter if you don't want to use filename's stem
        part as the table name

    Returns
    -------
    No return; prints out confirmation if insertion is successful
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    if path_checker(file_path):

        if table_name is None:

            table_name = path_checker(file_path).stem

        #when creating a .csv from piping it from console on Windows,
        #encoding is changed from UTF-8 to ANSI
        try:
            table_df = pd.read_csv(file_path)
        except UnicodeDecodeError:
            table_df = pd.read_csv(file_path, encoding="ANSI")

    conn = sqlite3.connect(db_uri, uri=True)

    with closing(conn):

        table_df.to_sql(
            name=table_name,
            con=conn,
            if_exists="replace",
            index=False,
        )

        print(f"Successfully inserted a new table {table_name}")
Ejemplo n.º 9
0
    def test_less_than_two_predefined_linked_columns_raiser_error(self):
        '''
        It only makes sense to have at least 2 linked columns
        '''

        user_linked_cols = ["hb_name"]

        args = dict(command="fromdata",
                    source=Path(
                        package_dir("sample", "_data", "inpatients.csv")),
                    verbose=True,
                    inline_limit=30,
                    equal_weights=True,
                    skip_columns=[],
                    linked_columns=user_linked_cols)

        self.assertRaises(Exception, tm.newExhibit, **args)
Ejemplo n.º 10
0
def table_info(table_name, db_uri=None):
    '''
    Print out basic information about a given table

    Parameters
    ----------
    table_name : string
        the name of a single table in the database
    db_uri : string or None
        added so that we can test the function using
        SQLite in-memory DB.

    Returns
    -------
    Prints out the headers + all rows in the table. Values are 
    comma separated to allow piping directly into a new
    .csv file
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    conn = sqlite3.connect(db_uri, uri=True)

    with closing(conn):
        c = conn.cursor()
        c.execute('SELECT name from sqlite_master where type= "table"')
        table_names = c.fetchall()

        if table_name in [tbl[0] for tbl in table_names]:

            c.execute(f"SELECT * FROM {table_name}")
            result = c.fetchall()
            c.execute(f"PRAGMA table_info({table_name})")
            headers = ",".join([x[1] for x in c.fetchall()])

            print(headers)
            print(*[",".join([str(y) for y in x]) for x in result], sep="\n")

        else:

            print(f"{table_name} not in schema")
Ejemplo n.º 11
0
def drop_tables(table_names, db_uri=None):
    '''
    Drop named table(s) from anon.db

    Parameters
    ----------
    table_names : list of table names or regex strings
 
    Returns
    -------
    Prints outcome (if successful) to console

    Note that in CLI, multiple table names must be separated with a space
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    conn = sqlite3.connect(db_uri, uri=True)

    if not isinstance(table_names, list):
        table_names = [table_names]

    with closing(conn):
        c = conn.cursor()
        c.execute('SELECT name from sqlite_master where type= "table"')
        source_tables = [tbl[0] for tbl in c.fetchall()]

        for table_name in table_names:

            for source_table in source_tables:

                if re.search(table_name, source_table):

                    c.execute(f"DROP TABLE {source_table}")
                    conn.execute("VACUUM")
                    conn.commit()

                    print(f"Successfully deleted table {source_table}")
Ejemplo n.º 12
0
def purge_temp_tables(db_uri=None):
    '''
    Delete all tables with "temp_" prefix from anon.db

    Parameters
    ----------
    db_uri : string or None
        added so that we can test the function using
        SQLite in-memory DB.
    
    Returns
    -------
    Prints out confirmation with the number of columns dropped
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    conn = sqlite3.connect(db_uri, uri=True)

    with closing(conn):
        c = conn.cursor()
        c.execute('SELECT name from sqlite_master where type= "table"')
        table_names = c.fetchall()

        count = 0

        for table in table_names:
            if "temp" in table[0]:
                c.execute(f"DROP TABLE {table[0]}")
                count += 1

        conn.execute("VACUUM")
        conn.commit()

    print(f"Successfully deleted {count} tables")
Ejemplo n.º 13
0
    def test_output_spec_respectes_equal_weights_argument(self):
        '''
        Doc string
        '''

        args = dict(command="fromdata",
                    source=Path(
                        package_dir("sample", "_data", "inpatients.csv")),
                    verbose=True,
                    inline_limit=30,
                    equal_weights=True,
                    skip_columns=[])

        xA = tm.newExhibit(**args)
        xA.read_data()
        xA.generate_spec()

        expected = "10-19        | 0.100              | 0.100 | 0.100 | 0.100"
        result = xA.spec_dict["columns"]["age"]["original_values"][2]

        # save the spec ID to delete temp tables after tests finish
        self._temp_tables.append(xA.spec_dict["metadata"]["id"])

        self.assertEqual(expected, result)
Ejemplo n.º 14
0
'''
Module referencing sample data for export

inpatients.csv is sourced from ISD Scotland Open Data page:
http://www.isdscotland.org/Health-Topics/Hospital-Care/Inpatient-and-Day-Case-Activity/

prescribing.csv is sourced from NHS Scotland Open Data page:
https://www.opendata.nhs.scot/dataset/prescriptions-in-the-community
'''

# External imports
import pandas as pd
import yaml

# Exhibit imports
from exhibit.core.utils import package_dir

#Load data
inpatients_data = pd.read_csv(package_dir("sample", "_data", "inpatients.csv"))
inpatients_anon = pd.read_csv(package_dir("sample", "_data",
                                          "inpatients_anon.csv"),
                              parse_dates=["quarter_date"])

prescribing_data = pd.read_csv(package_dir("sample", "_data",
                                           "prescribing.csv"),
                               parse_dates=["PaidDateMonth"])

#Load specs
with open(package_dir("sample", "_spec", "inpatients_demo.yml")) as f:
    inpatients_spec = yaml.safe_load(f)
Ejemplo n.º 15
0
def create_temp_table(table_name,
                      col_names,
                      data,
                      strip_whitespace=True,
                      db_uri=None,
                      return_table=False):
    '''
    Create a lookup table in the anon.db SQLite3 database

    Parameters
    ----------
    table_name : str
        make sure there are no spaces in the table_name as they are not allowed
    col_names: list or any other iterable
        column names also can't contain spaces
    data: list of tuples
        each tuple containting row's worth of data
    strip_whitespace : bool
        if the table is for user defined linked column, don't try to strip whitespace
    db_uri : str
        optional. During testing can pass an in-memory uri
    return_table : bool
        optional. Sometimes useful to return all values from the newly created table

    Occasionally it's useful to create a temporary table
    for linked columns that user doesn't want to anonymise,
    like Specialty and Specialty Group. To ensure that each 
    Specialty has the correct Specialty Group, we can store this
    information in a temporary table in the anon.db

    The "1" in the "1-to-many" should always be the first column.

    Make sure you add "temp_" prefix to your table if you
    want it to be discovered by the automatic clean-up.

    Normally, you'd pass a list as col_names and data 
    would be a list of tuples with length equal to the
    number of columns. 

    On success returns True or fetches all records if return_table
    optional parameter is set to True.
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    #make sure data is stripped from extra whitespace to match the spec
    #as an extra precaution we're dropping any rows with np.NaN values in them
    if strip_whitespace:
        data = [tuple(y.strip() for y in x) for x in data if np.NaN not in x]

    if len(col_names) == 1:
        col_list = col_names[0]
    else:
        col_list = ", ".join(col_names)

    params = ", ".join(["?" for _ in col_names])

    drop_sql = f"DROP TABLE IF EXISTS {table_name}"
    create_sql = f"CREATE TABLE {table_name} ({col_list})"
    insert_sql = f"INSERT INTO {table_name} VALUES ({params})"

    conn = sqlite3.connect(db_uri, uri=True)

    with closing(conn):
        c = conn.cursor()
        c.execute(drop_sql)
        c.execute(create_sql)
        c.executemany(insert_sql, data)
        conn.commit()

        if return_table:
            c.execute(f"SELECT * FROM {table_name}")
            return c.fetchall()
    return True
Ejemplo n.º 16
0
def query_anon_database(table_name,
                        column=None,
                        size=None,
                        order="rowid",
                        db_uri=None,
                        exclude_missing=False):
    '''
    Query anon.db and return a nice dataframe or series

    Parameters
    ----------
    table_name : str
        table_name comes in a fixed format with temp_ prefix followed
        by the spec id and then either the linked group number of the
        column name in case of non-linked, many-valued columns
    column : str
        optional. Single column to be extracted from the given table
    size : int
        optional. The parameter to go into LIMIT statement
    order : str
        optional. The column to order the results by; defaults to rowid
    db_uri : str
        optional. For testing.
    exclude_missing : bool
        optional. Set to True to exclude the missing data placeholder
        value from the column, if SQL is for the single column only

    Returns
    -------
    A dataframe with original column names
    '''

    if db_uri is None:
        db_uri = "file:" + package_dir("db", "anon.db") + "?mode=rw"

    conn = sqlite3.connect(db_uri, uri=True)

    #column can come in as a string or as an empty list or as ["string"]
    if column and isinstance(column, list):
        column = column[0]

    #build the sql string:
    order_sql = f"ORDER BY {order}"
    size_sql = f"LIMIT {size}" if size else ""
    where_sql = (f"WHERE {column} != '{MISSING_DATA_STR}'" if
                 (column and exclude_missing) else "")

    sql = f"""
    SELECT DISTINCT {str(column or '*')}
    FROM {table_name}
    {where_sql}
    {order_sql}
    {size_sql}
    """

    with closing(conn):
        c = conn.cursor()
        c.execute(sql)
        column_names = [description[0] for description in c.description]
        result = c.fetchall()

    if len(column_names) == 1:
        output = pd.DataFrame(data={column_names[0]: [x[0] for x in result]})
        output.rename(columns=lambda x: x.replace("$", " "), inplace=True)
        return output

    output = pd.DataFrame(data=result, columns=column_names)
    output.rename(columns=lambda x: x.replace("$", " "), inplace=True)

    return output