def read_json_file(file_path: str) -> Any:
    """Retrieve contents of JSON file.

    Parameters
    ----------
    file_path : string
        File path to read as utf-8 encoded JSON file

    Returns
    -------
    contents : Any
        Any JSON supported data structure, such as any combination of lists
        or dictionaries.

    Notes
    -----
    * Since this function ONLY supports writing JSON supported data structures
      to disk, Pandas DataFrames are NOT supported
    * Contrary to most other functions starting with 'read' in io_lib,
      NO extra manipulation such as parsing or format inference is done
      to any of the contents

    See Also
    --------
    * See Python online documentation of the more generalized function used
      here for the raw IO, `json.load` for more details on errors and
      limitations.
    """
    file_path_ = os_lib.normalize_path(file_path)
    _prepare_file_for_reading(file_path_, extension='.json', encoding='utf-8')
    if os_lib.is_empty_file(file_path_):
        return {}
    with open(file_path_, mode='r', encoding='utf-8') as json_file:
        return json.load(json_file)
def create_json_file(file_path: str,
                     contents: Any,
                     replace_if_exists: bool = True) -> None:
    """Create JSON file containing given data.

    Parameters
    ----------
    file_path : string
        File path to create or replace as a utf-8 encoded JSON file at
    contents : Any
        Contents to write to JSON in the form of any combination of lists
        or dictionaries.
    replace_if_exists : boolean, default True
        * If True remove file if present, creating a new one either way
        * If False create only if a file is not present otherwise raise OSError

    Notes
    -----
    * Since this function ONLY supports writing JSON supported data structures
      to disk, Pandas DataFrames are NOT supported

    See Also
    --------
    * See Python online documentation of the more generalized function used
      here for the raw IO, `json.dumps` for more details on errors and
      limitations.
    """
    file_path_ = os_lib.normalize_path(file_path)
    _prepare_file_for_creation(file_path_, '.json', replace_if_exists)
    with open(file_path_, mode='x', encoding='utf-8') as json_file:
        json_file.write(json.dumps(contents))
def create_sqlite_file(file_path: str,
                       replace_if_exists: bool = False) -> None:
    """Create SQL file if path is available and connection is successful.

    Parameters
    ----------
    file_path : string
        File path to create or replace as a utf-8 encoded SQLite file.
        The file is created as an empty SQLite database with no tables.
    replace_if_exists : boolean, default True
        * If True remove file if present, creating a new one either way
        * If False create only if a file is not present otherwise raise OSError

    Notes
    -----
    * The fact that the function `sqlite3.connect` indiscriminately creates a
      SQLite file is relied upon here to establish the database
    * If the file creation is successful, then the file path and the status of
      'replaced' or 'created' is logged via `print`

    Raises
    ------
    ValueError
        * If error during file creation or a connection to the newly created
          database fails to establish a connection, then the file is completely
          removed

    See Also
    -------
    * See docstring of `connect_to_sqlite_database` for more information
      regarding what is considered a valid SQLite file.
    """
    file_path_ = os_lib.normalize_path(file_path)
    os_lib.ensure_path_is_absolute(file_path_)
    file_exists_before_creation = os_lib.is_existent_file(file_path_)

    _prepare_file_for_creation(file_path_, '.sql', replace_if_exists)
    try:
        sqlite3.connect(file_path_)  # establish file path
        connect_to_sqlite_database(file_path_)
    except:
        os_lib.remove_file(
            file_path_)  # remove faultily auto-created file if exists
        raise ValueError(
            f'Cannot create database - '
            f'a valid connection to \'{file_path_}\' cannot be established.')

    # generate a report for the database update
    action_taken = 'replaced' if file_exists_before_creation else 'created'
    print(
        f'Database \'{os_lib.get_basename(file_path)}\' was successfully {action_taken}.'
    )
def main():
    df = warehouse.get_tutor_request_data()
    root_output_dir = os_lib.normalize_path(path=os_lib.join_path(
        PROJECT_DIR, 'external_datasets', 'pre_generated_data'))
    os_lib.remove_directory(root_output_dir,
                            ignore_errors=True)  # clear the dir if exists
    os_lib.create_directory(root_output_dir)
    for quarter_name in df['quarter'].unique():
        output_sub_dir = os_lib.join_path(root_output_dir,
                                          quarter_name.replace(' ', '_'))
        os_lib.create_directory(output_sub_dir)
        requests_in_quarter = df[df['quarter'] == quarter_name]
        generate_demo_quarter_data(requests_in_quarter, output_sub_dir)
def connect_to_sqlite_database(file_path: str) -> sqlite3.Connection:
    """Return sqlite3 connection if file_path leads to a valid SQLite file.

    Parameters
    ----------
    file_path : str
        File path to sqlite database - considered valid if it's an absolute,
        existing, UTF-8 encoded, SQLite file

    Return
    ------
    `sqlite3.Connection`
        sqlite connection to database located at given path

    Raises
    ------
    ValueError
        * If file path is not an absolute path (eg only file name is given)
        * If file name does not end with a single .sql extension
    FileNotFoundError
        * If file does not exist
    OSError
        * If file at path is corrupt or cannot be recognized as a SQLite file,
          as determined by it's header string
    UnicodeDecodeError
        * If file is not encoded with UTF-8
    ConnectionRefusedError
        * If `sqlite3.Connection` object instantiation fails

    See Also
    --------
    * See the section 'magic header string' under the web page 'fileformat'
      at the online documentation for SQLite, for more information on the
      method used to determine what constitutes a valid SQLite file.
    """
    file_path_ = os_lib.normalize_path(file_path)
    _prepare_file_for_reading(file_path_, extension='.sql', encoding='utf-8')
    # ensure byte encoding indicates file is of type SQLite
    with codecs.open(file_path_, 'r', 'utf-8') as file:
        if codecs.encode(file.read(16)) == '53514c69746520666f726d6174203300':
            raise UnicodeDecodeError(
                f'File \'{file_path_}\' is either corrupted or not a '
                f'recognizable SQLite file.') from None

    try:
        return sqlite3.Connection(file_path_)
    except Exception:
        raise ConnectionRefusedError(
            errno.ECONNREFUSED,
            'SQLite file \'{os_lib.get_basename(file_path_)}\' cannot be reached'
        )
def get_database_paths(con: sqlite3.Connection) -> List[str]:
    """Return absolute filepath to database from sqlite3 connection object.

    Notes
    -----
    * From "PRAGMA database_list" command in the sqlite pragma docs,
      The third column is the name of the database file.
    * A returned 'path' may be empty, in which case the database was not
      associated with the file.
    """
    cursor = con.cursor()
    cursor.execute("PRAGMA database_list")
    meta_rows = cursor.fetchall()
    return [os_lib.normalize_path(row[2]) for row in meta_rows]
def create_csv_file(file_path: str,
                    data: Union[np.ndarray, pd.Series, pd.DataFrame],
                    replace_if_exists: bool = False) -> None:
    """Create CSV file containing given data.

    Parameters
    ----------
    file_path : string
        File path to create or replace as a utf-8 encoded CSV file
    data : NumPy array, Pandas Series, or Pandas DataFrame
        Contents to write to CSV, with date formats inferred per
        ISO-8601 standards
    replace_if_exists : boolean, default True
        * If True remove file if present, creating a new one either way
        * If False create only if a file is not present otherwise raise OSError

    Notes
    -----
    * Internally data is converted to DataFrame format before converting to csv
    * Unlike `pandas.write_csv`, empty DataFrames create any empty file

    See Also
    --------
    * See Python online documentation of the more generalized function used
      here for the raw IO, `pandas.DataFrame.write_csv` for more details
      on errors and limitations.
    """
    file_path_ = os_lib.normalize_path(file_path)
    # convert to DataFrame
    if isinstance(data, pd.DataFrame):
        df = data
    elif isinstance(data, np.ndarray):
        df = pd.DataFrame(data)
    elif isinstance(data, pd.Series):
        df = pd.DataFrame(data).T
    else:
        raise ValueError('Given data is invalid - only Pandas Series, '
                         'Pandas DataFrame, and NumPy ndarray are supported.')

    _prepare_file_for_creation(file_path_, '.csv', replace_if_exists)
    # if completely empty dataframe, create completely empty file
    if df.empty:
        with open(file_path_, mode='x', encoding='utf-8'):
            pass
    else:
        return df.to_csv(path_or_buf=file_path_, mode='x', encoding='utf-8')
def read_csv_file(file_path: str, num_rows: int=None, date_columns: Sequence[Union[str, int]] = ()) \
        -> Union[np.ndarray, pd.DataFrame]:
    """Retrieve contents of a csv file.

    Parameters
    ----------
    file_path : string
        File path to read as utf-8 encoded CSV
    num_rows : int
        Number of rows to read from csv, all rows read if None
    date_columns : array-like of strings, default ()
        * Columns to parse to datetime, as per ISO-8601 datetime standards

    Returns
    -------
    NumPy array
        If only a single column is present
    Pandas DataFrame
        If no columns are present, as an empty DataFrame
    Pandas DataFrame
        If more than one column is retrieved from csv, with the first column
        taken as index

    See Also
    --------
    * See Pandas online documentation of the more generalized function used
      here for the raw IO, `pandas.read_csv` for more details on errors
      and limitations.
    """
    date_columns_ = date_columns if date_columns else None
    file_path_ = os_lib.normalize_path(file_path)
    _prepare_file_for_reading(file_path_, extension='.csv', encoding='utf-8')
    # if completely empty file, return completely empty DataFrame
    if os_lib.is_empty_file(file_path):
        return pd.DataFrame()

    data = pd.read_csv(filepath_or_buffer=file_path_,
                       index_col=0,
                       squeeze=True,
                       nrows=num_rows,
                       parse_dates=date_columns_,
                       encoding='utf-8',
                       infer_datetime_format=True)
    return data.values if isinstance(data, pd.Series) else data
Exemple #9
0
* This allows all code within the source to assume 'perfect' data, with the
  following assumptions:
  * ??
  * ??
"""
import sqlite3
from collections import namedtuple
from typing import Sequence, Tuple, Union, Dict, List, Set

import numpy as np
import pandas as pd

from stem_center_analytics.utils import io_lib, os_lib

# create a struct-like mapping for the three main data-source file paths
WAREHOUSE_DIR = os_lib.normalize_path(os_lib.get_parent_dir(__file__))
DATA_FILE_PATHS = namedtuple(
    'FilePaths', 'COURSE_RECORDS,QUARTER_DATES,DATABASE')(
        COURSE_RECORDS=os_lib.join_path(WAREHOUSE_DIR, 'course_records.json'),
        QUARTER_DATES=os_lib.join_path(WAREHOUSE_DIR, 'quarter_dates.csv'),
        DATABASE=os_lib.join_path(WAREHOUSE_DIR, 'stem_center_db.sql'),
    )


def connect_to_stem_center_db() -> sqlite3.Connection:
    """Context manager for connection to database containing cleaned/training data."""
    return io_lib.connect_to_sqlite_database(DATA_FILE_PATHS.DATABASE)


def get_quarter_dates() -> pd.DataFrame:
    """Return DataFrame of all (manually entered) quarter start, end dates."""