def read_json_file(file_path: str) -> Any: """Retrieve contents of JSON file. Parameters ---------- file_path : string File path to read as utf-8 encoded JSON file Returns ------- contents : Any Any JSON supported data structure, such as any combination of lists or dictionaries. Notes ----- * Since this function ONLY supports writing JSON supported data structures to disk, Pandas DataFrames are NOT supported * Contrary to most other functions starting with 'read' in io_lib, NO extra manipulation such as parsing or format inference is done to any of the contents See Also -------- * See Python online documentation of the more generalized function used here for the raw IO, `json.load` for more details on errors and limitations. """ file_path_ = os_lib.normalize_path(file_path) _prepare_file_for_reading(file_path_, extension='.json', encoding='utf-8') if os_lib.is_empty_file(file_path_): return {} with open(file_path_, mode='r', encoding='utf-8') as json_file: return json.load(json_file)
def create_json_file(file_path: str, contents: Any, replace_if_exists: bool = True) -> None: """Create JSON file containing given data. Parameters ---------- file_path : string File path to create or replace as a utf-8 encoded JSON file at contents : Any Contents to write to JSON in the form of any combination of lists or dictionaries. replace_if_exists : boolean, default True * If True remove file if present, creating a new one either way * If False create only if a file is not present otherwise raise OSError Notes ----- * Since this function ONLY supports writing JSON supported data structures to disk, Pandas DataFrames are NOT supported See Also -------- * See Python online documentation of the more generalized function used here for the raw IO, `json.dumps` for more details on errors and limitations. """ file_path_ = os_lib.normalize_path(file_path) _prepare_file_for_creation(file_path_, '.json', replace_if_exists) with open(file_path_, mode='x', encoding='utf-8') as json_file: json_file.write(json.dumps(contents))
def create_sqlite_file(file_path: str, replace_if_exists: bool = False) -> None: """Create SQL file if path is available and connection is successful. Parameters ---------- file_path : string File path to create or replace as a utf-8 encoded SQLite file. The file is created as an empty SQLite database with no tables. replace_if_exists : boolean, default True * If True remove file if present, creating a new one either way * If False create only if a file is not present otherwise raise OSError Notes ----- * The fact that the function `sqlite3.connect` indiscriminately creates a SQLite file is relied upon here to establish the database * If the file creation is successful, then the file path and the status of 'replaced' or 'created' is logged via `print` Raises ------ ValueError * If error during file creation or a connection to the newly created database fails to establish a connection, then the file is completely removed See Also ------- * See docstring of `connect_to_sqlite_database` for more information regarding what is considered a valid SQLite file. """ file_path_ = os_lib.normalize_path(file_path) os_lib.ensure_path_is_absolute(file_path_) file_exists_before_creation = os_lib.is_existent_file(file_path_) _prepare_file_for_creation(file_path_, '.sql', replace_if_exists) try: sqlite3.connect(file_path_) # establish file path connect_to_sqlite_database(file_path_) except: os_lib.remove_file( file_path_) # remove faultily auto-created file if exists raise ValueError( f'Cannot create database - ' f'a valid connection to \'{file_path_}\' cannot be established.') # generate a report for the database update action_taken = 'replaced' if file_exists_before_creation else 'created' print( f'Database \'{os_lib.get_basename(file_path)}\' was successfully {action_taken}.' )
def main(): df = warehouse.get_tutor_request_data() root_output_dir = os_lib.normalize_path(path=os_lib.join_path( PROJECT_DIR, 'external_datasets', 'pre_generated_data')) os_lib.remove_directory(root_output_dir, ignore_errors=True) # clear the dir if exists os_lib.create_directory(root_output_dir) for quarter_name in df['quarter'].unique(): output_sub_dir = os_lib.join_path(root_output_dir, quarter_name.replace(' ', '_')) os_lib.create_directory(output_sub_dir) requests_in_quarter = df[df['quarter'] == quarter_name] generate_demo_quarter_data(requests_in_quarter, output_sub_dir)
def connect_to_sqlite_database(file_path: str) -> sqlite3.Connection: """Return sqlite3 connection if file_path leads to a valid SQLite file. Parameters ---------- file_path : str File path to sqlite database - considered valid if it's an absolute, existing, UTF-8 encoded, SQLite file Return ------ `sqlite3.Connection` sqlite connection to database located at given path Raises ------ ValueError * If file path is not an absolute path (eg only file name is given) * If file name does not end with a single .sql extension FileNotFoundError * If file does not exist OSError * If file at path is corrupt or cannot be recognized as a SQLite file, as determined by it's header string UnicodeDecodeError * If file is not encoded with UTF-8 ConnectionRefusedError * If `sqlite3.Connection` object instantiation fails See Also -------- * See the section 'magic header string' under the web page 'fileformat' at the online documentation for SQLite, for more information on the method used to determine what constitutes a valid SQLite file. """ file_path_ = os_lib.normalize_path(file_path) _prepare_file_for_reading(file_path_, extension='.sql', encoding='utf-8') # ensure byte encoding indicates file is of type SQLite with codecs.open(file_path_, 'r', 'utf-8') as file: if codecs.encode(file.read(16)) == '53514c69746520666f726d6174203300': raise UnicodeDecodeError( f'File \'{file_path_}\' is either corrupted or not a ' f'recognizable SQLite file.') from None try: return sqlite3.Connection(file_path_) except Exception: raise ConnectionRefusedError( errno.ECONNREFUSED, 'SQLite file \'{os_lib.get_basename(file_path_)}\' cannot be reached' )
def get_database_paths(con: sqlite3.Connection) -> List[str]: """Return absolute filepath to database from sqlite3 connection object. Notes ----- * From "PRAGMA database_list" command in the sqlite pragma docs, The third column is the name of the database file. * A returned 'path' may be empty, in which case the database was not associated with the file. """ cursor = con.cursor() cursor.execute("PRAGMA database_list") meta_rows = cursor.fetchall() return [os_lib.normalize_path(row[2]) for row in meta_rows]
def create_csv_file(file_path: str, data: Union[np.ndarray, pd.Series, pd.DataFrame], replace_if_exists: bool = False) -> None: """Create CSV file containing given data. Parameters ---------- file_path : string File path to create or replace as a utf-8 encoded CSV file data : NumPy array, Pandas Series, or Pandas DataFrame Contents to write to CSV, with date formats inferred per ISO-8601 standards replace_if_exists : boolean, default True * If True remove file if present, creating a new one either way * If False create only if a file is not present otherwise raise OSError Notes ----- * Internally data is converted to DataFrame format before converting to csv * Unlike `pandas.write_csv`, empty DataFrames create any empty file See Also -------- * See Python online documentation of the more generalized function used here for the raw IO, `pandas.DataFrame.write_csv` for more details on errors and limitations. """ file_path_ = os_lib.normalize_path(file_path) # convert to DataFrame if isinstance(data, pd.DataFrame): df = data elif isinstance(data, np.ndarray): df = pd.DataFrame(data) elif isinstance(data, pd.Series): df = pd.DataFrame(data).T else: raise ValueError('Given data is invalid - only Pandas Series, ' 'Pandas DataFrame, and NumPy ndarray are supported.') _prepare_file_for_creation(file_path_, '.csv', replace_if_exists) # if completely empty dataframe, create completely empty file if df.empty: with open(file_path_, mode='x', encoding='utf-8'): pass else: return df.to_csv(path_or_buf=file_path_, mode='x', encoding='utf-8')
def read_csv_file(file_path: str, num_rows: int=None, date_columns: Sequence[Union[str, int]] = ()) \ -> Union[np.ndarray, pd.DataFrame]: """Retrieve contents of a csv file. Parameters ---------- file_path : string File path to read as utf-8 encoded CSV num_rows : int Number of rows to read from csv, all rows read if None date_columns : array-like of strings, default () * Columns to parse to datetime, as per ISO-8601 datetime standards Returns ------- NumPy array If only a single column is present Pandas DataFrame If no columns are present, as an empty DataFrame Pandas DataFrame If more than one column is retrieved from csv, with the first column taken as index See Also -------- * See Pandas online documentation of the more generalized function used here for the raw IO, `pandas.read_csv` for more details on errors and limitations. """ date_columns_ = date_columns if date_columns else None file_path_ = os_lib.normalize_path(file_path) _prepare_file_for_reading(file_path_, extension='.csv', encoding='utf-8') # if completely empty file, return completely empty DataFrame if os_lib.is_empty_file(file_path): return pd.DataFrame() data = pd.read_csv(filepath_or_buffer=file_path_, index_col=0, squeeze=True, nrows=num_rows, parse_dates=date_columns_, encoding='utf-8', infer_datetime_format=True) return data.values if isinstance(data, pd.Series) else data
* This allows all code within the source to assume 'perfect' data, with the following assumptions: * ?? * ?? """ import sqlite3 from collections import namedtuple from typing import Sequence, Tuple, Union, Dict, List, Set import numpy as np import pandas as pd from stem_center_analytics.utils import io_lib, os_lib # create a struct-like mapping for the three main data-source file paths WAREHOUSE_DIR = os_lib.normalize_path(os_lib.get_parent_dir(__file__)) DATA_FILE_PATHS = namedtuple( 'FilePaths', 'COURSE_RECORDS,QUARTER_DATES,DATABASE')( COURSE_RECORDS=os_lib.join_path(WAREHOUSE_DIR, 'course_records.json'), QUARTER_DATES=os_lib.join_path(WAREHOUSE_DIR, 'quarter_dates.csv'), DATABASE=os_lib.join_path(WAREHOUSE_DIR, 'stem_center_db.sql'), ) def connect_to_stem_center_db() -> sqlite3.Connection: """Context manager for connection to database containing cleaned/training data.""" return io_lib.connect_to_sqlite_database(DATA_FILE_PATHS.DATABASE) def get_quarter_dates() -> pd.DataFrame: """Return DataFrame of all (manually entered) quarter start, end dates."""