Example #1
0
 def __init__(self):
     self.sql = MSSQL()
     self.ftp = FTP()
     self.localdir = "files"
     self.remotedir = "seis"
     self.table_prefix = "SEIS"
     self.schools = self.ftp.get_directory_names(self.remotedir)
Example #2
0
def test_this_works():
    config = {
        "server": "db",
        "db": "master",
        "user": "******",
        "pwd": "Just4TestingThis",
        "schema": "dbo",
    }
    sql = MSSQL(**config)
    data = [{"id": 1, "value": "test1"}, {"id": 2, "value": "test2"}]
    df = pd.DataFrame(data)
    sql.insert_into("test_table", df)
    results = pd.read_sql_table("test_table",
                                con=sql.engine,
                                schema=sql.schema)
    assert len(results) == 2
Example #3
0
def db_generator(config):
    db_type = config.DB_TYPE
    if db_type == "mssql":
        return MSSQL()
    elif db_type == "postgres":
        return PostgreSQL()
    elif db_type == "sqlite":
        return SQLite(path=config.SQLITE_FILE)
    else:
        raise Exception()
Example #4
0
def db_generator(config):
    db_type = config.DB_TYPE
    default_config = {
        "schema": config.DB_SCHEMA,
        "server": config.DB_SERVER,
        "port": None,
        "db": config.DB,
        "user": config.DB_USER,
        "pwd": config.DB_PWD,
    }
    if db_type == "mssql":
        return MSSQL(**default_config)
    elif db_type == "postgres":
        return PostgreSQL(**default_config)
    elif db_type == "sqlite":
        return SQLite(path=config.DB)
    else:
        raise Exception()
Example #5
0
def main():
    try:
        school_year = os.getenv("CURRENT_SCHOOL_YEAR")
        conn = MSSQL()
        ftp = FTP()

        ftp.archive_remote_files(SOURCEDIR)
        ftp.delete_old_archive_files(SOURCEDIR)

        api_suffixes = os.getenv("API_SUFFIXES").split(",")
        API(api_suffixes).request_reports()
        if int(os.getenv("DELETE_LOCAL_FILES")):
            delete_data_files(LOCALDIR)
        files = download_from_ftp(ftp)

        process_application_data(conn, files, school_year)

        process_change_tracking(conn)

        if args.targets:
            sync_enrollment_targets(conn, school_year)
            conn.exec_sproc("sproc_SchoolMint_LoadTargetsWide")
            conn.exec_sproc("sproc_Schoolmint_create_intercepts")
            conn.exec_sproc("sproc_Schoolmint_load_Fact_PM")

        process_fact_daily_status(conn)

        success_message = read_logs("app.log")
        mailer = Mailer()
        mailer.notify(results=success_message)

    except Exception as e:
        logging.exception(e)
        stack_trace = traceback.format_exc()
        mailer = Mailer()
        mailer.notify(success=False, error_message=stack_trace)
Example #6
0
def migrate_mssql():
    try:
        sql = MSSQL()

        # Tables
        sql.exec_cmd_from_file(
            "sql/mssql/tables/schoolmint_ApplicationData_changehistory.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/tables/schoolmint_ApplicationData_raw_backup.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/tables/schoolmint_ApplicationData_raw.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/tables/schoolmint_ApplicationDataIndex_raw_backup.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/tables/schoolmint_ApplicationDataIndex_raw.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/tables/schoolmint_ApplicationStatuses.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/tables/schoolmint_FactDailyStatus.sql")
        sql.exec_cmd_from_file("sql/mssql/tables/schoolmint_lk_Enrollment.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/tables/schoolmint_ProgressMonitoring.sql")
        sql.exec_cmd_from_file("sql/mssql/tables/schoolmint_SchoolCodes.sql")

        # Load lookup tables
        enrollments = pd.read_csv("sql/data/lk_enrollment.csv")
        sql.insert_into("schoolmint_lk_Enrollment", enrollments)

        application_statuses = pd.read_csv("sql/data/application_statuses.csv")
        sql.insert_into("schoolmint_ApplicationStatuses", application_statuses)

        # Views
        sql.exec_cmd_from_file(
            "sql/mssql/views/vw_schoolmint_AppStatusList.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/views/vw_schoolmint_FactDailyStatus_InterimTargets.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/views/vw_schoolmint_FactDailyStatus.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/views/vw_schoolmint_ProgressMonitoring.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/views/vw_schoolmint_FactProgressMonitoring.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/views/vw_schoolmint_Index_Demographics.sql")

        # Stored Procedures
        sql.exec_cmd_from_file(
            "sql/mssql/sprocs/sproc_schoolmint_Create_ChangeTracking_Entries.sql"
        )
        sql.exec_cmd_from_file(
            "sql/mssql/sprocs/sproc_schoolmint_Create_FactDailyStatus.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/sprocs/sproc_schoolmint_Index_PostProcess.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/sprocs/sproc_schoolmint_Index_PrepareTables.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/sprocs/sproc_schoolmint_Raw_PostProcess.sql")
        sql.exec_cmd_from_file(
            "sql/mssql/sprocs/sproc_schoolmint_Raw_PrepareTables.sql")

    except ProgrammingError as e:
        if "Cannot open database" in str(e):
            print("ERROR: First create your database and schema manually")
Example #7
0
import urllib.request
import zipfile
import pandas as pd
from sqlsorcery import MSSQL

# Download 2019 SBAC Research Files
url = "http://caaspp-elpac.cde.ca.gov/caaspp/researchfiles/sb_ca2019_1_csv_v2.zip"
urllib.request.urlretrieve(url, "sb_ca2019_1_csv_v2.zip")

# Extract flat files from zip folder
with zipfile.ZipFile("sb_ca2019_1_csv_v2.zip", "r") as z:
    z.extractall()

# Read datasets into dataframes
entities = pd.read_csv("sb_ca2019entities_csv.txt", encoding="ISO-8859-1")
scores = pd.read_csv("sb_ca2019_1_csv_v2.txt", encoding="ISO-8859-1")

# Establish database connection
sql = MSSQL()

# Insert SBAC files into database tables
sql.insert_into("sbac_entities_2019", entities, chunksize=1000)
sql.insert_into("sbac_scores_2019", scores, chunksize=1000)

# Query district scores for Oakland and San Francisco
df = sql.query_from_file("query.sql")
print(df)
Example #8
0
 def __init__(self):
     self.data_dir = "data"
     self.sql = MSSQL()
     self.ftp = FTP(self.data_dir)
Example #9
0
class Connector:
    """ETL connector class"""
    def __init__(self):
        self.data_dir = "data"
        self.sql = MSSQL()
        self.ftp = FTP(self.data_dir)

    def sync_all_ftp_data(self):
        for table_name, directory_name in data_reports.items():
            self.ftp.download_files(directory_name)
            self._load_new_records_into_table(table_name, directory_name)

    def _load_new_records_into_table(self, table_name, report_name):
        """Find and insert new records into the data warehouse."""
        start_date = self._get_latest_date(table_name) + timedelta(days=1)
        yesterday = datetime.today() - timedelta(days=1)
        if start_date > yesterday:
            logging.info(
                f"Clever_{table_name} is up to date. No records inserted.")
            return
        else:
            file_names = self._generate_file_names(start_date, yesterday,
                                                   report_name)
            df = self._read_and_concat_files(file_names)
            self.sql.insert_into(f"Clever_{table_name}",
                                 df,
                                 if_exists="append")
            logging.info(
                f"Inserted {len(df)} records into Clever_{table_name}.")

    def _get_latest_date(self, table_name):
        """Get the latest date record in this table."""
        date = self.sql.query(
            f"SELECT TOP(1) [date] FROM custom.Clever_{table_name} ORDER BY [date] DESC"
        )
        latest_date = date["date"][0]
        return datetime.strptime(latest_date, "%Y-%m-%d")

    def _generate_file_names(self, start_date, yesterday, report_name):
        file_names = []
        while start_date <= yesterday:  # loop through yesterday's date
            formatted_date = start_date.strftime("%Y-%m-%d")
            file_names.append(f"{formatted_date}-{report_name}-students.csv")
            start_date += timedelta(days=1)
        return file_names

    def _read_and_concat_files(self, file_names):
        dfs = []
        for file_name in file_names:
            df = pd.read_csv(f"{self.data_dir}/{file_name}")
            logging.info(f"Read {len(df)} records from '{file_name}'.")
            dfs.append(df)
        data = pd.concat(dfs)
        return data

    def sync_student_google_accounts(self):
        """Get student emails from Google Accounts Manager app."""
        browser = Browser(self.data_dir)
        browser.export_student_google_accounts()
        # Transform and load csv data into database table
        df = self._get_data_from_csv_by_name("Student_export")
        df.rename(columns={"ID": "SIS_ID"}, inplace=True)
        self.sql.insert_into("Clever_StudentGoogleAccounts",
                             df,
                             if_exists="replace")
        logging.info(
            f"Inserted {len(df)} new records into Clever_StudentGoogleAccounts."
        )

    def _get_data_from_csv_by_name(self, string_to_match):
        """Get the downloaded csv BY NAME and store it in a dataframe."""
        for filename in os.listdir(self.data_dir):
            if fnmatch(filename, f"*{string_to_match}*"):
                file_path = f"{self.data_dir}/{filename}"
                break
        df = pd.read_csv(file_path)
        logging.info(f"Loaded {len(df)} records from downloaded file.")
        return df
Example #10
0
class Connector:
    """
    Data connector for Extracting data, Transforming into dataframes,
    and Loading into a database.
    """
    def __init__(self):
        self.sql = MSSQL()
        self.ftp = FTP()
        self.localdir = "files"
        self.remotedir = "seis"
        self.table_prefix = "SEIS"
        self.schools = self.ftp.get_directory_names(self.remotedir)

    def remove_local_files(self):
        """Remove any leftover files from local project directory."""
        filelist = [f for f in os.listdir(self.localdir)]
        for filename in filelist:
            if "gitkeep" not in filename:
                os.remove(os.path.join(self.localdir, filename))

    def get_files_from_ftp(self):
        """Loop through all sub-folders and download files from FTP."""
        self.remove_local_files()
        self.ftp.download_all(self.remotedir, self.localdir)
        self.filenames = [
            f for f in os.listdir(self.localdir) if f.endswith(".csv")
        ]
        logging.info(f"{len(self.filenames)} files downloaded. ")

    def read_files_into_df(self, file_name):
        """
        Given the file name (eg. Student or Service), read the files and concat into one DataFrame.

        Params:
            file_name (str): name of the file that you are trying to combine.

        Return:
            DataFrame: combined data from all files with the same name (ie. same type of data)
        """
        dfs = []
        for school in self.schools:
            path = os.path.join(self.localdir, f"{school}_{file_name}.csv")
            df = pd.read_csv(path,
                             sep=",",
                             quotechar='"',
                             doublequote=True,
                             dtype=str,
                             header=0)
            dfs.append(df)
        merged = pd.concat(dfs)
        merged.replace(np.nan, "", regex=True, inplace=True)
        return merged

    def insert_df_into_db(self, df, table_name):
        """
        Insert DataFrame into database with given table name.

        Params:
            df (DataFrame): data to insert into the database.
            table_name (str): name of the database table that you want to update.

        Return:
            none
        """
        table = f"{self.table_prefix}_{table_name}"
        self.sql.insert_into(table, df, if_exists="replace")
        logging.info(f"Inserted {len(df)} records into {table}.")
Example #11
0
from flask import Flask, jsonify, render_template
from sqlsorcery import MSSQL
import pandas as pd

app = Flask(__name__)
db = MSSQL()


@app.route("/api")
def index():
    tables = db.engine.table_names(schema=db.schema)
    return render_template("index.html", tables=tables)


@app.route("/api/<table>", methods=["GET"])
def endpoint(table):
    data = pd.read_sql_table(table, con=db.engine, schema=db.schema)
    data = data.to_dict(orient="records")
    return jsonify(data)
Example #12
0
def insert_table(df, tablename):
    """Drop and replace database table with each run"""
    df_len = len(df.index)
    sql = MSSQL()
    sql.insert_into(tablename, df, if_exists="replace")
    logging.info(f"Loaded {df_len} students into {tablename} table")
Example #13
0
class Connector:
    """ETL connector class"""
    def __init__(self):
        self.data_dir = "data"
        self.sql = MSSQL()
        self.ftp = FTP(self.data_dir)

    def sync_all_ftp_data(self):
        for table_name, directory_name in data_reports.items():
            self.ftp.download_files(directory_name)
            self._load_new_records_into_table(table_name, directory_name)

    def _load_new_records_into_table(self, table_name, report_name):
        """Find and insert new records into the data warehouse."""
        if report_name == "idm-reports":
            # this folder contains student emails file, which has no datestamp in the file name
            self._process_files_without_datestamp(table_name, report_name)
        else:
            self._process_files_with_datestamp(table_name, report_name)

    def _process_files_without_datestamp(self, table_name, report_name):
        # Student Emails file doesn't contain a datestamp in the file name
        # This table should be truncated and replaced.
        df = self._read_file(f"{self.data_dir}/google-student-emails.csv")
        self.sql.insert_into(f"Clever_{table_name}", df, if_exists="replace")
        logging.info(f"Inserted {len(df)} records into Clever_{table_name}.")

    def _process_files_with_datestamp(self, table_name, report_name):
        # Generate names for files with datestamps in the file name and process those files
        # These tables should be appended to, not truncated.
        start_date = self._get_latest_date(table_name) + timedelta(days=1)
        yesterday = datetime.today() - timedelta(days=1)
        if start_date > yesterday:
            logging.info(
                f"Clever_{table_name} is up to date. No records inserted.")
            return
        else:
            file_names = self._generate_file_names(start_date, yesterday,
                                                   report_name)
            df = self._read_and_concat_files(file_names)
            self.sql.insert_into(f"Clever_{table_name}",
                                 df,
                                 if_exists="append")
            logging.info(
                f"Inserted {len(df)} records into Clever_{table_name}.")

    def _get_latest_date(self, table_name):
        """Get the latest date record in this table."""
        date = self.sql.query(
            f"SELECT TOP(1) [date] FROM custom.Clever_{table_name} ORDER BY [date] DESC"
        )
        latest_date = date["date"][0]
        return datetime.strptime(latest_date, "%Y-%m-%d")

    def _generate_file_names(self, start_date, yesterday, report_name):
        file_names = []
        while start_date <= yesterday:  # loop through yesterday's date
            formatted_date = start_date.strftime("%Y-%m-%d")
            file_names.append(f"{formatted_date}-{report_name}-students.csv")
            start_date += timedelta(days=1)
        return file_names

    def _read_and_concat_files(self, file_names):
        dfs = []
        for file_name in file_names:
            df = pd.read_csv(f"{self.data_dir}/{file_name}")
            logging.info(f"Read {len(df)} records from '{file_name}'.")
            dfs.append(df)
        data = pd.concat(dfs)
        return data

    def _read_file(self, file_name):
        df = pd.read_csv(file_name)
        logging.info(f"Read {len(df)} records from '{file_name}'.")
        return df
Example #14
0
def main():
    df = pd.read_csv(file, sep=",")
    sql = MSSQL()
    sql.insert_into(table, df)