def CreateTable_to_ColList(sqlcols): ''' @description: Given column section of a CREATE TABLE statement, returns the list of column names. @param sqlcols: String containing the column stub of a CREATE TABLE statement. ''' sqlcmd_createtemptable = ''' CREATE TABLE public.temp_colparser ( {cols} ); '''.format(cols=sqlcols) sqlcmd_fetchcols = ''' SELECT column_name FROM information_schema.columns WHERE table_name = \'temp_colparser\' AND table_schema = \'public\';''' with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection with conn.cursor() as cur: cur.execute(sqlcmd_createtemptable) cur.execute(sqlcmd_fetchcols) queried_columns = cur.fetchall() return [c[0] for c in queried_columns]
def gen_persistence_dummies_df(): ''' Fetches enrollments table from SQL database and returns enrollment dummies dataframe ''' # Load data with connect_to_db(cred.host, cred.user, cred.pw,cred.dbname) as connection: enrollments = pd.read_sql('select enrollid, studentid, collegeid, start_date, end_date, date_last_verified, status from common.enrollments;',connection) # Convert date vars to datetime datevars = ['start_date','end_date', 'date_last_verified'] for dtcol in datevars: enrollments[dtcol] = pd.to_datetime(enrollments[dtcol]) # Drop rows missing necessary vars mandatoryvars = ['studentid','collegeid','start_date','status'] irows = enrollments.shape[0] for mcol in mandatoryvars: enrollments = enrollments[ enrollments[mcol].notnull() ] frows = enrollments.shape[0] print 'Dropped enrollment rows with missing ' + ', '.join(mandatoryvars) print '--> Row count went from %d to %d.' %(irows,frows) # Change studentid & collegeid from float to int enrollments['studentid'] = enrollments['studentid'].astype(int) enrollments['collegeid'] = enrollments['collegeid'].astype(int) # Create persistence dummies for n in range(1,9): enrollments['persist_'+str(n)+'_halfyear'] = enrollments.apply(lambda row: persist_n_halfyear(row,n), axis=1) # Return dataframe return enrollments.drop(['start_date','end_date','date_last_verified','status'],axis=1)
def CreateTable_to_ColList(sqlcols): ''' @description: Given column section of a CREATE TABLE statement, returns the list of column names. @param sqlcols: String containing the column stub of a CREATE TABLE statement. ''' sqlcmd_createtemptable = ''' CREATE TABLE public.temp_colparser ( {cols} ); '''.format(cols=sqlcols) sqlcmd_fetchcols = ''' SELECT column_name FROM information_schema.columns WHERE table_name = \'temp_colparser\' AND table_schema = \'public\';''' with connect_to_db( cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection with conn.cursor() as cur: cur.execute(sqlcmd_createtemptable) cur.execute(sqlcmd_fetchcols) queried_columns = cur.fetchall() return [c[0] for c in queried_columns]
def load_rows(self, split=['test', 'train']): # check that only legit splits are being passed if not all(s in ['train', 'test'] for s in split): raise ValueError("split must be in ['test','train']") if len(split) > 2: raise ValueError( "load_rows() should only take two splits ('train', 'test', or both) maximum" ) if len(split) == 0: raise ValueError( "load_rows() should get at least one split ('train' or 'test' or both)!" ) with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as connection: # first load non pandas features # set search path search_path_string = 'set search_path to %s' % (self.schema) connection.cursor().execute( search_path_string) #specify which schema to search if 'train' in split: self.train_megaquery = self.generate_megaquery('train') self.train_rows = pd.read_sql(self.train_megaquery, connection) self.train_rows.set_index(self.target[0]['train'].index_col, inplace=True) if 'test' in split: self.test_megaquery = self.generate_megaquery('test') self.test_rows = pd.read_sql(self.test_megaquery, connection) self.test_rows.set_index(self.target[0]['test'].index_col, inplace=True) # then handle pandas features if 'train' in split: #get objects self.pandas_trainfeatures = list( get_feature_object(x, 'train') for x in self.pandas_features) # get data self.get_pandas_rows(connection, self.pandas_trainfeatures, 'train') if 'test' in split: self.pandas_testfeatures = list( get_feature_object(x, 'test') for x in self.pandas_features) self.get_pandas_rows(connection, self.pandas_testfeatures, 'test')
def main(): # Create empty schemas sqlcmd_createschemas = """ CREATE SCHEMA lookup; CREATE SCHEMA noble; CREATE SCHEMA kipp_nj; CREATE SCHEMA common;""" with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection with conn.cursor() as cur: cur.execute(sqlcmd_createschemas) cur.connection.commit()
def __init__(self,rawcsv_dict,rawtoclean_fn,sqltable_name,sqltable_cols,cols_toindex='',overwrite=False,upload=True): ''' Loads a table into our Postgres database from local data. Performs 4 steps: 1. Load raw CSVs into dataframes 2. Map raw dataframes to 1 clean dataframe. 3. Export clean dataframe to clean CSV. 4. Load clean CSV to Postgres database. ''' # Create dictionary of dataframes self.rawdf_dict = {key:pd.read_csv(val) for key,val in rawcsv_dict.items()} # Create clean dataframe self.cleandf = rawtoclean_fn(**self.rawdf_dict) # Write clean dataframe to CSV cleancsv = tempfile.NamedTemporaryFile() self.cleandf.to_csv(cleancsv.name, na_rep='NaN', header=False, index=False) # Generate the CREATE TABLE statement self.sqlcmd_createtable = ''' {drop} CREATE TABLE {name}( {cols} ); '''.format(drop='DROP TABLE IF EXISTS ' + sqltable_name + ';' if overwrite==True else '', name=sqltable_name, cols=sqltable_cols['partnerids']+sqltable_cols['data']) # Generate CREATE INDEX statements self.sqlcmd_createindex = '\n'.join( ['CREATE INDEX on ' + sqltable_name + '(' + col + ');' for col in cols_toindex] ) # Upload clean CSV to SQL database if upload==True: with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection with conn.cursor() as cur: with open(cleancsv.name,"r") as f: # create the temporary table CreateSQLTable(cur=cur,sqlcmd_createtable=self.sqlcmd_createtable,sqltable_name=sqltable_name) # create indices if specified if self.sqlcmd_createindex: cur.execute(self.sqlcmd_createindex) # load the data cur.copy_from(f, sqltable_name, sep=',', null = 'NaN', columns = list(self.cleandf.columns.values)) cur.connection.commit() print '\nUploaded table: ' + sqltable_name else: print 'As requested, DID NOT UPLOAD ' + sqltable_name
def main(): # Create empty schemas sqlcmd_createschemas = ''' CREATE SCHEMA lookup; CREATE SCHEMA noble; CREATE SCHEMA kipp_nj; CREATE SCHEMA common;''' with connect_to_db( cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection with conn.cursor() as cur: cur.execute(sqlcmd_createschemas) cur.connection.commit()
def __init__(self,sqltable_name,sqltable_cols,sqlcmd_populate,cols_toindex='',overwrite=False,upload=True): ''' Creates and populates a table in our Postgres database using data already in the database. Performs 2 steps: 1. Create an empty table. 2. Populate the table from data in the database. ''' # Generate the CREATE TABLE statement self.sqlcmd_createtable = ''' {drop} CREATE TABLE {name}( {cols} ); '''.format(drop='DROP TABLE IF EXISTS ' + sqltable_name + ';' if overwrite==True else '', name=sqltable_name, cols=sqltable_cols['commonids']+sqltable_cols['data']) # Generate CREATE INDEX statements self.sqlcmd_createindex = '\n'.join( ['CREATE INDEX on ' + sqltable_name + '(' + col + ');' for col in cols_toindex] ) # Parse list of data columns datacol_list = CreateTable_to_ColList(sqltable_cols['data']) # Generate the INSERT INTO statement that populates the common table self.sqlcmd_populate = sqlcmd_populate.format(commontable=sqltable_name, datacolnames=', '.join(datacol_list), partnerdatacols=', '.join(['partnerdata.'+x for x in datacol_list])) # Create and populate the table in the SQL database if upload==True: with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection with conn.cursor() as cur: # create the empty table CreateSQLTable(cur=cur,sqlcmd_createtable=self.sqlcmd_createtable,sqltable_name=sqltable_name) # create indices if specified if self.sqlcmd_createindex: cur.execute(self.sqlcmd_createindex) # populate the table with data cur.execute(self.sqlcmd_populate) cur.connection.commit() print 'Created and populated table: ' + sqltable_name else: print 'As requested, DID NOT UPLOAD ' + sqltable_name
def load_rows(self,split=['test','train']): # check that only legit splits are being passed if not all(s in ['train','test'] for s in split): raise ValueError("split must be in ['test','train']") if len(split) > 2: raise ValueError("load_rows() should only take two splits ('train', 'test', or both) maximum") if len(split) == 0: raise ValueError("load_rows() should get at least one split ('train' or 'test' or both)!") with connect_to_db(cred.host, cred.user, cred.pw,cred.dbname) as connection: # first load non pandas features # set search path search_path_string = 'set search_path to %s' %(self.schema) connection.cursor().execute(search_path_string) #specify which schema to search if 'train' in split: self.train_megaquery = self.generate_megaquery('train') self.train_rows = pd.read_sql(self.train_megaquery,connection) self.train_rows.set_index(self.target[0]['train'].index_col,inplace=True) if 'test' in split: self.test_megaquery = self.generate_megaquery('test') self.test_rows = pd.read_sql(self.test_megaquery,connection) self.test_rows.set_index(self.target[0]['test'].index_col,inplace=True) # then handle pandas features if 'train' in split: #get objects self.pandas_trainfeatures = list(get_feature_object(x, 'train') for x in self.pandas_features) # get data self.get_pandas_rows(connection, self.pandas_trainfeatures, 'train') if 'test' in split: self.pandas_testfeatures = list(get_feature_object(x, 'test') for x in self.pandas_features) self.get_pandas_rows(connection, self.pandas_testfeatures, 'test')
from config import PERSISTENCE_PATH from modeling.featurepipeline.dataloader import DataLoader from modeling.featurepipeline.experiment import Experiment from util.SQL_helpers import connect_to_db from util import cred # import credentials from modeling.features.all_features import IsFirstEnrollment #### Set Output File ########################################## outputFolder = 'visualization_logs/' ############################################################### if __name__ == '__main__': with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn: df = pd.read_sql("select id, start_date from college_persistence.enrollments where degree_type='Bachelors'",conn) df = df.set_index('id') df.loc[:,'start_date'] = pd.to_datetime(df.loc[:,'start_date']) ife = IsFirstEnrollment() ife.load_rows(conn) df = df.merge(ife.rows,how='left',left_index=True,right_index=True) df['spring'] = df.start_date.apply(lambda x: x.month in [12,1,2,3,4]) df.loc[df.start_date.isnull(),'spring'] = pd.NaT pd.crosstab(index=df.spring,columns=df.is_first_enrollment)
def __init__(self, rawcsv_dict, rawtoclean_fn, sqltable_name, sqltable_cols, cols_toindex='', overwrite=False, upload=True): ''' Loads a table into our Postgres database from local data. Performs 4 steps: 1. Load raw CSVs into dataframes 2. Map raw dataframes to 1 clean dataframe. 3. Export clean dataframe to clean CSV. 4. Load clean CSV to Postgres database. ''' # Create dictionary of dataframes self.rawdf_dict = { key: pd.read_csv(val) for key, val in rawcsv_dict.items() } # Create clean dataframe self.cleandf = rawtoclean_fn(**self.rawdf_dict) # Write clean dataframe to CSV cleancsv = tempfile.NamedTemporaryFile() self.cleandf.to_csv(cleancsv.name, na_rep='NaN', header=False, index=False) # Generate the CREATE TABLE statement self.sqlcmd_createtable = ''' {drop} CREATE TABLE {name}( {cols} ); '''.format(drop='DROP TABLE IF EXISTS ' + sqltable_name + ';' if overwrite == True else '', name=sqltable_name, cols=sqltable_cols['partnerids'] + sqltable_cols['data']) # Generate CREATE INDEX statements self.sqlcmd_createindex = '\n'.join([ 'CREATE INDEX on ' + sqltable_name + '(' + col + ');' for col in cols_toindex ]) # Upload clean CSV to SQL database if upload == True: with connect_to_db( cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection with conn.cursor() as cur: with open(cleancsv.name, "r") as f: # create the temporary table CreateSQLTable( cur=cur, sqlcmd_createtable=self.sqlcmd_createtable, sqltable_name=sqltable_name) # create indices if specified if self.sqlcmd_createindex: cur.execute(self.sqlcmd_createindex) # load the data cur.copy_from(f, sqltable_name, sep=',', null='NaN', columns=list( self.cleandf.columns.values)) cur.connection.commit() print '\nUploaded table: ' + sqltable_name else: print 'As requested, DID NOT UPLOAD ' + sqltable_name
def __init__(self, sqltable_name, sqltable_cols, sqlcmd_populate, cols_toindex='', overwrite=False, upload=True): ''' Creates and populates a table in our Postgres database using data already in the database. Performs 2 steps: 1. Create an empty table. 2. Populate the table from data in the database. ''' # Generate the CREATE TABLE statement self.sqlcmd_createtable = ''' {drop} CREATE TABLE {name}( {cols} ); '''.format(drop='DROP TABLE IF EXISTS ' + sqltable_name + ';' if overwrite == True else '', name=sqltable_name, cols=sqltable_cols['commonids'] + sqltable_cols['data']) # Generate CREATE INDEX statements self.sqlcmd_createindex = '\n'.join([ 'CREATE INDEX on ' + sqltable_name + '(' + col + ');' for col in cols_toindex ]) # Parse list of data columns datacol_list = CreateTable_to_ColList(sqltable_cols['data']) # Generate the INSERT INTO statement that populates the common table self.sqlcmd_populate = sqlcmd_populate.format( commontable=sqltable_name, datacolnames=', '.join(datacol_list), partnerdatacols=', '.join( ['partnerdata.' + x for x in datacol_list])) # Create and populate the table in the SQL database if upload == True: with connect_to_db( cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection with conn.cursor() as cur: # create the empty table CreateSQLTable(cur=cur, sqlcmd_createtable=self.sqlcmd_createtable, sqltable_name=sqltable_name) # create indices if specified if self.sqlcmd_createindex: cur.execute(self.sqlcmd_createindex) # populate the table with data cur.execute(self.sqlcmd_populate) cur.connection.commit() print 'Created and populated table: ' + sqltable_name else: print 'As requested, DID NOT UPLOAD ' + sqltable_name