def CreateTable_to_ColList(sqlcols):
    '''
    @description: Given column section of a CREATE TABLE statement, returns the list of column names.
    @param sqlcols: String containing the column stub of a CREATE TABLE statement.
    '''

    sqlcmd_createtemptable = '''
        CREATE TABLE public.temp_colparser (
            {cols}
        );
        '''.format(cols=sqlcols)

    sqlcmd_fetchcols = '''
    SELECT column_name
    FROM information_schema.columns
      WHERE table_name = \'temp_colparser\'
      AND table_schema = \'public\';'''

    with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection
        with conn.cursor() as cur:
            cur.execute(sqlcmd_createtemptable)

            cur.execute(sqlcmd_fetchcols)
            queried_columns = cur.fetchall()

    return [c[0] for c in queried_columns]
def gen_persistence_dummies_df():
    ''' Fetches enrollments table from SQL database and returns enrollment dummies dataframe
    '''

    # Load data
    with connect_to_db(cred.host, cred.user, cred.pw,cred.dbname) as connection:
        enrollments = pd.read_sql('select enrollid, studentid, collegeid, start_date, end_date, date_last_verified, status from common.enrollments;',connection)

    # Convert date vars to datetime
    datevars = ['start_date','end_date', 'date_last_verified']
    for dtcol in datevars:
        enrollments[dtcol] = pd.to_datetime(enrollments[dtcol])

    # Drop rows missing necessary vars
    mandatoryvars = ['studentid','collegeid','start_date','status']
    irows = enrollments.shape[0]
    for mcol in mandatoryvars:
        enrollments = enrollments[ enrollments[mcol].notnull() ]    
    frows = enrollments.shape[0]
    print 'Dropped enrollment rows with missing ' + ', '.join(mandatoryvars)
    print '--> Row count went from %d to %d.' %(irows,frows)

    # Change studentid & collegeid from float to int
    enrollments['studentid'] = enrollments['studentid'].astype(int)
    enrollments['collegeid'] = enrollments['collegeid'].astype(int)

    # Create persistence dummies
    for n in range(1,9):
        enrollments['persist_'+str(n)+'_halfyear'] = enrollments.apply(lambda row: persist_n_halfyear(row,n), axis=1)

    # Return dataframe
    return enrollments.drop(['start_date','end_date','date_last_verified','status'],axis=1)
Beispiel #3
0
def CreateTable_to_ColList(sqlcols):
    '''
    @description: Given column section of a CREATE TABLE statement, returns the list of column names.
    @param sqlcols: String containing the column stub of a CREATE TABLE statement.
    '''

    sqlcmd_createtemptable = '''
        CREATE TABLE public.temp_colparser (
            {cols}
        );
        '''.format(cols=sqlcols)

    sqlcmd_fetchcols = '''
    SELECT column_name
    FROM information_schema.columns
      WHERE table_name = \'temp_colparser\'
      AND table_schema = \'public\';'''

    with connect_to_db(
            cred.host, cred.user, cred.pw,
            cred.dbname) as conn:  # use context managed db connection
        with conn.cursor() as cur:
            cur.execute(sqlcmd_createtemptable)

            cur.execute(sqlcmd_fetchcols)
            queried_columns = cur.fetchall()

    return [c[0] for c in queried_columns]
Beispiel #4
0
    def load_rows(self, split=['test', 'train']):

        # check that only legit splits are being passed
        if not all(s in ['train', 'test'] for s in split):
            raise ValueError("split must be in ['test','train']")
        if len(split) > 2:
            raise ValueError(
                "load_rows() should only take two splits ('train', 'test', or both) maximum"
            )
        if len(split) == 0:
            raise ValueError(
                "load_rows() should get at least one split ('train' or 'test' or both)!"
            )

        with connect_to_db(cred.host, cred.user, cred.pw,
                           cred.dbname) as connection:

            # first load non pandas features

            # set search path

            search_path_string = 'set search_path to %s' % (self.schema)

            connection.cursor().execute(
                search_path_string)  #specify which schema to search

            if 'train' in split:
                self.train_megaquery = self.generate_megaquery('train')
                self.train_rows = pd.read_sql(self.train_megaquery, connection)
                self.train_rows.set_index(self.target[0]['train'].index_col,
                                          inplace=True)

            if 'test' in split:
                self.test_megaquery = self.generate_megaquery('test')
                self.test_rows = pd.read_sql(self.test_megaquery, connection)
                self.test_rows.set_index(self.target[0]['test'].index_col,
                                         inplace=True)

            # then handle pandas features

            if 'train' in split:
                #get objects
                self.pandas_trainfeatures = list(
                    get_feature_object(x, 'train')
                    for x in self.pandas_features)
                # get data
                self.get_pandas_rows(connection, self.pandas_trainfeatures,
                                     'train')

            if 'test' in split:
                self.pandas_testfeatures = list(
                    get_feature_object(x, 'test')
                    for x in self.pandas_features)
                self.get_pandas_rows(connection, self.pandas_testfeatures,
                                     'test')
def main():
    # Create empty schemas
    sqlcmd_createschemas = """
		CREATE SCHEMA lookup;
		CREATE SCHEMA noble;
		CREATE SCHEMA kipp_nj;
		CREATE SCHEMA common;"""

    with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn:  # use context managed db connection
        with conn.cursor() as cur:
            cur.execute(sqlcmd_createschemas)
            cur.connection.commit()
    def __init__(self,rawcsv_dict,rawtoclean_fn,sqltable_name,sqltable_cols,cols_toindex='',overwrite=False,upload=True):
        ''' Loads a table into our Postgres database from local data. Performs 4 steps:

                1. Load raw CSVs into dataframes
                2. Map raw dataframes to 1 clean dataframe.
                3. Export clean dataframe to clean CSV.
                4. Load clean CSV to Postgres database.
        '''

        # Create dictionary of dataframes
        self.rawdf_dict = {key:pd.read_csv(val) for key,val in rawcsv_dict.items()}

        # Create clean dataframe
        self.cleandf = rawtoclean_fn(**self.rawdf_dict)

        # Write clean dataframe to CSV
        cleancsv = tempfile.NamedTemporaryFile()
        self.cleandf.to_csv(cleancsv.name, na_rep='NaN', header=False, index=False)

        # Generate the CREATE TABLE statement
        self.sqlcmd_createtable = '''
            {drop}

            CREATE TABLE {name}(
                {cols}
            );
            '''.format(drop='DROP TABLE IF EXISTS ' + sqltable_name + ';' if overwrite==True else '',
                       name=sqltable_name,
                       cols=sqltable_cols['partnerids']+sqltable_cols['data'])

        # Generate CREATE INDEX statements
        self.sqlcmd_createindex = '\n'.join( ['CREATE INDEX on ' + sqltable_name + '(' + col + ');' for col in cols_toindex] )

        # Upload clean CSV to SQL database
        if upload==True:
            with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection
                with conn.cursor() as cur:
                    with open(cleancsv.name,"r") as f:

                        # create the temporary table
                        CreateSQLTable(cur=cur,sqlcmd_createtable=self.sqlcmd_createtable,sqltable_name=sqltable_name)

                        # create indices if specified
                        if self.sqlcmd_createindex: cur.execute(self.sqlcmd_createindex)

                        # load the data
                        cur.copy_from(f, sqltable_name, sep=',', null = 'NaN', columns = list(self.cleandf.columns.values))
                        cur.connection.commit()

            print '\nUploaded table: ' + sqltable_name
        else:
            print 'As requested, DID NOT UPLOAD ' + sqltable_name
Beispiel #7
0
def main():
    # Create empty schemas
    sqlcmd_createschemas = '''
		CREATE SCHEMA lookup;
		CREATE SCHEMA noble;
		CREATE SCHEMA kipp_nj;
		CREATE SCHEMA common;'''

    with connect_to_db(
            cred.host, cred.user, cred.pw,
            cred.dbname) as conn:  # use context managed db connection
        with conn.cursor() as cur:
            cur.execute(sqlcmd_createschemas)
            cur.connection.commit()
    def __init__(self,sqltable_name,sqltable_cols,sqlcmd_populate,cols_toindex='',overwrite=False,upload=True):
        ''' Creates and populates a table in our Postgres database using data already in the database. Performs 2 steps:

            1. Create an empty table.
            2. Populate the table from data in the database.
        '''

        # Generate the CREATE TABLE statement
        self.sqlcmd_createtable = '''
            {drop}

            CREATE TABLE {name}(
                {cols}
            );
            '''.format(drop='DROP TABLE IF EXISTS ' + sqltable_name + ';' if overwrite==True else '',
                       name=sqltable_name,
                       cols=sqltable_cols['commonids']+sqltable_cols['data'])

        # Generate CREATE INDEX statements
        self.sqlcmd_createindex = '\n'.join( ['CREATE INDEX on ' + sqltable_name + '(' + col + ');' for col in cols_toindex] )

        # Parse list of data columns
        datacol_list = CreateTable_to_ColList(sqltable_cols['data'])

        # Generate the INSERT INTO statement that populates the common table
        self.sqlcmd_populate = sqlcmd_populate.format(commontable=sqltable_name,
                                                      datacolnames=', '.join(datacol_list),
                                                      partnerdatacols=', '.join(['partnerdata.'+x for x in datacol_list]))

        # Create and populate the table in the SQL database
        if upload==True:
            with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn: # use context managed db connection
                with conn.cursor() as cur:

                    # create the empty table
                    CreateSQLTable(cur=cur,sqlcmd_createtable=self.sqlcmd_createtable,sqltable_name=sqltable_name)

                    # create indices if specified
                    if self.sqlcmd_createindex: cur.execute(self.sqlcmd_createindex)

                    # populate the table with data
                    cur.execute(self.sqlcmd_populate)
                    cur.connection.commit()

            print 'Created and populated table: ' + sqltable_name
        else:
            print 'As requested, DID NOT UPLOAD ' + sqltable_name
	def load_rows(self,split=['test','train']):

		# check that only legit splits are being passed
		if not all(s in ['train','test'] for s in split):
			raise ValueError("split must be in ['test','train']")
		if len(split) > 2:
			raise ValueError("load_rows() should only take two splits ('train', 'test', or both) maximum")
		if len(split) == 0:
			raise ValueError("load_rows() should get at least one split ('train' or 'test' or both)!")

		with connect_to_db(cred.host, cred.user, cred.pw,cred.dbname) as connection:

			# first load non pandas features

			# set search path

			search_path_string = 'set search_path to %s' %(self.schema)

			connection.cursor().execute(search_path_string) #specify which schema to search

			if 'train' in split:
				self.train_megaquery = self.generate_megaquery('train')
				self.train_rows = pd.read_sql(self.train_megaquery,connection)
				self.train_rows.set_index(self.target[0]['train'].index_col,inplace=True)

			if 'test' in split:
				self.test_megaquery = self.generate_megaquery('test')
				self.test_rows = pd.read_sql(self.test_megaquery,connection)
				self.test_rows.set_index(self.target[0]['test'].index_col,inplace=True)

			# then handle pandas features
			
			if 'train' in split:
				#get objects
				self.pandas_trainfeatures = list(get_feature_object(x, 'train') for x in self.pandas_features)
				# get data
				self.get_pandas_rows(connection, self.pandas_trainfeatures, 'train')
			
			if 'test' in split:
				self.pandas_testfeatures = list(get_feature_object(x, 'test') for x in self.pandas_features)
				self.get_pandas_rows(connection, self.pandas_testfeatures, 'test')
Beispiel #10
0
from config import PERSISTENCE_PATH
from modeling.featurepipeline.dataloader import DataLoader
from modeling.featurepipeline.experiment import Experiment
from util.SQL_helpers import connect_to_db
from util import cred # import credentials
from modeling.features.all_features import IsFirstEnrollment


#### Set Output File ##########################################
outputFolder = 'visualization_logs/'
###############################################################


if __name__ == '__main__':
	
	with connect_to_db(cred.host, cred.user, cred.pw, cred.dbname) as conn:
		df = pd.read_sql("select id, start_date from college_persistence.enrollments where degree_type='Bachelors'",conn)
		df = df.set_index('id')
		df.loc[:,'start_date'] = pd.to_datetime(df.loc[:,'start_date'])
		ife = IsFirstEnrollment()
		ife.load_rows(conn)

	df = df.merge(ife.rows,how='left',left_index=True,right_index=True)

	df['spring'] = df.start_date.apply(lambda x: x.month in [12,1,2,3,4])
	df.loc[df.start_date.isnull(),'spring'] = pd.NaT


	pd.crosstab(index=df.spring,columns=df.is_first_enrollment)

Beispiel #11
0
    def __init__(self,
                 rawcsv_dict,
                 rawtoclean_fn,
                 sqltable_name,
                 sqltable_cols,
                 cols_toindex='',
                 overwrite=False,
                 upload=True):
        ''' Loads a table into our Postgres database from local data. Performs 4 steps:

                1. Load raw CSVs into dataframes
                2. Map raw dataframes to 1 clean dataframe.
                3. Export clean dataframe to clean CSV.
                4. Load clean CSV to Postgres database.
        '''

        # Create dictionary of dataframes
        self.rawdf_dict = {
            key: pd.read_csv(val)
            for key, val in rawcsv_dict.items()
        }

        # Create clean dataframe
        self.cleandf = rawtoclean_fn(**self.rawdf_dict)

        # Write clean dataframe to CSV
        cleancsv = tempfile.NamedTemporaryFile()
        self.cleandf.to_csv(cleancsv.name,
                            na_rep='NaN',
                            header=False,
                            index=False)

        # Generate the CREATE TABLE statement
        self.sqlcmd_createtable = '''
            {drop}

            CREATE TABLE {name}(
                {cols}
            );
            '''.format(drop='DROP TABLE IF EXISTS ' + sqltable_name +
                       ';' if overwrite == True else '',
                       name=sqltable_name,
                       cols=sqltable_cols['partnerids'] +
                       sqltable_cols['data'])

        # Generate CREATE INDEX statements
        self.sqlcmd_createindex = '\n'.join([
            'CREATE INDEX on ' + sqltable_name + '(' + col + ');'
            for col in cols_toindex
        ])

        # Upload clean CSV to SQL database
        if upload == True:
            with connect_to_db(
                    cred.host, cred.user, cred.pw,
                    cred.dbname) as conn:  # use context managed db connection
                with conn.cursor() as cur:
                    with open(cleancsv.name, "r") as f:

                        # create the temporary table
                        CreateSQLTable(
                            cur=cur,
                            sqlcmd_createtable=self.sqlcmd_createtable,
                            sqltable_name=sqltable_name)

                        # create indices if specified
                        if self.sqlcmd_createindex:
                            cur.execute(self.sqlcmd_createindex)

                        # load the data
                        cur.copy_from(f,
                                      sqltable_name,
                                      sep=',',
                                      null='NaN',
                                      columns=list(
                                          self.cleandf.columns.values))
                        cur.connection.commit()

            print '\nUploaded table: ' + sqltable_name
        else:
            print 'As requested, DID NOT UPLOAD ' + sqltable_name
Beispiel #12
0
    def __init__(self,
                 sqltable_name,
                 sqltable_cols,
                 sqlcmd_populate,
                 cols_toindex='',
                 overwrite=False,
                 upload=True):
        ''' Creates and populates a table in our Postgres database using data already in the database. Performs 2 steps:

            1. Create an empty table.
            2. Populate the table from data in the database.
        '''

        # Generate the CREATE TABLE statement
        self.sqlcmd_createtable = '''
            {drop}

            CREATE TABLE {name}(
                {cols}
            );
            '''.format(drop='DROP TABLE IF EXISTS ' + sqltable_name +
                       ';' if overwrite == True else '',
                       name=sqltable_name,
                       cols=sqltable_cols['commonids'] + sqltable_cols['data'])

        # Generate CREATE INDEX statements
        self.sqlcmd_createindex = '\n'.join([
            'CREATE INDEX on ' + sqltable_name + '(' + col + ');'
            for col in cols_toindex
        ])

        # Parse list of data columns
        datacol_list = CreateTable_to_ColList(sqltable_cols['data'])

        # Generate the INSERT INTO statement that populates the common table
        self.sqlcmd_populate = sqlcmd_populate.format(
            commontable=sqltable_name,
            datacolnames=', '.join(datacol_list),
            partnerdatacols=', '.join(
                ['partnerdata.' + x for x in datacol_list]))

        # Create and populate the table in the SQL database
        if upload == True:
            with connect_to_db(
                    cred.host, cred.user, cred.pw,
                    cred.dbname) as conn:  # use context managed db connection
                with conn.cursor() as cur:

                    # create the empty table
                    CreateSQLTable(cur=cur,
                                   sqlcmd_createtable=self.sqlcmd_createtable,
                                   sqltable_name=sqltable_name)

                    # create indices if specified
                    if self.sqlcmd_createindex:
                        cur.execute(self.sqlcmd_createindex)

                    # populate the table with data
                    cur.execute(self.sqlcmd_populate)
                    cur.connection.commit()

            print 'Created and populated table: ' + sqltable_name
        else:
            print 'As requested, DID NOT UPLOAD ' + sqltable_name