def write(self):
        print(
            f'Writing {len(self.rows):,} rows to {IpedsCharge.__tablename__} table in database.'
        )
        session = Session()

        if len(self.rows) > 0:
            try:
                for row in self.rows:
                    _ = session.query(IpedsCharge).filter(
                        IpedsCharge.unitid == row.unitid,
                        IpedsCharge.date_key == row.date_key).delete(
                            synchronize_session=False)
                session.bulk_save_objects(self.rows)
                session.commit()
            except Exception as e:
                print(f'An error occurred:\n{str(e)}.')
                session.rollback()
                print('No changes were made to the database due to error.')
            else:
                print('Rows successfully written to database.')
        else:
            print('No rows were available to insert.')

        session.close()
    def write(self):
        session = Session()

        if len(self.rows) > 0:
            try:
                _ = session.query(self.rows[0].__class__).filter(self.rows[0].__class__.date_key==self.date_key).delete(synchronize_session=False)
                session.bulk_save_objects(self.rows)
                session.commit()
            except Exception as e:
                print(f'An error occurred:\n{str(e)}.')
                session.rollback()
                print('No changes were made to the database due to error.')
            else:
                print('Rows successfully written to database.')
        else:
            print('No rows were available to insert.')
            
        session.close()
Ejemplo n.º 3
0
import numpy as np
import pandas as pd
from pandas import DataFrame
from sqlalchemy import sql

print("Reading faculty dimension data")
df = pd.read_csv('data/ipeds_faculty_dimension.csv')

session = Session()

try:
    print('Populating dimension tables.')
    # insert state data
    record_deletes = session.query(IpedsFacultyDimension).delete(
        synchronize_session=False)
    df_inserts = session.bulk_insert_mappings(
        mapper=IpedsFacultyDimension, mappings=df.to_dict(orient='records')),
    print('\tFinished populating counties table.')

except Exception as e:
    session.rollback()
    print(
        "\tAn error occurred and no data were changed in the database.\n\nError:\n{}"
        .format(str(e)))
else:
    session.commit()
    print('\tChanges committed to database.')

session.close()

print("All Done.")
Ejemplo n.º 4
0
def main():
    print("Reading carnegie_classes data")

    labels = get_sheet('Labels')
    labels.columns = labels.columns.str.strip().str.lower()
    labels = labels.rename(
        columns={
            'variable': 'classification_id',
            'label': 'classification',
            'value': 'class_code',
            'label.1': 'carnegie_class'
        })
    labels = labels[labels.class_code.notnull()]
    labels = labels.ffill()

    vals = get_sheet('Data')

    # reshape from wide to long format
    carnegie = pd.melt(vals,
                       id_vars=['UNITID'],
                       var_name='classification_id',
                       value_vars=[
                           'BASIC2005',
                           'BASIC2010',
                           'BASIC2015',
                           'BASIC2018',
                           'IPUG2018',
                           'IPGRAD2018',
                           'ENRPROFILE2018',
                           'UGPROFILE2018',
                           'SIZESET2018',
                           'CCE2015',
                       ],
                       value_name='class_code')

    carnegie = carnegie.rename(columns={'UNITID': 'unitid'})

    carnegie = carnegie.merge(labels,
                              on=['class_code', 'classification_id'],
                              how='inner')

    session = Session()

    try:
        print('Populating dimension tables.')
        # insert state data
        record_deletes = session.query(CarnegieClass).delete(
            synchronize_session=False)
        df_inserts = session.bulk_insert_mappings(
            mapper=CarnegieClass, mappings=carnegie.to_dict(orient='records')),
        print('\tFinished populating carnegie_classes table.')

    except Exception as e:
        session.rollback()
        print(
            "\tAn error occurred and no data were changed in the database.\n\nError:\n{}"
            .format(str(e)))
        print("\tNo changed made to database.\n")
    else:
        session.commit()
        print('\tChanges committed to database.\n')

    session.close()

    print("All Done.")
Ejemplo n.º 5
0
def main():
    for year in np.arange(args.first, args.last + 1):
        try:
            spec = f'data/nsf_{year}.pickle'
            print(f'Reading data for fiscal year ending {year}...', end='', flush=True)
            with open(spec, 'rb') as f:
                herd = pickle.load(f)
        except Exception as e:
            print(f'ERROR.\nFile not downloaded properly.\n\n{str(e)}\n')
        else:
            print('DONE.')
            # herd.info()

        # set date key
        date_key = f'{year}-06-30'

        # modify data frame to apply needed fixes
        herd['date_key'] = date_key

        # convert id's and add missing values
        herd.ncses_inst_id = herd.ncses_inst_id.fillna('XXXXXXXX')

        # get unitid fixes
        with open('data/inst_id_fixes.pickle', 'rb') as f:
            fixes = pickle.load(f)

        # apply fixes to unitid column, fill missing, and convert to integer
        herd['unitid'] = herd.inst_id.map(fixes)
        herd.unitid = np.where(herd.unitid.isna(), herd.ipeds_unitid, herd.unitid)
        herd.unitid = herd.unitid.fillna(-1).astype(int)

        herd = herd[herd.column.isin(['DOD',
                                    'DOE',
                                    'HHS',
                                    'NASA',
                                    'NSF',
                                    'USDA',
                                    'Other agencies',
                                    'State and local government',
                                    'Business',
                                    'Institution funds',
                                    'Nonprofit organziations'])]


        herd['agency_key'] = item_recode(herd.column,
                                        {'DOD': 'DOD',
                                        'DOE': 'DOE',
                                        'HHS': 'HHS',
                                        'NASA': 'NAS',
                                        'NSF': 'NSF',
                                        'USDA': 'USD',
                                        'Other agencies': 'OTH',
                                        'State and local government': 'SLG',
                                        'Business': 'BUS',
                                        'Institution funds': 'INS',
                                        'All other sources': 'OTH',
                                        'Nonprofit organziations': 'NPO'},
                                        'Unknown')

        herd['funding_type'] = herd.questionnaire_no.str[:2]

        herd = herd[herd.funding_type.isin(['09', '11'])]

        herd['funding_type'] = item_recode(herd.funding_type, {'09': 'Federal', '11': 'Non-federal'}, 'Unknown')

        herd['academic_field_key'] = herd.questionnaire_no.str[2:].str.strip()
        herd = herd[herd.academic_field_key.isin(['A','B01','B02','B03','B04','B05','B06','B07','B08',
                                                'B09','C01','C02','C03','C04','D01','D02','D03','D04',
                                                'D05','E','F01','F02','F03','F04','F05','G','H01',
                                                'H02','H03','H04','H05','I','J01','J02','J03','J04',
                                                'J05','J06','J07','J08'])]

        herd['data'] = herd.data.fillna(0) * 1000

        # rename columns
        herd = herd.rename(columns = {'data': 'expenditure'})

        # de-duplicate items
        keepers = ['inst_id',
                'date_key',
                'funding_type',
                'agency_key',
                'academic_field_key',
                'ncses_inst_id',
                'unitid',
                'expenditure']

        herd = herd[keepers].groupby(['inst_id',
                                    'date_key',
                                    'funding_type',
                                    'agency_key',
                                    'academic_field_key',
                                    'ncses_inst_id',
                                    'unitid']).sum().reset_index()

        herd['expenditure'] = herd.expenditure.fillna(0)
        # herd = herd.fillna(sql.null())

        # insert data into dbo.survey_records
        session = Session()

        try:
            print(f'Attempting to insert {herd.shape[0]:,} rows for {year} into {NsfHerdDetail.__tablename__}.')
            record_deletes = session.query(NsfHerdDetail).filter(NsfHerdDetail.date_key==date_key).delete(synchronize_session=False)
            session.bulk_insert_mappings(mapper = NsfHerdDetail,
                                        mappings = herd.to_dict(orient='records'),
                                        render_nulls = True)
        except Exception as e:
            session.rollback()
            print(str(e))
            print('No data were altered due to error.\n')
        else:
            session.commit()
            print(f'\t{record_deletes:,} old records were deleted.')
            print(f'\t{herd.shape[0]:,} new records were inserted.\n')
        finally:
            session.close()
            session = None

    print('All done!')
Ejemplo n.º 6
0
def main():
    for year in np.arange(args.first, args.last + 1):
        try:
            spec = f'data/nsf_{year}.pickle'
            print(f'Reading data for fiscal year ending {year}...',
                  end='',
                  flush=True)
            with open(spec, 'rb') as f:
                herd = pickle.load(f)
        except Exception as e:
            print(f'ERROR.\nFile not downloaded properly.\n\n{str(e)}\n')
        else:
            print('DONE.')
            # herd.info()

        # set date key
        date_key = f'{year}-06-30'

        # modify data frame to apply needed fixes
        herd['date_key'] = date_key

        herd['med_sch_flag'] = herd.med_sch_flag.isin(
            ['T', 'TRUE', 'True', 'true', 't', 'Y', 'Yes', '1'])

        herd['toi_code'] = herd.toi_code == 1

        toc = {1: 'Public', 2: 'Private'}

        herd['toc_code'] = item_recode(herd['toc_code'], toc, 'Unknown')
        herd['inst_state_code'] = item_recode(herd['inst_state_code'],
                                              state_fips, 0)

        # convert id's and add missing values
        herd.ncses_inst_id = herd.ncses_inst_id.fillna('XXXXXXXX')

        # get unitid fixes
        with open('data/inst_id_fixes.pickle', 'rb') as f:
            fixes = pickle.load(f)

        # apply fixes to unitid column, fill missing, and convert to integer
        herd['unitid'] = herd.inst_id.map(fixes)
        herd.unitid = np.where(herd.unitid.isna(), herd.ipeds_unitid,
                               herd.unitid)
        herd.unitid = herd.unitid.fillna(-1).astype(int)

        # select questionnaire_no's for institutional aggregate values
        herd = herd[herd.questionnaire_no.isin([
            '01.a', '01.b', '01.c', '01.d', '01.e', '01.f', '01.g', '04',
            'NA_01', '15'
        ])]

        # data are reported in thousands of dollars - make explicit
        herd['data'] = np.where(herd.questionnaire_no == '15', herd.data,
                                herd.data.fillna(0) * 1000)

        # add labels for personnel variables
        herd.loc[(herd['questionnaire_no'] == '15') &
                 (herd['row'] == 'Principal investigators'),
                 'questionnaire_no'] = 'principal_investigators'
        herd.loc[(herd['questionnaire_no'] == '15') &
                 (herd['row'] == 'Other personnel'),
                 'questionnaire_no'] = 'other_personnel'
        herd.loc[(herd['questionnaire_no'] == '15') & (herd['row'] == 'Total'),
                 'questionnaire_no'] = 'research_personnel'

        keepers = [
            'inst_id', 'date_key', 'unitid', 'ncses_inst_id', 'inst_name_long',
            'inst_state_code', 'toc_code', 'toi_code', 'med_sch_flag',
            'questionnaire_no', 'data'
        ]

        # long to wide
        institutions = herd.pivot_table(index=[
            'inst_id', 'date_key', 'unitid', 'ncses_inst_id', 'inst_name_long',
            'inst_state_code', 'toc_code', 'toi_code', 'med_sch_flag'
        ],
                                        columns='questionnaire_no',
                                        values='data',
                                        aggfunc=np.sum,
                                        fill_value=0).reset_index()

        # rename columns
        institutions = institutions.rename(
            columns={
                'inst_name_long': 'institution_name',
                'inst_state_code': 'state_fips',
                'toc_code': 'control',
                'toi_code': 'academic_institution',
                'med_sch_flag': 'medical_school_flag',
                '01.a': 'federal_government',
                '01.b': 'state_and_local_government',
                '01.c': 'business',
                '01.d': 'nonprofit_organizations',
                '01.e': 'institutional_funds',
                '01.f': 'other_sources',
                '01.g': 'total_rd_expenses',
                '04': 'medical_school_expenses',
                'NA_01': 'arra_funds'
            })

        # replace NaN with database-compliant nulls
        institutions['research_personnel'] = np.where(
            institutions.research_personnel == 0, None,
            institutions.research_personnel)

        # insert data into dbo.survey_records
        session = Session()

        try:
            print(
                f'Attempting to insert {institutions.shape[0]:,} rows for {year} into {NsfHerdInstitution.__tablename__}.'
            )
            record_deletes = session.query(NsfHerdInstitution).filter(
                NsfHerdInstitution.date_key == date_key).delete(
                    synchronize_session=False)
            session.bulk_insert_mappings(
                mapper=NsfHerdInstitution,
                mappings=institutions.to_dict(orient='records'),
                render_nulls=True)
        except Exception as e:
            session.rollback()
            print(str(e))
            print('No data were altered due to error.\n')
        else:
            session.commit()
            print(f'\t{record_deletes:,} old records were deleted.')
            print(f'\t{institutions.shape[0]:,} new records were inserted.\n')
        finally:
            session.close()
            session = None

    print('All done!')