def impute_data(df, cohort):

    #import IPython
    #IPython.embed()

    if isinstance(df, str):
        df = ml.read_data(df)

    #########################
    ## IMPUTE MISSING DATA ##
    #########################
    print "Imputing missing data..."

    #change msam to missing is msam_NA==1
    nanList =  ['g6_g6msam_nan', 'g7_g7msam_nan', 'g8_g8msam_nan', 'g9_g8msam_nan']
    varList = [[ 'g6_g6msam_Advanced', 'g6_g6msam_Basic', 'g6_g6msam_Proficient'], ['g7_g7msam_Advanced', 'g7_g7msam_Basic', 'g7_g7msam_Proficient'], ['g8_g8msam_Advanced', 'g8_g8msam_Basic', 'g8_g8msam_Proficient'],['g9_g8msam_Advanced', 'g9_g8msam_Basic', 'g9_g8msam_Proficient']]
    for x in range(0,len(nanList)):
        nacol = nanList[x]
        colList = varList[x]
        for col in colList:
            df.loc[df[nacol] == 1, col] = np.nan 


    #pred missing data using any available data
    wordList = ['absrate', 'mapr', 'msam_Advanced', 'msam_Basic', 'msam_Proficient', 'mobility', 'nsusp', 'mpa', 'tardyr', 'psatm', 'psatv', 'retained']
    for word in wordList:
        colList = [col for col in df.columns if word in col]
        rowMean = df[colList].mean(axis=1)
        for col in colList:
            print df[col].value_counts(dropna=False)
            df.loc[:,col].fillna(rowMean, inplace=True)
            print df[col].value_counts(dropna=False)


    '''
    ############################
    # IMPUTE NEIGHBORHOOD DATA #
    ############################

    print "Imputing missing school neighborhood data..."

    ## Fill missing school neighborhood data
    print "Fixing neighborhood columns..."
    neighborhood_cols = ['suspensionrate',  'mobilityrateentrantswithdra',  'attendancerate',   'avg_class_size',   'studentinstructionalstaffratio',   'dropoutrate',  'grade12documenteddecisionco',  'grade12documenteddecisionem',  'grade12documenteddecisionmi',  'grad12docdec_col_emp', 'graduationrate',   'studentsmeetinguniversitysyste',   'Est_Households_2012',  'Est_Population_2012',  'Med_Household_Income_2012',    'Mean_Household_Income_2012',   'Pop_Below_Poverty_2012',   'Percent_Below_Poverty_2012',   'Pop_Under18_2012', 'Under18_Below_Poverty_2012',   'Under18_Below_Poverty_Percent_2012',   'Housholds_on_Food_stamps_with_Children_Under18_2012',  'Housholds_Pop_on_Food_Stamps_2012',    'Pop_BlackAA_2012', 'Pop_White_2012',   'Bt_18_24_percent_less_than_High_School_2012',  'Bt_18_24_percent_High_School_2012',    'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012',    'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012',    'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012',    'Over_25_percent_AA_2012',  'Over_25_percent_Bachelors_2012',   'Over_25_percent_Graduate_or_Professionals_2012']
    ml.replace_with_mean(df, neighborhood_cols)
    '''

    #summary = ml.summarize(df)
    #print summary.T
    #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv')

    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data_cohort' + str(cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    #IPython.embed()

    print "Done!"
    import IPython
    IPython.embed()
    return df
def deal_with_dummies(df, cohort):

    if isinstance(df, str):
        df = ml.read_data(df)
    
    ###################################
    ## CREATE DUMMY VARIABLE COLUMNS ##
    ###################################
    print "Creating dummy variables..."

    school_ids = [col for col in df.columns if 'school_id' in col]
    df[school_ids] = df.loc[:,school_ids].astype(str, copy=False)

    string_cols = list(df.select_dtypes(include=['object']))
    
    dummys = pd.get_dummies(df[string_cols], dummy_na=True)
    df = pd.concat([df, dummys], axis=1)
    
    df.drop(string_cols, axis=1, inplace=True)

    ## Save clean version
    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort' + str(cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    return df
Beispiel #3
0
def choose_rows(df, grade):

    #Find rows to use
    print "Choosing rows..."
    data9 = df[df['g6_dropout'] != 1]
    data9 = data9[data9['g7_dropout'] != 1]
    data9 = data9[data9['g8_dropout'] != 1]
    data10 = data9[data9['g9_dropout'] != 1]
    data11 = data10[data10['g10_dropout'] != 1]
    data12 = data11[data11['g11_dropout'] != 1]

    if grade == 9:
        data9 = data9[data9['g8_missing'] != 1]
        return data9
    elif grade == 10:
        data10 = data10[data10['g9_missing'] != 1]
        return data10
    elif grade == 11:
        data11 = data11[data11['g10_missing'] != 1]
        return data11
    elif grade == 12:
        data12 = data12[data12['g11_missing'] != 1]
        ml.print_to_csv(
            df,
            '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/subset_test.csv'
        )
        return data12
Beispiel #4
0
def deal_with_dummies(df, cohort):

    if isinstance(df, str):
        df = ml.read_data(df)

    ###################################
    ## CREATE DUMMY VARIABLE COLUMNS ##
    ###################################
    print "Creating dummy variables..."

    school_ids = [col for col in df.columns if 'school_id' in col]
    df[school_ids] = df.loc[:, school_ids].astype(str, copy=False)

    string_cols = list(df.select_dtypes(include=['object']))

    dummys = pd.get_dummies(df[string_cols], dummy_na=True)
    df = pd.concat([df, dummys], axis=1)

    df.drop(string_cols, axis=1, inplace=True)

    ## Save clean version
    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/clean_data_cohort' + str(
        cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    return df
Beispiel #5
0
def deal_with_dummies(dataset):
    
    df = ml.read_data(dataset)

    ###################################
    ## CREATE DUMMY VARIABLE COLUMNS ##
    ###################################
    print "Creating dummy variables..."

    string_cols = list(df.select_dtypes(include=['object']))
    print string_cols

    df = ml.get_dummys(df, string_cols, dummy_na=True)
    for col in string_cols:
        print col
        df.drop(col, axis=1, inplace=True)

    ## Save clean version
    ml.print_to_csv(df, 'data/clean_data.csv')
def run_classifiers(csv_file, y):

	## LOAD PREPARED DATA
	df = ml.read_data(csv_file)

	################################
	# Build & Evaluate Classifiers #
	################################
	print "Evaluating classifiers..."

	## USE TOP FEATURES TO COMPARE CLASSIFIER PERFORMACE
	features = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
					'MonthlyIncome', 'age', 'NumberOfTimes90DaysLate',
					'NumberOfOpenCreditLinesAndLoans']

	X = df[features].as_matrix()
	y = df[y].as_matrix()

	#print ml.build_classifiers(X,y)
	ml.print_to_csv(ml.build_classifiers(X, y), 'compare_classifiers.csv')
def choose_rows(df, grade):

    #Find rows to use
    print "Choosing rows..."
    data9 = df[df['g6_dropout'] !=1]
    data9 = data9[data9['g7_dropout'] !=1]
    data9 = data9[data9['g8_dropout'] !=1]
    data10 = data9[data9['g9_dropout'] !=1]
    data11 = data10[data10['g10_dropout'] !=1]
    data12 = data11[data11['g11_dropout'] !=1]

    if grade == 9:
        data9 = data9[data9['g8_missing'] !=1]
        return data9
    elif grade == 10:
        data10 = data10[data10['g9_missing'] !=1]
        return data10
    elif grade == 11:
        data11 = data11[data11['g10_missing'] !=1]
        return data11
    elif grade == 12:
        data12 = data12[data12['g11_missing'] !=1]
        ml.print_to_csv(df, '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/subset_test.csv')
        return data12
def clean_data(df, cohort):

    print "Cleaning data..."

    ################################
    ## DROP UNNECESSARY VARIABLES ##
    ################################

    print "Dropping unnecessary variables..."

    if cohort == 1:
        print "for cohort 1..."
        variables_to_drop = ['g6_tardyr','g6_school_name', 'g7_school_name', 'g8_school_name', 'g9_school_name', 'g10_school_name', 'g11_school_name', 'g12_school_name','g6_year', 'g6_gradeexp', 'g6_grade', 'g6_wcode', 'g7_year', 'g7_gradeexp', 'g7_grade', 'g7_wcode', 'g8_year', 'g8_gradeexp', 'g8_grade', 'g8_wcode', 'g9_year', 'g9_gradeexp', 'g9_grade', 'g9_wcode', 'g10_year', 'g10_gradeexp', 'g10_grade', 'g10_wcode', 'g11_year', 'g11_gradeexp', 'g11_grade', 'g11_wcode', 'g12_year', 'g12_gradeexp', 'g12_grade', 'g12_wcode']
        df.drop(variables_to_drop, axis=1, inplace=True)
        
    elif cohort == 2:
        print "for cohort 2..."
        variables_to_drop = ['g6_school_name', 'g7_school_name', 'g8_school_name', 'g9_school_name', 'g10_school_name', 'g11_school_name', 'g12_school_name','g6_year', 'g6_grade', 'g6_wcode', 'g7_year', 'g7_grade', 'g7_wcode', 'g8_year', 'g8_grade', 'g8_wcode', 'g9_year', 'g9_grade', 'g9_wcode', 'g10_year', 'g10_grade', 'g10_wcode', 'g11_year', 'g11_grade', 'g11_wcode', 'g12_year', 'g12_grade', 'g12_wcode']
        df.drop(variables_to_drop, axis=1, inplace=True)

    #######################
    ## COMBINE VARIABLES ##
    #######################

    ## Create single column for birth year
    print "Correcting birthdays..."

    df['birthday'] = df['g11_byrmm']
    birthday_cols = ['g12_byrmm', 'g10_byrmm', 'g9_byrmm', 'g8_byrmm', 'g7_byrmm', 'g6_byrmm']
    for c in birthday_cols:
        ml.replace_if_missing(df, 'birthday', c)
        df.drop(c, axis=1, inplace=True)
    
    df['birth_year'] = df.loc[:,'birthday'].astype(str, copy=False)[:4]
    df['birth_month'] = df.loc[:,'birthday'].astype(str, copy=False)[4:]
    df.drop('birthday', axis=1, inplace=True)

    ## Create single column for gender
    print "Correcting gender..."

    df['gender'] = df['g11_gender']
    gender_cols = ['g12_gender', 'g11_gender', 'g10_gender', 'g9_gender', 'g8_gender', 'g7_gender', 'g6_gender']
    for c in gender_cols:
        ml.replace_if_missing(df, 'gender', c)
        df.drop(c, axis=1, inplace=True)


    ################
    ## CLEAN DATA ##
    ################

    print "Cleaning data..."
    retained_cols = ['g11_retained', 'g12_retained', 'g9_newmcps', 'g10_newmcps', 'g11_newmcps', 'g12_newmcps', 'g9_newus', 'g10_newus', 'g11_newus', 'g12_newus']

    for col in retained_cols:
        df[col] = df[col].notnull()


    ###############################
    ## CREATE MISSING DATA FLAGS ##
    ###############################

    print "Creating missing data flags..."

    ## Create flag if a given student is missing a year's worth of data
    grade_id = ['g6_pid', 'g7_pid', 'g8_pid', 'g9_pid', 'g10_pid', 'g11_pid', 'g12_pid']
    year = 6
    for g in grade_id:
        col_name = 'g' + str(year) + '_missing'
        df[col_name] = df[g].isnull()
        df.drop(g, axis=1, inplace=True)
        year+=1

    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/predummy_data_cohort' + str(cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    return df
Beispiel #9
0
def clean_data(df, cohort):

    print "Cleaning data..."

    ################################
    ## DROP UNNECESSARY VARIABLES ##
    ################################

    print "Dropping unnecessary variables..."

    if cohort == 'cohort1':
        print "for cohort 1..."
        variables_to_drop = ['g6_tardyr','g6_school_name', 'g7_school_name', 'g8_school_name', 'g9_school_name', 'g10_school_name', 'g11_school_name', 'g12_school_name','g6_year', 'g6_gradeexp', 'g6_grade', 'g6_wcode', 'g7_year', 'g7_gradeexp', 'g7_grade', 'g7_wcode', 'g8_year', 'g8_gradeexp', 'g8_grade', 'g8_wcode', 'g9_year', 'g9_gradeexp', 'g9_grade', 'g9_wcode', 'g10_year', 'g10_gradeexp', 'g10_grade', 'g10_wcode', 'g11_year', 'g11_gradeexp', 'g11_grade', 'g11_wcode', 'g12_year', 'g12_gradeexp', 'g12_grade', 'g12_wcode']
        for v in variables_to_drop:
            df.drop(v, axis=1, inplace=True)

    elif cohort == 'cohort2':
        print "for cohort 2..."
        variables_to_drop = ['g6_tardyr','g6_school_name', 'g7_school_name', 'g8_school_name', 'g9_school_name', 'g10_school_name', 'g11_school_name', 'g12_school_name','g6_year', 'g6_grade', 'g6_wcode', 'g7_year', 'g7_grade', 'g7_wcode', 'g8_year', 'g8_grade', 'g8_wcode', 'g9_year', 'g9_grade', 'g9_wcode', 'g10_year', 'g10_grade', 'g10_wcode', 'g11_year', 'g11_grade', 'g11_wcode', 'g12_year', 'g12_grade', 'g12_wcode']
        for v in variables_to_drop:
            df.drop(v, axis=1, inplace=True)

    else:
        pass

    #######################
    ## COMBINE VARIABLES ##
    #######################

    ## Create single column for birth year
    print "Correcting birthdays..."

    df['birthday'] = df['g11_byrmm']
    birthday_cols = ['g12_byrmm', 'g11_byrmm', 'g10_byrmm', 'g9_byrmm', 'g8_byrmm', 'g7_byrmm', 'g6_byrmm']
    for c in birthday_cols:
        ml.replace_with_other_col(df, 'birthday', c)
        df.drop(c, axis=1, inplace=True)
    #print ml.summarize(df['birthday'])

    ## Create single column for gender
    print "Correcting gender..."

    df['gender'] = df['g11_gender']
    gender_cols = ['g12_gender', 'g11_gender', 'g10_gender', 'g9_gender', 'g8_gender', 'g7_gender', 'g6_gender']
    for c in gender_cols:
        ml.replace_with_other_col(df, 'gender', c)
        df.drop(c, axis=1, inplace=True)
    #print df['gender'].value_counts()


    ################
    ## CLEAN DATA ##
    ################

    print "Cleaning data..."
    retained_cols = ['g11_retained', 'g12_retained', 'g9_newmcps', 'g10_newmcps', 'g11_newmcps', 'g12_newmcps', 'g9_newus', 'g10_newus', 'g11_newus', 'g12_newus']
    for col in retained_cols:
        for index, row in df.iterrows():
            if pd.isnull(row[col]):
                df.ix[index, col] = 0
            else:
                df.ix[index, col] = 1
        df[col] = df[col].astype(int)


    ###############################
    ## CREATE MISSING DATA FLAGS ##
    ###############################

    print "Creating missing data flags..."

    ## Create flag if a given student is missing a year's worth of data
    grade_id = ['g6_pid', 'g7_pid', 'g8_pid', 'g9_pid', 'g10_pid', 'g11_pid', 'g12_pid']
    year = 6
    for g in grade_id:
        col_name = 'g' + str(year) + '_missing'
        for index, row in df.iterrows():
            if pd.isnull(row[g]):
                df.ix[index, col_name] = 1
            else:
                df.ix[index, col_name] = 0
        df.drop(g, axis=1, inplace=True)
        year+=1


    ml.print_to_csv(df, 'data/predummy_data.csv')
Beispiel #10
0
def impute_data(dataset, cohort):

    df = ml.read_data(dataset)

    ##########################
    ## IMPUTE ACADEMIC DATA ##
    ##########################

    print "Impute missing academic information..."

    ## Fill missing school data -- use mean imputation for now
    school_vars = ['g6_school_id', 'g7_school_id', 'g8_school_id', 'g9_school_id', 'g10_school_id', 'g11_school_id', 'g12_school_id']
    ml.replace_with_mean(df, school_vars)

    ## Fill missing grade and test score information -- use mean imputation for now
    grades_tests = ['g6_q1mpa', 'g6_q2mpa', 'g6_q3mpa', 'g6_q4mpa', 'g6_g6mapr','g7_q1mpa', 'g7_q2mpa', 'g7_q3mpa', 'g7_q4mpa', 'g7_g7mapr', 'g8_q1mpa', 'g8_q2mpa', 'g8_q3mpa', 'g8_q4mpa', 'g8_g8mapr', 'g9_q1mpa', 'g9_q2mpa', 'g9_q3mpa', 'g9_q4mpa', 'g9_g8mapr', 'g10_q1mpa', 'g10_q2mpa', 'g10_q3mpa', 'g10_q4mpa', 'g10_psatv', 'g10_psatm', 'g11_q1mpa', 'g11_q2mpa', 'g11_q3mpa', 'g11_q4mpa', 'g11_psatv', 'g11_psatm', 'g12_q1mpa', 'g12_q2mpa', 'g12_q3mpa', 'g12_q4mpa', 'g12_psatv', 'g12_psatm']
    ml.replace_with_mean(df, grades_tests)

    ## Fill in missing id with dummy
    ml.replace_with_value(df, 'id', 0)

    ## Fill missing MSAM data
    g6_msam = ['g6_g6msam_Advanced','g6_g6msam_Basic','g6_g6msam_Proficient']
    ml.replace_dummy_null_mean(df, 'g6_g6msam_nan', g6_msam)

    if cohort == 'cohort1':
        g7_msam = ['g7_g7msam_Advanced','g7_g7msam_Basic','g7_g7msam_Proficient']
        ml.replace_dummy_null_mean(df, 'g7_g7msam_nan', g7_msam)
    elif cohort == 'cohort2':
        g7_msam = ['g7_g7msam_ ','g7_g7msam_1','g7_g7msam_2', 'g7_g7msam_3']
        ml.replace_dummy_null_mean(df, 'g7_g7msam_nan', g7_msam)

    g8_msam = ['g8_g8msam_Advanced','g8_g8msam_Basic','g8_g8msam_Proficient']
    ml.replace_dummy_null_mean(df, 'g8_g8msam_nan', g8_msam)

    g9_msam = ['g9_g8msam_Advanced','g9_g8msam_Basic','g9_g8msam_Proficient']
    ml.replace_dummy_null_mean(df,'g9_g8msam_nan', g9_msam)

    
    ############################
    ## IMPUTE BEHAVIORAL DATA ##
    ############################

    print "Impute missing behavioral data..."

    ## Fill missing behavioral data -- use mean imputation for now
    behavioral_cols = ['g6_absrate', 'g6_nsusp','g7_absrate', 'g7_tardyr', 'g7_nsusp', 'g8_absrate', 'g8_tardyr', 'g8_nsusp', 'g9_absrate', 'g9_nsusp', 'g10_absrate', 'g10_nsusp', 'g11_absrate', 'g11_nsusp','g12_absrate', 'g12_nsusp']
    ml.replace_with_mean(df, behavioral_cols)

    ## Fill in missing birthday data
    #ml.replace_with_mean(df, 'birthday')

    ############################
    ## IMPUTE ENROLLMENT DATA ##
    ############################

    print "Imputing missing enrollment data..."

    ## Fill missing enrollment data
    print "Fixing mobility columns..."
    mobility_cols = ['g10_retained', 'g6_mobility', 'g7_mobility', 'g8_mobility', 'g9_mobility', 'g9_retained','g10_mobility', 'g11_mobility', 'g12_mobility', 'birthday']
    # Includes g10_retained because it's coded as 0/1 already
    ml.replace_with_mean(df, mobility_cols)


    #########################
    ## IMPUTE DROPOUT DATA ##
    #########################

    print "Impute missing droput information..."

    ## Fill missing dropout information with 0
    dropout_vars = ['g6_dropout', 'g7_dropout', 'g8_dropout', 'g9_dropout', 'g10_dropout', 'g11_dropout', 'g12_dropout', 'dropout']
    ml.replace_with_value(df, dropout_vars, [0,0,0,0,0,0,0,0])

    #variables = list(df.columns.values)
    #print variables



    ############################
    # IMPUTE NEIGHBORHOOD DATA #
    ############################

    print "Imputing missing school neighborhood data..."

    ## Fill missing school neighborhood data
    print "Fixing neighborhood columns..."
    """
    neighborhood_cols = ['suspensionrate',  'mobilityrateentrantswithdra',  'attendancerate',   'avg_class_size',   'studentinstructionalstaffratio',   'dropoutrate',  'grade12documenteddecisionco',  'grade12documenteddecisionem',  'grade12documenteddecisionmi',  'grad12docdec_col_emp', 'graduationrate',   'studentsmeetinguniversitysyste',   'Est_Households_2012',  'Est_Population_2012',  'Med_Household_Income_2012',    'Mean_Household_Income_2012',   'Pop_Below_Poverty_2012',   'Percent_Below_Poverty_2012',   'Pop_Under18_2012', 'Under18_Below_Poverty_2012',   'Under18_Below_Poverty_Percent_2012',   'Housholds_on_Food_stamps_with_Children_Under18_2012',  'Housholds_Pop_on_Food_Stamps_2012',    'Pop_BlackAA_2012', 'Pop_White_2012',   'Bt_18_24_percent_less_than_High_School_2012',  'Bt_18_24_percent_High_School_2012',    'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012',    'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012',    'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012',    'Over_25_percent_AA_2012',  'Over_25_percent_Bachelors_2012',   'Over_25_percent_Graduate_or_Professionals_2012']
    """

    neighborhood_cols = ['g9_suspensionrate', 'g10_suspensionrate', 'g11_suspensionrate', 'g12_suspensionrate', 'g9_mobilityrateentrantswithdra', 'g10_mobilityrateentrantswithdra', 'g11_mobilityrateentrantswithdra', 'g12_mobilityrateentrantswithdra', 'g9_attendancerate', 'g10_attendancerate', 'g11_attendancerate', 'g12_attendancerate','g9_avg_class_size', 'g10_avg_class_size', 'g11_avg_class_size', 'g12_avg_class_size','g9_studentinstructionalstaffratio', 'g10_studentinstructionalstaffratio', 'g11_studentinstructionalstaffratio', 'g12_studentinstructionalstaffratio','g9_dropoutrate', 'g10_dropoutrate', 'g11_dropoutrate', 'g12_dropoutrate', 'g9_grade12documenteddecisionco', 'g10_grade12documenteddecisionco', 'g11_grade12documenteddecisionco', 'g12_grade12documenteddecisionco','g9_grade12documenteddecisionem', 'g10_grade12documenteddecisionem', 'g11_grade12documenteddecisionem', 'g12_grade12documenteddecisionem','g9_grade12documenteddecisionmi', 'g10_grade12documenteddecisionmi', 'g11_grade12documenteddecisionmi', 'g12_grade12documenteddecisionmi', 'g9_grad12docdec_col_emp', 'g10_grad12docdec_col_emp', 'g11_grad12docdec_col_emp', 'g12_grad12docdec_col_emp', 'g9_graduationrate', 'g10_graduationrate', 'g11_graduationrate', 'g12_graduationrate','g9_studentsmeetinguniversitysyste', 'g10_studentsmeetinguniversitysyste', 'g11_studentsmeetinguniversitysyste', 'g12_studentsmeetinguniversitysyste', 'g9_Est_Households_2012', 'g10_Est_Households_2012', 'g11_Est_Households_2012', 'g12_Est_Households_2012','g9_Est_Population_2012', 'g10_Est_Population_2012', 'g11_Est_Population_2012', 'g12_Est_Population_2012', 'g9_Med_Household_Income_2012', 'g10_Med_Household_Income_2012', 'g11_Med_Household_Income_2012', 'g12_Med_Household_Income_2012', 'g9_Mean_Household_Income_2012', 'g10_Mean_Household_Income_2012', 'g11_Mean_Household_Income_2012', 'g12_Mean_Household_Income_2012', 'g9_Pop_Below_Poverty_2012', 'g10_Pop_Below_Poverty_2012', 'g11_Pop_Below_Poverty_2012', 'g12_Pop_Below_Poverty_2012', 'g9_Percent_Below_Poverty_2012', 'g10_Percent_Below_Poverty_2012', 'g11_Percent_Below_Poverty_2012', 'g12_Percent_Below_Poverty_2012', 'g9_Pop_Under18_2012', 'g10_Pop_Under18_2012', 'g11_Pop_Under18_2012', 'g12_Pop_Under18_2012', 'g9_Under18_Below_Poverty_2012', 'g10_Under18_Below_Poverty_2012', 'g11_Under18_Below_Poverty_2012', 'g12_Under18_Below_Poverty_2012', 'g9_Under18_Below_Poverty_Percent_2012', 'g10_Under18_Below_Poverty_Percent_2012', 'g11_Under18_Below_Poverty_Percent_2012', 'g12_Under18_Below_Poverty_Percent_2012', 'g9_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g10_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g11_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g12_Housholds_on_Food_stamps_with_Children_Under18_2012', 'g9_Housholds_Pop_on_Food_Stamps_2012', 'g10_Housholds_Pop_on_Food_Stamps_2012', 'g11_Housholds_Pop_on_Food_Stamps_2012', 'g12_Housholds_Pop_on_Food_Stamps_2012', 'g9_Pop_BlackAA_2012', 'g10_Pop_BlackAA_2012', 'g11_Pop_BlackAA_2012', 'g12_Pop_BlackAA_2012', 'g9_Pop_White_2012', 'g10_Pop_White_2012', 'g11_Pop_White_2012', 'g12_Pop_White_2012', 'g9_Bt_18_24_percent_less_than_High_School_2012', 'g10_Bt_18_24_percent_less_than_High_School_2012', 'g11_Bt_18_24_percent_less_than_High_School_2012', 'g12_Bt_18_24_percent_less_than_High_School_2012', 'g9_Bt_18_24_percent_High_School_2012', 'g10_Bt_18_24_percent_High_School_2012', 'g11_Bt_18_24_percent_High_School_2012', 'g12_Bt_18_24_percent_High_School_2012', 'g9_Bt_18_24_percent_Some_College_or_AA_2012', 'g10_Bt_18_24_percent_Some_College_or_AA_2012', 'g11_Bt_18_24_percent_Some_College_or_AA_2012', 'g12_Bt_18_24_percent_Some_College_or_AA_2012', 'g9_Bt_1824_percent_BA_or_Higher_2012', 'g10_Bt_1824_percent_BA_or_Higher_2012', 'g11_Bt_1824_percent_BA_or_Higher_2012', 'g12_Bt_1824_percent_BA_or_Higher_2012', 'g9_Over_25_percent_less_than_9th_grade_2012', 'g10_Over_25_percent_less_than_9th_grade_2012', 'g11_Over_25_percent_less_than_9th_grade_2012', 'g12_Over_25_percent_less_than_9th_grade_2012', 'g9_Over_25_percent_9th_12th_2012', 'g10_Over_25_percent_9th_12th_2012', 'g11_Over_25_percent_9th_12th_2012', 'g12_Over_25_percent_9th_12th_2012', 'g9_Over_25_percent_High_School_2012', 'g10_Over_25_percent_High_School_2012', 'g11_Over_25_percent_High_School_2012', 'g12_Over_25_percent_High_School_2012', 'g9_Over_25__percent_Some_College_No_Deg_2012', 'g10_Over_25__percent_Some_College_No_Deg_2012', 'g11_Over_25__percent_Some_College_No_Deg_2012', 'g12_Over_25__percent_Some_College_No_Deg_2012', 'g9_Over_25_percent_AA_2012', 'g10_Over_25_percent_AA_2012', 'g11_Over_25_percent_AA_2012', 'g12_Over_25_percent_AA_2012', 'g9_Over_25_percent_Bachelors_2012', 'g10_Over_25_percent_Bachelors_2012', 'g11_Over_25_percent_Bachelors_2012', 'g12_Over_25_percent_Bachelors_2012', 'g9_Over_25_percent_Graduate_or_Professionals_2012', 'g10_Over_25_percent_Graduate_or_Professionals_2012', 'g11_Over_25_percent_Graduate_or_Professionals_2012', 'g12_Over_25_percent_Graduate_or_Professionals_2012']
    ml.replace_with_mean(df, neighborhood_cols)


    summary = ml.summarize(df)
    print summary.T
    #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv')

    ml.print_to_csv(df, 'data/imputed_data.csv')
    #ml.print_to_csv(df, '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data.csv')
    print "Done!"
Beispiel #11
0
def clean_data(df, cohort):

    print "Cleaning data..."

    ################################
    ## DROP UNNECESSARY VARIABLES ##
    ################################

    print "Dropping unnecessary variables..."

    if cohort == 1:
        print "for cohort 1..."
        variables_to_drop = [
            'g6_tardyr', 'g6_school_name', 'g7_school_name', 'g8_school_name',
            'g9_school_name', 'g10_school_name', 'g11_school_name',
            'g12_school_name', 'g6_year', 'g6_gradeexp', 'g6_grade',
            'g6_wcode', 'g7_year', 'g7_gradeexp', 'g7_grade', 'g7_wcode',
            'g8_year', 'g8_gradeexp', 'g8_grade', 'g8_wcode', 'g9_year',
            'g9_gradeexp', 'g9_grade', 'g9_wcode', 'g10_year', 'g10_gradeexp',
            'g10_grade', 'g10_wcode', 'g11_year', 'g11_gradeexp', 'g11_grade',
            'g11_wcode', 'g12_year', 'g12_gradeexp', 'g12_grade', 'g12_wcode'
        ]
        df.drop(variables_to_drop, axis=1, inplace=True)

    elif cohort == 2:
        print "for cohort 2..."
        variables_to_drop = [
            'g6_school_name', 'g7_school_name', 'g8_school_name',
            'g9_school_name', 'g10_school_name', 'g11_school_name',
            'g12_school_name', 'g6_year', 'g6_grade', 'g6_wcode', 'g7_year',
            'g7_grade', 'g7_wcode', 'g8_year', 'g8_grade', 'g8_wcode',
            'g9_year', 'g9_grade', 'g9_wcode', 'g10_year', 'g10_grade',
            'g10_wcode', 'g11_year', 'g11_grade', 'g11_wcode', 'g12_year',
            'g12_grade', 'g12_wcode'
        ]
        df.drop(variables_to_drop, axis=1, inplace=True)

    #######################
    ## COMBINE VARIABLES ##
    #######################

    ## Create single column for birth year
    print "Correcting birthdays..."

    df['birthday'] = df['g11_byrmm']
    birthday_cols = [
        'g12_byrmm', 'g10_byrmm', 'g9_byrmm', 'g8_byrmm', 'g7_byrmm',
        'g6_byrmm'
    ]
    for c in birthday_cols:
        ml.replace_if_missing(df, 'birthday', c)
        df.drop(c, axis=1, inplace=True)

    df['birth_year'] = df.loc[:, 'birthday'].astype(str, copy=False)[:4]
    df['birth_month'] = df.loc[:, 'birthday'].astype(str, copy=False)[4:]
    df.drop('birthday', axis=1, inplace=True)

    ## Create single column for gender
    print "Correcting gender..."

    df['gender'] = df['g11_gender']
    gender_cols = [
        'g12_gender', 'g11_gender', 'g10_gender', 'g9_gender', 'g8_gender',
        'g7_gender', 'g6_gender'
    ]
    for c in gender_cols:
        ml.replace_if_missing(df, 'gender', c)
        df.drop(c, axis=1, inplace=True)

    ################
    ## CLEAN DATA ##
    ################

    print "Cleaning data..."
    retained_cols = [
        'g11_retained', 'g12_retained', 'g9_newmcps', 'g10_newmcps',
        'g11_newmcps', 'g12_newmcps', 'g9_newus', 'g10_newus', 'g11_newus',
        'g12_newus'
    ]

    for col in retained_cols:
        df[col] = df[col].notnull()

    ###############################
    ## CREATE MISSING DATA FLAGS ##
    ###############################

    print "Creating missing data flags..."

    ## Create flag if a given student is missing a year's worth of data
    grade_id = [
        'g6_pid', 'g7_pid', 'g8_pid', 'g9_pid', 'g10_pid', 'g11_pid', 'g12_pid'
    ]
    year = 6
    for g in grade_id:
        col_name = 'g' + str(year) + '_missing'
        df[col_name] = df[g].isnull()
        df.drop(g, axis=1, inplace=True)
        year += 1

    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/predummy_data_cohort' + str(
        cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    return df
Beispiel #12
0
def impute_data(df, cohort):

    #import IPython
    #IPython.embed()

    if isinstance(df, str):
        df = ml.read_data(df)

    #########################
    ## IMPUTE MISSING DATA ##
    #########################
    print "Imputing missing data..."

    #change msam to missing is msam_NA==1
    nanList = [
        'g6_g6msam_nan', 'g7_g7msam_nan', 'g8_g8msam_nan', 'g9_g8msam_nan'
    ]
    varList = [
        ['g6_g6msam_Advanced', 'g6_g6msam_Basic', 'g6_g6msam_Proficient'],
        ['g7_g7msam_Advanced', 'g7_g7msam_Basic', 'g7_g7msam_Proficient'],
        ['g8_g8msam_Advanced', 'g8_g8msam_Basic', 'g8_g8msam_Proficient'],
        ['g9_g8msam_Advanced', 'g9_g8msam_Basic', 'g9_g8msam_Proficient']
    ]
    for x in range(0, len(nanList)):
        nacol = nanList[x]
        colList = varList[x]
        for col in colList:
            df.loc[df[nacol] == 1, col] = np.nan

    #pred missing data using any available data
    wordList = [
        'absrate', 'mapr', 'msam_Advanced', 'msam_Basic', 'msam_Proficient',
        'mobility', 'nsusp', 'mpa', 'tardyr', 'psatm', 'psatv', 'retained'
    ]
    for word in wordList:
        colList = [col for col in df.columns if word in col]
        rowMean = df[colList].mean(axis=1)
        for col in colList:
            print df[col].value_counts(dropna=False)
            df.loc[:, col].fillna(rowMean, inplace=True)
            print df[col].value_counts(dropna=False)
    '''
    ############################
    # IMPUTE NEIGHBORHOOD DATA #
    ############################

    print "Imputing missing school neighborhood data..."

    ## Fill missing school neighborhood data
    print "Fixing neighborhood columns..."
    neighborhood_cols = ['suspensionrate',  'mobilityrateentrantswithdra',  'attendancerate',   'avg_class_size',   'studentinstructionalstaffratio',   'dropoutrate',  'grade12documenteddecisionco',  'grade12documenteddecisionem',  'grade12documenteddecisionmi',  'grad12docdec_col_emp', 'graduationrate',   'studentsmeetinguniversitysyste',   'Est_Households_2012',  'Est_Population_2012',  'Med_Household_Income_2012',    'Mean_Household_Income_2012',   'Pop_Below_Poverty_2012',   'Percent_Below_Poverty_2012',   'Pop_Under18_2012', 'Under18_Below_Poverty_2012',   'Under18_Below_Poverty_Percent_2012',   'Housholds_on_Food_stamps_with_Children_Under18_2012',  'Housholds_Pop_on_Food_Stamps_2012',    'Pop_BlackAA_2012', 'Pop_White_2012',   'Bt_18_24_percent_less_than_High_School_2012',  'Bt_18_24_percent_High_School_2012',    'Bt_18_24_percent_Some_College_or_AA_2012', 'Bt_1824_percent_BA_or_Higher_2012',    'Over_25_percent_less_than_9th_grade_2012', 'Over_25_percent_9th_12th_2012',    'Over_25_percent_High_School_2012', 'Over_25__percent_Some_College_No_Deg_2012',    'Over_25_percent_AA_2012',  'Over_25_percent_Bachelors_2012',   'Over_25_percent_Graduate_or_Professionals_2012']
    ml.replace_with_mean(df, neighborhood_cols)
    '''

    #summary = ml.summarize(df)
    #print summary.T
    #ml.print_to_csv(summary.T, 'updated_summary_stats_vertical.csv')

    return_file = '/mnt/data2/education_data/mcps/DATA_DO_NOT_UPLOAD/imputed_data_cohort' + str(
        cohort) + '.csv'
    ml.print_to_csv(df, return_file)

    #IPython.embed()

    print "Done!"
    import IPython
    IPython.embed()
    return df
def prepare_data(dataset):
	
	#######################################################
	# Load Credit Data and Run Initial Summary Statistics #
	#######################################################
	print "Loading data..."

	## LOAD DATA
	df = ml.read_data(dataset)
	variables = list(df.columns.values)
	
	## RUN INITIAL SUMMARY STATISTICS & GRAPH DISTRIBUTIONS
	summary = ml.summarize(df)
	#print_to_csv(summary, 'summary_stats.csv')
	
	for v in variables:
		ml.histogram(df, v)

	## FOR FUTURE: Drop rows where 'percentage' fields have values > 1

	############################
	# Deal with missing values #
	############################
	print "Handling missing values..."

	print "Correcting dependents column..."
	''' DEPENDENTS: Missing values are likely zeros. If someone didn't 
	provide this info, they likely wouldn't have kids.'''
	variables = ['NumberOfDependents']
	values = [0]
	ml.replace_with_value(df, variables, values)

	print "Correcting income column..."
	'''MONTHLY INCOME: It wouldn't make sense to determine missing values
	through replacing with a specific value. Instead, impute null values with
	the mean of income.'''
	variables = ['MonthlyIncome']
	ml.replace_with_mean(df, variables)

	#ml.print_to_csv(df, 'credit-data-updated.csv')

	#####################
	# Generate Features #
	#####################
	print "Generating features..."

	## FIND IMPORTANT FEATURES
	test_features = np.array(['RevolvingUtilizationOfUnsecuredLines', 'age',
							'NumberOfTime30-59DaysPastDueNotWorse',
							'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
							'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
							'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'])
	y = 'SeriousDlqin2yrs'

	## Find initial best features
	#print ml.find_features(df, test_features, y)

	## ENGINEER ADDITIONAL FEATURES
	print "Engineering income buckets..."
	'''MONTHLY INCOME: Break this into buckets, adjusting for outliers'''
	df['MonthlyIncome_adjust'] = df.MonthlyIncome.apply(lambda x: ml.adjust_outliers(x, 15000))	
	ml.bin_variable(df, 'MonthlyIncome_adjust', 15, False)
	#print pd.value_counts(df['MonthlyIncome_adjust_bins'])

	print "Engineering age buckets..."
	'''AGE: Break this into buckets'''
	bins = [-1] + range(20, 80, 5) + [120]
	ml.bin_variable(df, 'age', bins, False)
	#print pd.value_counts(df['age_bins'])
	
	#print df.head()

	## RECALCULATE IMPORTANT FEATURES
	new_features = np.array(['MonthlyIncome_adjust_bins', 'age_bins'])
	all_features = np.hstack((test_features, new_features))
	#print all_features
	#print ml.summarize(df)

	## FIND BEST FEATURES
	#print ml.find_features(df, all_features, y)

	### FOR FUTURE: It would be cool to be able to automatically point to the top
	### five best features or focus on the features that meet a certain threshold.
	### Then I could return that as well for the run_classifiers function.

	## PRINT PREPARED DATA TO CSV
	file_name = "credit-data-clean.csv"
	ml.print_to_csv(df, file_name)

	return file_name, y