def replace_missingness(df):

    # replace all negative values with NaN
    print "Replacing missing values with NaN"
    df[df < 0] = np.nan

    # replace Missing or Other with NaN
    df[df == 'Missing'] = np.nan
    df[df == 'Other'] = np.nan

    # drop columns with over 75% missingness
    print "Dropping columns with over 75% missingness"
    include_col = []
    theta = .75
    dropped_col = 0
    for col in df.columns:
        missingness = df[col].isnull().sum() / len(df[col])
        if theta >= missingness:
            include_col.append(col)
        else:
            dropped_col += 1
    df = df[include_col]
    print "Dropped ", dropped_col, " columns"

    # find and drop columns that do not contain numeric values
    print "Dropping columns that do not contain numeric values"
    df_subset = df.select_dtypes(exclude=[np.number])
    print "Columns dropped: ", df_subset.columns
    df = df.drop(df_subset.columns, axis=1)

    # using knn imputation
    print "Running Knn imputation"
    df_imputed_columns = df.columns
    k = 109
    df_imputed = KNN(k=k).complete(df)
    df_imputed = pd.DataFrame(df_imputed)
    #print df_imputed
    df_imputed.columns = df_imputed_columns

    # save result to imputed_background.csv
    df_imputed.to_csv('imputed_background_jobTraining.csv',
                      sep=',',
                      index=False)
Exemple #2
0
background['cf4fint'] = background['cf4fint'].astype('category')
myrange = len(background['cf4fint'].value_counts())
background['cf4fint'] = background['cf4fint'].cat.rename_categories(
    list(range(1, myrange + 1)))

background['hv5_wj9ae'] = background['hv5_wj9ae'].astype('float')

#at least 400 non-NAN values for each column
background = background.dropna(axis=1, how='all')
background = background.dropna(axis=1, thresh=400)

#Missing value imputation with KNN
background_filled_knn = KNN(k=100).complete(background)
background_filled_knn = pd.DataFrame(background_filled_knn,
                                     columns=[list(background)])
background_filled_knn.to_csv('background_filled_knn.csv')

train = pd.read_csv('train.csv')
train = train[['challengeID', 'gpa']]
train = train.dropna()
training = pd.concat([background_filled_knn, train], axis=1, join='inner')

#training.to_csv('concated_train_background.csv')

#getting features from rlasso with tuning
feature_set = []
from sklearn.linear_model import RandomizedLasso
params = [0.004, 0.0000004]
predictors = training.iloc[:, 1:len(list(training)) - 1]
targets = training['gpa']
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors,