def replace_missingness(df): # replace all negative values with NaN print "Replacing missing values with NaN" df[df < 0] = np.nan # replace Missing or Other with NaN df[df == 'Missing'] = np.nan df[df == 'Other'] = np.nan # drop columns with over 75% missingness print "Dropping columns with over 75% missingness" include_col = [] theta = .75 dropped_col = 0 for col in df.columns: missingness = df[col].isnull().sum() / len(df[col]) if theta >= missingness: include_col.append(col) else: dropped_col += 1 df = df[include_col] print "Dropped ", dropped_col, " columns" # find and drop columns that do not contain numeric values print "Dropping columns that do not contain numeric values" df_subset = df.select_dtypes(exclude=[np.number]) print "Columns dropped: ", df_subset.columns df = df.drop(df_subset.columns, axis=1) # using knn imputation print "Running Knn imputation" df_imputed_columns = df.columns k = 109 df_imputed = KNN(k=k).complete(df) df_imputed = pd.DataFrame(df_imputed) #print df_imputed df_imputed.columns = df_imputed_columns # save result to imputed_background.csv df_imputed.to_csv('imputed_background_jobTraining.csv', sep=',', index=False)
background['cf4fint'] = background['cf4fint'].astype('category') myrange = len(background['cf4fint'].value_counts()) background['cf4fint'] = background['cf4fint'].cat.rename_categories( list(range(1, myrange + 1))) background['hv5_wj9ae'] = background['hv5_wj9ae'].astype('float') #at least 400 non-NAN values for each column background = background.dropna(axis=1, how='all') background = background.dropna(axis=1, thresh=400) #Missing value imputation with KNN background_filled_knn = KNN(k=100).complete(background) background_filled_knn = pd.DataFrame(background_filled_knn, columns=[list(background)]) background_filled_knn.to_csv('background_filled_knn.csv') train = pd.read_csv('train.csv') train = train[['challengeID', 'gpa']] train = train.dropna() training = pd.concat([background_filled_knn, train], axis=1, join='inner') #training.to_csv('concated_train_background.csv') #getting features from rlasso with tuning feature_set = [] from sklearn.linear_model import RandomizedLasso params = [0.004, 0.0000004] predictors = training.iloc[:, 1:len(list(training)) - 1] targets = training['gpa'] pred_train, pred_test, tar_train, tar_test = train_test_split(predictors,