def impute_missing_vals(df, num_cols, cat_cols): df_full = pd.DataFrame(columns=df.columns) for j in np.unique(df.ID): df_n = df[df.ID == j].reset_index(drop=True) missing_val = df_n.isnull().sum() r, c = df_n.shape if r > 2: if df_n[num_cols].isnull().sum().sum() > 0: df_n[num_cols] = pd.DataFrame(fancyimpute.KNN(k=5).complete( df_n[num_cols]), columns=num_cols) # for i in num_cols: # if len(df_n[df_n[i].isnull()])>0: # df_n.loc[df_n[i].isnull(),i]=np.mean(df_n[i]) for i in cat_cols: if len(df_n[df_n[i].isnull()]) > 0: if len(stat._counts(df_n.loc[:, i])) > 1: df_n.loc[:, i] = df_n.loc[:, i].fillna(method='ffill') df_n[i] = df_n[i].astype(object) else: df_n.loc[df_n[i].isnull(), i] = stat.mode(df_n[i]) df_full = pd.concat([df_full, df_n], ignore_index=True) return df_full
def outlier_imputer(df_o, num_cols): # Outlier Analysis while True: for i in num_cols: median = np.median(df_o[i]) std = np.std(df_o[i]) min = (df_o[i].quantile(0.25) - 1.5 * (df_o[i].quantile(0.75) - df_o[i].quantile(0.25))) max = (df_o[i].quantile(0.75) + 1.5 * (df_o[i].quantile(0.75) - df_o[i].quantile(0.25))) df_o.loc[df_o[i] < min, i] = np.nan df_o.loc[df_o[i] > max, i] = np.nan missing_val = df_o.isnull().sum() if (missing_val.sum() > 0): df_o[num_cols] = pd.DataFrame(fancyimpute.KNN(k=3).complete( df_o[num_cols]), columns=num_cols) # for i in num_cols: # if len(df_o[df_o[i].isnull()])>0: # df_o.loc[df_o[i].isnull(),i]=np.mean(df_o[i]) else: break df_o.loc[df_o['Reason for absence'] == 0, 'Reason for absence'] = 20 df_o.loc[df_o['Month of absence'] == 0, 'Month of absence'] = stat.mode(df_o['Month of absence']) return df_o
def perform_knn_imputation(dfs): knn_imputed_datasets = [ fancyimpute.KNN(k=100, verbose=True).fit_transform(dfs[i]) for i in range(len(dfs)) ] return [ pd.DataFrame(data=knn_imputed_datasets[i]) for i in range(len(dfs)) ]
def knn_fill_conf(perc, c): """knn fill that provides the conf intervals""" clf = BayesianRidge() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) design = fancyimpute.KNN(k=3).fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) return sum(scipy.stats.norm(drug_preds, std * 1).pdf(drug_true) < 0.05)
def impute_value(self, df, method="MICE"): """ Impute using MICE """ if method == "MICE": return fi.MICE(verbose=False).complete(df) elif method == "KNN": return fi.KNN(k=4, verbose=False).complete(df) else: return fi.SimpleFill().complete(df)
def score_rent(self): X_train, X_test, y_train, y_test, contin_cols = self.preprocessing() #seperate continous and categorical columns con_train = X_train[contin_cols] cat_train = X_train[X_train.columns.difference(contin_cols)] #fancyimpute mice in continous train data mice = fancyimpute.MICE(verbose=0) con_train = np.asarray(con_train) con_train_mice = mice.complete(con_train) #fancyimpute mice in categorical train data cat_train = np.asarray(cat_train) cat_train_fancyknn = fancyimpute.KNN().complete(cat_train) cat_train_fancyknn = np.round(cat_train_fancyknn).astype(int) #apply boxcox transformation to continuous train data con_train_mice_bc = np.empty(con_train_mice.shape) from scipy import stats for i in range(len(contin_cols)): if np.argwhere(con_train_mice[:, i] < 0).size == 0: x = stats.boxcox(con_train_mice[:, i] + 1e-5)[0] x = np.asarray([x]) con_train_mice_bc[:, i] = x else: con_train_mice_bc[:, i] = con_train_mice[:, i] # apply onehot to categorical train data enc = OneHotEncoder() enc = enc.fit(cat_train_fancyknn) oh = enc.transform(cat_train_fancyknn).toarray() cat_train_fancyknn_onehot = np.round(oh).astype(int) #concatenate imputed train data X_train_imp = np.concatenate( (cat_train_fancyknn_onehot, con_train_mice_bc), axis=1) # Feature selection using Lasso select_lassocv = SelectFromModel(LassoCV()) select_lassocv = select_lassocv.fit(X_train_imp, y_train) #LassoCV param_grid = {'alpha': np.logspace(-3, 0, 14)} print(param_grid) grid = GridSearchCV(Lasso(normalize=True, max_iter=1e6), param_grid, cv=10) #makepipeline to prevent information leakage pipe_lassocv = make_pipeline(MinMaxScaler(), select_lassocv, grid) pipe_lassocv = pipe_lassocv.fit(X_train_imp, y_train) train_r2 = np.mean( cross_val_score(pipe_lassocv, X_train_imp, y_train, cv=5)) return contin_cols, enc, pipe_lassocv, train_r2, X_test, y_test
def impute_parameter_adjustment(method, param_grid, impute_radio, x_init, y_init, reference_x, reference_y): model = joblib.load('..\\models\\vote_model_hard.joblib') markers = ['o', '*', '1', 's', '2'] I = 20 for radio, marker in zip(impute_radio, markers): acc_1 = {i: 0 for i in param_grid} acc_2 = {i: 0 for i in param_grid} for m in range(I): corruptor = Corruptor(x_init, radio) x_miss = getattr(corruptor, "mcar")() for n in param_grid: if method == 'knn': x_impute = fancyimpute.KNN(k=n).fit_transform( np.vstack( (x_miss, reference_x)))[range(x_init.shape[0])] if method == 'mice': data_impute_list = [] for i in range(n): imputer = fancyimpute.IterativeImputer( n_iter=13, sample_posterior=True, random_state=i) data_impute_list.append( imputer.fit_transform( np.vstack( (x_miss, reference_x)))[range(x_init.shape[0])]) x_impute = np.mean(data_impute_list, 0) print(radio, m, n) if method == 'em': x_impute = em(np.vstack((x_miss, reference_x)), loops=n)[range(x_init.shape[0])] if method == 'som': x_impute = impute_SOM(x_miss, n)[range(x_init.shape[0])] y_pred1 = model.predict(x_impute) y_pred2 = model.predict(x_init) acc_1[n] += 1 - accuracy_score(y_pred1, y_pred2) acc_2[n] += 1 - accuracy_score(y_pred1, y_init) acc_1 = {i: (j / I) for i, j in acc_1.items()} acc_2 = {i: (j / I) for i, j in acc_2.items()} plt.subplot(121) plt.plot(acc_1.keys(), acc_1.values(), marker=marker, label='%.1f%%' % (radio * 100)) plt.xlabel('K') plt.ylabel('CER between imputation and prediction') plt.subplot(122) plt.plot(acc_2.keys(), acc_2.values(), marker=marker, label='%.1f%%' % (radio * 100)) plt.xlabel('K') plt.ylabel('CER between imputation and real label') plt.legend(loc=0, bbox_to_anchor=(0.3, -0.05), ncol=5) plt.show()
def impute(data, method='knn', n=5): if method == 'knn': data_impute = fancyimpute.KNN(k=n).fit_transform(data) if method == 'mice': data_impute_list = [] for i in range(11): imputer = fancyimpute.IterativeImputer(n_iter=13, sample_posterior=True, random_state=i) data_impute_list.append(imputer.fit_transform(data)) data_impute = np.mean(data_impute_list, 0) # data_impute = mice(data) if method == 'em': data_impute = em(data) if method == 'mean': data_impute = fancyimpute.simple_fill.SimpleFill( fill_method='mean').fit_transform(data) return data_impute
def outlier_imputer(df_o, num_cols): # Outlier Analysis while True: for i in num_cols: min = (df_o[i].quantile(0.25) - 1.5 * (df_o[i].quantile(0.75) - df_o[i].quantile(0.25))) max = (df_o[i].quantile(0.75) + 1.5 * (df_o[i].quantile(0.75) - df_o[i].quantile(0.25))) df_o.loc[df_o[i] < min, i] = np.nan df_o.loc[df_o[i] > max, i] = np.nan missing_val = df_o.isnull().sum() print(missing_val) if (missing_val.sum() > 0): df_o_knn = pd.DataFrame(fancyimpute.KNN(k=3).complete( df_o[num_cols]), columns=num_cols) df_o_knn.head() df_o.iloc[:, 9:15] = df_o_knn.iloc[:, :] else: break return df_o
def predict_rent(self): contin_cols, enc, pipe_lassocv, train_r2, X_test, y_test = self.score_rent( ) con_test = X_test[contin_cols] cat_test = X_test[X_test.columns.difference(contin_cols)] #impute test data respectively (continous data) mice = fancyimpute.MICE(verbose=0) con_test = np.asarray(con_test) con_test_mice = mice.complete(con_test) #categorical data cat_test = np.asarray(cat_test) cat_test_fancyknn = fancyimpute.KNN().complete(cat_test) cat_test_fancyknn = np.round(cat_test_fancyknn).astype(int) #apply boxcox transformation to continuous test data con_test_mice_bc = np.empty(con_test_mice.shape) from scipy import stats for i in range(len(contin_cols)): if np.argwhere(con_test_mice[:, i] < 0).size == 0: x = stats.boxcox(con_test_mice[:, i] + 1e-5)[0] x = np.asarray([x]) con_test_mice_bc[:, i] = x else: con_test_mice_bc[:, i] = con_test_mice[:, i] # apply onehot to categorical train data oh = enc.transform(cat_test_fancyknn).toarray() cat_test_fancyknn_onehot = np.round(oh).astype(int) print("Finished onehot") #concatenate imputed test data X_test_imp = np.concatenate( (cat_test_fancyknn_onehot, con_test_mice_bc), axis=1) # make prediction based on training model y_pred = pipe_lassocv.predict(X_test_imp) test_r2 = r2_score(y_test, y_pred) print(test_r2) return X_test, y_test, y_pred
missing_matrix[i] = np.nan complete_matrix = method(verbose=False).fit_transform(missing_matrix) imputed = [complete_matrix[i] for i in indices] correlations[method].append( pd.DataFrame([imputed, originals]).T.corr().iloc[0, 1]) deviation = np.mean([(abs((o - i) / o)) for o, i in zip(originals, imputed) if o == o and o > .01]) percent_off[method].append(deviation) # try different K values for KNN for k in range(4, 15): print('using k=%s' % k) key = 'KNN:K=%s' % k correlations[key] = [] percent_off[key] = [] for simulation in range(100): indices = get_rand_index(base_matrix, 5000) originals = [base_matrix[i] for i in indices] missing_matrix = base_matrix.copy() for i in indices: missing_matrix[i] = np.nan complete_matrix = fancyimpute.KNN( k=k, verbose=False).fit_transform(missing_matrix) imputed = [complete_matrix[i] for i in indices] correlations[key].append( pd.DataFrame([imputed, originals]).T.corr().iloc[0, 1]) deviation = np.mean([(abs((o - i) / o)) for o, i in zip(originals, imputed) if o == o and o > .01]) percent_off[key].append(deviation)
colnamesX = ['PTS_x', 'DRPG_x', 'TOPG_x', 'PF_x', 'RPG_x'] #colnamesX = ['APG_y', 'RPG_y','PPG_y', 'FG%_y', 'STPG_y', 'BLKPG_y', 'GP_y', 'MPG_y', '2P%_y', '3P%_y'] #colnamesX = ['PTS_x','PF_x','TOPG_x'] colnamesY = ['Class'] # class label allCols = colnamesX + colnamesY # read in data bigDF = shuffle(pd.read_csv( args.path_to_dataframe)) # shuffle data as we read it in # dataX.to_csv("./csv/data_x.csv") # dataY.to_csv("./csv/data_y.csv") focusDF = bigDF[allCols].dropna(thresh=2) dataY = focusDF[colnamesY] # labels dataX = focusDF[colnamesX] dataX = fi.KNN(k=3).fit_transform(dataX) print("Feature vector table shape:", dataX.shape) print("Label table shape:", dataY.shape) # dividing X, y into train and test data X_train, X_test, y_train, y_test = train_test_split(dataX, dataY.values.ravel(), test_size=0.25) scalerX = MinMaxScaler() newDataX = scalerX.fit(X_train) # newDataX = dataX # Apply the scaler to the X training data X_train_std = scalerX.transform(X_train) # # Apply the SAME scaler to the X test data
def impute(self): return fi.KNN(verbose=False).complete(self.missing_data)
csv_reader = csv.reader(file_csv_input) list_movie_id = list(csv_reader) list_movie_id = [int(each[0]) for each in list_movie_id] np_data = np.zeros((number_of_user, number_of_movie)) np_data.fill(np.nan) for each in list_data: user_id = int(each[0]) - 1 movie_id = list_movie_id.index(int(each[1])) rating = float(each[2]) np_data[user_id, movie_id] = rating print(np_data) time_start = time.time() model = fancyimpute.KNN(K_of_knn + 1) np_prediction = model.fit_transform(np_data) error = [] for each in list_test: predict_user_id = int(each[0]) - 1 predict_movie_id = list_movie_id.index(int(each[1])) real_rating = float(each[2]) predict_rating = np_prediction[predict_user_id][predict_movie_id] error.append(abs(real_rating - predict_rating)) time_end = time.time() print('Finish, used %.3lfs' % (time_end - time_start))
print('Mean imputation:') print(get_loss(X_c, X_mean, Y_c)) # save mean inputation results print(X_c.shape, Y_c.shape, Z_c.shape) # raw_input() np.save('./result/mean_data.npy', X_mean) np.save('./result/mean_label.npy', Z_c) # Algo2: KNN imputation X_knn = [] for x, y in zip(X, Y): X_knn.append(fancyimpute.KNN(k=10, verbose=False).fit_transform(x)) X_c = np.concatenate(X, axis=0) Y_c = np.concatenate(Y, axis=0) X_knn = np.concatenate(X_knn, axis=0) print('KNN imputation') print(get_loss(X_c, X_knn, Y_c)) # raw_input() # ### Matrix Factorization # since MF is extremely slow, we evaluate the imputation result every 100 iterations X_mf = []
def preprocess(trainfile, testfile, outputdir, useless_attr, miss_threshold, xstrategy, ymin, ymax, ystrategy, fill_method="MICE", normal01=True): """对XY进行数据预处理,矩阵补全、正则化标准化等。 :param trainfile: string, 训练集(d_train_20180102.csv)的路径 :param testfile: string, 测试集(d_test_A_20180102.csv)的路径 :param outputdir: string, 预处理后文件保存的路径 :param useless_attr: list, 需要删除的无用属性,比如[0, 1, 2, 3] :param miss_threshold: float, 属性确实严重忽略的阈值,百分比,比如0.7 :param xstrategy: string, 对x中奇异点的处理方式{"replace", "nothing"} :param ymin: float, 对Y中点的最小值,小于这个值,即为奇异点 :param ymax: float, 对Y中点的最大值,超过这个值,就是奇异点 :param ystrategy: string, 对y中奇异点的处理方式("delete", "replace", "nothing") :param fill_method: string, 矩阵补全的策略,{"KNN", "SoftI", "MF", "MICE"} :param normal01: bool, 如果为真,则对结果进行归一化到01,否则,不归一化 :return: list, 归一化之后的trainX, trainY, testX """ # 0. 读入训练集,测试集 train_XY = convert(trainfile) test_X = convert(testfile) print("读入数据集,开始数据预处理") # 1. 删除无用属性列 train_id = train_XY[:, 0:1] test_id = test_X[:, 0:1] train_XY = np.delete(train_XY, useless_attr, axis=1) test_X = np.delete(test_X, useless_attr, axis=1) n_test = test_X.shape[0] info1 = "1. 删除train_XY, test_X上的无用属性:%s, train_X.shape=%s, test_X.shape=%s"\ %(str(useless_attr), str(train_XY.shape), str(test_X.shape)) print(info1) # 2. 删除缺失严重的列 miss_mask = np.isnan(train_XY) n = miss_mask.shape[0] column_del = [] # 删除列的list for i in range(miss_mask.shape[1]): miss_n = miss_mask[:, i].sum() if miss_n / n >= miss_threshold: column_del.append(i) train_XY = np.delete(train_XY, column_del, axis=1) test_X = np.delete(test_X, column_del, axis=1) info2 = "2. 在train_XY, test_X上删除缺失超过%f%%的属性:%s" % (miss_threshold * 100, str(column_del)) print(info2) # 3. 对y进行去噪,手动设置阈值 train_Y = train_XY[:, -1:] upper_mask = train_Y > ymax lower_mask = train_Y < ymin if ystrategy == "replace": train_Y[upper_mask] = ymax train_Y[lower_mask] = ymin elif ystrategy == "delete": index = np.array(np.arange(0, train_Y.shape[0], 1), ndmin=2).T chsn_mask = upper_mask | lower_mask train_XY = np.delete(train_XY, index[chsn_mask], axis=0) train_id = np.delete(train_id, index[chsn_mask], axis=0) elif ystrategy == "nothing": pass else: raise ValueError(r"'ystrategy'应该是{nothing, replace, delete}中的一个") train_Y = train_XY[:, -1:] print("3. 对trainY去噪(%s),trainXY.shape=%s" % (ystrategy, train_XY.shape)) # 4. 对X进行操作,通过boxplot计算阈值 train_X = train_XY[:, :-1] all_X = np.concatenate([train_X, test_X], axis=0) attr_n = train_XY.shape[1] - 1 attr_min_max = np.zeros( (attr_n, 2), dtype=np.float64) # 存储每个属性经过boxplot之后的最小最大值,即阈值array if xstrategy == "nothing": pass elif xstrategy == "replace": # 对X中的奇异点 替换为 最值 for i in range(attr_n): # 对每列进行浅拷贝,对crt_attr操作相当于对train_XY操作 crt_attr = all_X[:, i:i + 1] miss = np.isnan(crt_attr) box_dic = plt.boxplot(crt_attr[~miss]) crt_max = box_dic["caps"][0].get_ydata()[0] crt_min = box_dic["caps"][1].get_ydata()[0] if crt_max < crt_min: tmp = crt_max crt_max = crt_min crt_min = tmp attr_min_max[i, 0] = crt_min attr_min_max[i, 1] = crt_max crt_attr[miss] = 0 upper_mask = crt_attr > crt_max lower_mask = crt_attr < crt_min upper_mask &= ~miss lower_mask &= ~miss crt_attr[upper_mask] = crt_max crt_attr[lower_mask] = crt_min crt_attr[miss] = np.nan else: raise ValueError(r"'xstrategy'应该是{nothing, replace}中的一个") print(r"4. 对所有的X进行去噪(%s)." % xstrategy) # 5. 矩阵补全 completer = None if fill_method == "KNN": completer = fi.KNN(verbose=False) elif fill_method == "SoftI": completer = fi.SoftImpute(verbose=False) elif fill_method == "MF": completer = fi.MatrixFactorization(verbose=False) elif fill_method == "MICE": completer = fi.MICE(verbose=False) else: ValueError(r"'fill_method'应该是{'KNN','SoftI','MF','MICE'}.") all_X_complete = completer.complete(all_X) print("5. 在all_X上进行矩阵补全(%s)." % fill_method) # train_X = all_X_complete[:-1000, :] # test_X = all_X_complete[-1000:, :] # 6. 归一化,以及01缩放 if normal01: X_nmler = StandardScaler() X_01 = MinMaxScaler() Y_nmler = StandardScaler() Y_01 = MinMaxScaler() X_nmler.fit(all_X_complete) Y_nmler.fit(train_Y) all_X_nml = X_nmler.transform(all_X_complete) train_Y_nml = Y_nmler.transform(train_Y) X_01.fit(all_X_nml) Y_01.fit(train_Y_nml) all_X_nml01 = X_01.transform(all_X_nml) train_Y_nml01 = Y_01.transform(train_Y_nml) final_train_X = all_X_nml01[:-n_test, :] final_test_X = all_X_nml01[-n_test:, :] final_train_Y = np.concatenate([train_Y_nml01, train_Y], axis=1) else: final_train_X = all_X_complete[:-n_test, :] final_test_X = all_X_complete[-n_test:, :] final_train_Y = train_Y print(r"6. 对all_X, train_Y归一化到01(%s)." % normal01) # 7. 存储数据 print(r"7. 存储数据为: 集合_类别_日期.csv(%s)." % outputdir) # timestamp = datetime.now().strftime("%Y%m%d%H%M") timestamp = "0000" np.savetxt(outputdir + r"\train_X_" + timestamp + ".csv", final_train_X, delimiter=",") np.savetxt(outputdir + r"\test_X_" + timestamp + ".csv", final_test_X, delimiter=",") np.savetxt(outputdir + r"\train_Y_" + timestamp + ".csv", final_train_Y, delimiter=",") np.savetxt(outputdir + r"\train_id_" + timestamp + ".csv", train_id.astype(np.int64), delimiter=",") np.savetxt(outputdir + r"\test_id_" + timestamp + ".csv", test_id.astype(np.int64), delimiter=",") return train_X, train_Y, test_X, train_id
def load_and_process_data(self, data_path): df = pd.read_csv(data_path, sep=';') df = df[df['Month of absence'] != 0] self.preprocessed_data = df.copy() cat_cols = [ 'ID', 'Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Education', 'Son', 'Pet' ] num_cols = [ 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Disciplinary failure', 'Social drinker', 'Social smoker', 'Weight', 'Height', 'Body mass index', 'Absenteeism time in hours' ] df.loc[df['Reason for absence'].isin(range(1, 15)), 'Reason for absence'] = 1 df.loc[df['Reason for absence'].isin(range(15, 19)), 'Reason for absence'] = 2 df.loc[df['Reason for absence'].isin(range(19, 22)), 'Reason for absence'] = 3 df.loc[df['Reason for absence'].isin(range(22, 29)), 'Reason for absence'] = 4 df.Education = df.Education.map({1: 0, 2: 1, 3: 1, 4: 1}) df.Pet = df.Pet.map({0: 0, 1: 1, 2: 1, 4: 1, 5: 1, 8: 1}) df.Son = df.Son.map({0: 0, 1: 0, 2: 1, 3: 1, 4: 1}) while True: for i in num_cols: median = np.median(df[i]) std = np.std(df[i]) min = (df[i].quantile(0.25) - 1.5 * (df[i].quantile(0.75) - df[i].quantile(0.25))) max = (df[i].quantile(0.75) + 1.5 * (df[i].quantile(0.75) - df[i].quantile(0.25))) df.loc[df[i] < min, i] = np.nan df.loc[df[i] > max, i] = np.nan missing_val = df.isnull().sum() if (missing_val.sum() > 0): df[num_cols] = pd.DataFrame(fancyimpute.KNN(k=3).complete( df[num_cols]), columns=num_cols) # for i in num_cols: # if len(df_o[df_o[i].isnull()])>0: # df_o.loc[df_o[i].isnull(),i]=np.mean(df_o[i]) else: break df.drop([ 'ID', 'Weight', 'Age', 'Social smoker', 'Disciplinary failure', 'Education', 'Pet', 'Absenteeism time in hours' ], axis=1, inplace=True) df = pd.get_dummies(df, columns=['Reason for absence'], drop_first=True) df = df[[ 'Reason for absence_1', 'Reason for absence_2', 'Reason for absence_3', 'Reason for absence_4', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Work load Average/day ', 'Hit target', 'Son', 'Social drinker', 'Height', 'Body mass index' ]] num_cols = [ 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Work load Average/day ', 'Hit target', 'Height', 'Body mass index' ] self.data = df self.data[num_cols] = self.scaler.transform(self.data[num_cols]) print(self.data[num_cols].head())
rows = all_matches[pd.isnull(all_matches[betting_odds_combined]).any(axis=1)] print(len(rows)) for index, row in rows.iterrows(): if index % 1000 == 0: print(index) for betting_odds in betting_odds_all: mean = np.mean(row[betting_odds]) all_matches.loc[index, row[betting_odds]. index[row[betting_odds].isnull().tolist()]] = mean home = all_matches.filter(regex='__home_') away = all_matches.filter(regex='__away_') home_filled = fi.KNN().fit_transform(home) home_filled = pd.DataFrame(data=home_filled, columns=home.columns, index=home.index) away_filled = fi.KNN().fit_transform(away) away_filled = pd.DataFrame(data=away_filled, columns=away.columns, index=away.index) all_matches_filled = all_matches.copy() all_matches_filled[home.columns] = home_filled all_matches_filled[away.columns] = away_filled cols_to_normalize = [ '__home_buildUpPlaySpeed', '__home_buildUpPlayDribbling',
mecanismo="MAR" robjects.r.assign('meca',mecanismo) #robjects.r('print(meca)') proporción=0.4 robjects.r.assign('propor',proporción) robjects.r(""" A=matrix(nrow=52,ncol=2, data,byrow=TRUE) """) robjects.r(""" require(mice) result <- ampute(data=A, prop = propor,mech=meca, bycases = FALSE) """) y = robjects.r('result$amp') #print('el valor de y:',y) print(y[0]) archivo=rd.lista_imputada(y) print(archivo) wt.write(archivo,'datos_amput.csv') #robjects.r('print(propor)') #robjects.r('print(x)') vecinos=3 var=fy.KNN(k=vecinos).fit_transform(archivo) var2=my.KNNImputer(n_neighbors=vecinos).fit_transform(archivo) print(len(var)) archivo_imputado=rd.reconvertir(Archivo[0],var) archivo_imputado2=rd.reconvertir(Archivo[0],var2) print(archivo_imputado) wt.write(archivo_imputado,'datos_imputados.csv') wt.write(archivo_imputado2,'datos_imputados2.csv') #print(pi[0])
accepts_x.head() # In[5]: accepts_y = accepts['bad_ind'] # In[ ]: rejects.head() # In[6]: rejects_x = rejects[[ "tot_derog", "age_oldest_tr", "rev_util", "fico_score", "ltv" ]] # print rejects_x.head() rejects_x.info() # In[ ]: accepts_x.info() import fancyimpute as fimp accepts_x_filled = pd.DataFrame(fimp.KNN(3).complete(accepts_x.as_matrix())) accepts_x_filled.columns = accepts_x.columns rejects_x_filled = pd.DataFrame(fimp.KNN(3).complete(rejects_x.as_matrix())) rejects_x_filled.columns = rejects_x.columns
from selfregulation.utils.utils import get_behav_data, get_recent_dataset, get_demographics from selfregulation.utils.result_utils import load_results from selfregulation.utils.get_balanced_folds import BalancedKFold # load data dataset = get_recent_dataset() items = get_behav_data(dataset=dataset, file='items.csv.gz') subject_items = get_behav_data(dataset=dataset, file='subject_x_items.csv') survey_ontology = load_results(dataset)['survey'] demographics = survey_ontology.DA.data demo_factors = survey_ontology.DA.get_scores() # set up prediction imputer = fancyimpute.KNN() predictors = imputer.fit_transform(subject_items) targets = demo_factors.values # set up cross-validation for i, name in enumerate(demo_factors.columns): CV = BalancedKFold(nfolds=10) CV_iter = list(CV.split(predictors, targets[:, 0])) clf = RidgeCV(cv=5) score = cross_val_score(clf, survey_ontology.EFA.get_scores(), targets[:, i], cv=CV_iter, scoring=make_scorer(r2_score)).mean() print('%s Score: %.2f' % (name, score))
def fancy_impute(df, method='mice'): if method =='knn': df = pd.DataFrame(data=fancyimpute.KNN(3).complete(df), columns=df.columns, index=df.index) else: df = pd.DataFrame(data=fancyimpute.MICE().complete(df), columns=df.columns, index=df.index) return df
def impute(df, method, verbose=False): """ Impute missing data using specified imputation method. Parameters ---------- df: pd.DataFrame Stat DataFrame with source columns and player/team multi-index. method: str/bool Imputation method for missing data. - False: Do not impute missing data. - None: Do not impute missing data. - 'BiScaler' - 'IterativeImpute' - 'IterativeSVD' - 'KNN': Impute with nearest neighbors. - 'Mean': Impute missing with average of other sources. - 'NuclearNorm' - 'SoftImpute' verbose: bool, default=False If True, print debugging information. Returns ------- df: pd.DataFrame Imputed DataFrame with no NaNs. """ warnings.filterwarnings('ignore', category=RuntimeWarning) # Subset DataFrame to only include only projection columns. ignored_cols = ['Player', 'Team', 'Pos', 'Week', 'STATS'] impute_cols = [col for col in list(df) if col not in ignored_cols] X = df[impute_cols].copy().T # Impute DataFrame. v = verbose if method in [None, False]: imputed_vals = X.values elif np.sum(np.sum(X.isnull())) == 0: # No missing values. imputed_vals = X.values elif method == 'BiScaler': imputed_vals = fi.BiScaler(verbose=v).fit_transform(X) elif method == 'IterativeImpute': imputed_vals = fi.IterativeImputer(verbose=v).fit_transform(X) elif method == 'IterativeSVD': imputed_vals = fi.IterativeSVD(verbose=v).fit_transform(X) elif method == 'KNN': imputed_vals = fi.KNN(k=3, verbose=v).fit_transform(X) elif method == 'MatrixFactorization': imputed_vals = fi.MatrixFactorization(verbose=v).fit_transform(X) elif method == 'Mean': imputed_vals = fi.SimpleFill('mean').fit_transform(X) elif method == 'Median': imputed_vals = fi.SimpleFill('median').fit_transform(X) elif method == 'NuclearNorm': imputed_vals = fi.NuclearNormMinimization(verbose=v).fit_transform(X) elif method == 'SoftImpute': imputed_vals = fi.SoftImpute(verbose=v).fit_transform(X) # Recombine ignored columns with imputed data. imputed_df = pd.DataFrame(imputed_vals.T, columns=X.index) for col in impute_cols: if len(imputed_df[col]) != len(df[col]): print(f'df: {len(df[col])}\nimp: {len(imputed_df[col])}') df[col] = imputed_df[col].values return df
print('Mean imputation:') print(get_loss(X_c, X_mean, Y_c)) # save mean inputation results print(X_c.shape, Y_c.shape, Z_c.shape) raw_input() np.save('./result/mean_data.npy', X_mean) np.save('./result/mean_label.npy', Z_c) # Algo2: KNN imputation X_knn = [] for x, y in zip(X, Y): X_knn.append(fancyimpute.KNN(k=10, verbose=False).complete(x)) X_c = np.concatenate(X, axis=0) Y_c = np.concatenate(Y, axis=0) X_knn = np.concatenate(X_knn, axis=0) print('KNN imputation') print(get_loss(X_c, X_knn, Y_c)) raw_input() # ### Matrix Factorization # since MF is extremely slow, we evaluate the imputation result every 100 iterations X_mf = []