def test_impute_value_custom_idw(test_data): """fast_knn using custom idw""" data = test_data(SHAPE, 0, 2) idw_fn = functools.partial(impy.ops.inverse_distance_weighting.shepards, power=1) imputed = impy.fast_knn(data, k=2, idw_fn=idw_fn) assert np.isclose(imputed[0, 2], 8.913911092686593)
def perf_knn_imput(dfs_arg, n_neighbors): # knn_data = [fancyimpute.KNN(k=100, verbose=True).solve(dfs_arg[i]) knn_data = [ impy.fast_knn(dfs_arg[i].values, k=n_neighbors) for i in range(len(dfs_arg)) ] return [pd.DataFrame(data=knn_data[i]) for i in range(len(dfs_arg))]
def pre_knn(data, year, year1): # 设定时间序列 # data['TIMESTAMP'] = pd.to_datetime(data['TIMESTAMP']) data = data.set_index('TIMESTAMP') if year.strip(): if year1.strip(): data = data[year:year1] else: data = data[year] # 补充了数据 ,这时补充的数据都是 0 df_period = data.resample('D').sum() # 替换空值为其他值 df_period_clone = df_period df_period_clone.replace(0, np.nan, inplace=True) # 平滑后的数据 isnullcon = df_period_clone.isnull().any() # print(isnullcon['FP_TOTALENG']) # 这里来个判断,如果数据里没有需要平滑的点,那么直接输出 if isnullcon['FP_TOTALENG']: df_after = impy.fast_knn(df_period_clone, k=13, eps=0, p=2) for i in range(len(df_period_clone)): # print('-----平缓后数据-----') # print(df_after[0][i]) # print('-----平缓前数据-----') # print(df_period['FP_TOTALENG'][i]) df_period_clone['FP_TOTALENG'][i] = df_after[0][i] return df_period_clone
def compute_err_kNN(Xtrain, ytrain, Xtest, ytest, n, p, G): Xtr_nan_list = make_nan_list(Xtrain, ytrain, G, n, p) # make NA data # since making function changes the order of observation # we need to generate new ytr from Xtr_nan Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0])) for g in np.arange(1, G): Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g])) ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g])))) # percentage of missing values per_missing = np.mean(np.isnan(Xtr_nan)) scaler = MinMaxScaler() scaler.fit(Xtr_nan) Xtr_nan = scaler.transform(Xtr_nan) Xtest = scaler.transform(Xtest) Xtr_nan_list2 = [] for g in range(G): Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g])) start = time.time() Xtr_knn = impy.fast_knn(Xtr_nan) print("Finished imputing") clf_knn = skLDA().fit(Xtr_knn, ytr) knn_err = np.mean(clf_knn.predict(Xtest).flatten() != ytest) knn_time = time.time() - start return knn_err, knn_time
def test_impute_value(self): data = np.array([[ 0. , 1. , np.nan, 3. , 4. ], [ 5. , 6. , 7. , 8. , 9. ], [10. , 11. , 12. , 13. , 14. ], [15. , 16. , 17. , 18. , 19. ], [20. , 21. , 22. , 23. , 24. ]]) imputed = impy.fast_knn(data, k=2) # Weighted average of nearest 2 neighbours self.assertTrue(np.isclose(imputed[0][2], 8.913911092686593))
def test_impute_value_custom_idw(self): data = np.array([[0., 1., np.nan, 3., 4.], [5., 6., 7., 8., 9.], [10., 11., 12., 13., 14.], [15., 16., 17., 18., 19.], [20., 21., 22., 23., 24.]]) idw = functools.partial(impy.util.inverse_distance_weighting.shepards, power=1) imputed = impy.fast_knn(data, k=2, idw=idw) assert np.isclose(imputed[0][2], 8.913911092686593)
def impute_knn(df, numeric_vars, neighbors): X = convert_to_numeric(df, numeric_vars) X = df[numeric_vars].to_numpy() other_vars = list(set(df.columns) - set(numeric_vars) ) X_strings = df[other_vars].reset_index() imputed_np = fast_knn(X, k= neighbors) X_imputed = pd.DataFrame.from_records(imputed_np, columns = numeric_vars) rv = X_strings.join(X_imputed) return rv
def impute_values(df, imp_strategy, neighbors, numeric_vars): X = convert_to_numeric(df, numeric_vars) X = df[numeric_vars].to_numpy() other_vars = list(set(df.columns) - set(numeric_vars) ) X_strings = df[other_vars].reset_index(drop=True) if imp_strategy == "knn": # imputer = KNNImputer(n_neighbors = neighbors, weights = weight_type) # imputed = imputer.fit_transform(X) # This is very costly # from here https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html # https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html imputed = fast_knn(X, k= neighbors) else: imputer = SimpleImputer(missing_values = np.nan, strategy = imp_strategy) imputer.fit(X) imputed = imputer.transform(X) X_imputed = pd.DataFrame.from_records(imputed, columns = numeric_vars) rv = X_strings.join(X_imputed) return rv
def compute_err_knn(Xtrain, ytrain, n, p, G): # make NAs Xtr_nan_list = make_nan_list(Xtrain, ytrain, G, n, p) # make NA data # since making function changes the order of observation # we need to generate new ytr from Xtr_nan Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0])) for g in np.arange(1, G): Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g])) ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g])))) scaler = StandardScaler() scaler.fit(Xtr_nan) Xtr_nan = scaler.transform(Xtr_nan) Xtrain = scaler.transform(Xtrain) for g in range(G): Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g]) mus = [np.mean(Xtrain[ytrain == g, :], axis=0) for g in np.arange(G)] mus = np.asarray(mus) # each row is a mean of a class S = [(sum(ytrain == g) - 1) * np.cov(Xtrain[ytrain == g, :], rowvar=False) for g in np.arange(G)] S = np.asarray(S) / len(ytrain) # percentage of missing values per_missing = np.mean(np.isnan(Xtr_nan)) start = time.time() Xtr_knn = impy.fast_knn(Xtr_nan) mus_knn = np.asarray( [np.mean(Xtr_knn[ytrain == g, :], axis=0) for g in np.arange(G)]) S_knn = np.asarray([ (sum(ytrain == g) - 1) * np.cov(Xtr_knn[ytrain == g, :], rowvar=False) for g in np.arange(G) ]) S_knn = S_knn / len(ytrain) knn_err = err(mus, S, mus_knn, S_knn) knn_time = time.time() - start return knn_err, knn_time, per_missing
def setMissingValues(data): data = pd.DataFrame({ 'a': data[:, 0], 'b': data[:, 1], 'c': data[:, 2], 'd': data[:, 3], 'e': data[:, 4], 'label': data[:, 5] }) data_missing_grouped = data.groupby('label') new_data_grouped = list() for key, item in data_missing_grouped: temp = list(imp.fast_knn(np.array(item), k=3)) for i in temp: new_data_grouped.append(i) with open('data/new_tiroid.csv', 'w') as csvFile: writer = csv.writer(csvFile) writer.writerows(new_data_grouped) csvFile.close() return new_data_grouped
def test_impute_value_custom_idw(): "fast_knn using custom idw" idw = functools.partial(impy.util.inverse_distance_weighting.shepards, power=1) imputed = impy.fast_knn(data_m2, k=2, idw=idw) assert np.isclose(imputed[0][2], 8.913911092686593)
def test_impute_value(self): data = np.array([[0., 1., np.nan, 3., 4.], [5., 6., 7., 8., 9.], [10., 11., 12., 13., 14.], [15., 16., 17., 18., 19.], [20., 21., 22., 23., 24.]]) imputed = impy.fast_knn(data, k=2) assert np.isclose(imputed[0][2], 8.38888888888889)
def test_impute_missing_values(self): """ After imputation, no NaN's should exist""" imputed = impy.fast_knn(self.data_m) self.assertFalse(np.isnan(imputed).any())
def test_return_type(self): """ Check return type, should return an np.ndarray""" imputed = impy.fast_knn(self.data_m) self.assertTrue(isinstance(imputed, np.ndarray))
def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target): try: self.miss_info = miss_info self.columns = notobj self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[ "num_col"] metric = {"rmse": {}, "nrmse": {}} self.rawT = T self.target = target if target is not None: self.target_y = T[target] else: self.target_y = None self.cv = {} self.cv.update(deepcopy(metric)) self.kf = kf self.MSE = {} self.MSE.update(deepcopy(metric)) self.result = {} self.time_ck = {} X = deepcopy(T) mask = pd.DataFrame(mask, columns=T.columns.tolist()) self.rawmask = mask X[(mask == 1).values] = np.nan if obj in [None, []]: obj = None else: pass ########################################## self.X = X[notobj] self.T = T[notobj] self.mask = mask[notobj] self.notobj = notobj ########################################## if obj is not None: ############ Numeric + Category ################# cat_impute = SimpleImputer(strategy="most_frequent") X[obj] = cat_impute.fit_transform(X[obj]) self.true_obj = T[obj] self.pd_obj = X[obj] ################################################### TT = deepcopy(T) cat_encoder = miss_info["ce_encoder"] for k in cat_encoder.category_mapping: col, map_ = k["col"], k["mapping"] TT[col] = TT[col].replace( dict(zip(k["mapping"].index, k["mapping"].values))) self.full_miss_data = TT self.full_miss_data[(mask == 1).values] = np.nan mice_data = deepcopy(T) for obj_col in obj: mice_data[obj_col] = "Cols_" + mice_data[obj_col] self.full_mice_data = mice_data self.full_mice_data[(mask == 1).values] = np.nan else: ########## Numeric ############################### num_data = deepcopy(self.X) num_data[(self.mask == 1).values] = np.nan self.full_miss_data = deepcopy(num_data) self.full_mice_data = deepcopy(num_data) ################################################### self.algo = algo self.method = { "MissForest" : lambda x : MissForest(verbose = 0, n_jobs = -1 ).fit(x) , "mean" : lambda x : impy.mean(x) , "median" : lambda x : impy.median(x) , "mode" : lambda x : impy.mode(x) , "knn" : lambda x : impy.fast_knn(x) , "MICE" : lambda x : impy.mice(x) , "EM" : lambda x : impy.em(x), "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\ fit_transform(pd.DataFrame(x)).values, } except Exception as e: print(e) pass
#数据缺失处理 import impyute as impy data_drop = dataload1.dropna()#将缺失值剔除 data_drop.hist(layout=(1,3),bins=40,figsize=(15,3)) data_mode = dataload1.fillna(dataload1.mode())#用最高频率纸填补缺失值 data_mode.hist(layout=(1,3),bins=40,figsize=(15,3)) data = dataload1[['Unnamed: 0','points','price']] nd = np.array(data) filled_mice = impy.mice(nd)#通过属性的相关关系来填补缺失值 data_mice = pd.DataFrame(filled_mice) data_mice.hist(layout=(1,3),bins=40,figsize=(15,3)) filled_knn = impy.fast_knn(nd,k=3)#通过数据对象之间的相似性来填补缺失值 data_knn = pd.DataFrame(filled_knn) data_knn.hist(layout=(1,3),bins=40,figsize=(15,3)) plt.show() data_drop = dataload2.dropna()#将缺失值剔除 data_drop.hist(layout=(1,3),bins=40,figsize=(15,3)) data_mode = dataload2.fillna(dataload2.mode())#用最高频率纸填补缺失值 data_mode.hist(layout=(1,3),bins=40,figsize=(15,3)) data = dataload2[['Unnamed: 0','points','price']] nd = np.array(data) filled_mice = impy.mice(nd)#通过属性的相关关系来填补缺失值 data_mice = pd.DataFrame(filled_mice)
def test_impute_value(test_data): """fast_knn using standard idw""" data = test_data(SHAPE, 0, 2) imputed = impy.fast_knn(data, k=2) assert np.isclose(imputed[0, 2], 8.38888888888889)
def run_experiment_k_paper(X_test, y_test, clf, NB, a, setting): X_impute_mean = np.mean(X_test, axis = 0) X_impute_median = np.median(X_test, axis = 0) X_impute_max = np.max(X_test, axis = 0) X_impute_min = np.min(X_test, axis = 0) X_impute_flip = np.copy(1 - X_test) k_all = [] missing_err_nb_all = [] missing_err_lr_mean_all = [] missing_err_lr_median_all = [] missing_err_lr_max_all = [] missing_err_lr_min_all = [] missing_err_lr_flip_all = [] missing_err_lr_em_impute_all = [] missing_err_lr_mice_impute_all = [] missing_err_lr_knn_impute_all = [] useEM = setting["em"] if "em" in setting else False discreteFeatures = setting["discreteFeatures"] if "discreteFeatures" in setting else 1 featureEncoding = setting["feature_encoding"] if "feature_encoding" in setting else None do_emImpute = setting["emImpute"] if "emImpute" in setting else False do_miceImpute = setting["miceImpute"] if "miceImpute" in setting else False do_knnImpute = setting["knnImpute"] if "knnImpute" in setting else False if useEM: missing_err_ours_all = {} for i in range(len(a)): missing_err_ours_all["ours_" + str(i)] = [] else: missing_err_ours_all = [] useProb = setting["prob"] if "prob" in setting else True function = setting["function"] if "function" in setting else None if function is None: if useProb: function = conditional_likelihood_k else: function = f1_score print("Using following function: ") print(function) repeat = setting["repeat"] if "repeat" in setting else 1 FEATURES = setting["features"] if "features" in setting else None if FEATURES is None: NNN = X_test.shape[1] if not featureEncoding is None: NNN = len(featureEncoding) FEATURES = np.array( [i for i in range(NNN / discreteFeatures )] ) else: FEATURES = np.array( FEATURES ) print("Possible features to remove: {}".format(FEATURES.shape[0])) K = setting["k"] for k in K: print("K = {}".format(k)) if k > FEATURES.shape[0]: print("Early stop: Only had {} features possible to remove".format(FEATURES.shape[0])) break cur_nb = [] cur_lr_mean = [] cur_lr_median = [] cur_lr_max = [] cur_lr_min = [] cur_flip = [] cur_em_impute = [] cur_mice_impute = [] cur_knn_impute = [] if useEM: cur_ours = {} for i in range(len(a)): cur_ours["ours_" + str(i)] = [] else: cur_ours = [] for R in range(repeat): if R % 10 == 0: print("\t R = {}".format(R)) X_test_mean = np.array(X_test, dtype = 'float') X_test_median = np.array(X_test, dtype = 'float') X_test_max = np.array(X_test, dtype = 'float') X_test_min = np.array(X_test, dtype = 'float') X_test_flip = np.array(X_test, dtype = 'float') X_test_em_impute = np.array(X_test, dtype = 'float') X_test_mice_impute = np.array(X_test, dtype = 'float') X_test_knn_impute = np.array(X_test, dtype = 'float') missing = np.zeros(X_test.shape, dtype=bool) for i in range(X_test.shape[0]): miss = np.random.choice(FEATURES, k, replace=False) if not featureEncoding is None and k > 0: missK = [] for m in miss: for z in featureEncoding[m]: missK.append(z) miss = np.copy(np.array(missK)) elif discreteFeatures != 1 and k > 0: missK = [] for m in miss: for z in range(discreteFeatures): missK.append(m * discreteFeatures + z) miss = np.copy(np.array(missK)) missing[i][miss] = True # if k > 0: # print(missing[i]) # print(np.sum(missing[i])) X_test_mean[i][miss] = X_impute_mean[miss] X_test_median[i][miss] = X_impute_median[miss] X_test_max[i][miss] = X_impute_max[miss] X_test_min[i][miss] = X_impute_min[miss] X_test_flip[i][miss] = X_impute_flip[i][miss] X_test_em_impute[i][miss] = np.nan X_test_mice_impute[i][miss] = np.nan X_test_knn_impute[i][miss] = np.nan if do_emImpute: import time start = time.time() loops = 6 print ("\tStarting to em impute with loops = {}".format(loops)) X_test_em_impute = impyute.em(X_test_em_impute, loops = loops) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_em_impute = np.zeros(X_test.shape) if do_miceImpute: import time start = time.time() print ("\tStarting to mice impute") X_test_mice_impute = impyute.mice(X_test_mice_impute) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_mice_impute = np.zeros(X_test.shape) if do_knnImpute: import time start = time.time() print ("\tStarting to knn impute") X_test_knn_impute = impyute.fast_knn(X_test_knn_impute) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_knn_impute = np.zeros(X_test.shape) lr_prob = clf.predict_proba(X_test) if useProb: cur_nb.append ( function(lr_prob, predict_nbk_with_missing(X_test_mean, NB, missing, prob = True)) ) cur_lr_mean.append ( function(lr_prob, clf.predict_proba(X_test_mean)) ) cur_lr_median.append ( function(lr_prob, clf.predict_proba(X_test_median))) cur_lr_max.append ( function(lr_prob, clf.predict_proba(X_test_max))) cur_lr_min.append ( function(lr_prob, clf.predict_proba(X_test_min))) cur_em_impute.append ( function(lr_prob, clf.predict_proba(X_test_em_impute))) cur_mice_impute.append( function(lr_prob, clf.predict_proba(X_test_mice_impute))) cur_knn_impute.append ( function(lr_prob, clf.predict_proba(X_test_knn_impute))) # cur_flip.append ( function(lr_prob, clf.predict_proba(X_test_flip))) if not useEM: cur_ours.append ( function(lr_prob, a.classify(X_test, missing, prob = True))) else: for z in range(len(a)): cur_ours["ours_" + str(z)].append (function(lr_prob, a[z].classify(X_test, missing, prob = True))) else: cur_nb.append ( function(y_test, predict_nbk_with_missing(X_test_mean, NB, missing)) ) cur_lr_mean.append ( function(y_test, clf.predict(X_test_mean)) ) cur_lr_median.append ( function(y_test, clf.predict(X_test_median))) cur_lr_max.append ( function(y_test, clf.predict(X_test_max))) cur_lr_min.append ( function(y_test, clf.predict(X_test_min))) cur_em_impute.append ( function(y_test, clf.predict(X_test_em_impute))) cur_mice_impute.append( function(y_test, clf.predict(X_test_mice_impute))) cur_knn_impute.append( function(y_test, clf.predict(X_test_knn_impute))) # cur_flip.append ( function(y_test, clf.predict(X_test_flip))) if not useEM: cur_ours.append ( function(y_test, a.classify(X_test_mean, missing))) else: for z in range(len(a)): cur_ours["ours_" + str(z)].append( function(y_test, a[z].classify(X_test_mean, missing))) k_all.append(k) missing_err_nb_all.append (cur_nb) missing_err_lr_mean_all.append (cur_lr_mean) missing_err_lr_median_all.append(cur_lr_median) missing_err_lr_max_all.append (cur_lr_max) missing_err_lr_min_all.append (cur_lr_min) missing_err_lr_flip_all.append (cur_flip) missing_err_lr_em_impute_all.append(cur_em_impute) missing_err_lr_mice_impute_all.append(cur_mice_impute) missing_err_lr_knn_impute_all.append(cur_knn_impute) if useEM: for i in cur_ours: missing_err_ours_all[i].append(cur_ours[i]) else: missing_err_ours_all.append (cur_ours) if not useEM: missing_err_ours_all = np.array(missing_err_ours_all) data = { "features_count": FEATURES.shape[0], "k" : np.array(k_all), "nb": np.array(missing_err_nb_all), "mean": np.array(missing_err_lr_mean_all), "median": np.array(missing_err_lr_median_all), "max": np.array(missing_err_lr_max_all), "min": np.array(missing_err_lr_min_all), "ours": missing_err_ours_all, "flip": np.array(missing_err_lr_flip_all), "em_impute": np.array(missing_err_lr_em_impute_all), "mice_impute": np.array(missing_err_lr_mice_impute_all), "knn_impute": np.array(missing_err_lr_knn_impute_all), } return data
def test_impute_value(): "fast_knn using standard idw" imputed = impy.fast_knn(data_m2, k=2) assert np.isclose(imputed[0][2], 8.38888888888889)
def test_impute_missing_values(): """ After imputation, no NaN's should exist""" imputed = impy.fast_knn(data_m1) assert not np.isnan(imputed).any()
def test_return_type(): """ Check return type, should return an np.ndarray""" imputed = impy.fast_knn(data_m1) assert isinstance(imputed, np.ndarray)
import numpy as np n = 5 arr = np.random.uniform(high=6, size=(n, n)) for _ in range(3): arr[np.random.randint(n), np.random.randint(n)] = np.nan print(arr) print(20 * '_') #np.array([[0.25288643, 1.8149261 , 4.79943748, 0.54464834, np.nan], # [4.44798362, 0.93518716, 3.24430922, 2.50915032, 5.75956805], # [0.79802036, np.nan, 0.51729349, 5.06533123, 3.70669172], # [1.30848217, 2.08386584, 2.29894541, np.nan, 3.38661392], # [2.70989501, 3.13116687, 0.25851597, 4.24064355, 1.99607231]]) import impyute as impy print(impy.fast_knn(arr)) print(20 * '_') print(impy.mean(arr)) print(20 * '_')
def test_return_type(knn_test_data): imputed = impy.fast_knn(knn_test_data) return_na_check(imputed)