def EM_imputation(self, dataset): # only for numerical values # imputes given data using expectation maximization. # E-step: Calculates the expected complete data log # likelihood ratio. import impyute as imp df = dataset if dataset.select_dtypes(['number']).isnull().sum().sum() > 0: X = imp.em(dataset.select_dtypes(['number']).iloc[:, :].values) Z = dataset.select_dtypes(include=['object']) df = pd.DataFrame.from_records( X, columns=dataset.select_dtypes(['number']).columns) df = df.join(Z) else: pass return df
def compute_err_EM(Xtrain, ytrain, Xtest, ytest, n, p, G): Xtr_nan_list = make_nan_list(Xtrain, ytrain, G, n, p) # make NA data # since making function changes the order of observation # we need to generate new ytr from Xtr_nan Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0])) for g in np.arange(1, G): Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g])) ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g])))) # percentage of missing values per_missing = np.mean(np.isnan(Xtr_nan)) scaler = MinMaxScaler() scaler.fit(Xtr_nan) Xtr_nan = scaler.transform(Xtr_nan) Xtest = scaler.transform(Xtest) Xtr_nan_list2 = [] for g in range(G): Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g])) #impute,classify and get the error rates for imputation approaches start = time.time() Xtr_em = impy.em(Xtr_nan, loops=10) clf_em = skLDA().fit(Xtr_em, ytr) em_err = np.mean(clf_em.predict(Xtest).flatten() != ytest) em_time = time.time() - start return em_err, em_time
def Impute_EM(data_x): '''Impute missing values in data_x Args: - data_x: original data with missing values Returns: - imputed_data: imputed data ''' imputed_data = impy.em(data_x) return imputed_data
def compute_err_EM(Xtrain, ytrain, n, p, G): # make NAs Xtr_nan_list = make_nan_list(Xtrain, ytrain, G, n, p) # make NA data # since making function changes the order of observation # we need to generate new ytr from Xtr_nan Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0])) for g in np.arange(1, G): Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g])) ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g])))) scaler = StandardScaler() scaler.fit(Xtr_nan) Xtr_nan = scaler.transform(Xtr_nan) Xtrain = scaler.transform(Xtrain) for g in range(G): Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g]) mus = [np.mean(Xtrain[ytrain == g, :], axis=0) for g in np.arange(G)] mus = np.asarray(mus) # each row is a mean of a class S = [(sum(ytrain == g) - 1) * np.cov(Xtrain[ytrain == g, :], rowvar=False) for g in np.arange(G)] S = np.asarray(S) / len(ytrain) # percentage of missing values per_missing = np.mean(np.isnan(Xtr_nan)) start = time.time() Xtr_em = impy.em(Xtr_nan, loops=10) mus_em = np.array( [np.mean(Xtr_em[ytrain == g, :], axis=0) for g in np.arange(G)]) S_em = np.array([ (sum(ytrain == g) - 1) * np.cov(Xtr_em[ytrain == g, :], rowvar=False) for g in np.arange(G) ]) S_em = S_em / len(ytrain) em_err = err(mus, S, mus_em, S_em) em_time = time.time() - start return em_err, em_time, per_missing
def run_experiment_k_paper(X_test, y_test, clf, NB, a, setting): X_impute_mean = np.mean(X_test, axis = 0) X_impute_median = np.median(X_test, axis = 0) X_impute_max = np.max(X_test, axis = 0) X_impute_min = np.min(X_test, axis = 0) X_impute_flip = np.copy(1 - X_test) k_all = [] missing_err_nb_all = [] missing_err_lr_mean_all = [] missing_err_lr_median_all = [] missing_err_lr_max_all = [] missing_err_lr_min_all = [] missing_err_lr_flip_all = [] missing_err_lr_em_impute_all = [] missing_err_lr_mice_impute_all = [] missing_err_lr_knn_impute_all = [] useEM = setting["em"] if "em" in setting else False discreteFeatures = setting["discreteFeatures"] if "discreteFeatures" in setting else 1 featureEncoding = setting["feature_encoding"] if "feature_encoding" in setting else None do_emImpute = setting["emImpute"] if "emImpute" in setting else False do_miceImpute = setting["miceImpute"] if "miceImpute" in setting else False do_knnImpute = setting["knnImpute"] if "knnImpute" in setting else False if useEM: missing_err_ours_all = {} for i in range(len(a)): missing_err_ours_all["ours_" + str(i)] = [] else: missing_err_ours_all = [] useProb = setting["prob"] if "prob" in setting else True function = setting["function"] if "function" in setting else None if function is None: if useProb: function = conditional_likelihood_k else: function = f1_score print("Using following function: ") print(function) repeat = setting["repeat"] if "repeat" in setting else 1 FEATURES = setting["features"] if "features" in setting else None if FEATURES is None: NNN = X_test.shape[1] if not featureEncoding is None: NNN = len(featureEncoding) FEATURES = np.array( [i for i in range(NNN / discreteFeatures )] ) else: FEATURES = np.array( FEATURES ) print("Possible features to remove: {}".format(FEATURES.shape[0])) K = setting["k"] for k in K: print("K = {}".format(k)) if k > FEATURES.shape[0]: print("Early stop: Only had {} features possible to remove".format(FEATURES.shape[0])) break cur_nb = [] cur_lr_mean = [] cur_lr_median = [] cur_lr_max = [] cur_lr_min = [] cur_flip = [] cur_em_impute = [] cur_mice_impute = [] cur_knn_impute = [] if useEM: cur_ours = {} for i in range(len(a)): cur_ours["ours_" + str(i)] = [] else: cur_ours = [] for R in range(repeat): if R % 10 == 0: print("\t R = {}".format(R)) X_test_mean = np.array(X_test, dtype = 'float') X_test_median = np.array(X_test, dtype = 'float') X_test_max = np.array(X_test, dtype = 'float') X_test_min = np.array(X_test, dtype = 'float') X_test_flip = np.array(X_test, dtype = 'float') X_test_em_impute = np.array(X_test, dtype = 'float') X_test_mice_impute = np.array(X_test, dtype = 'float') X_test_knn_impute = np.array(X_test, dtype = 'float') missing = np.zeros(X_test.shape, dtype=bool) for i in range(X_test.shape[0]): miss = np.random.choice(FEATURES, k, replace=False) if not featureEncoding is None and k > 0: missK = [] for m in miss: for z in featureEncoding[m]: missK.append(z) miss = np.copy(np.array(missK)) elif discreteFeatures != 1 and k > 0: missK = [] for m in miss: for z in range(discreteFeatures): missK.append(m * discreteFeatures + z) miss = np.copy(np.array(missK)) missing[i][miss] = True # if k > 0: # print(missing[i]) # print(np.sum(missing[i])) X_test_mean[i][miss] = X_impute_mean[miss] X_test_median[i][miss] = X_impute_median[miss] X_test_max[i][miss] = X_impute_max[miss] X_test_min[i][miss] = X_impute_min[miss] X_test_flip[i][miss] = X_impute_flip[i][miss] X_test_em_impute[i][miss] = np.nan X_test_mice_impute[i][miss] = np.nan X_test_knn_impute[i][miss] = np.nan if do_emImpute: import time start = time.time() loops = 6 print ("\tStarting to em impute with loops = {}".format(loops)) X_test_em_impute = impyute.em(X_test_em_impute, loops = loops) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_em_impute = np.zeros(X_test.shape) if do_miceImpute: import time start = time.time() print ("\tStarting to mice impute") X_test_mice_impute = impyute.mice(X_test_mice_impute) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_mice_impute = np.zeros(X_test.shape) if do_knnImpute: import time start = time.time() print ("\tStarting to knn impute") X_test_knn_impute = impyute.fast_knn(X_test_knn_impute) end = time.time() print ("\tDone imputing! " + str( end - start ) ) else: X_test_knn_impute = np.zeros(X_test.shape) lr_prob = clf.predict_proba(X_test) if useProb: cur_nb.append ( function(lr_prob, predict_nbk_with_missing(X_test_mean, NB, missing, prob = True)) ) cur_lr_mean.append ( function(lr_prob, clf.predict_proba(X_test_mean)) ) cur_lr_median.append ( function(lr_prob, clf.predict_proba(X_test_median))) cur_lr_max.append ( function(lr_prob, clf.predict_proba(X_test_max))) cur_lr_min.append ( function(lr_prob, clf.predict_proba(X_test_min))) cur_em_impute.append ( function(lr_prob, clf.predict_proba(X_test_em_impute))) cur_mice_impute.append( function(lr_prob, clf.predict_proba(X_test_mice_impute))) cur_knn_impute.append ( function(lr_prob, clf.predict_proba(X_test_knn_impute))) # cur_flip.append ( function(lr_prob, clf.predict_proba(X_test_flip))) if not useEM: cur_ours.append ( function(lr_prob, a.classify(X_test, missing, prob = True))) else: for z in range(len(a)): cur_ours["ours_" + str(z)].append (function(lr_prob, a[z].classify(X_test, missing, prob = True))) else: cur_nb.append ( function(y_test, predict_nbk_with_missing(X_test_mean, NB, missing)) ) cur_lr_mean.append ( function(y_test, clf.predict(X_test_mean)) ) cur_lr_median.append ( function(y_test, clf.predict(X_test_median))) cur_lr_max.append ( function(y_test, clf.predict(X_test_max))) cur_lr_min.append ( function(y_test, clf.predict(X_test_min))) cur_em_impute.append ( function(y_test, clf.predict(X_test_em_impute))) cur_mice_impute.append( function(y_test, clf.predict(X_test_mice_impute))) cur_knn_impute.append( function(y_test, clf.predict(X_test_knn_impute))) # cur_flip.append ( function(y_test, clf.predict(X_test_flip))) if not useEM: cur_ours.append ( function(y_test, a.classify(X_test_mean, missing))) else: for z in range(len(a)): cur_ours["ours_" + str(z)].append( function(y_test, a[z].classify(X_test_mean, missing))) k_all.append(k) missing_err_nb_all.append (cur_nb) missing_err_lr_mean_all.append (cur_lr_mean) missing_err_lr_median_all.append(cur_lr_median) missing_err_lr_max_all.append (cur_lr_max) missing_err_lr_min_all.append (cur_lr_min) missing_err_lr_flip_all.append (cur_flip) missing_err_lr_em_impute_all.append(cur_em_impute) missing_err_lr_mice_impute_all.append(cur_mice_impute) missing_err_lr_knn_impute_all.append(cur_knn_impute) if useEM: for i in cur_ours: missing_err_ours_all[i].append(cur_ours[i]) else: missing_err_ours_all.append (cur_ours) if not useEM: missing_err_ours_all = np.array(missing_err_ours_all) data = { "features_count": FEATURES.shape[0], "k" : np.array(k_all), "nb": np.array(missing_err_nb_all), "mean": np.array(missing_err_lr_mean_all), "median": np.array(missing_err_lr_median_all), "max": np.array(missing_err_lr_max_all), "min": np.array(missing_err_lr_min_all), "ours": missing_err_ours_all, "flip": np.array(missing_err_lr_flip_all), "em_impute": np.array(missing_err_lr_em_impute_all), "mice_impute": np.array(missing_err_lr_mice_impute_all), "knn_impute": np.array(missing_err_lr_knn_impute_all), } return data
def test_em_(test_data): data = test_data(SHAPE) imputed = impy.em(data) return_na_check(imputed)
def test_impute_missing_values(self): """ After imputation, no NaN's should exist""" imputed = impy.em(self.data_m) self.assertFalse(np.isnan(imputed).any())
def test_return_type(self): """ Check return type, should return an np.ndarray""" imputed = impy.em(self.data_m) self.assertTrue(isinstance(imputed, np.ndarray))
mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() #Maxmum col[x_cat_i] = np.random.normal(loc=mu, scale=std) dealt = (col[x_cat_i] - previous) / previous if dealt < 0.1: data[x_i, y_i] = col[x_cat_i] break data[x_i, y_i] = col[x_cat_i] previous = col[x_cat_i] return data if __name__ == '__main__': data_comp = np.loadtxt("spam.txt", delimiter=",") #读入数据并对其进行随机删除 data_uncomp = rand_delete(data_comp, 0.05) #对读入的数据进行删除 m = get_mask(data_uncomp) #1代表缺失,0代表完整 # data_comp1 = StandardScaler().fit_transform(data_comp) # data_comp2 = MinMaxScaler().fit_transform(data_comp) data_comp = Normalizer().fit_transform(data_comp) data_imp = em(data_uncomp, 50) #用自己编写的em算法进行填补 data_impy = impy.em(data_uncomp) #用impyute中的em进行填补 m_data_imp = data_imp * m m_data_raw = data_comp * m m_data_impy = data_impy * m #为了只计算填补的值之间的差距,故与m相乘 error_impy = np.mean(np.square(m_data_raw - m_data_impy)) print("error_impy:", error_impy) error = np.mean(np.square(m_data_raw - m_data_imp)) print("define em error:", error)
from impyute import em import numpy as np import xlrd book = xlrd.open_workbook('Matlab Codes/Gaussian Graphical Models/Counties/allLocs.xlsx') sheet = book.sheet_by_name('sheet1') y = [[]] for r in range(1,sheet.nrows): try: y[0].append(float(sheet.cell_value(r, 1))) except: y[0].append(np.nan) import math data = np.array(y).T print(data) n = em(data, loops=50) p = n.T.tolist()[0] import pandas as pd sh = pd.read_excel('Matlab Codes/Gaussian Graphical Models/Counties/allLocs.xlsx', sheet_name='sheet1', index=False, na_values=[np.nan]) t = {} p = list(sh.index.unique()) for j in p: t[j] = df.loc[j] y = np.array([sh['TEMP'].as_matrix()]) print(y) u ={} for j in p: u[j] = np.array([t[j]['TEMP'].as_matrix()]) for j in p: y = u[j] if np.isnan(y.T):
def test_return_type(): """ Check return type, should return an np.ndarray""" imputed = impy.em(data_m) assert isinstance(imputed, np.ndarray)
# iterative imputation process # while nmf_model.reconstruction_err_**2 > 10: while nmf_model.reconstruction_err_ > 2.5: W = nmf_model.fit_transform(imputed) imputed[~msk] = W.dot(nmf_model.components_)[~msk] print(nmf_model.reconstruction_err_) # [Imputation mode: MICE] imputed = impy.mice(df.values[:split_idx]) # [Imputation mode: k-NN] imputer = KNNImputer(n_neighbors=10) # default: 2 imputed = imputer.fit_transform(df.values[:split_idx]) # [Imputation mode: EM] imputed = impy.em(df.values[:split_idx], loops=50) # [Imputation mode: LOCF] imputed = df.copy().iloc[:split_idx].ffill() imputed = imputed.fillna(0) imputed = imputed.values # [Imputation mode: NOCB] imputed = df.copy().iloc[:split_idx].bfill() imputed = imputed.fillna(0) imputed = imputed.values # [No imputation: Case Deletion] imputed = df.drop(df[df.isnull().any(axis=1)].index).copy() # [No imputation: Zero Substitution]
def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target): try: self.miss_info = miss_info self.columns = notobj self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[ "num_col"] metric = {"rmse": {}, "nrmse": {}} self.rawT = T self.target = target if target is not None: self.target_y = T[target] else: self.target_y = None self.cv = {} self.cv.update(deepcopy(metric)) self.kf = kf self.MSE = {} self.MSE.update(deepcopy(metric)) self.result = {} self.time_ck = {} X = deepcopy(T) mask = pd.DataFrame(mask, columns=T.columns.tolist()) self.rawmask = mask X[(mask == 1).values] = np.nan if obj in [None, []]: obj = None else: pass ########################################## self.X = X[notobj] self.T = T[notobj] self.mask = mask[notobj] self.notobj = notobj ########################################## if obj is not None: ############ Numeric + Category ################# cat_impute = SimpleImputer(strategy="most_frequent") X[obj] = cat_impute.fit_transform(X[obj]) self.true_obj = T[obj] self.pd_obj = X[obj] ################################################### TT = deepcopy(T) cat_encoder = miss_info["ce_encoder"] for k in cat_encoder.category_mapping: col, map_ = k["col"], k["mapping"] TT[col] = TT[col].replace( dict(zip(k["mapping"].index, k["mapping"].values))) self.full_miss_data = TT self.full_miss_data[(mask == 1).values] = np.nan mice_data = deepcopy(T) for obj_col in obj: mice_data[obj_col] = "Cols_" + mice_data[obj_col] self.full_mice_data = mice_data self.full_mice_data[(mask == 1).values] = np.nan else: ########## Numeric ############################### num_data = deepcopy(self.X) num_data[(self.mask == 1).values] = np.nan self.full_miss_data = deepcopy(num_data) self.full_mice_data = deepcopy(num_data) ################################################### self.algo = algo self.method = { "MissForest" : lambda x : MissForest(verbose = 0, n_jobs = -1 ).fit(x) , "mean" : lambda x : impy.mean(x) , "median" : lambda x : impy.median(x) , "mode" : lambda x : impy.mode(x) , "knn" : lambda x : impy.fast_knn(x) , "MICE" : lambda x : impy.mice(x) , "EM" : lambda x : impy.em(x), "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\ fit_transform(pd.DataFrame(x)).values, } except Exception as e: print(e) pass
def perf_em_imput(dfs_arg): em_data = [ impy.em(dfs_arg[i].values, loops=50, dtype='cont') for i in range(len(dfs_arg)) ] return [pd.DataFrame(data=em_data[i]) for i in range(len(dfs_arg))]
med_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median') med_imputer = med_imputer.fit(miss_data_x) imputed_data_med = med_imputer.transform(miss_data_x) # Report the RMSE performance rmse_med = rmse_loss(ori_data_x, imputed_data_med, data_m) print() print('RMSE Performance: ' + str(np.round(rmse_med, 4))) #%% # EM imputation import impyute as impy data_missing = pd.DataFrame(miss_data_x) em_imputed = impy.em(miss_data_x) rmse_em = rmse_loss(ori_data_x, em_imputed, data_m) print() print('RMSE Performance: ' + str(np.round(rmse_em, 4))) pd.DataFrame(imputed_data_med).to_csv(os.path.join(os.getcwd(), '[10] data/' + data_name + '_imp_med' + '.csv'), index = False) pd.DataFrame(em_imputed).to_csv(os.path.join(os.getcwd(), '[10] data/' + data_name + '_imp_EM' + '.csv'), index = False) # RMSE # GAIN: 0.0905 # median imputation: 0.1095 # EM imputation: 0.1453
def em(data): return impyute.em(data.values)
df = pd.read_csv(input_file, index_col=[0], parse_dates=[0], date_parser=parser) arr = df.values """ Splitting indices for each data set - Air Quality: -1 (impute all data instances) - GECCO2015: 154140 """ split_idx = 154140 # Imputation mode: EM imputed_em = impy.em(arr[:split_idx], loops=50) # default: 50 # [Option] aggregate train (imputed) /valid (not imputed) data imputed_em = np.append(imputed_em, arr[split_idx:], axis=0) # [Option] resampling imputed_em = imputed_em.resample('D').mean() # Convert to DataFrame imputed_em = pd.DataFrame(imputed_em, index=df.index, columns=df.columns) # Visualizing comparison between actual and imputed values plt.plot(imputed_em[df.columns[0]], label='imputed') plt.plot(df[df.columns[0]], label='actual') plt.legend(loc='best') plt.show()
def test_impute_missing_values(): """ After imputation, no NaN's should exist""" imputed = impy.em(data_m) assert not np.isnan(imputed).any()