def test_output_file_exists(test_data, results_path): data = test_data(SHAPE) labels = np.array([1, 0, 1, 1, 0]) imputed_mode = [] imputed_mode.append(["mode", (impy.mode(np.copy(data)), labels)]) imputed_mode.append(["mean", (impy.mean(np.copy(data)), labels)]) impy.util.compare(imputed_mode, log_path=results_path) with open(results_path, 'r') as fin: expected = {'mode': [('SVC', 0.0)], 'mean': [('SVC', 0.0)]} assert ast.literal_eval(next(fin)) == expected
def test_impute(self): #si = SingleImputer(strategy={'bare_nuclei':"pmm"}) #df_impute = si.fit_transform(self.df.iloc[:,1:10]) df_impute = impy.mean(self.df.iloc[:, 1:10]) #print(df_impute.iloc[:, 5]) df_impute.iloc[:, 5] = df_impute.iloc[:, 5].apply(lambda x: np.around(x, decimals = 0), 1) robjects.r('library(mice)') robjects.r('dataset_impute <- mice(bc_data[, 2:10], print=FALSE)') r_impute = robjects.r('mice::complete(dataset_impute,1)$bare_nuclei') #print(r_impute) for i in range(df_impute.shape[0]): print('P ' + str(df_impute.iloc[:, 5][i])+ 'R ' + str(r_impute[i])) self.assertEqual(df_impute.iloc[:, 5][i],r_impute[i])
def setUp(self): """ self.data_c: Complete dataset/No missing values self.data_m: Incommplete dataset/Has missing values """ mask = np.zeros((5, 5), dtype=bool) mask[0][0] = True data_m = impy.dataset.test_data(mask=mask) labels = np.array([1, 0, 1, 1, 0]) self.imputed_mode = [] self.imputed_mode.append( ["mode", (impy.mode(np.copy(data_m)), labels)]) self.imputed_mode.append( ["mean", (impy.mean(np.copy(data_m)), labels)])
def impute_mean(df_soc, numerical_col): my_imputer = SimpleImputer() for e in numerical_col: s_array = df_soc[e].to_numpy() has_nan = np.isnan(s_array.reshape(-1, 1)) print(has_nan) if(has_nan.any()): imputed = impy.mean(s_array.reshape(-1, 1)) print("Imputed: ", e) df_soc[e] = imputed return df_soc #impy.mean(arr)
def imputeValues(filename): dataset = pd.DataFrame() dataset2 = pd.DataFrame() df = pd.read_csv(filename) for i in df.columns: if df[i].dtype == np.float64 or df[i].dtype == np.int64: dataset[i] = df[i] dataset2 = impy.mean(dataset) dataset2.columns = dataset.columns for j in df.columns: if j in dataset2.columns: df[j] = dataset2[j] df.to_csv("../Frontend/comets_final.csv", index=False) return df
import numpy as np n = 5 arr = np.random.uniform(high=6, size=(n, n)) for _ in range(3): arr[np.random.randint(n), np.random.randint(n)] = np.nan print(arr) print(20 * '_') #np.array([[0.25288643, 1.8149261 , 4.79943748, 0.54464834, np.nan], # [4.44798362, 0.93518716, 3.24430922, 2.50915032, 5.75956805], # [0.79802036, np.nan, 0.51729349, 5.06533123, 3.70669172], # [1.30848217, 2.08386584, 2.29894541, np.nan, 3.38661392], # [2.70989501, 3.13116687, 0.25851597, 4.24064355, 1.99607231]]) import impyute as impy print(impy.fast_knn(arr)) print(20 * '_') print(impy.mean(arr)) print(20 * '_')
#%% import pandas as pd import numpy as np import matplotlib.pyplot as plt #impyute digunakan untuk memmasukkan nilai yg hilang import impyute as impy data_ruspini = pd.read_csv("data_ruspini_missing.csv") data_ruspini = data_ruspini.replace("?", np.nan) #data_ruspini data_ruspini_array = np.array(data_ruspini, dtype=float) data_baru = impy.mean(data_ruspini_array) #data_baru data_frame_ruspini_missing = pd.DataFrame({ 'x': data_ruspini_array[:, 0], 'y': data_ruspini_array[:, 1], 'label': data_ruspini_array[:, 2], }) data_frame_ruspini_baru = pd.DataFrame({ 'x': data_baru[:, 0], 'y': data_baru[:, 1], 'label': data_baru[:, 2], }) print(data_frame_ruspini_baru) print(data_frame_ruspini_missing) #visualisasi
def test_mean_impute_missing_values(): """ After imputation, no Nan's should exist""" imputed = impy.mean(data_m) assert not np.isnan(imputed).any()
def test_mean_impute_missing_values(self): """ After imputation, no Nan's should exist""" imputed = impy.mean(self.data_m) self.assertFalse(np.isnan(imputed).any())
"""test_compare.py""" import numpy as np import impyute as impy mask = np.zeros((5, 5), dtype=bool) mask[0][0] = True data_m = impy.dataset.test_data(mask=mask) labels = np.array([1, 0, 1, 1, 0]) imputed_mode = [] imputed_mode.append(["mode", (impy.mode(np.copy(data_m)), labels)]) imputed_mode.append(["mean", (impy.mean(np.copy(data_m)), labels)]) def test_output_file_exists(): """ Small test to just check that it runs without fialing""" path = "./results.txt" impy.util.compare(imputed_mode, log_path=path)
def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target): try: self.miss_info = miss_info self.columns = notobj self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[ "num_col"] metric = {"rmse": {}, "nrmse": {}} self.rawT = T self.target = target if target is not None: self.target_y = T[target] else: self.target_y = None self.cv = {} self.cv.update(deepcopy(metric)) self.kf = kf self.MSE = {} self.MSE.update(deepcopy(metric)) self.result = {} self.time_ck = {} X = deepcopy(T) mask = pd.DataFrame(mask, columns=T.columns.tolist()) self.rawmask = mask X[(mask == 1).values] = np.nan if obj in [None, []]: obj = None else: pass ########################################## self.X = X[notobj] self.T = T[notobj] self.mask = mask[notobj] self.notobj = notobj ########################################## if obj is not None: ############ Numeric + Category ################# cat_impute = SimpleImputer(strategy="most_frequent") X[obj] = cat_impute.fit_transform(X[obj]) self.true_obj = T[obj] self.pd_obj = X[obj] ################################################### TT = deepcopy(T) cat_encoder = miss_info["ce_encoder"] for k in cat_encoder.category_mapping: col, map_ = k["col"], k["mapping"] TT[col] = TT[col].replace( dict(zip(k["mapping"].index, k["mapping"].values))) self.full_miss_data = TT self.full_miss_data[(mask == 1).values] = np.nan mice_data = deepcopy(T) for obj_col in obj: mice_data[obj_col] = "Cols_" + mice_data[obj_col] self.full_mice_data = mice_data self.full_mice_data[(mask == 1).values] = np.nan else: ########## Numeric ############################### num_data = deepcopy(self.X) num_data[(self.mask == 1).values] = np.nan self.full_miss_data = deepcopy(num_data) self.full_mice_data = deepcopy(num_data) ################################################### self.algo = algo self.method = { "MissForest" : lambda x : MissForest(verbose = 0, n_jobs = -1 ).fit(x) , "mean" : lambda x : impy.mean(x) , "median" : lambda x : impy.median(x) , "mode" : lambda x : impy.mode(x) , "knn" : lambda x : impy.fast_knn(x) , "MICE" : lambda x : impy.mice(x) , "EM" : lambda x : impy.em(x), "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\ fit_transform(pd.DataFrame(x)).values, } except Exception as e: print(e) pass
def test_mean(test_data): data = test_data(SHAPE) imputed = impy.mean(data) return_na_check(imputed)