def test_solver_fill_methods_with_low_rank_random_matrix(): for fill_method in ("zero", "mean", "median", "min", "random"): imputer = SimpleFill(fill_method=fill_method) XY_completed = imputer.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error( XY, XY_completed, missing_mask, name="Solver with fill_method=%s" % fill_method) assert missing_mae < 5, "Error too high for Solver with %s fill method!" % fill_method
def __init__(self, fill_method='zero', fill_missing=True, **kwargs): """Imputs NaN's using various filling methods like mean, zero, median, min, random Args: fill_method: How NaN's will be exchanged. Possible values: 'mean', 'zero', 'median', 'min', 'random' fill_missing: If True, transformer will fill NaN values by filling method """ super().__init__() self.fill_missing = fill_missing self.filler = SimpleFill(fill_method)
def imputeMethodMedain(result, originData, missData, missRate, missPattern, dataType='continuous'): imputationMethod = "median" try: imputedData = SimpleFill("median").fit_transform(missData) if dataType != 'continuous': mark = [ temp[0] for temp in pd.DataFrame(np.unique(missData)).dropna( axis=0).values ] imputedData = modifier(imputedData, mark) result = addResult(result, missRate, missPattern, imputationMethod, evaluate.RMSE(originData, imputedData), MAE(originData, imputedData), masked_mape_np(originData, imputedData)) except Exception as e: print(e) imputedData = 'none' result = addResult(result, missRate, missPattern, imputationMethod, np.inf, np.inf, np.inf) return result, imputedData
def __mean(self, test_data): """ wrap fancyimpute-mean """ test_data = mvp.df2np(test_data, [], self.verbose) complete_data = SimpleFill(fill_method="mean").complete(test_data) return complete_data
class FillNan(BaseTransformer): def __init__(self, fill_method='zero', fill_missing=True, **kwargs): """Imputs NaN's using various filling methods like mean, zero, median, min, random Args: fill_method: How NaN's will be exchanged. Possible values: 'mean', 'zero', 'median', 'min', 'random' fill_missing: If True, transformer will fill NaN values by filling method """ super().__init__() self.fill_missing = fill_missing self.filler = SimpleFill(fill_method) def transform(self, X): """ Args: X: DataFrame with NaN's Returns: Dictionary with one key - 'X' corresponding to given DataFrame but without nan's """ if self.fill_missing: X = self.filler.complete(X) return {'X': X} def load(self, filepath): self.filler = joblib.load(filepath) return self def persist(self, filepath): joblib.dump(self.filler, filepath)
def fill_missing_values(df): df = drop_high_missing_features(df) is_missing = pd.isnull(df).sum().sum() if is_missing: arr_complete = SimpleFill().complete(df) df = pd.DataFrame(arr_complete, columns = df.columns) return df
def baseline_inpute(X_incomplete, method='mean', level=0): if method == 'mean': X_filled_mean = SimpleFill().fit_transform(X_incomplete) return X_filled_mean elif method == 'knn': k = [3, 10, 50][level] X_filled_knn = KNN(k=k, verbose=False).fit_transform(X_incomplete) return X_filled_knn elif method == 'svd': rank = [ np.ceil((X_incomplete.shape[1] - 1) / 10), np.ceil((X_incomplete.shape[1] - 1) / 5), X_incomplete.shape[1] - 1 ][level] X_filled_svd = IterativeSVD(rank=int(rank), verbose=False).fit_transform(X_incomplete) return X_filled_svd elif method == 'mice': max_iter = [3, 10, 50][level] X_filled_mice = IterativeImputer( max_iter=max_iter).fit_transform(X_incomplete) return X_filled_mice elif method == 'spectral': # default value for the sparsity level is with respect to the maximum singular value, # this is now done in a heuristic way sparsity = [0.5, None, 3][level] X_filled_spectral = SoftImpute( shrinkage_value=sparsity).fit_transform(X_incomplete) return X_filled_spectral else: raise NotImplementedError
def impute_using_statistics(df, method='min'): """ Imputes the missing values by the selected statistical property of each column :param df: The input dataframe that contains missing values :param method: The imputation method (min by default) "zero": fill missing entries with zeros "mean": fill with column means "median" : fill with column medians "min": fill with min value per column "random": fill with gaussian noise according to mean/std of column :return: the imputed dataframe """ sf = SimpleFill(method) imputed_matrix = sf.complete(df.values) imputed_df = pd.DataFrame(imputed_matrix, df.index, df.columns) return imputed_df
def _get_imputer(self): if self.strategy == "simple": return SimpleFill() return { "knn": KNN, "mice": MICE, "matrix": MatrixFactorization, "soft": SoftImpute, }[self.strategy](verbose=False)
def impute_missing_values(self, value_set, strategy): """ 对原始数据矩阵进行填充 :param value_set: 待处理的原始数据矩阵 :param strategy: 1:剔除缺失值 2:高频值填充 3:属性相关关系填充 4:数据对象相似性填充 :return: 进行填充过的数据矩阵,类型为list: (col1, col2, ...) """ # 以剔除缺失值的方法进行处理 if strategy == 1: new_value_set = [] for data_sample in value_set: new_data_sample = [] if None in data_sample or 'NA' in data_sample: continue else: for data in data_sample: new_data_sample.append(float(data)) new_value_set.append(new_data_sample) value_array = np.array(new_value_set) elif strategy in [2, 3, 4]: # 将value_set矩阵转化为numpy矩阵,并将其中的缺失值用np.nan替换 new_value_set = [] for data_sample in value_set: new_data_sample = [] for data in data_sample: if data and data != 'NA': new_data_sample.append(float(data)) else: new_data_sample.append(np.nan) new_value_set.append(new_data_sample) value_array = np.array(new_value_set) # 以最高频值进行填补,由于均为概率类的数值属性,所以用平均数代替 if strategy == 2: value_array = SimpleFill( fill_method="mean").complete(value_array) # 以属性相关关系进行填补,取相关性最高的三个属性做 elif strategy == 3: value_array = MICE(n_nearest_columns=3).complete(value_array) # 以数据对象相似性进行填补,取相似度最高的10个数据对象 elif strategy == 4: for batch in range(len(value_array) // 1000 + 1): value_array[batch*1000 : min(batch*1000+1000, len(value_array))] = \ KNN(k = 10).complete(value_array[batch*1000 : min(batch*1000+1000, len(value_array))]) else: raise ArgInputError("The strategy should be in (1,2,3,4)!") # 将填充过的数据矩阵按feature_col转换为n个col的list feature_col_list = [] for i in range(len(value_array[0])): feature_col_list.append(value_array[:, i].tolist()) return feature_col_list
def __init__(self, method, **kwargs): self.clf = None self.method = method if method == "SoftImpute": self.clf = SoftImpute(**kwargs) elif method == "KNN": self.clf = KNN(**kwargs) elif method == "Naive": self.clf = SimpleFill() elif method == 'II': raise ('NOT TESTED') self.clf = IterativeImputer(min_value=0) else: raise ("Not Implemented method")
def load_data(p_miss, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42): np.random.seed(rand_seed) with open("data/" + dataset + "_x", "rb") as file: data_x = pickle.load(file) with open("data/" + dataset + "_y", "rb") as file: data_y = pickle.load(file) n = data_x.shape[0] p = data_x.shape[1] perc_miss = p_miss xmiss = np.copy(data_x) if mode == "mcar": xmiss_flat = xmiss.flatten() miss_pattern = np.random.choice(n*p, np.floor(n*p*perc_miss).astype(np.int), replace=False) xmiss_flat[miss_pattern] = np.nan xmiss = xmiss_flat.reshape([n, p]) # in xmiss, the missing values are represented by nans elif mode == "mar": fixed_len = int(np.floor(p/3)) prob = para*np.mean(data_x[:, :fixed_len], 1) prob = sigmoid(prob, 0.5) for i in range(n): mask_tmp = np.random.choice([1, 0], size=p, p=[1 - prob[i], prob[i]]) for j in range(fixed_len, p): if mask_tmp[j] == 0: xmiss[i, j] = np.nan print("missing rate: ", np.sum(np.isnan(xmiss.flatten()))/(n*p)) else: raise Exception("mode is not valid") mask = np.isfinite(xmiss) # binary mask that indicates which values are missing xhat_0 = np.copy(xmiss) xhat_0[np.isnan(xmiss)] = 0 x_filled = SimpleFill().fit_transform(xmiss) print("MSE mean imputation full data: " + str(mse(x_filled, data_x, mask))) if train == True: part = int(np.floor(n/2)) return (n-part), p, xmiss[part:,:], xhat_0[part:,:], mask[part:,:], data_x[part:,:], data_y[part:,:] elif train == False: part = int(np.floor(n/2)) return part, p, xmiss[:part,:], xhat_0[:part,:], mask[:part,:], data_x[:part,:], data_y[:part,:] elif train == None: return n, p, xmiss, xhat_0, mask, data_x, data_y
def residualize_baseline(df, baseline_vars=[]): if len(baseline_vars) == 0: baseline_vars = ['Age', 'Sex'] # remove baseline vars baseline = df[baseline_vars] data = df.copy() data.drop(baseline_vars, axis=1, inplace=True) lr = LinearRegression() if data.isnull().sum().sum() > 0: imputed = SimpleFill().fit_transform(data) data = pd.DataFrame(imputed, index=data.index, columns=data.columns) for v in data: y = data[v] lr.fit(baseline, y) data[v] = y - lr.predict(baseline) return data
def determine_impute(df): """Iterates various imputation methods to find lower MSE""" algorithms = [ SimpleFill(), KNN(1), KNN(2), KNN(3), KNN(4), KNN(5), IterativeSVD(), MatrixFactorization() ] MSE = {} df_incomplete = create_test_df(df, 0.7, list(T40_dict.keys())) for i, alg in enumerate(algorithms): print(alg) X_complete = impute_df(df_incomplete, alg) alg_mse = ((df - X_complete)**2).sum().mean() print(str(i) + alg.__class__.__name__, alg_mse) MSE[str(i) + alg.__class__.__name__] = alg_mse return MSE
def __init__(self, data, predict): self.df = data self.predict = predict self.X = None self.y = None self.X_scale = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.incomplete_data = None self.clean_data = None self.methods = [ SimpleFill(), KNN(1), KNN(2), KNN(3), KNN(4), KNN(5), IterativeSVD(), MatrixFactorization() ]
def run_impute(self, X, state='train'): if state == 'train': self.train_data['ave'] = np.zeros([X.shape[0], X.shape[1]]) for imp_method in self.impute_method: if imp_method == 'mean': imp_ope = SimpleFill() if imp_method == 'KNN': imp_ope = KNN() if imp_method == 'IterativeSVD': imp_ope = IterativeSVD() if imp_method == 'MatrixFactorization': imp_ope = MatrixFactorization() X_filled = imp_ope.fit_transform(X) self.train_data[imp_method] = X_filled self.impute_operator[imp_method] = imp_ope self.train_data['ave'] += X_filled self.train_data['ave'] /= len(self.impute_method) return 0
def prepareImputation(df): afterImputation = pd.DataFrame(SimpleFill().fit_transform( df.loc[:, df.columns != 0])) for c in afterImputation.columns: df[c + 1] = afterImputation[c].values return df
new_dataset = pd.concat([test_data, train_data], axis=0) train_data = train_data.values test_data = test_data.values print('train datasize:', train_data.shape, ' test datasize: ', test_data.shape) corrupted_holdout = test_data.copy() corrupted_holdout[:, :RNA_size] = np.nan df_combine = pd.DataFrame( np.concatenate([corrupted_holdout, train_data], axis=0)) print('name:', cancertype, ' missing rate:', missing_perc, 'train datasize:', train_data.shape, ' test datasize: ', test_data.shape) ############## Mean method X_filled = SimpleFill(fill_method="mean").fit_transform(df_combine) RNA_txt = pd.DataFrame(X_filled[:, :RNA_size], index=shuffle_cancer.index, columns=shuffle_cancer.columns[:RNA_size]) RNA_txt.to_csv(datadir + '/filled_data/Mean_' + cancertype + str(missing_perc * 100) + '_' + str(sample_count) + '.csv') nz = test_data[:, :RNA_size].size nnm_mse = np.sqrt((np.linalg.norm( (X_filled[:test_data.shape[0], :RNA_size] - test_data[:, :RNA_size]))**2) / nz) print("Mean method, RMSE: %f" % nnm_mse) loss_list_Mean[cancer_c, perc, sample_count - 1] = nnm_mse ##############SVD
imputedData = modifier(imputedData, mark) score = evaluate.RMSE(originData, imputedData) ii_misc[0].append(score) ii_misc[1].append(MAE(originData, imputedData)) ii_misc[2].append(masked_mape_np(originData, imputedData)) ii_misc[3].append(TF(originData, imputedData)) logger.info( "fi IterativeImputer missing rate:{},RMSE:{}".format( i, score)) except: ii_misc[0].append(np.inf) ii_misc[1].append(np.inf) ii_misc[2].append(np.inf) ii_misc[3].append(np.inf) try: imputedData = SimpleFill("median").fit_transform(missData) imputedData = modifier(imputedData, mark) score = evaluate.RMSE(originData, imputedData) median_misc[0].append(score) median_misc[1].append(MAE(originData, imputedData)) median_misc[2].append(masked_mape_np(originData, imputedData)) median_misc[3].append(TF(originData, imputedData)) logger.info("fi median missing rate:{},RMSE:{}".format( i, score)) except: median_misc[0].append(np.inf) median_misc[1].append(np.inf) median_misc[2].append(np.inf) median_misc[3].append(np.inf) try: imputedData = impyute.imputation.cs.random(missData)
from fancyimpute import (BiScaler, KNN, NuclearNormMinimization, SoftImpute, SimpleFill) n = 200 m = 20 inner_rank = 4 X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) print("Mean squared element: %0.4f" % (X**2).mean()) # X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.fit_transform(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.fit_transform(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data,
def run(folder, name, patients, run_all, save_imputed): random_seed = 123 np.random.seed(seed=random_seed) X_corrupt = load_file(folder, name) name = name.split('.csv')[0] print(name) end = X_corrupt.shape[0] print(end) X = np.genfromtxt('./data/completeCasesBoxCox.csv', delimiter=',', skip_header=1)[:end, 1:] scores = {} simple_mean_X = SimpleFill(fill_method='mean').complete(X_corrupt) scores['simple_mean'] = evaluate(simple_mean_X, X, X_corrupt) simple_median_X = SimpleFill(fill_method='median').complete(X_corrupt) scores['simple_median'] = evaluate(simple_median_X, X, X_corrupt) random_X = SimpleFill(fill_method='random').complete(X_corrupt) scores['random'] = evaluate(random_X, X, X_corrupt) # SVD svd_1_X = IterativeSVD(rank=1).complete(X_corrupt) scores['svd_1'] = evaluate(svd_1_X, X, X_corrupt) svd_2_X = IterativeSVD(rank=2).complete(X_corrupt) scores['svd_2'] = evaluate(svd_2_X, X, X_corrupt) svd_3_X = IterativeSVD(rank=3).complete(X_corrupt) scores['svd_3'] = evaluate(svd_3_X, X, X_corrupt) svd_4_X = IterativeSVD(rank=4).complete(X_corrupt) scores['svd_4'] = evaluate(svd_4_X, X, X_corrupt) svd_5_X = IterativeSVD(rank=5).complete(X_corrupt) scores['svd_5'] = evaluate(svd_5_X, X, X_corrupt) svd_6_X = IterativeSVD(rank=6).complete(X_corrupt) scores['svd_6'] = evaluate(svd_6_X, X, X_corrupt) svd_7_X = IterativeSVD(rank=7).complete(X_corrupt) scores['svd_7'] = evaluate(svd_7_X, X, X_corrupt) svd_8_X = IterativeSVD(rank=8).complete(X_corrupt) scores['svd_8'] = evaluate(svd_8_X, X, X_corrupt) svd_9_X = IterativeSVD(rank=9).complete(X_corrupt) scores['svd_9'] = evaluate(svd_9_X, X, X_corrupt) svd_10_X = IterativeSVD(rank=10).complete(X_corrupt) scores['svd_10'] = evaluate(svd_10_X, X, X_corrupt) svd_11_X = IterativeSVD(rank=11).complete(X_corrupt) scores['svd_11'] = evaluate(svd_11_X, X, X_corrupt) svd_12_X = IterativeSVD(rank=12).complete(X_corrupt) scores['svd_12'] = evaluate(svd_12_X, X, X_corrupt) svd_13_X = IterativeSVD(rank=13).complete(X_corrupt) scores['svd_13'] = evaluate(svd_13_X, X, X_corrupt) svd_14_X = IterativeSVD(rank=14).complete(X_corrupt) scores['svd_14'] = evaluate(svd_14_X, X, X_corrupt) svd_15_X = IterativeSVD(rank=15).complete(X_corrupt) scores['svd_15'] = evaluate(svd_15_X, X, X_corrupt) svd_16_X = IterativeSVD(rank=16).complete(X_corrupt) scores['svd_16'] = evaluate(svd_16_X, X, X_corrupt) svd_17_X = IterativeSVD(rank=17).complete(X_corrupt) scores['svd_17'] = evaluate(svd_17_X, X, X_corrupt) svd_18_X = IterativeSVD(rank=18).complete(X_corrupt) scores['svd_18'] = evaluate(svd_18_X, X, X_corrupt) svd_19_X = IterativeSVD(rank=19).complete(X_corrupt) scores['svd_19'] = evaluate(svd_19_X, X, X_corrupt) svd_20_X = IterativeSVD(rank=20).complete(X_corrupt) scores['svd_20'] = evaluate(svd_20_X, X, X_corrupt) svd_21_X = IterativeSVD(rank=21).complete(X_corrupt) scores['svd_21'] = evaluate(svd_21_X, X, X_corrupt) svd_22_X = IterativeSVD(rank=22).complete(X_corrupt) scores['svd_22'] = evaluate(svd_22_X, X, X_corrupt) svd_23_X = IterativeSVD(rank=23).complete(X_corrupt) scores['svd_23'] = evaluate(svd_23_X, X, X_corrupt) svd_24_X = IterativeSVD(rank=24).complete(X_corrupt) scores['svd_24'] = evaluate(svd_24_X, X, X_corrupt) si_X = SoftImpute().complete(X_corrupt) scores['si'] = evaluate(si_X, X, X_corrupt) si_s_half_X = SoftImpute(shrinkage_value=0.5).complete(X_corrupt) scores['si_s_half'] = evaluate(si_s_half_X, X, X_corrupt) si_s_1_X = SoftImpute(shrinkage_value=1).complete(X_corrupt) scores['si_s_1'] = evaluate(si_s_1_X, X, X_corrupt) si_s_2_X = SoftImpute(shrinkage_value=2).complete(X_corrupt) scores['si_s_2'] = evaluate(si_s_2_X, X, X_corrupt) si_s_4_X = SoftImpute(shrinkage_value=4).complete(X_corrupt) scores['si_s_4'] = evaluate(si_s_4_X, X, X_corrupt) si_s_8_X = SoftImpute(shrinkage_value=8).complete(X_corrupt) scores['si_s_8'] = evaluate(si_s_8_X, X, X_corrupt) si_s_16_X = SoftImpute(shrinkage_value=16).complete(X_corrupt) scores['si_s_16'] = evaluate(si_s_16_X, X, X_corrupt) si_s_32_X = SoftImpute(shrinkage_value=32).complete(X_corrupt) scores['si_s_32'] = evaluate(si_s_32_X, X, X_corrupt) si_s_64_X = SoftImpute(shrinkage_value=64).complete(X_corrupt) scores['si_s_64'] = evaluate(si_s_64_X, X, X_corrupt) si_s_128_X = SoftImpute(shrinkage_value=128).complete(X_corrupt) scores['si_s_128'] = evaluate(si_s_128_X, X, X_corrupt) if save_imputed: np.savetxt('./output/sweeps/' + name + '_simple_mean.csv', simple_mean_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_simple_median.csv', simple_median_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_simple_random.csv', random_X, delimiter=',', newline='\n'), np.savetxt('./output/sweeps/' + name + '_svd_1.csv', svd_1_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_2.csv', svd_2_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_3.csv', svd_3_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_4.csv', svd_4_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_5.csv', svd_5_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_6.csv', svd_6_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_7.csv', svd_7_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_8.csv', svd_8_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_9.csv', svd_9_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_10.csv', svd_10_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_11.csv', svd_11_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_12.csv', svd_12_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_13.csv', svd_13_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_14.csv', svd_14_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_15.csv', svd_15_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_16.csv', svd_16_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_17.csv', svd_17_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_18.csv', svd_18_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_19.csv', svd_19_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_20.csv', svd_20_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_21.csv', svd_21_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_22.csv', svd_22_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_23.csv', svd_23_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_24.csv', svd_24_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si.csv', si_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_half.csv', si_s_half_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_1.csv', si_s_1_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_2.csv', si_s_2_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_4.csv', si_s_4_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_8.csv', si_s_8_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_16.csv', si_s_16_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_32.csv', si_s_32_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_64.csv', si_s_64_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_128.csv', si_s_128_X, delimiter=',', newline='\n') if run_all: mice_X = MICE().complete(X_corrupt) scores['MICE'] = evaluate(mice_X, X, X_corrupt) mice_col_lambda_reg_25 = MICE( model=BayesianRidgeRegression(lambda_reg=0.25)).complete(X_corrupt) scores['MICE_col_lambda_reg_25'] = evaluate( mice_col_lambda_reg_25, X, X_corrupt) mice_col_lambda_reg_10 = MICE( model=BayesianRidgeRegression(lambda_reg=0.1)).complete(X_corrupt) scores['MICE_col_lambda_reg_10'] = evaluate( mice_col_lambda_reg_10, X, X_corrupt) mice_col_lambda_reg_1 = MICE( model=BayesianRidgeRegression(lambda_reg=0.01)).complete(X_corrupt) scores['MICE_col_lambda_reg_1'] = evaluate( mice_col_lambda_reg_1, X, X_corrupt) mice_col_lambda_reg_01 = MICE( model=BayesianRidgeRegression(lambda_reg=0.001)).complete(X_corrupt) scores['MICE_col_lambda_reg_01'] = evaluate( mice_col_lambda_reg_01, X, X_corrupt) mice_col_lambda_reg_001 = MICE( model=BayesianRidgeRegression(lambda_reg=0.0001)).complete(X_corrupt) scores['MICE_col_lambda_reg_001'] = evaluate( mice_col_lambda_reg_001, X, X_corrupt) mice_pmm_X = MICE(impute_type='pmm').complete(X_corrupt) scores['MICE_pmm'] = evaluate(mice_pmm_X, X, X_corrupt) mice_pmm_lambda_reg_25 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.25)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_25'] = evaluate( mice_pmm_lambda_reg_25, X, X_corrupt) mice_pmm_lambda_reg_10 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.1)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_10'] = evaluate( mice_pmm_lambda_reg_10, X, X_corrupt) mice_pmm_lambda_reg_1 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.01)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_1'] = evaluate(mice_pmm_lambda_reg_1, X, X_corrupt) mice_pmm_lambda_reg_01 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.001)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_01'] = evaluate(mice_pmm_lambda_reg_01, X, X_corrupt) mice_pmm_lambda_reg_001 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.0001)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_001'] = evaluate( mice_pmm_lambda_reg_001, X, X_corrupt) knn_1_X = KNN(k=1).complete(X_corrupt) scores['knn_1'] = evaluate(knn_1_X, X, X_corrupt) knn_3_X = KNN(k=3).complete(X_corrupt) scores['knn_3'] = evaluate(knn_3_X, X, X_corrupt) knn_9_X = KNN(k=9).complete(X_corrupt) scores['knn_9'] = evaluate(knn_9_X, X, X_corrupt) knn_15_X = KNN(k=15).complete(X_corrupt) scores['knn_15'] = evaluate(knn_15_X, X, X_corrupt) knn_30_X = KNN(k=30).complete(X_corrupt) scores['knn_30'] = evaluate(knn_30_X, X, X_corrupt) knn_81_X = KNN(k=81).complete(X_corrupt) scores['knn_81'] = evaluate(knn_81_X, X, X_corrupt) knn_243_X = KNN(k=243).complete(X_corrupt) scores['knn_243'] = evaluate(knn_243_X, X, X_corrupt) knn_751_X = KNN(k=751).complete(X_corrupt) scores['knn_751'] = evaluate(knn_751_X, X, X_corrupt) knn_2000_X = KNN(k=2000).complete(X_corrupt) scores['knn_2000'] = evaluate(knn_2000_X, X, X_corrupt) knn_6000_X = KNN(k=6000).complete(X_corrupt) scores['knn_6000'] = evaluate(knn_6000_X, X, X_corrupt) if save_imputed: np.savetxt('./output/sweeps/' + name + '_MICE.csv', mice_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_25.csv', mice_col_lambda_reg_25, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_10.csv', mice_col_lambda_reg_10, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_1.csv', mice_col_lambda_reg_1, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_01.csv', mice_col_lambda_reg_01, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_001.csv', mice_col_lambda_reg_001, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_X.csv', mice_pmm_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_25.csv', mice_pmm_lambda_reg_25, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_10.csv', mice_pmm_lambda_reg_10, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_1.csv', mice_pmm_lambda_reg_1, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_01.csv', mice_pmm_lambda_reg_01, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_001.csv', mice_pmm_lambda_reg_001, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_1.csv', knn_1_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_3.csv', knn_3_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_9.csv', knn_9_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_15.csv', knn_15_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_30.csv', knn_30_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_81.csv', knn_81_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_243.csv', knn_243_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_751.csv', knn_751_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_2000.csv', knn_2000_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_6000.csv', knn_6000_X, delimiter=',', newline='\n') print(scores) scores_df = pd.DataFrame().from_dict(scores.items()) scores_df.columns = ['Method', 'Score'] scores_df.set_index('Method') scores_df.to_csv('./output/scores/' + folder + '/' + name + '.csv')
X_filled_knn = KNN(k=3).fit_transform(X_incomplete) # matrix completion using MICE X_filled_mice = IterativeImputer().fit_transform(X_incomplete) # matrix completion using Iterative SVD X_filled_svd = IterativeSVD(rank=3).fit_transform(X_incomplete) # matrix completion using Matrix Factorization X_filled_mf = MatrixFactorization(learning_rate=0.01, rank=3, l2_penalty=0, min_improvement=1e-6).fit_transform(X_incomplete) # matrix completion using Mean Fill X_filled_meanfill = SimpleFill(fill_method='mean').fit_transform(X_incomplete) # matrix completion using Median Fill X_filled_medianfill = SimpleFill(fill_method='median').fit_transform(X_incomplete) # matrix completion using Zero Fill X_filled_zerofill = SimpleFill(fill_method='zero').fit_transform(X_incomplete) # matrix completion using Min Fill X_filled_minfill = SimpleFill(fill_method='min').fit_transform(X_incomplete) # matrix completion using Sampled Fill X_filled_randomfill = SimpleFill(fill_method='random').fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding X_incomplete_normalized = BiScaler().fit_transform(X_incomplete) X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized) # print mean squared error for the imputation methods above
def impute_mean(X): return SimpleFill("mean").complete(X)
re_X = re_X.astype(int) X_filled_knn = modifier(X_filled_knn, s) X_filled_knn = X_filled_knn.astype(int) logger.info("knn MSE:{}".format(MSE(imputedData, X_filled_knn))) logger.info("knn res MSE:{}".format(MSE(imputedData, re_X))) logger.info("res change MSE:{}".format(MSE(X_filled_knn, re_X))) # X_filled_ii = IterativeImputer().fit_transform(mm_missData) # re_X = inp.revise(X_filled_ii, miss_location, # model=os.path.join(modelSavePath, '{}.pkl'.format(modelName))) # X_filled_ii = restore(min_max_scaler=min_max_scaler,s=s,data=X_filled_ii) # re_X = restore(min_max_scaler=min_max_scaler, s=s, data=re_X) # logger.info("ii MSE:{}".format(MSE(imputedData, X_filled_ii))) # logger.info("ii res MSE:{}".format(MSE(imputedData, re_X))) X_filled_sf = SimpleFill().fit_transform(missData) re_X = inp.revise(modifier(X_filled_sf, s), miss_location, model=os.path.join(modelSavePath, '{}.pkl'.format(modelName))) re_X = modifier(re_X, s) re_X = re_X.astype(int) X_filled_sf = modifier(X_filled_sf, s) X_filled_sf = X_filled_sf.astype(int) logger.info("sf MSE:{}".format(MSE(imputedData, X_filled_sf))) logger.info("sf res MSE:{}".format(MSE(imputedData, re_X))) logger.info("res change MSE:{}".format(MSE(X_filled_sf, re_X))) X_filled_me = SimpleFill("median").fit_transform(missData) re_X = inp.revise(modifier(X_filled_me, s), miss_location,
import numpy as np from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, SimpleFill n = 200 m = 20 inner_rank = 4 X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) print("Mean squared element: %0.4f" % (X ** 2).mean()) # X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data,
def preprocess1(dataset, mf_imputer, labelencoder, delete_rows=True): dataset['Gender'] = dataset['Gender'].map(lambda x: 1 if x == 'Male' else 0 if x == 'Female' else x) dataset['Gender'] = mf_imputer.fit_transform(dataset[['Gender']]).ravel() dataset['Married'] = dataset['Married'].map(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x) dataset['Dependents'] = dataset['Dependents'].map(lambda x: 4 if x == '3+' else x) dataset['Dependents'] = pd.to_numeric(dataset['Dependents'], errors='coerce') dataset['Self_Employed'] = dataset['Self_Employed'].map(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x) dataset['Property_Area'] = labelencoder.fit_transform(dataset['Property_Area']) dataset['Education'] = labelencoder.fit_transform(dataset['Education']) dataset['Gender'] = pd.to_numeric(dataset['Gender'], errors='coerce').astype(np.int8) dataset['Dependents'] = pd.to_numeric(dataset['Dependents'], errors='coerce').astype(np.int8) printValueCount(dataset) cols = dataset.columns from fancyimpute import ( BiScaler, KNN, NuclearNormMinimization, SoftImpute, SimpleFill ) X_filled_knn = KNN(k=3).complete(dataset) X_filled_mean = SimpleFill("mean").complete(dataset) X_filled_softimpute = SoftImpute().complete(dataset) simplefill_mse = ((X_filled_mean - dataset) ** 2).mean() # print("KNN: %f" % simplefill_mse) knn_mse = ((X_filled_knn - dataset) ** 2).mean() # print("KNN: %f" % knn_mse) softImpute_mse = ((X_filled_softimpute - dataset) ** 2).mean() # print("SoftImpute MSE: %f" % softImpute_mse) dataset = getDataFrame(X_filled_knn, cols) # printValueCount(dataset) return dataset # df.sex = df.sex.map({'female': 1, 'male': 0}) # dataset['Dependents'] = mf_imputer.fit_transform(dataset[['Dependents']]).ravel() # Impute Dependents using married people for row, v in dataset['Dependents'].iteritems(): if (v is np.nan): #print(row, dataset.loc[row, 'Married']) if (dataset.loc[row, 'Married'] == 0): dataset.loc[row, 'Dependents'] = 0 else: dataset.loc[row, 'Dependents'] = 1 # Impute married missing values using dependents dataset['Dependents'] = dataset['Dependents'].map( lambda x: int(x) if str(x).isalnum() and not str(x).isalpha() else raise_('Number format exception')) for row, v in dataset['Married'].iteritems(): if (v != v or v is np.nan or v is None or v is '' or v == ""): if (dataset.loc[row, 'Dependents'] > 0): dataset.loc[row, 'Married'] = 1 else: dataset.loc[row, 'Married'] = 0 # dataset['Married'].groupby(dataset['Dependents']).value_counts() # Impute missing values of Self_Employed #plt.figure(figsize=(16, 6)) #sns.boxplot(x=dataset['Self_Employed'], y=dataset['ApplicantIncome']) # plt.yscale("log") #plt.title('Self employed wise boxplot of income') #plt.xticks(rotation=90); #plt.figure(figsize=(16, 6)) #sns.boxplot(x=dataset['Self_Employed'], y=dataset['CoapplicantIncome']) # plt.yscale("log") #plt.title('Self employed wise boxplot of CoapplicantIncome') #plt.xticks(rotation=90); incomemaan = dataset['ApplicantIncome'].groupby(dataset['Self_Employed']).mean() for row, v in dataset['Self_Employed'].iteritems(): if (v != v or v is np.nan or v is None or v is '' or v == ""): print(row, v) if (dataset.loc[row, 'ApplicantIncome'] > incomemaan[1]): dataset.loc[row, 'Self_Employed'] = 1 else: dataset.loc[row, 'Self_Employed'] = 0 # Impute missing values of LoanAmount #f, ax = plt.subplots(1, 2, figsize=(14, 6)) #ax1, ax2 = ax.flatten() #sns.distplot(dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean()), color='r', ax=ax1) #ax1.set_title('Distrbution of LoanAmount') #sns.boxplot(x=dataset['LoanAmount'], ax=ax2) #ax2.set_ylabel('') #ax2.set_title('Boxplot of LoanAmount') # Remove ouliers for ApplicantIncome remove >8000 #sns.distplot(dataset['ApplicantIncome']) if (delete_rows == True): dataset = dataset[dataset['ApplicantIncome'] < 15000] #sns.distplot(dataset['ApplicantIncome']) # Remove ouliers for LoanAmount remove >500 #sns.distplot(dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean())) if (delete_rows == True): dataset = dataset[dataset['LoanAmount'] < 450] else: dataset['LoanAmount'] = dataset['LoanAmount'].map(lambda x: 100 if x != x or x is None or x is np.nan else x) #sns.distplot(dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean())) #sns.jointplot(x=dataset['LoanAmount'], y=dataset['Property_Area'], color='g') # Do not have any relation #sns.jointplot(x=dataset['LoanAmount'], y=dataset['Loan_Amount_Term'], color='g') # Do not have any relation #sns.jointplot(x=dataset['LoanAmount'], y=dataset['Credit_History'], color='g') # Do not have any relation #sns.jointplot(x=dataset['LoanAmount'], y=dataset['ApplicantIncome'], color='g') # Do not have any relation #sns.distplot(dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mean())) #sns.jointplot(x=dataset['Loan_Amount_Term'], y=dataset['Property_Area'], color='g') # Do not have any relation #sns.jointplot(x=dataset['Loan_Amount_Term'], y=dataset['LoanAmount'], color='g') # Do not have any relation #sns.jointplot(x=dataset['Loan_Amount_Term'], y=dataset['Credit_History'], color='g') # Do not have any relation #sns.jointplot(x=dataset['Loan_Amount_Term'], y=dataset['ApplicantIncome'], color='g') # Do not have any relation # It is not significent with any column so make it mean dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mean(), inplace=True) # Impute Credit_History #sns.distplot(dataset['Credit_History'].fillna(dataset['Credit_History'].mean())) #sns.jointplot(x=dataset['Credit_History'], y=dataset['Property_Area'], color='g') # Do not have any relation #sns.jointplot(x=dataset['Credit_History'], y=dataset['LoanAmount'], color='g') # Do not have any relation #sns.jointplot(x=dataset['Credit_History'], y=dataset['Loan_Amount_Term'], color='g') # Do not have any relation #sns.jointplot(x=dataset['Credit_History'], y=dataset['ApplicantIncome'], color='g') # Do not have any relation # It is not significent with any column so make it most_frequent dataset['Credit_History'] = mf_imputer.fit_transform(dataset[['Credit_History']]).ravel() printValueCount(dataset, 5) return dataset
# from hyperopt import hp from ray import tune from ray.tune.suggest.hyperopt import HyperOptSearch from utils.handle_missingdata import gene_missingdata # space = { # "lr": hp.loguniform("lr", 1e-10, 0.1), # "momentum": hp.uniform("momentum", 0.1, 0.9), # } #baseline插补方法 from ycimpute.imputer import mice from ycimpute.utils import evaluate from utils.base_impute import random_inpute from fancyimpute import IterativeImputer, SimpleFill imputation = { 'median': SimpleFill("median").fit_transform, 'random': random_inpute, 'mice': mice.MICE().complete, 'ii': IterativeImputer().fit_transform } class TAI(Solver): def __init__(self, theta=5, epochs=50, use_cuda=False, batch_size=64, early_stop=1e-06, normalizer='zero_score', iterations=30,
for negative_log_regularization_weight in [1, 2, 3]: regularization_weight = 10.0 ** -negative_log_regularization_weight table.add_entry( solver=MICE( n_nearest_columns=25, n_imputations=20, n_burn_in=10, model=BayesianRidgeRegression(lambda_reg=regularization_weight), init_fill_method="mean", ), name="MICE_%d" % negative_log_regularization_weight) for fill_method in ["mean", "median"]: table.add_entry( solver=SimpleFill(fill_method=fill_method), name="SimpleFill_%s" % fill_method) for k in [1, 5, 17]: table.add_entry( solver=DenseKNN( k=k, orientation="rows"), name="DenseKNN_k%d" % (k,)) for shrinkage_value in [50, 200, 800]: # SoftImpute without rank constraints table.add_entry( solver=SoftImpute( shrinkage_value=shrinkage_value), name="SoftImpute_lambda%d" % (shrinkage_value,))
table = ResultsTable(images_dict=images_dict, scale_rows=False, center_rows=False) for negative_log_regularization_weight in [2, 3, 4]: regularization_weight = 10.0**-negative_log_regularization_weight table.add_entry(solver=IterativeImputer( n_nearest_columns=80, n_iter=50, n_burn_in=5, ), name="IterativeImputer_%d" % negative_log_regularization_weight) for fill_method in ["mean", "median"]: table.add_entry(solver=SimpleFill(fill_method=fill_method), name="SimpleFill_%s" % fill_method) for k in [1, 3, 7]: table.add_entry(solver=KNN(k=k, orientation="rows"), name="KNN_k%d" % (k, )) for shrinkage_value in [25, 50, 100]: # SoftImpute without rank constraints table.add_entry(solver=SoftImpute(shrinkage_value=shrinkage_value), name="SoftImpute_lambda%d" % (shrinkage_value, )) for rank in [10, 20, 40]: table.add_entry(solver=IterativeSVD(rank=rank, init_fill_method="zero"), name="IterativeSVD_rank%d" % (rank, ))
import torch.utils.data from pandas import isnull from functools import partial from logger import logger from sklearn.preprocessing import StandardScaler #继承类和model from utils.tools import Solver from dnn.autoencoder_test_partice import Autoencoder,ResAutoencoder,StockedAutoencoder,StockedResAutoencoder from utils.normalizer import NORMALIZERS,RECOVER #baseline插补方法 from ycimpute.imputer import mice from utils.base_impute import random_inpute from fancyimpute import IterativeImputer, SimpleFill imputation = {'median':SimpleFill("median").fit_transform,'random':random_inpute,'mice':mice.MICE().complete,'ii':IterativeImputer().fit_transform} AUTOENCODER_METHOD={'Autoencoder':Autoencoder,'ResAutoencoder':ResAutoencoder,'StockedAutoencoder':StockedAutoencoder,'StockedResAutoencoder':StockedResAutoencoder} LOSS={'MSELoss':torch.nn.MSELoss(),'CrossEntropyLoss':torch.nn.CrossEntropyLoss()} class TAI(Solver): #原始参数 def __init__( self, theta=5, epochs=50, use_cuda=False, batch_size=64, early_stop=1e-06, normalizer='zero_score', iterations=30,