def impute_ages(self, data): #drop_survived = data.drop(['Survived'], axis=1) column_titles = list(data) mice_results = fancyimpute.MICE().complete(np.array(data)) results = pd.DataFrame(mice_results, columns=column_titles) #results['Survived'] = list(data['Survived']) return results
def Imputer(method): if method == 'MICE': imputer = fancyimpute.MICE( n_nearest_columns=n_nearest_columns, min_value=0.0, verbose=False, ) def impute(data): return imputer.complete(data) elif method in ('fancymean', 'zero', 'fancymedian', 'min', 'random'): imputer = fancyimpute.SimpleFill( min_value=0.0, fill_method=method, ) def impute(data): return imputer.complete(data) elif method in ('mean', 'median', 'most_frequent'): import sklearn.preprocessing imputer = sklearn.preprocessing.Imputer( strategy=method, ) def impute(data): return imputer.fit_transform(data) elif method == 'drop': def impute(data): raise NotImplementedError return impute
def test_cross_validation_with_imputation(): imputer = fancyimpute.MICE(n_imputations=2, n_burn_in=1, n_nearest_columns=25) train_data = (mhcflurry.dataset.Dataset.from_csv( get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")).get_alleles( ["HLA-A0201", "HLA-A0202", "HLA-A0301"])) folds = cross_validation_folds(train_data, n_folds=3, imputer=imputer, drop_similar_peptides=True, alleles=["HLA-A0201", "HLA-A0202"]) eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"}) eq_(len(folds), 6) for fold in folds: eq_(fold.train.unique_alleles(), set([fold.allele])) eq_(fold.imputed_train.unique_alleles(), set([fold.allele])) eq_(fold.test.unique_alleles(), set([fold.allele])) models = HYPERPARAMETER_DEFAULTS.models_grid(activation=["tanh", "relu"], layer_sizes=[[4]], embedding_output_dim=[8], n_training_epochs=[3]) print(models) df = train_across_models_and_folds(folds, models) print(df) assert df.test_auc.mean() > 0.6
def impute_value(self, df, method="MICE"): """ Impute using MICE """ if method == "MICE": return fi.MICE(verbose=False).complete(df) elif method == "KNN": return fi.KNN(k=4, verbose=False).complete(df) else: return fi.SimpleFill().complete(df)
def impute(data, **kwargs): ### Impute missing values | kwargs from MICE args # can add impute_method=random (or other) to MICE impute_missing = data impute_missing_cols = list(impute_missing) filled_soft = fancyimpute.MICE(**kwargs).complete(np.array(impute_missing)) results = pd.DataFrame(filled_soft, columns=impute_missing_cols) assert results.isnull().sum().sum() == 0, 'Not all NAs removed' return results
def score_rent(self): X_train, X_test, y_train, y_test, contin_cols = self.preprocessing() #seperate continous and categorical columns con_train = X_train[contin_cols] cat_train = X_train[X_train.columns.difference(contin_cols)] #fancyimpute mice in continous train data mice = fancyimpute.MICE(verbose=0) con_train = np.asarray(con_train) con_train_mice = mice.complete(con_train) #fancyimpute mice in categorical train data cat_train = np.asarray(cat_train) cat_train_fancyknn = fancyimpute.KNN().complete(cat_train) cat_train_fancyknn = np.round(cat_train_fancyknn).astype(int) #apply boxcox transformation to continuous train data con_train_mice_bc = np.empty(con_train_mice.shape) from scipy import stats for i in range(len(contin_cols)): if np.argwhere(con_train_mice[:, i] < 0).size == 0: x = stats.boxcox(con_train_mice[:, i] + 1e-5)[0] x = np.asarray([x]) con_train_mice_bc[:, i] = x else: con_train_mice_bc[:, i] = con_train_mice[:, i] # apply onehot to categorical train data enc = OneHotEncoder() enc = enc.fit(cat_train_fancyknn) oh = enc.transform(cat_train_fancyknn).toarray() cat_train_fancyknn_onehot = np.round(oh).astype(int) #concatenate imputed train data X_train_imp = np.concatenate( (cat_train_fancyknn_onehot, con_train_mice_bc), axis=1) # Feature selection using Lasso select_lassocv = SelectFromModel(LassoCV()) select_lassocv = select_lassocv.fit(X_train_imp, y_train) #LassoCV param_grid = {'alpha': np.logspace(-3, 0, 14)} print(param_grid) grid = GridSearchCV(Lasso(normalize=True, max_iter=1e6), param_grid, cv=10) #makepipeline to prevent information leakage pipe_lassocv = make_pipeline(MinMaxScaler(), select_lassocv, grid) pipe_lassocv = pipe_lassocv.fit(X_train_imp, y_train) train_r2 = np.mean( cross_val_score(pipe_lassocv, X_train_imp, y_train, cv=5)) return contin_cols, enc, pipe_lassocv, train_r2, X_test, y_test
def impute(data): """Impute missing values in the Age, Deck, Embarked, and Fare features. """ impute_missing = data.drop(['Survived', 'Train'], axis = 1) impute_missing_cols = list(impute_missing) filled_soft = fancyimpute.MICE().complete(np.array(impute_missing)) results = pd.DataFrame(filled_soft, columns = impute_missing_cols) results['Train'] = list(data['Train']) results['Survived'] = list(data['Survived']) assert results.isnull().sum().sum() == 0, 'Not all NAs removed' return results
def imputate_continuous(data_train): ''' Imputation Continuous Variable ''' continuous = data_train.columns[data_train.dtypes != "object"] index_train = data_train.index X_train = data_train[continuous].as_matrix() try: X_train_fancy_mice = fancyimpute.MICE(verbose=0).complete(X_train) data_train_continuous = pd.DataFrame(X_train_fancy_mice, columns=continuous, index=index_train) except: data_train_continuous = pd.DataFrame(X_train, columns=continuous, index=index_train) return data_train_continuous
def parametric_input(df): ''' Using mice to fill missing continuous variables, adding a column to indicate that they were missing. Returns a transformed dataframe. Attributes ---------- df: pandas.DataFrame, type of float or int types. ''' # Filling values using mice. mice_matrix = fancyimpute.MICE(n_imputations=50).complete(df.values) mice_df = pd.DataFrame(mice_matrix) mice_df.columns = df.columns mice_df.index = df.index # Adding an indicator dataframe. ismissing_df = create_ismissing_df(df) return(pd.concat([mice_df, ismissing_df], axis=1))
def test_imputation(): imputer = fancyimpute.MICE(n_imputations=2, n_burn_in=1, n_nearest_columns=25) train_data = (mhcflurry.dataset.Dataset.from_csv( get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")).get_alleles( ["HLA-A0201", "HLA-A0202", "HLA-A0301"])) folds = cross_validation_folds(train_data, n_folds=3, imputer=imputer, drop_similar_peptides=True, alleles=["HLA-A0201", "HLA-A0202"]) eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"}) eq_(len(folds), 6) for fold in folds: eq_(fold.train.unique_alleles(), set([fold.allele])) eq_(fold.imputed_train.unique_alleles(), set([fold.allele])) eq_(fold.test.unique_alleles(), set([fold.allele]))
def predict_rent(self): contin_cols, enc, pipe_lassocv, train_r2, X_test, y_test = self.score_rent( ) con_test = X_test[contin_cols] cat_test = X_test[X_test.columns.difference(contin_cols)] #impute test data respectively (continous data) mice = fancyimpute.MICE(verbose=0) con_test = np.asarray(con_test) con_test_mice = mice.complete(con_test) #categorical data cat_test = np.asarray(cat_test) cat_test_fancyknn = fancyimpute.KNN().complete(cat_test) cat_test_fancyknn = np.round(cat_test_fancyknn).astype(int) #apply boxcox transformation to continuous test data con_test_mice_bc = np.empty(con_test_mice.shape) from scipy import stats for i in range(len(contin_cols)): if np.argwhere(con_test_mice[:, i] < 0).size == 0: x = stats.boxcox(con_test_mice[:, i] + 1e-5)[0] x = np.asarray([x]) con_test_mice_bc[:, i] = x else: con_test_mice_bc[:, i] = con_test_mice[:, i] # apply onehot to categorical train data oh = enc.transform(cat_test_fancyknn).toarray() cat_test_fancyknn_onehot = np.round(oh).astype(int) print("Finished onehot") #concatenate imputed test data X_test_imp = np.concatenate( (cat_test_fancyknn_onehot, con_test_mice_bc), axis=1) # make prediction based on training model y_pred = pipe_lassocv.predict(X_test_imp) test_r2 = r2_score(y_test, y_pred) print(test_r2) return X_test, y_test, y_pred
import fancyimpute as fi def impute_value(df): mice_impute = fi.MICE().complete(df) return mice_impute import numpy as np if __name__ == '__main__': w = np.random.randn(10, 2) w_n = w.copy() w_n[3, 1] = np.nan w_c = fi.MICE().complete(w_n) print w_c, w
def impute_value(df): mice_impute = fi.MICE().complete(df) return mice_impute
def preprocess(trainfile, testfile, outputdir, useless_attr, miss_threshold, xstrategy, ymin, ymax, ystrategy, fill_method="MICE", normal01=True): """对XY进行数据预处理,矩阵补全、正则化标准化等。 :param trainfile: string, 训练集(d_train_20180102.csv)的路径 :param testfile: string, 测试集(d_test_A_20180102.csv)的路径 :param outputdir: string, 预处理后文件保存的路径 :param useless_attr: list, 需要删除的无用属性,比如[0, 1, 2, 3] :param miss_threshold: float, 属性确实严重忽略的阈值,百分比,比如0.7 :param xstrategy: string, 对x中奇异点的处理方式{"replace", "nothing"} :param ymin: float, 对Y中点的最小值,小于这个值,即为奇异点 :param ymax: float, 对Y中点的最大值,超过这个值,就是奇异点 :param ystrategy: string, 对y中奇异点的处理方式("delete", "replace", "nothing") :param fill_method: string, 矩阵补全的策略,{"KNN", "SoftI", "MF", "MICE"} :param normal01: bool, 如果为真,则对结果进行归一化到01,否则,不归一化 :return: list, 归一化之后的trainX, trainY, testX """ # 0. 读入训练集,测试集 train_XY = convert(trainfile) test_X = convert(testfile) print("读入数据集,开始数据预处理") # 1. 删除无用属性列 train_id = train_XY[:, 0:1] test_id = test_X[:, 0:1] train_XY = np.delete(train_XY, useless_attr, axis=1) test_X = np.delete(test_X, useless_attr, axis=1) n_test = test_X.shape[0] info1 = "1. 删除train_XY, test_X上的无用属性:%s, train_X.shape=%s, test_X.shape=%s"\ %(str(useless_attr), str(train_XY.shape), str(test_X.shape)) print(info1) # 2. 删除缺失严重的列 miss_mask = np.isnan(train_XY) n = miss_mask.shape[0] column_del = [] # 删除列的list for i in range(miss_mask.shape[1]): miss_n = miss_mask[:, i].sum() if miss_n / n >= miss_threshold: column_del.append(i) train_XY = np.delete(train_XY, column_del, axis=1) test_X = np.delete(test_X, column_del, axis=1) info2 = "2. 在train_XY, test_X上删除缺失超过%f%%的属性:%s" % (miss_threshold * 100, str(column_del)) print(info2) # 3. 对y进行去噪,手动设置阈值 train_Y = train_XY[:, -1:] upper_mask = train_Y > ymax lower_mask = train_Y < ymin if ystrategy == "replace": train_Y[upper_mask] = ymax train_Y[lower_mask] = ymin elif ystrategy == "delete": index = np.array(np.arange(0, train_Y.shape[0], 1), ndmin=2).T chsn_mask = upper_mask | lower_mask train_XY = np.delete(train_XY, index[chsn_mask], axis=0) train_id = np.delete(train_id, index[chsn_mask], axis=0) elif ystrategy == "nothing": pass else: raise ValueError(r"'ystrategy'应该是{nothing, replace, delete}中的一个") train_Y = train_XY[:, -1:] print("3. 对trainY去噪(%s),trainXY.shape=%s" % (ystrategy, train_XY.shape)) # 4. 对X进行操作,通过boxplot计算阈值 train_X = train_XY[:, :-1] all_X = np.concatenate([train_X, test_X], axis=0) attr_n = train_XY.shape[1] - 1 attr_min_max = np.zeros( (attr_n, 2), dtype=np.float64) # 存储每个属性经过boxplot之后的最小最大值,即阈值array if xstrategy == "nothing": pass elif xstrategy == "replace": # 对X中的奇异点 替换为 最值 for i in range(attr_n): # 对每列进行浅拷贝,对crt_attr操作相当于对train_XY操作 crt_attr = all_X[:, i:i + 1] miss = np.isnan(crt_attr) box_dic = plt.boxplot(crt_attr[~miss]) crt_max = box_dic["caps"][0].get_ydata()[0] crt_min = box_dic["caps"][1].get_ydata()[0] if crt_max < crt_min: tmp = crt_max crt_max = crt_min crt_min = tmp attr_min_max[i, 0] = crt_min attr_min_max[i, 1] = crt_max crt_attr[miss] = 0 upper_mask = crt_attr > crt_max lower_mask = crt_attr < crt_min upper_mask &= ~miss lower_mask &= ~miss crt_attr[upper_mask] = crt_max crt_attr[lower_mask] = crt_min crt_attr[miss] = np.nan else: raise ValueError(r"'xstrategy'应该是{nothing, replace}中的一个") print(r"4. 对所有的X进行去噪(%s)." % xstrategy) # 5. 矩阵补全 completer = None if fill_method == "KNN": completer = fi.KNN(verbose=False) elif fill_method == "SoftI": completer = fi.SoftImpute(verbose=False) elif fill_method == "MF": completer = fi.MatrixFactorization(verbose=False) elif fill_method == "MICE": completer = fi.MICE(verbose=False) else: ValueError(r"'fill_method'应该是{'KNN','SoftI','MF','MICE'}.") all_X_complete = completer.complete(all_X) print("5. 在all_X上进行矩阵补全(%s)." % fill_method) # train_X = all_X_complete[:-1000, :] # test_X = all_X_complete[-1000:, :] # 6. 归一化,以及01缩放 if normal01: X_nmler = StandardScaler() X_01 = MinMaxScaler() Y_nmler = StandardScaler() Y_01 = MinMaxScaler() X_nmler.fit(all_X_complete) Y_nmler.fit(train_Y) all_X_nml = X_nmler.transform(all_X_complete) train_Y_nml = Y_nmler.transform(train_Y) X_01.fit(all_X_nml) Y_01.fit(train_Y_nml) all_X_nml01 = X_01.transform(all_X_nml) train_Y_nml01 = Y_01.transform(train_Y_nml) final_train_X = all_X_nml01[:-n_test, :] final_test_X = all_X_nml01[-n_test:, :] final_train_Y = np.concatenate([train_Y_nml01, train_Y], axis=1) else: final_train_X = all_X_complete[:-n_test, :] final_test_X = all_X_complete[-n_test:, :] final_train_Y = train_Y print(r"6. 对all_X, train_Y归一化到01(%s)." % normal01) # 7. 存储数据 print(r"7. 存储数据为: 集合_类别_日期.csv(%s)." % outputdir) # timestamp = datetime.now().strftime("%Y%m%d%H%M") timestamp = "0000" np.savetxt(outputdir + r"\train_X_" + timestamp + ".csv", final_train_X, delimiter=",") np.savetxt(outputdir + r"\test_X_" + timestamp + ".csv", final_test_X, delimiter=",") np.savetxt(outputdir + r"\train_Y_" + timestamp + ".csv", final_train_Y, delimiter=",") np.savetxt(outputdir + r"\train_id_" + timestamp + ".csv", train_id.astype(np.int64), delimiter=",") np.savetxt(outputdir + r"\test_id_" + timestamp + ".csv", test_id.astype(np.int64), delimiter=",") return train_X, train_Y, test_X, train_id
from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier, export_graphviz from sklearn import metrics from sklearn import preprocessing from sklearn.feature_selection import chi2 from sklearn.model_selection import train_test_split from sklearn import tree from sklearn.tree import _tree import graphviz dataset = read_csv('realdata3.csv') modifiedData = dataset.fillna(np.NaN) print(modifiedData.head(5)) d = modifiedData d1 = fancyimpute.MICE().complete(d) newd = df(data=d1, index=d.index, columns=list(d.columns)) newd.to_csv('test2.csv') #criterion = "entropy", max_depth = 7, min_samples_split=500, min_samples_leaf=500 outcome_var = 'BAD' model = tree.DecisionTreeClassifier(criterion="entropy", max_depth=12, min_samples_split=500, min_samples_leaf=200) predictor_var = [ 'LOAN', 'MORTDUE', 'REASON', 'VALUE', 'DELINQ', 'DEROG', 'CLAGE', 'Other', 'DELINQ', 'Office', 'Sales', 'ProfExe' ] X_train, X_test, y_train, y_test = train_test_split(newd[predictor_var],
def fancy_impute(df, method='mice'): if method =='knn': df = pd.DataFrame(data=fancyimpute.KNN(3).complete(df), columns=df.columns, index=df.index) else: df = pd.DataFrame(data=fancyimpute.MICE().complete(df), columns=df.columns, index=df.index) return df
# List containing names of the features column_list = ['Pregnancies','Glucose','BloodPressure','SkinThickness', 'Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome'] df = pd.read_csv('Pimadiabetes.csv', names = column_list, header = None) # Treat 0 in the biological variables other than number of times pregnant and outcome as missing values replace_cols = [i for i in column_list if i not in ['Outcome','Pregnancies']] df[replace_cols] = df[replace_cols].replace(0, np.nan) # Median Imputation Technique for BMI, BloodPressure, Glucose df[['BMI','BloodPressure','Glucose']] = df[['BMI','BloodPressure','Glucose']].fillna(df.median()) # Multiple Imputation Technique for SkinThickness and Insulin df[['Insulin','SkinThickness']] = imp.MICE().complete(df[['Insulin','SkinThickness']]) # Splitting dataframe into predictors and outcome Y_data = df['Outcome'] X_data = df.drop(['Outcome'], axis=1) # Splits up the data set into 80% train 20% test X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = .2, random_state = 13) # Dictionary of Classifiers dict_classifiers = { "Logistic Regression": LogisticRegression(), "AdaBoost": AdaBoostClassifier(n_estimators=100), "Naive Bayes": GaussianNB(), "Random Forest": RandomForestClassifier(n_estimators=100,criterion='entropy')}
def preprocess(train_data, test_data, fill_times=1000, ignore_columns=['id']): """对训练数据和测试数据进行预处理 1. 对数据进行补全 2. 对数据进行归一化 :param train_data: DataFrame, 通过pandas读入的训练集数据 :param test_data: DataFrame, 通过pandas读入的测试集数据 :return: X_train, y_train, X_test """ feature_name = test_data.columns label_name = list(set(train_data.columns) - set(feature_name)) train_label = train_data[label_name] train_feature = train_data[feature_name] test_feature = test_data test_feature = test_feature.set_index(test_feature.index + 1000) train_index = train_feature.index test_index = test_feature.index all_feature = pd.concat([train_feature, test_feature], axis=0) all_index = all_feature.index snp_columns = [ 'SNP1', 'SNP2', 'SNP3', 'SNP4', 'SNP5', 'SNP6', 'SNP7', 'SNP8', 'SNP9', 'SNP10', 'SNP11', 'SNP12', 'SNP13', 'SNP14', 'SNP15', 'SNP16', 'SNP17', 'SNP18', 'SNP19', 'SNP20', 'SNP21', 'SNP22', 'SNP23', 'SNP24', 'SNP25', 'SNP26', 'SNP27', 'SNP28', 'SNP29', 'SNP30', 'SNP31', 'SNP32', 'SNP33', 'SNP34', 'SNP35', 'SNP36', 'SNP37', 'SNP38', 'SNP39', 'SNP40', 'SNP41', 'SNP42', 'SNP43', 'SNP44', 'SNP45', 'SNP46', 'SNP47', 'SNP48', 'SNP49', 'SNP50', 'SNP51', 'SNP52', 'SNP53', 'SNP54', 'SNP55' ] other_columns = set(all_feature.columns) - set(snp_columns) other_columns = list(other_columns) all_feature[snp_columns] = all_feature[snp_columns].fillna(0) snp_feature = all_feature[snp_columns] snp_scale = MinMaxScaler() snp_scale.fit([[0], [3]]) snp_feature01 = snp_scale.transform(snp_feature) snp_feature_final = pd.DataFrame(snp_feature01, columns=snp_columns, index=all_index) ###################### # 数据填充 ###################### feature_complete = [] t0 = time() for i in range(fill_times): mice_data = fi.MICE(verbose=False).complete(all_feature[other_columns]) feature_complete.append(mice_data) remain_time = (time() - t0) / (i + 1) * (fill_times - i - 1) print("第 %2.d/%d 次填充, 剩余时间 %.0f s" % (i + 1, fill_times, remain_time)) feature_complete = np.array(feature_complete) feature_filled = feature_complete.mean(axis=0) all_feature[other_columns] = feature_filled other_feature = all_feature[other_columns] int_column = [ '年龄', '孕次', '产次', 'BMI分类', '收缩压', '舒张压', 'ALT', 'AST', 'Lpa', 'DM家族史', 'ACEID' ] float2_column = [ 'BUN', 'ApoA1', 'CHO', 'wbc', '孕前体重', 'HDLC', 'Cr', 'RBP4', 'ApoB', '分娩时', '身高', '糖筛孕周', 'TG', 'LDLC', 'hsCRP' ] float5_column = ['孕前BMI'] float6_column = ['VAR00007'] other_feature.loc[:][int_column] = other_feature[int_column].round() other_feature.loc[:][float2_column] = other_feature[float2_column].round(2) other_feature.loc[:][float5_column] = other_feature[float5_column].round(5) other_feature.loc[:][float6_column] = other_feature[float6_column].round(6) ###################### # 数据归一化 ###################### id_feature = other_feature[ignore_columns] feature_need_nml = other_feature.drop(ignore_columns, axis=1) feature_nml = scale(feature_need_nml.values, axis=0) feature_nml01 = minmax_scale(feature_nml, axis=0) other_feature_final = pd.DataFrame(feature_nml01, columns=feature_need_nml.columns, index=all_index) feature_final = pd.concat( [id_feature, snp_feature_final, other_feature_final], axis=1) train_feature = pd.DataFrame(feature_final.ix[train_index], columns=feature_name) train_label = pd.DataFrame(train_label, columns=["label"]) test_feature = pd.DataFrame(feature_final.ix[test_index], columns=feature_name) return train_feature, train_label, test_feature
def impute(df): return pd.DataFrame(fancyimpute.MICE().complete(np.array(df)), columns = list(df))
print('MF imputation') print(get_loss(X_c, X_mf_c, Y_c)) # MICE imputation # Since MICE can not handle the singular matrix, we do it in a batch style X_mice = [] # since the data matrix of one patient is a singular matrix, we merge a batch of matrices and do MICE impute n = len(X) batch_size = 128 nb_batch = (n + batch_size - 1) // batch_size for i in range(nb_batch): print('On batch {}'.format(i)) x = np.concatenate(X[i * batch_size: (i + 1) * batch_size]) y = np.concatenate(Y[i * batch_size: (i + 1) * batch_size]) x_mice = fancyimpute.MICE(n_imputations=100, n_pmm_neighbors=20, verbose=False).complete(x) X_mice.append(x_mice) X_mice = np.concatenate(X_mice, axis=0) X_c = np.concatenate(X, axis=0) Y_c = np.concatenate(Y, axis=0) print('MICE imputation') print(get_loss(X_c, X_mice, Y_c))
# 3- pip install fancyimpute # En caso de que siga generando problemas analizar las dependencias adicionales exigidas, en mi caso fue ipykernel # pip install ipukernel # y despues: pip install fancyinpute import pandas as pd import numpy as np datoNum = data.select_dtypes(include=[np.float]).as_matrix() fecha = data.select_dtypes(include=[np.object]).as_matrix() datoNum = pd.DataFrame(datoNum) fecha = pd.DataFrame(fecha) import fancyimpute datoNumcomp = pd.DataFrame(fancyimpute.MICE().complete(datoNum)) datos_completos = pd.concat([fecha, datoNumcomp], axis=1) datos_completos.columns = data.columns datos_completos.index = data.index datos_completos # In[ ]: datos_completos.isnull().any().any() # In[ ]:
def impute_mice(X): # fancyimpute is a downloadable package capable of kNN, NMM, soft impute, # MSE, MICE, and low-rank SVD. X_new = fancyimpute.MICE(n_imputations=100).complete(X) return X_new