def Em(name1,feature,n1,n2): data = pd.read_csv('E:\\feature.csv', encoding='gbk') datafeature = data[feature] data2values = datafeature[feature].values emval = EM() datacompete = emval.solve(data2values) return datacompete
def EMmake(name1, name2): trafficflow, b, ind = share.Csvopreation(name1, name2) datavalue = trafficflow.values datavalue = share.Standardize(datavalue, np.nanmean(datavalue, axis=0), np.nanstd(datavalue, axis=0)) #print(trafficflow.values.dtype) em = EM() # EM算法 model = em.solve(datavalue) return model
def test_em(): X_filled = EM().complete(missing_data) complete_data_, _, _ = min_max_scale(complete_data) X_filled, _, _ = min_max_scale(X_filled) score = RMSE(complete_data_[missing_mask], X_filled[missing_mask]) print(score)
def evaluate(self, X_mis, X_full): missing_index = evaluate.get_missing_index(np.isnan(X_mis)) original_arr = X_full[missing_index] em_X_filled = EM().complete(copy.copy(X_mis)) em_filled_arr = em_X_filled[missing_index] rmse_em_score = evaluate.RMSE(original_arr, em_filled_arr) return rmse_em_score
def em_missing_value(self, **params) -> pd.DataFrame: """ EM基于高斯分布能处理混合数据,但在连续型数据上表现的相对较好。在多种数 据缺失机制下,EM相对于其他方法有着较好的表现。 :param params: :return: """ before = self.df[self.columns].values after = EM(max_iter=1000).complete(before) self.df[self.columns] = after return self.df
#pip install ycimpute #EM Algoritma import seaborn as sns import missingno as msno import numpy as np import pandas as pd from ycimpute.imputer import EM df = sns.load_dataset("titanic") df = df.select_dtypes(include =["float64","int64"]) print(df) print(df.isnull().sum()) var_name = list(df) print(var_name) arr_df = np.array(df) print(arr_df) print(arr_df.shape) arr_dffill = EM().complete(arr_df) df_fill = pd.DataFrame(arr_dffill,columns=var_name) print(df_fill.isnull().sum())
try: imputedData = knnimput.KNN(k=3).complete(missData) score = evaluate.RMSE(originData, imputedData) knn_rmse.append(score) logger.info("knn missing rate:{},RMSE:{}".format(i, score)) except: knn_rmse.append(np.nan) try: imputedData = mice.MICE().complete(missData) score = evaluate.RMSE(originData, imputedData) mice_rmse.append(score) logger.info("MICE missing rate:{},RMSE:{}".format(i, score)) except: mice_rmse.append(np.nan) try: imputedData = EM().complete(missData) score = evaluate.RMSE(originData, imputedData) em_rmse.append(score) logger.info("EM missing rate:{},RMSE:{}".format(i, score)) except: em_rmse.append(np.nan) try: imputedData = BiScaler().fit_transform(missData) imputedData = SoftImpute().fit_transform(imputedData) score = evaluate.RMSE(originData, imputedData) fi_bs_rmse.append(score) logger.info("fi BiScaler missing rate:{},RMSE:{}".format( i, score)) except: fi_bs_rmse.append(np.nan) try:
var_names=list(df) import numpy as np n_df=np.array(df) from ycimpute.imputer import iterforest dff=iterforest.IterImput().complete(n_df) import pandas as pd dff=pd.DataFrame(dff, columns=var_names) dff.isnull().sum() ###EM import seaborn as sns import missingno as msno df=sns.load_dataset('titanic') df=df.select_dtypes(include=['float64','int64']) from ycimpute.imputer import EM var_names=list(df) import numpy as np n_df=np.array(df) dff=EM().complete(n_df) dff=pd.DataFrame(dff, columns=var_names) dff.isnull().sum() #yani eksiklerin hepsi dolmuş oldu.