Ejemplo n.º 1
0
    def EM_imputation(self, dataset):
        # only for numerical values
        # imputes given data using expectation maximization.
        # E-step: Calculates the expected complete data log
        # likelihood ratio.

        import impyute as imp

        df = dataset

        if dataset.select_dtypes(['number']).isnull().sum().sum() > 0:

            X = imp.em(dataset.select_dtypes(['number']).iloc[:, :].values)

            Z = dataset.select_dtypes(include=['object'])

            df = pd.DataFrame.from_records(

                X, columns=dataset.select_dtypes(['number']).columns)

            df = df.join(Z)

        else:

            pass

        return df
Ejemplo n.º 2
0
def compute_err_EM(Xtrain, ytrain, Xtest, ytest, n, p, G):
    Xtr_nan_list = make_nan_list(Xtrain, ytrain, G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1, G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    scaler = MinMaxScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtest = scaler.transform(Xtest)
    Xtr_nan_list2 = []
    for g in range(G):
        Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g]))

    #impute,classify and get the error rates for imputation approaches
    start = time.time()
    Xtr_em = impy.em(Xtr_nan, loops=10)
    clf_em = skLDA().fit(Xtr_em, ytr)
    em_err = np.mean(clf_em.predict(Xtest).flatten() != ytest)
    em_time = time.time() - start

    return em_err, em_time
Ejemplo n.º 3
0
def Impute_EM(data_x):
    '''Impute missing values in data_x

    Args:
      - data_x: original data with missing values

    Returns:
      - imputed_data: imputed data
    '''
    imputed_data = impy.em(data_x)

    return imputed_data
Ejemplo n.º 4
0
def compute_err_EM(Xtrain, ytrain, n, p, G):
    # make NAs
    Xtr_nan_list = make_nan_list(Xtrain, ytrain, G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1, G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    for g in range(G):
        Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g])

    mus = [np.mean(Xtrain[ytrain == g, :], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus)  # each row is a mean of a class
    S = [(sum(ytrain == g) - 1) * np.cov(Xtrain[ytrain == g, :], rowvar=False)
         for g in np.arange(G)]
    S = np.asarray(S) / len(ytrain)

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    start = time.time()
    Xtr_em = impy.em(Xtr_nan, loops=10)
    mus_em = np.array(
        [np.mean(Xtr_em[ytrain == g, :], axis=0) for g in np.arange(G)])
    S_em = np.array([
        (sum(ytrain == g) - 1) * np.cov(Xtr_em[ytrain == g, :], rowvar=False)
        for g in np.arange(G)
    ])
    S_em = S_em / len(ytrain)
    em_err = err(mus, S, mus_em, S_em)
    em_time = time.time() - start

    return em_err, em_time, per_missing
Ejemplo n.º 5
0
def run_experiment_k_paper(X_test, y_test, clf, NB, a, setting):
    X_impute_mean   = np.mean(X_test, axis = 0)
    X_impute_median = np.median(X_test, axis = 0)
    X_impute_max    = np.max(X_test, axis = 0)
    X_impute_min    = np.min(X_test, axis = 0)
    X_impute_flip   = np.copy(1 - X_test)


    k_all = []
    missing_err_nb_all = []
    missing_err_lr_mean_all = []
    missing_err_lr_median_all = []
    missing_err_lr_max_all = []
    missing_err_lr_min_all = []
    missing_err_lr_flip_all = []
    missing_err_lr_em_impute_all = []
    missing_err_lr_mice_impute_all = []
    missing_err_lr_knn_impute_all = []

    useEM = setting["em"] if "em" in setting else False
    discreteFeatures = setting["discreteFeatures"] if "discreteFeatures" in setting else 1
    featureEncoding = setting["feature_encoding"] if "feature_encoding" in setting else None

    do_emImpute = setting["emImpute"] if "emImpute" in setting else False
    do_miceImpute = setting["miceImpute"] if "miceImpute" in setting else False
    do_knnImpute = setting["knnImpute"] if "knnImpute" in setting else False

    if useEM:
        missing_err_ours_all = {}
        for i in range(len(a)):
            missing_err_ours_all["ours_" + str(i)] = []
    else:
        missing_err_ours_all = []

    useProb = setting["prob"] if "prob" in setting else True
    function = setting["function"] if "function" in setting else None
    if function is None:
        if useProb:
            function = conditional_likelihood_k
        else:
            function = f1_score

    print("Using following function: ")
    print(function)
    
    repeat = setting["repeat"] if "repeat" in setting else 1

    FEATURES = setting["features"] if "features" in setting else None
    if FEATURES is None:
        NNN = X_test.shape[1]
        if not featureEncoding is None:
            NNN = len(featureEncoding)
        FEATURES = np.array( [i for i in range(NNN / discreteFeatures )] )
    else:
        FEATURES = np.array( FEATURES )

    print("Possible features to remove: {}".format(FEATURES.shape[0]))

    K = setting["k"]

    for k in K:
        print("K = {}".format(k))

        if k > FEATURES.shape[0]:
            print("Early stop: Only had {} features possible to remove".format(FEATURES.shape[0]))
            break

        cur_nb = []
        cur_lr_mean = []
        cur_lr_median = []
        cur_lr_max = []
        cur_lr_min = []
        cur_flip = []
        cur_em_impute = []
        cur_mice_impute = []
        cur_knn_impute = []

        if useEM:
            cur_ours = {}
            for i in range(len(a)):
                cur_ours["ours_" + str(i)] = []
        else:
            cur_ours = []
        
        

        for R in range(repeat):
            if R % 10 == 0:
                print("\t R = {}".format(R))
            X_test_mean   = np.array(X_test, dtype = 'float')
            X_test_median = np.array(X_test, dtype = 'float')
            X_test_max    = np.array(X_test, dtype = 'float')
            X_test_min    = np.array(X_test, dtype = 'float')
            X_test_flip   = np.array(X_test, dtype = 'float')
            X_test_em_impute = np.array(X_test, dtype = 'float')
            X_test_mice_impute = np.array(X_test, dtype = 'float')
            X_test_knn_impute = np.array(X_test, dtype = 'float')
            missing = np.zeros(X_test.shape, dtype=bool)

            for i in range(X_test.shape[0]):
                miss = np.random.choice(FEATURES, k, replace=False)

                if not featureEncoding is None and k > 0:
                    missK = []
                    for m in miss:
                        for z in featureEncoding[m]:
                            missK.append(z)
                    miss = np.copy(np.array(missK))

                elif discreteFeatures != 1 and k > 0:
                    missK = []
                    for m in miss:
                        for z in range(discreteFeatures):
                            missK.append(m * discreteFeatures + z)
                    miss = np.copy(np.array(missK))

                   
                missing[i][miss] = True
                # if k > 0:
                #     print(missing[i])
                #     print(np.sum(missing[i]))
                X_test_mean[i][miss]   = X_impute_mean[miss]
                X_test_median[i][miss] = X_impute_median[miss]
                X_test_max[i][miss]    = X_impute_max[miss]
                X_test_min[i][miss]    = X_impute_min[miss]
                X_test_flip[i][miss]   = X_impute_flip[i][miss]
                X_test_em_impute[i][miss] = np.nan
                X_test_mice_impute[i][miss] = np.nan
                X_test_knn_impute[i][miss] = np.nan

            if do_emImpute:
                import time
                start = time.time()
                loops = 6
                print ("\tStarting to em impute with loops = {}".format(loops))
                X_test_em_impute = impyute.em(X_test_em_impute, loops = loops)
                end = time.time()
                print ("\tDone imputing! " + str( end - start ) )
            else:
                X_test_em_impute = np.zeros(X_test.shape)

            if do_miceImpute:
                import time
                start = time.time()
                print ("\tStarting to mice impute")
                X_test_mice_impute = impyute.mice(X_test_mice_impute)
                end = time.time()
                print ("\tDone imputing! " + str( end - start ) )
            else:
                 X_test_mice_impute = np.zeros(X_test.shape)


            if do_knnImpute:
                import time
                start = time.time()
                print ("\tStarting to knn impute")
                X_test_knn_impute = impyute.fast_knn(X_test_knn_impute)
                end = time.time()
                print ("\tDone imputing! " + str( end - start ) )
            else:
                 X_test_knn_impute = np.zeros(X_test.shape)

            lr_prob = clf.predict_proba(X_test)
            
            if useProb:
                cur_nb.append         ( function(lr_prob, predict_nbk_with_missing(X_test_mean, NB, missing, prob = True)) )
                cur_lr_mean.append    ( function(lr_prob, clf.predict_proba(X_test_mean)) )
                cur_lr_median.append  ( function(lr_prob, clf.predict_proba(X_test_median)))
                cur_lr_max.append     ( function(lr_prob, clf.predict_proba(X_test_max)))
                cur_lr_min.append     ( function(lr_prob, clf.predict_proba(X_test_min)))
                cur_em_impute.append  ( function(lr_prob, clf.predict_proba(X_test_em_impute)))
                cur_mice_impute.append( function(lr_prob, clf.predict_proba(X_test_mice_impute)))
                cur_knn_impute.append ( function(lr_prob, clf.predict_proba(X_test_knn_impute)))
                # cur_flip.append       ( function(lr_prob, clf.predict_proba(X_test_flip)))
                if not useEM:
                    cur_ours.append   ( function(lr_prob, a.classify(X_test, missing, prob = True)))
                else:
                    for z in range(len(a)):
                        cur_ours["ours_" + str(z)].append (function(lr_prob, a[z].classify(X_test, missing, prob = True)))
                        

            else:
                cur_nb.append         ( function(y_test, predict_nbk_with_missing(X_test_mean, NB, missing)) )
                cur_lr_mean.append    ( function(y_test, clf.predict(X_test_mean)) )
                cur_lr_median.append  ( function(y_test, clf.predict(X_test_median)))
                cur_lr_max.append     ( function(y_test, clf.predict(X_test_max)))
                cur_lr_min.append     ( function(y_test, clf.predict(X_test_min)))
                cur_em_impute.append  ( function(y_test, clf.predict(X_test_em_impute)))
                cur_mice_impute.append( function(y_test, clf.predict(X_test_mice_impute)))
                cur_knn_impute.append( function(y_test, clf.predict(X_test_knn_impute)))
                # cur_flip.append       ( function(y_test, clf.predict(X_test_flip)))
                if not useEM:
                    cur_ours.append   ( function(y_test, a.classify(X_test_mean, missing)))
                else:
                    for z in range(len(a)):
                        cur_ours["ours_" + str(z)].append( function(y_test, a[z].classify(X_test_mean, missing)))
        
        k_all.append(k)
        missing_err_nb_all.append       (cur_nb)
        missing_err_lr_mean_all.append  (cur_lr_mean)
        missing_err_lr_median_all.append(cur_lr_median)
        missing_err_lr_max_all.append   (cur_lr_max)
        missing_err_lr_min_all.append   (cur_lr_min)
        missing_err_lr_flip_all.append  (cur_flip)
        missing_err_lr_em_impute_all.append(cur_em_impute)
        missing_err_lr_mice_impute_all.append(cur_mice_impute)
        missing_err_lr_knn_impute_all.append(cur_knn_impute)
        if useEM:
            for i in cur_ours:
                missing_err_ours_all[i].append(cur_ours[i])
        else:
            missing_err_ours_all.append  (cur_ours)

    if not useEM:
        missing_err_ours_all = np.array(missing_err_ours_all)

    data = {
        "features_count": FEATURES.shape[0],
        "k" :     np.array(k_all),
        "nb":     np.array(missing_err_nb_all),
        "mean":   np.array(missing_err_lr_mean_all),
        "median": np.array(missing_err_lr_median_all),
        "max":    np.array(missing_err_lr_max_all),
        "min":    np.array(missing_err_lr_min_all),
        "ours":   missing_err_ours_all,
        "flip":   np.array(missing_err_lr_flip_all),
        "em_impute": np.array(missing_err_lr_em_impute_all),
        "mice_impute": np.array(missing_err_lr_mice_impute_all),
        "knn_impute": np.array(missing_err_lr_knn_impute_all),
    }            

    return data
Ejemplo n.º 6
0
def test_em_(test_data):
    data = test_data(SHAPE)
    imputed = impy.em(data)
    return_na_check(imputed)
Ejemplo n.º 7
0
 def test_impute_missing_values(self):
     """ After imputation, no NaN's should exist"""
     imputed = impy.em(self.data_m)
     self.assertFalse(np.isnan(imputed).any())
Ejemplo n.º 8
0
 def test_return_type(self):
     """ Check return type, should return an np.ndarray"""
     imputed = impy.em(self.data_m)
     self.assertTrue(isinstance(imputed, np.ndarray))
Ejemplo n.º 9
0
                    mu = col[~np.isnan(col)].mean()
                    std = col[~np.isnan(col)].std()
                    #Maxmum
                    col[x_cat_i] = np.random.normal(loc=mu, scale=std)
                    dealt = (col[x_cat_i] - previous) / previous
                    if dealt < 0.1:
                        data[x_i, y_i] = col[x_cat_i]
                        break
                    data[x_i, y_i] = col[x_cat_i]
                    previous = col[x_cat_i]
    return data


if __name__ == '__main__':

    data_comp = np.loadtxt("spam.txt", delimiter=",")  #读入数据并对其进行随机删除
    data_uncomp = rand_delete(data_comp, 0.05)  #对读入的数据进行删除
    m = get_mask(data_uncomp)  #1代表缺失,0代表完整
    #    data_comp1 = StandardScaler().fit_transform(data_comp)
    #    data_comp2 = MinMaxScaler().fit_transform(data_comp)
    data_comp = Normalizer().fit_transform(data_comp)
    data_imp = em(data_uncomp, 50)  #用自己编写的em算法进行填补
    data_impy = impy.em(data_uncomp)  #用impyute中的em进行填补
    m_data_imp = data_imp * m
    m_data_raw = data_comp * m
    m_data_impy = data_impy * m  #为了只计算填补的值之间的差距,故与m相乘
    error_impy = np.mean(np.square(m_data_raw - m_data_impy))
    print("error_impy:", error_impy)
    error = np.mean(np.square(m_data_raw - m_data_imp))
    print("define em error:", error)
Ejemplo n.º 10
0
from impyute import em
import numpy as np
import xlrd
book = xlrd.open_workbook('Matlab Codes/Gaussian Graphical Models/Counties/allLocs.xlsx')
sheet = book.sheet_by_name('sheet1')
y = [[]]
for r in range(1,sheet.nrows):
	try:
		y[0].append(float(sheet.cell_value(r, 1)))
	except:
		y[0].append(np.nan)
import math
data = np.array(y).T
print(data)
n = em(data, loops=50)
p = n.T.tolist()[0]
import pandas as pd
sh = pd.read_excel('Matlab Codes/Gaussian Graphical Models/Counties/allLocs.xlsx', sheet_name='sheet1', index=False, na_values=[np.nan])
t = {}
p = list(sh.index.unique())
for j in p:
	t[j] = df.loc[j]
y = np.array([sh['TEMP'].as_matrix()])
print(y)
u ={}
for j in p:
	u[j] = np.array([t[j]['TEMP'].as_matrix()])
for j in p:
	y = u[j]
	if np.isnan(y.T):
Ejemplo n.º 11
0
def test_return_type():
    """ Check return type, should return an np.ndarray"""
    imputed = impy.em(data_m)
    assert isinstance(imputed, np.ndarray)
Ejemplo n.º 12
0
#   iterative imputation process
# while nmf_model.reconstruction_err_**2 > 10:
while nmf_model.reconstruction_err_ > 2.5:
    W = nmf_model.fit_transform(imputed)
    imputed[~msk] = W.dot(nmf_model.components_)[~msk]
    print(nmf_model.reconstruction_err_)

# [Imputation mode: MICE]
imputed = impy.mice(df.values[:split_idx])

# [Imputation mode: k-NN]
imputer = KNNImputer(n_neighbors=10)  # default: 2
imputed = imputer.fit_transform(df.values[:split_idx])

# [Imputation mode: EM]
imputed = impy.em(df.values[:split_idx], loops=50)

# [Imputation mode: LOCF]
imputed = df.copy().iloc[:split_idx].ffill()
imputed = imputed.fillna(0)
imputed = imputed.values

# [Imputation mode: NOCB]
imputed = df.copy().iloc[:split_idx].bfill()
imputed = imputed.fillna(0)
imputed = imputed.values

# [No imputation: Case Deletion]
imputed = df.drop(df[df.isnull().any(axis=1)].index).copy()

# [No imputation: Zero Substitution]
Ejemplo n.º 13
0
 def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target):
     try:
         self.miss_info = miss_info
         self.columns = notobj
         self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[
             "num_col"]
         metric = {"rmse": {}, "nrmse": {}}
         self.rawT = T
         self.target = target
         if target is not None: self.target_y = T[target]
         else: self.target_y = None
         self.cv = {}
         self.cv.update(deepcopy(metric))
         self.kf = kf
         self.MSE = {}
         self.MSE.update(deepcopy(metric))
         self.result = {}
         self.time_ck = {}
         X = deepcopy(T)
         mask = pd.DataFrame(mask, columns=T.columns.tolist())
         self.rawmask = mask
         X[(mask == 1).values] = np.nan
         if obj in [None, []]: obj = None
         else: pass
         ##########################################
         self.X = X[notobj]
         self.T = T[notobj]
         self.mask = mask[notobj]
         self.notobj = notobj
         ##########################################
         if obj is not None:
             ############ Numeric + Category  #################
             cat_impute = SimpleImputer(strategy="most_frequent")
             X[obj] = cat_impute.fit_transform(X[obj])
             self.true_obj = T[obj]
             self.pd_obj = X[obj]
             ###################################################
             TT = deepcopy(T)
             cat_encoder = miss_info["ce_encoder"]
             for k in cat_encoder.category_mapping:
                 col, map_ = k["col"], k["mapping"]
                 TT[col] = TT[col].replace(
                     dict(zip(k["mapping"].index, k["mapping"].values)))
             self.full_miss_data = TT
             self.full_miss_data[(mask == 1).values] = np.nan
             mice_data = deepcopy(T)
             for obj_col in obj:
                 mice_data[obj_col] = "Cols_" + mice_data[obj_col]
             self.full_mice_data = mice_data
             self.full_mice_data[(mask == 1).values] = np.nan
         else:
             ########## Numeric  ###############################
             num_data = deepcopy(self.X)
             num_data[(self.mask == 1).values] = np.nan
             self.full_miss_data = deepcopy(num_data)
             self.full_mice_data = deepcopy(num_data)
             ###################################################
         self.algo = algo
         self.method = {
             "MissForest" : lambda x : MissForest(verbose = 0, n_jobs  = -1 ).fit(x) ,
             "mean" : lambda x : impy.mean(x) ,
             "median" : lambda x : impy.median(x) ,
             "mode" : lambda x : impy.mode(x) ,
             "knn" : lambda x : impy.fast_knn(x) ,
             "MICE" : lambda x : impy.mice(x) ,
             "EM" : lambda x : impy.em(x),
             "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\
             fit_transform(pd.DataFrame(x)).values,
         }
     except Exception as e:
         print(e)
         pass
Ejemplo n.º 14
0
def perf_em_imput(dfs_arg):
    em_data = [
        impy.em(dfs_arg[i].values, loops=50, dtype='cont')
        for i in range(len(dfs_arg))
    ]
    return [pd.DataFrame(data=em_data[i]) for i in range(len(dfs_arg))]
Ejemplo n.º 15
0
med_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
med_imputer = med_imputer.fit(miss_data_x)
imputed_data_med = med_imputer.transform(miss_data_x)

# Report the RMSE performance
rmse_med = rmse_loss(ori_data_x, imputed_data_med, data_m)

print()
print('RMSE Performance: ' + str(np.round(rmse_med, 4)))


#%%
# EM imputation
import impyute as impy

data_missing = pd.DataFrame(miss_data_x)
em_imputed = impy.em(miss_data_x)

rmse_em = rmse_loss(ori_data_x, em_imputed, data_m)

print()
print('RMSE Performance: ' + str(np.round(rmse_em, 4)))

pd.DataFrame(imputed_data_med).to_csv(os.path.join(os.getcwd(), '[10] data/' + data_name + '_imp_med' + '.csv'), index = False)
pd.DataFrame(em_imputed).to_csv(os.path.join(os.getcwd(), '[10] data/' + data_name + '_imp_EM' + '.csv'), index = False)

# RMSE
# GAIN: 0.0905
# median imputation: 0.1095
# EM imputation: 0.1453
Ejemplo n.º 16
0
def em(data):
    return impyute.em(data.values)
Ejemplo n.º 17
0
df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)

arr = df.values
"""
Splitting indices for each data set
    - Air Quality: -1 (impute all data instances)
    - GECCO2015: 154140
"""

split_idx = 154140

# Imputation mode: EM
imputed_em = impy.em(arr[:split_idx], loops=50)  # default: 50

# [Option] aggregate train (imputed) /valid (not imputed) data
imputed_em = np.append(imputed_em, arr[split_idx:], axis=0)

# [Option] resampling
imputed_em = imputed_em.resample('D').mean()

# Convert to DataFrame
imputed_em = pd.DataFrame(imputed_em, index=df.index, columns=df.columns)

# Visualizing comparison between actual and imputed values
plt.plot(imputed_em[df.columns[0]], label='imputed')
plt.plot(df[df.columns[0]], label='actual')
plt.legend(loc='best')
plt.show()
Ejemplo n.º 18
0
def test_impute_missing_values():
    """ After imputation, no NaN's should exist"""
    imputed = impy.em(data_m)
    assert not np.isnan(imputed).any()