Ejemplo n.º 1
0
    def MICE_imputation(self, dataset):
        # only for numerical values
        # Multivariate Imputation by Chained Equations only suitable
        # for Missing At Random (MAR),
        # which means that the probability that a value is missing
        # depends only on observed values and not on unobserved values

        import impyute as imp

        df = dataset

        if dataset.select_dtypes(['number']).isnull().sum().sum() > 0:

            X = imp.mice(dataset.select_dtypes(['number']).iloc[:, :].values)

            Z = dataset.select_dtypes(include=['object'])

            df = pd.DataFrame.from_records(
                X, columns=dataset.select_dtypes(['number']).columns)

            df = df.join(Z)

        else:

            pass

        return df
Ejemplo n.º 2
0
    def data_quality_verification(self):
        try:
            cols = ["bcg_wastage_rate", "totalbirths"]
            self.df[cols] = self.df[cols].replace({0: np.nan})
            self.df["bcg_wastage_rate"] = self.df["bcg_wastage_rate"].replace(
                {1: np.nan})
            # dealing with the missing data
            # imputed_training = fast_knn(org_unit_group.values, k=3)
            # print(self.df.values)
            imputed_training = mice(self.df.values)
            self.df[cols] = imputed_training

            # convert column "totalbirths" to int64 dtype
            self.df = self.df.astype({"totalbirths": int})
        except Exception as e:
            self.metadata['report_status'] = 'ERROR'
            self.metadata['report_description'] = 'Data <-> Data Quality Verification Failed. => ' \
                                                  '' + str(e)
            self.has_validation_error = True
            return
Ejemplo n.º 3
0
 def test_impute_missing_values(self):
     """ After imputation, no NaN's should exist"""
     imputed = impy.mice(self.data_m)
     self.assertFalse(np.isnan(imputed).any())
Ejemplo n.º 4
0
 def test_return_type(self):
     """ Check return type, should return an np.ndarray"""
     imputed = impy.mice(self.data_m)
     self.assertTrue(isinstance(imputed, np.ndarray))
Ejemplo n.º 5
0
def run_experiment_k_paper(X_test, y_test, clf, NB, a, setting):
    X_impute_mean   = np.mean(X_test, axis = 0)
    X_impute_median = np.median(X_test, axis = 0)
    X_impute_max    = np.max(X_test, axis = 0)
    X_impute_min    = np.min(X_test, axis = 0)
    X_impute_flip   = np.copy(1 - X_test)


    k_all = []
    missing_err_nb_all = []
    missing_err_lr_mean_all = []
    missing_err_lr_median_all = []
    missing_err_lr_max_all = []
    missing_err_lr_min_all = []
    missing_err_lr_flip_all = []
    missing_err_lr_em_impute_all = []
    missing_err_lr_mice_impute_all = []
    missing_err_lr_knn_impute_all = []

    useEM = setting["em"] if "em" in setting else False
    discreteFeatures = setting["discreteFeatures"] if "discreteFeatures" in setting else 1
    featureEncoding = setting["feature_encoding"] if "feature_encoding" in setting else None

    do_emImpute = setting["emImpute"] if "emImpute" in setting else False
    do_miceImpute = setting["miceImpute"] if "miceImpute" in setting else False
    do_knnImpute = setting["knnImpute"] if "knnImpute" in setting else False

    if useEM:
        missing_err_ours_all = {}
        for i in range(len(a)):
            missing_err_ours_all["ours_" + str(i)] = []
    else:
        missing_err_ours_all = []

    useProb = setting["prob"] if "prob" in setting else True
    function = setting["function"] if "function" in setting else None
    if function is None:
        if useProb:
            function = conditional_likelihood_k
        else:
            function = f1_score

    print("Using following function: ")
    print(function)
    
    repeat = setting["repeat"] if "repeat" in setting else 1

    FEATURES = setting["features"] if "features" in setting else None
    if FEATURES is None:
        NNN = X_test.shape[1]
        if not featureEncoding is None:
            NNN = len(featureEncoding)
        FEATURES = np.array( [i for i in range(NNN / discreteFeatures )] )
    else:
        FEATURES = np.array( FEATURES )

    print("Possible features to remove: {}".format(FEATURES.shape[0]))

    K = setting["k"]

    for k in K:
        print("K = {}".format(k))

        if k > FEATURES.shape[0]:
            print("Early stop: Only had {} features possible to remove".format(FEATURES.shape[0]))
            break

        cur_nb = []
        cur_lr_mean = []
        cur_lr_median = []
        cur_lr_max = []
        cur_lr_min = []
        cur_flip = []
        cur_em_impute = []
        cur_mice_impute = []
        cur_knn_impute = []

        if useEM:
            cur_ours = {}
            for i in range(len(a)):
                cur_ours["ours_" + str(i)] = []
        else:
            cur_ours = []
        
        

        for R in range(repeat):
            if R % 10 == 0:
                print("\t R = {}".format(R))
            X_test_mean   = np.array(X_test, dtype = 'float')
            X_test_median = np.array(X_test, dtype = 'float')
            X_test_max    = np.array(X_test, dtype = 'float')
            X_test_min    = np.array(X_test, dtype = 'float')
            X_test_flip   = np.array(X_test, dtype = 'float')
            X_test_em_impute = np.array(X_test, dtype = 'float')
            X_test_mice_impute = np.array(X_test, dtype = 'float')
            X_test_knn_impute = np.array(X_test, dtype = 'float')
            missing = np.zeros(X_test.shape, dtype=bool)

            for i in range(X_test.shape[0]):
                miss = np.random.choice(FEATURES, k, replace=False)

                if not featureEncoding is None and k > 0:
                    missK = []
                    for m in miss:
                        for z in featureEncoding[m]:
                            missK.append(z)
                    miss = np.copy(np.array(missK))

                elif discreteFeatures != 1 and k > 0:
                    missK = []
                    for m in miss:
                        for z in range(discreteFeatures):
                            missK.append(m * discreteFeatures + z)
                    miss = np.copy(np.array(missK))

                   
                missing[i][miss] = True
                # if k > 0:
                #     print(missing[i])
                #     print(np.sum(missing[i]))
                X_test_mean[i][miss]   = X_impute_mean[miss]
                X_test_median[i][miss] = X_impute_median[miss]
                X_test_max[i][miss]    = X_impute_max[miss]
                X_test_min[i][miss]    = X_impute_min[miss]
                X_test_flip[i][miss]   = X_impute_flip[i][miss]
                X_test_em_impute[i][miss] = np.nan
                X_test_mice_impute[i][miss] = np.nan
                X_test_knn_impute[i][miss] = np.nan

            if do_emImpute:
                import time
                start = time.time()
                loops = 6
                print ("\tStarting to em impute with loops = {}".format(loops))
                X_test_em_impute = impyute.em(X_test_em_impute, loops = loops)
                end = time.time()
                print ("\tDone imputing! " + str( end - start ) )
            else:
                X_test_em_impute = np.zeros(X_test.shape)

            if do_miceImpute:
                import time
                start = time.time()
                print ("\tStarting to mice impute")
                X_test_mice_impute = impyute.mice(X_test_mice_impute)
                end = time.time()
                print ("\tDone imputing! " + str( end - start ) )
            else:
                 X_test_mice_impute = np.zeros(X_test.shape)


            if do_knnImpute:
                import time
                start = time.time()
                print ("\tStarting to knn impute")
                X_test_knn_impute = impyute.fast_knn(X_test_knn_impute)
                end = time.time()
                print ("\tDone imputing! " + str( end - start ) )
            else:
                 X_test_knn_impute = np.zeros(X_test.shape)

            lr_prob = clf.predict_proba(X_test)
            
            if useProb:
                cur_nb.append         ( function(lr_prob, predict_nbk_with_missing(X_test_mean, NB, missing, prob = True)) )
                cur_lr_mean.append    ( function(lr_prob, clf.predict_proba(X_test_mean)) )
                cur_lr_median.append  ( function(lr_prob, clf.predict_proba(X_test_median)))
                cur_lr_max.append     ( function(lr_prob, clf.predict_proba(X_test_max)))
                cur_lr_min.append     ( function(lr_prob, clf.predict_proba(X_test_min)))
                cur_em_impute.append  ( function(lr_prob, clf.predict_proba(X_test_em_impute)))
                cur_mice_impute.append( function(lr_prob, clf.predict_proba(X_test_mice_impute)))
                cur_knn_impute.append ( function(lr_prob, clf.predict_proba(X_test_knn_impute)))
                # cur_flip.append       ( function(lr_prob, clf.predict_proba(X_test_flip)))
                if not useEM:
                    cur_ours.append   ( function(lr_prob, a.classify(X_test, missing, prob = True)))
                else:
                    for z in range(len(a)):
                        cur_ours["ours_" + str(z)].append (function(lr_prob, a[z].classify(X_test, missing, prob = True)))
                        

            else:
                cur_nb.append         ( function(y_test, predict_nbk_with_missing(X_test_mean, NB, missing)) )
                cur_lr_mean.append    ( function(y_test, clf.predict(X_test_mean)) )
                cur_lr_median.append  ( function(y_test, clf.predict(X_test_median)))
                cur_lr_max.append     ( function(y_test, clf.predict(X_test_max)))
                cur_lr_min.append     ( function(y_test, clf.predict(X_test_min)))
                cur_em_impute.append  ( function(y_test, clf.predict(X_test_em_impute)))
                cur_mice_impute.append( function(y_test, clf.predict(X_test_mice_impute)))
                cur_knn_impute.append( function(y_test, clf.predict(X_test_knn_impute)))
                # cur_flip.append       ( function(y_test, clf.predict(X_test_flip)))
                if not useEM:
                    cur_ours.append   ( function(y_test, a.classify(X_test_mean, missing)))
                else:
                    for z in range(len(a)):
                        cur_ours["ours_" + str(z)].append( function(y_test, a[z].classify(X_test_mean, missing)))
        
        k_all.append(k)
        missing_err_nb_all.append       (cur_nb)
        missing_err_lr_mean_all.append  (cur_lr_mean)
        missing_err_lr_median_all.append(cur_lr_median)
        missing_err_lr_max_all.append   (cur_lr_max)
        missing_err_lr_min_all.append   (cur_lr_min)
        missing_err_lr_flip_all.append  (cur_flip)
        missing_err_lr_em_impute_all.append(cur_em_impute)
        missing_err_lr_mice_impute_all.append(cur_mice_impute)
        missing_err_lr_knn_impute_all.append(cur_knn_impute)
        if useEM:
            for i in cur_ours:
                missing_err_ours_all[i].append(cur_ours[i])
        else:
            missing_err_ours_all.append  (cur_ours)

    if not useEM:
        missing_err_ours_all = np.array(missing_err_ours_all)

    data = {
        "features_count": FEATURES.shape[0],
        "k" :     np.array(k_all),
        "nb":     np.array(missing_err_nb_all),
        "mean":   np.array(missing_err_lr_mean_all),
        "median": np.array(missing_err_lr_median_all),
        "max":    np.array(missing_err_lr_max_all),
        "min":    np.array(missing_err_lr_min_all),
        "ours":   missing_err_ours_all,
        "flip":   np.array(missing_err_lr_flip_all),
        "em_impute": np.array(missing_err_lr_em_impute_all),
        "mice_impute": np.array(missing_err_lr_mice_impute_all),
        "knn_impute": np.array(missing_err_lr_knn_impute_all),
    }            

    return data
Ejemplo n.º 6
0
)
df2 = dataframe.replace(r'\s+', np.NaN)

data_= df2.drop(['PASI.END.WEEK.7',\
    'PASI.END.WEEK.8','PASI.END.WEEK.9','PASI.END.WEEK.10','PASI.END.WEEK.11'], axis = 1)
#data_= data_.dropna()
ds = data_  #.iloc[:99]

# # Drop missing data
# print(ds.shape)
# ds = ds.dropna()
# print(ds.shape)

# split data into X and y
X6 = data_.iloc[:, 0:-1].values  #W0-2
X6 = impy.mice(X6)

X6 = normalize(X6)
y = data_.iloc[:, -1]

# Reduce the number of classes to 2, classes 2 and 1 are
# merged together to form a new class (0) and class 3
# becomes class 1.

y = np.array([data_.iloc[:, -1]])
b = Binarizer(2)
b_scaled = b.fit_transform(y)[0]
y = b_scaled

# Form a dataframe with the imputed x_values and binarized y_values.
d_x = pd.DataFrame(
Ejemplo n.º 7
0
msk = (imputed + np.random.randn(*imputed.shape) - imputed) < 0.8
imputed[~msk] = 0

#   initializing NMF imputation model
nmf_model = NMF()  # n_components: num. of features
nmf_model.fit(imputed)

#   iterative imputation process
# while nmf_model.reconstruction_err_**2 > 10:
while nmf_model.reconstruction_err_ > 2.5:
    W = nmf_model.fit_transform(imputed)
    imputed[~msk] = W.dot(nmf_model.components_)[~msk]
    print(nmf_model.reconstruction_err_)

# [Imputation mode: MICE]
imputed = impy.mice(df.values[:split_idx])

# [Imputation mode: k-NN]
imputer = KNNImputer(n_neighbors=10)  # default: 2
imputed = imputer.fit_transform(df.values[:split_idx])

# [Imputation mode: EM]
imputed = impy.em(df.values[:split_idx], loops=50)

# [Imputation mode: LOCF]
imputed = df.copy().iloc[:split_idx].ffill()
imputed = imputed.fillna(0)
imputed = imputed.values

# [Imputation mode: NOCB]
imputed = df.copy().iloc[:split_idx].bfill()
Ejemplo n.º 8
0
    plt.subplot(140+i+1)
    dataload3.boxplot(column=i)
    plt.title(l[i])
plt.show()

#数据缺失处理
import impyute as impy
data_drop = dataload1.dropna()#将缺失值剔除
data_drop.hist(layout=(1,3),bins=40,figsize=(15,3))

data_mode = dataload1.fillna(dataload1.mode())#用最高频率纸填补缺失值
data_mode.hist(layout=(1,3),bins=40,figsize=(15,3))

data = dataload1[['Unnamed: 0','points','price']]
nd = np.array(data)
filled_mice = impy.mice(nd)#通过属性的相关关系来填补缺失值
data_mice = pd.DataFrame(filled_mice)
data_mice.hist(layout=(1,3),bins=40,figsize=(15,3))

filled_knn = impy.fast_knn(nd,k=3)#通过数据对象之间的相似性来填补缺失值
data_knn = pd.DataFrame(filled_knn)
data_knn.hist(layout=(1,3),bins=40,figsize=(15,3))

plt.show()

data_drop = dataload2.dropna()#将缺失值剔除
data_drop.hist(layout=(1,3),bins=40,figsize=(15,3))

data_mode = dataload2.fillna(dataload2.mode())#用最高频率纸填补缺失值
data_mode.hist(layout=(1,3),bins=40,figsize=(15,3))
Ejemplo n.º 9
0
 def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target):
     try:
         self.miss_info = miss_info
         self.columns = notobj
         self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[
             "num_col"]
         metric = {"rmse": {}, "nrmse": {}}
         self.rawT = T
         self.target = target
         if target is not None: self.target_y = T[target]
         else: self.target_y = None
         self.cv = {}
         self.cv.update(deepcopy(metric))
         self.kf = kf
         self.MSE = {}
         self.MSE.update(deepcopy(metric))
         self.result = {}
         self.time_ck = {}
         X = deepcopy(T)
         mask = pd.DataFrame(mask, columns=T.columns.tolist())
         self.rawmask = mask
         X[(mask == 1).values] = np.nan
         if obj in [None, []]: obj = None
         else: pass
         ##########################################
         self.X = X[notobj]
         self.T = T[notobj]
         self.mask = mask[notobj]
         self.notobj = notobj
         ##########################################
         if obj is not None:
             ############ Numeric + Category  #################
             cat_impute = SimpleImputer(strategy="most_frequent")
             X[obj] = cat_impute.fit_transform(X[obj])
             self.true_obj = T[obj]
             self.pd_obj = X[obj]
             ###################################################
             TT = deepcopy(T)
             cat_encoder = miss_info["ce_encoder"]
             for k in cat_encoder.category_mapping:
                 col, map_ = k["col"], k["mapping"]
                 TT[col] = TT[col].replace(
                     dict(zip(k["mapping"].index, k["mapping"].values)))
             self.full_miss_data = TT
             self.full_miss_data[(mask == 1).values] = np.nan
             mice_data = deepcopy(T)
             for obj_col in obj:
                 mice_data[obj_col] = "Cols_" + mice_data[obj_col]
             self.full_mice_data = mice_data
             self.full_mice_data[(mask == 1).values] = np.nan
         else:
             ########## Numeric  ###############################
             num_data = deepcopy(self.X)
             num_data[(self.mask == 1).values] = np.nan
             self.full_miss_data = deepcopy(num_data)
             self.full_mice_data = deepcopy(num_data)
             ###################################################
         self.algo = algo
         self.method = {
             "MissForest" : lambda x : MissForest(verbose = 0, n_jobs  = -1 ).fit(x) ,
             "mean" : lambda x : impy.mean(x) ,
             "median" : lambda x : impy.median(x) ,
             "mode" : lambda x : impy.mode(x) ,
             "knn" : lambda x : impy.fast_knn(x) ,
             "MICE" : lambda x : impy.mice(x) ,
             "EM" : lambda x : impy.em(x),
             "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\
             fit_transform(pd.DataFrame(x)).values,
         }
     except Exception as e:
         print(e)
         pass
Ejemplo n.º 10
0
def perf_mice_imput(dfs_arg):
    mice_data = [
        impy.mice(dfs_arg[i].values, dtype='cont') for i in range(len(dfs_arg))
    ]
    return [pd.DataFrame(data=mice_data[i]) for i in range(len(dfs_arg))]
Ejemplo n.º 11
0
import pandas as pd
from pandas import datetime
import impyute as impy
from matplotlib import pyplot as plt


def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')


input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)

# imputation mode: MICE
imputed_mice = impy.mice(df.values)
imputed_mice = pd.DataFrame(imputed_mice, index=df.index, columns=df.columns)

# Visualizing comparison between actual and imputed values
plt.plot(imputed_mice[df.columns[0]], label='imputed')
plt.plot(df[df.columns[0]], label='actual')
plt.legend(loc='best')
plt.show()

# Save the data set with imputed values
imputed_mice.to_csv('./data/AirQualityUCI_MICE.csv', index='Datetime')