Beispiel #1
0
 def impute(self, df):
     if self.knn:
         knn = KNN()
         return pd.DataFrame(knn.fit_transform(df), columns=df.columns)
     else:
         mice = IterativeImputer()
         return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
Beispiel #2
0
 def impute(self, df):
     if self.knn:
         knn = KNN()
         return pd.DataFrame(knn.fit_transform(df), columns=df.columns)
     else:
         mice = IterativeImputer()
         return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
def test_iterative_imputer_with_low_rank_random_matrix():
    imputer = IterativeImputer(n_iter=50, random_state=0)
    XY_completed = imputer.fit_transform(XY_incomplete)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="IterativeImputer")
    assert missing_mae < 0.1, "Error too high with IterativeImputer method!"
def multi_imp(data,m):
    XY=data
    n_imputations = m
    XY_completed = []
    for i in range(n_imputations):
        imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i)
        XY_completed.extend(imputer.fit_transform(XY))
    return np.array(XY_completed)
Beispiel #5
0
class vk_sensing():
    def __init__(self, method, **kwargs):
        self.clf = None
        self.method = method
        if method == "SoftImpute":
            self.clf = SoftImpute(**kwargs)
        elif method == "KNN":
            self.clf = KNN(**kwargs)
        elif method == "Naive":
            self.clf = SimpleFill()
        elif method == 'II':
            raise ('NOT TESTED')
            self.clf = IterativeImputer(min_value=0)
        else:
            raise ("Not Implemented method")

    def fit_transform(self, X_train):
        # print (X_train, np.isnan(X_train).all())
        assert (self.clf is not None)
        X_est = None
        if np.isnan(X_train).any():
            if np.isnan(X_train).all():
                X_est = np.zeros_like(X_train)
            else:
                # print (np.isnan(self.clf.fit_transform(X_train)).any())
                X_est = massage_imputed_matrix(self.clf.fit_transform(X_train))
        else:
            X_est = X_train
        assert (not np.isnan(X_est).any())
        return X_est

    def CVfit(self, X, val_ratio=0.2):
        mask = np.invert(np.isnan(X))
        sample_mask = np.random.rand(*X.shape) < val_ratio
        X_train = X.copy()
        X_train[mask & (~sample_mask)] = np.nan
        X_val = X.copy()
        X_val[mask & (sample_mask)] = np.nan
        cur_best_err = np.inf
        cur_best_k = None
        for k in GLOB_IMPUTE_K_SWEEP:
            clf = construct_low_rank_imputer(self.method, k)
            if np.isnan(X_train).any():
                if np.isnan(X_train).all():
                    X_est = np.zeros_like(X_train)
                else:
                    X_est = massage_imputed_matrix(clf.fit_transform(X_train))
            else:
                X_est = X_train
            err = MAE(X_est, X_val)
            # print (k, err, RMSN(X_est, X_val))
            if err < cur_best_err:
                cur_best_err = err
                cur_best_k = k
        if cur_best_k is None:
            cur_best_k = 1
        # print (cur_best_k)
        self.clf = construct_low_rank_imputer(self.method, cur_best_k)
def test_iterative_imputer_with_low_rank_random_matrix_approximate():
    imputer = IterativeImputer(n_iter=50, n_nearest_features=5, random_state=0)
    XY_completed = imputer.fit_transform(XY_incomplete)
    _, missing_mae = reconstruction_error(
        XY,
        XY_completed,
        missing_mask,
        name="IterativeImputer with n_nearest_features=5")
    assert missing_mae < 0.1, "Error too high with IterativeImputer " \
                              "method using n_nearest_features=5!"
Beispiel #7
0
 def get_predict(self, flag, in_data):
   output = in_data.copy()
   output.shape = (utils.M_NUM, 1)
   output[~flag] = np.nan
   solver = MICE()
   tmp = self.t_measure.copy()
   tmp = np.column_stack((tmp, output)).transpose()
   tmp = solver.fit_transform(tmp)
   output = np.array(tmp[-1, :]).reshape(utils.M_NUM, 1)
   return output
Beispiel #8
0
def mice_imputer_wo_target(df):
    mice = IterativeImputer()
    return pd.DataFrame(mice.fit_transform(df),
                        columns=[
                            'city', 'city_development_index', 'gender',
                            'relevent_experience', 'enrolled_university',
                            'education_level', 'major_discipline',
                            'experience', 'company_size', 'company_type',
                            'last_new_job', 'training_hours'
                        ])
def test_iterative_imputer_as_mice_with_low_rank_random_matrix_approximate():
    n_imputations = 5
    XY_completed = []
    for i in range(n_imputations):
        imputer = IterativeImputer(n_iter=5,
                                   sample_posterior=True,
                                   random_state=i)
        XY_completed.append(imputer.fit_transform(XY_incomplete))
    _, missing_mae = reconstruction_error(XY,
                                          np.mean(XY_completed, axis=0),
                                          missing_mask,
                                          name="IterativeImputer as MICE")
    assert missing_mae < 0.1, "Error too high with IterativeImputer as MICE!"
Beispiel #10
0
    def __init__(self):
        path = "C:\‏‏PycharmProjects\PTSD\Data\PTSD.xlsx"
        df = pd.read_excel(path)
        df = df[~df['PCL_Strict3'].isna()]
        df = df[df["military_exp18_t3"] > 0]
        df = df[self.features + self.ID + self.target_features]
        df_pcl3 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx")
        df_pcl3 = PCL_calculator(df_pcl3)
        df_pcl2 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx")
        df_pcl2 = PCL_calculator(df_pcl2)
        df_pcl1 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx")
        df_pcl1 = PCL_calculator(df_pcl1)

        df = df.merge(df_pcl1, on="ID", how='outer')
        df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID", how='outer')
        df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'], axis=1), on="ID", how='outer')

        df = df[~df['PCL_Strict3'].isna()]
        #df = df[~df['tred_cutoff'].isna()]
        df.drop(self.ID, inplace=True, axis=1)
        if mew:
            mice = IterativeImputer()
            df = pd.DataFrame(mice.fit_transform(df), columns=df.columns)

        all_x_col = self.features + self.features_2 + self.target_features_2
        #all_x_col = self.features + self.features_2
        #y_col = ["tred_cutoff"]
        y_col = ["PCL_Strict3"]
        X = df[all_x_col]
        Y = df[y_col]
        X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X, Y, test_size=0.25, random_state=271828, stratify=Y)
        X_train, X_test, y_train, y_test = train_test_split(X_train_0, y_train_0, test_size=0.25, random_state=271828, stratify=y_train_0)
        df = pd.concat([X_train, y_train], axis=1)
        self.X_test = X_test
        self.y_test =y_test

        self.X_train_0 = X_train_0
        self.X_test_0 = X_test_0
        self.y_train_0 = y_train_0
        self.y_test_0 = y_test_0

        self.df = df
Beispiel #11
0
    def iterative_imputer(self, estimator, max_iter, tol, n_nearest_feature,
                          initial_strategy, imputation_order, skip_complete,
                          min_value, max_value, verbose, random_state):
        print("Interative Imputer")
        print(n_nearest_feature)
        my_estimator = None

        if estimator == 'BayesianRidge':
            my_estimator = BayesianRidge()
        if estimator == 'DecisionTreeRegressor':
            my_estimator = DecisionTreeRegressor()
        if estimator == 'ExtraTreesRegressor':
            my_estimator = ExtraTreesRegressor()
        if estimator == 'KNeighborsRegressor':
            my_estimator = KNeighborsRegressor()
        if estimator == 'DecisionTreeClassifier':
            my_estimator = DecisionTreeClassifier

        imp = IterativeImputer(
            estimator=my_estimator,
            missing_values=np.NAN,
            # sample_posterior=sample_posterior,
            max_iter=max_iter,
            tol=tol,
            n_nearest_features=n_nearest_feature,
            initial_strategy=initial_strategy,
            imputation_order=imputation_order,
            skip_complete=skip_complete,
            min_value=min_value,
            max_value=max_value,
            verbose=verbose,
            random_state=random_state,
            # add_indicator=add_indicator
        )

        print("Iterative Imputer is created")
        self.data = imp.fit_transform(self.data)
        self.data = pd.DataFrame(self.data)
        self.data.columns = self.featuresName
        self.data = self.data.infer_objects()
Beispiel #12
0
def replace_mice(method):
    train_df=pd.read_csv(path, parse_dates=True,encoding='utf-8')
    del_col=train_df.select_dtypes(include=['object']).columns
    for i in del_col:
        train_df=train_df.drop([i],axis=1)
    countcolumns=0
    for i in train_df.columns: 
        if(i==var):
            inx=countcolumns
        countcolumns=countcolumns+1   
    n_imputations = 10
    XY_completed = []
    for i in range(n_imputations):
        imputer = IterativeImputer(n_iter=n_imputations, sample_posterior=True, random_state=i)
        XY_completed.append(imputer.fit_transform(train_df.as_matrix()))
    XY_completed = np.mean(XY_completed, 0)
    XY_completed = np.round(XY_completed)   
    new_df = pd.read_csv(path,parse_dates=True,encoding='utf-8') 
    data_null_len=len(new_df[new_df[var].isnull()])
    for i in range(data_null_len):
        xx=train_df[train_df[var].isnull()].index[i]
        new_df[var].loc[xx]=abs(XY_completed[xx][inx])
    return new_df
Beispiel #13
0
 def datainput(self):
     full_data = pd.read_csv(self.file, header=0)
     print('\nMissing values for each columns')
     print(full_data.isnull().sum()) # print # of mssing values
     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
     df_n = full_data.select_dtypes(include=numerics)
     col_names = list(df_n.columns)
     df_c = full_data.select_dtypes(exclude=numerics)
     ipt = input('\nIs there any missing values? (y/n) : ')
     if ipt == 'y':
         ct = input('Is there any missing values which is not digit? (y/n) : ')
         if ct == 'y':
             full_data.dropna()
         else:
             impute = IterativeImputer()
             df_n = impute.fit_transform(df_n) #process mssing values using imputer
             df_n = pd.DataFrame(df_n)
             df_n.columns = col_names
             full_data = pd.concat([df_n,df_c], axis=1)
         print('\nMissing values after processing')
         print(full_data.isnull().sum())  # print # of missing values
     train, test = train_test_split(full_data, test_size=0.3, shuffle = False) # train,test set split , default = shuffle
     return train, test
Beispiel #14
0
    all_data[col_name] = most_common_imputed[col_name]
nom_df = pd.get_dummies(all_data[nominal_columns], prefix=nominal_columns)

for col_name in nom_df.columns:
    all_data[col_name] = nom_df[col_name]

all_data = all_data.drop(columns= nominal_columns)

print(all_data)
from fancyimpute import IterativeImputer


MICE_imputer = IterativeImputer()
ordinal_mice = all_data.copy(deep = True)

ordinal_mice.iloc[:,:] = np.round(MICE_imputer.fit_transform(ordinal_mice))

for col_name in ordinal_columns:
    all_data[col_name] = ordinal_mice[col_name]

for col_name in numeric_columns:
    all_data[col_name] = ordinal_mice[col_name]


if all_data.isnull().values.any():
    print("Yuh artık!")
    print("GOSHHH!!!!!")
    print("Breakdown loading...")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
Beispiel #15
0
    def __init__(self):
        super().__init__(chapter_id="PROJ01_dual_single",
                         to_data_path="dual_single",
                         target_field="EPSI")
        file_from_ = "dual_single.csv"
        file_to_ = "dual_single.csv.gz"
        # get data ---
        csv_path = os.path.join(self.TO_DATA_PATH, file_from_)
        if not os.path.isfile(csv_path):
            self.DUAL_SINGLE_URL = "https://github.com/amosbaranes/ml_data/raw/master/dual_single.csv.gz"
            self.fetch_tgz_data(self.DUAL_SINGLE_URL, file_from_, "gz")
        self.load_csv_data("dual_single")
        self.DATA_SOURCE = self.DATA.iloc[:, 5:19]
        self.DATA_IMPUTED = self.DATA.iloc[:, 19:]

        self.SINGLE_DATA = self.DATA[self.DATA["Type"] == "Single"]
        self.DUAL_DATA = self.DATA[self.DATA["Type"] == "Dual"]
        self.SINGLE_DATA_IMPUTED = self.SINGLE_DATA.iloc[:, 19:]
        self.DUAL_DATA_IMPUTED = self.DUAL_DATA.iloc[:, 19:]

        self.SINGLE_DATA_SOURCE = self.SINGLE_DATA.iloc[:, 5:19]
        self.DUAL_DATA_SOURCE = self.DUAL_DATA.iloc[:, 5:19]
        self.SINGLE_DATA_SOURCE_ZZ = zig_zag_(self.SINGLE_DATA_SOURCE,
                                              a_rows=0.25,
                                              a_col=0.85)
        self.DUAL_DATA_SOURCE_ZZ = zig_zag_(self.DUAL_DATA_SOURCE,
                                            a_rows=0.25,
                                            a_col=0.85)

        mice_impute_s = IterativeImputer()
        self.SINGLE_DATA_SOURCE_ZZI = pd.DataFrame(
            mice_impute_s.fit_transform(self.SINGLE_DATA_SOURCE_ZZ))
        # print(self.SINGLE_DATA_SOURCE_ZZ.columns)
        # print(self.SINGLE_DATA_SOURCE_ZZI.columns)
        try:
            self.SINGLE_DATA_SOURCE_ZZI.columns = self.SINGLE_DATA_SOURCE_ZZ.columns
            file = "SINGLE_DATA_IMPUTED"
            ssr = os.path.join(self.TO_DATA_PATH, file + ".xlsx")  # "housing"
            print(ssr)
            with pd.ExcelWriter(ssr, engine='xlsxwriter') as writer:
                self.SINGLE_DATA_SOURCE_ZZI.to_excel(writer,
                                                     sheet_name="imputed")
                writer.save()
        except Exception as e:
            print(e)
        # print(self.SINGLE_DATA_SOURCE_ZZI)

        mice_impute_d = IterativeImputer()
        self.DUAL_DATA_SOURCE_ZZI = pd.DataFrame(
            mice_impute_d.fit_transform(self.DUAL_DATA_SOURCE_ZZ))
        try:
            self.DUAL_DATA_SOURCE_ZZI.columns = self.DUAL_DATA_SOURCE_ZZ.columns
            file = "DUAL_DATA_IMPUTED"
            ssr = os.path.join(self.TO_DATA_PATH, file + ".xlsx")  # "housing"
            print(ssr)
            with pd.ExcelWriter(ssr, engine='xlsxwriter') as writer:
                self.DUAL_DATA_SOURCE_ZZI.to_excel(writer,
                                                   sheet_name="imputed")
                writer.save()
        except Exception as e:
            print(e)
Beispiel #16
0
from fancyimpute import IterativeImputer

df = pd.read_csv('pima.csv')
msno.bar(df)
msno.heatmap(df)

df_cleaned = df.dropna(subset=['Diastolic_BP', 'BMI', 'Glucose'])
df_noNa = df.dropna()
y_noNa = df_noNa['Class']
X_noNa = df_noNa.iloc[:, :-1]
lm_noNa = sm.OLS(y_noNa, X_noNa).fit()
R2 = pd.Series([lm_noNa.rsquared_adj, 0, 0], index=['noNa', 'MICE', 'medians'])

df_MICE = df_cleaned.copy(deep=True)
MICE_imputer = IterativeImputer()
df_MICE.iloc[:, :] = MICE_imputer.fit_transform(df_MICE)

y_MICE = df_MICE['Class']
X_MICE = df_MICE.iloc[:, :-1]
lm_MICE = sm.OLS(y_MICE, X_MICE).fit()
lm.summary()
R2['MICE'] = lm_MICE.rsquared_adj

df_medians = df_cleaned.copy(deep=True)
df_medians.loc[df_medians['Serum_Insulin'].isna(),
               'Serum_Insulin'] = df_medians['Serum_Insulin'].median()
df_medians.loc[df_medians['Skin_Fold'].isna(),
               'Skin_Fold'] = df_medians['Skin_Fold'].median()
y_medians = df_medians['Class']
X_medians = df_medians.iloc[:, :-1]
lm_medians = sm.OLS(y_medians, X_medians).fit()
Beispiel #17
0
    'q6.14_ANGER_pcl2', 'q6.15_CONC_pcl2', 'q6.16_HYPER_pcl2',
    'q6.17_STRTL_pcl2', 'intrusion_pcl2', 'avoidance_pcl2',
    'hypertention_pcl2', 'depression_pcl2', 'tred_pcl2'
]
target_features = ["PCL_Strict3", "PCL3"]

ID = ["ID"]

path = "C:\‏‏PycharmProjects\PTSD\Data\PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df['PCL_Strict3'].isna()]

df = df[features + ID + target_features]

mice = IterativeImputer()
df = pd.DataFrame(mice.fit_transform(df), columns=df.columns)

extra_features = 0
if extra_features:
    df_pcl3 = pd.read_excel(
        "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx")
    df_pcl3 = PCL_calculator(df_pcl3)
    df_pcl2 = pd.read_excel(
        "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx")
    df_pcl2 = PCL_calculator(df_pcl2)
    df_pcl1 = pd.read_excel(
        "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx")
    df_pcl1 = PCL_calculator(df_pcl1)

    df = df.merge(df_pcl1, on="ID", how='outer')
    df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID", how='outer')
Beispiel #18
0
    def __init__(self):
        path = "C:\‏‏PycharmProjects\PTSD\Data\PTSD.xlsx"
        df = pd.read_excel(path)
        df = df[~df['PCL_Strict3'].isna()]
        df = df[~((df["military_exp18_t3"] == 0) &
                  (df["military_exp18_t2"] == 0))]
        df = df[self.features + self.ID + self.target_features]
        df_pcl3 = pd.read_excel(
            "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx")
        df_pcl3 = PCL_calculator(df_pcl3)
        df_pcl2 = pd.read_excel(
            "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx")
        df_pcl2 = PCL_calculator(df_pcl2)
        df_pcl1 = pd.read_excel(
            "C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx")
        df_pcl1 = PCL_calculator(df_pcl1)

        df = df.merge(df_pcl1, on="ID", how='outer')
        df = df.merge(df_pcl2,
                      suffixes=('_pcl1', '_pcl2'),
                      on="ID",
                      how='outer')
        df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'],
                                   axis=1),
                      on="ID",
                      how='outer')

        df = df[~df['PCL_Strict3'].isna()]
        #df = df[~df['tred_cutoff'].isna()]
        df.drop(self.ID, inplace=True, axis=1)
        if mew:
            mice = IterativeImputer()
            df = pd.DataFrame(mice.fit_transform(df), columns=df.columns)

        all_x_col = self.features + self.features_2 + self.target_features_2
        #all_x_col = self.features + self.features_2
        #y_col = ["tred_cutoff"]
        y_col = ["PCL_Strict3"]
        X = df[all_x_col]
        Y = df[y_col]
        if mew:
            X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(
                X, Y, test_size=0.25, random_state=271828, stratify=Y)
            X_train, X_test, y_train, y_test = train_test_split(
                X_train_0,
                y_train_0,
                test_size=0.25,
                random_state=271828,
                stratify=y_train_0)
            df = pd.concat([X_train, y_train], axis=1)
            self.X_test = X_test
            self.y_test = y_test

            self.X_train_0 = X_train_0
            self.X_test_0 = X_test_0
            self.y_train_0 = y_train_0
            self.y_test_0 = y_test_0
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, Y, test_size=0.25, random_state=271828, stratify=Y)
            df = pd.concat([X_train, y_train], axis=1)
            self.X_test = X_test
            self.y_test = y_test

        self.df = df
Beispiel #19
0
# dialysis_mod1: 0.53
# insurance_esrd: 7.80
# Mortality_Rate_Facility: 1.30
# Hospitalization_Rate_facility: 1.09
# NEAR_DIST: 21.58
# nephcare_cat2: 36.99
# rucc_metro: 0.90

# Random forest imputation for categorical
for var in ['dialysis_mod1', 'insurance_esrd', 'rucc_rural', 'nephcare_cat2']:
    pred = ['sex_new', 'age_cat', 'race_new', var]
    imputer = IterativeImputer(
        n_iter=1,
        random_state=7,
        predictor=RandomForestClassifier(n_estimators=10))
    imputed = pd.DataFrame(imputer.fit_transform(d[pred]), columns=pred)
    d = d.drop(var, axis=1).join(imputed[var])

# Bayesian Ridge linear imputation for continuous
for var in [
        'Hospitalization_Rate_facility', 'Mortality_Rate_Facility', 'NEAR_DIST'
]:
    completed = []
    for i in range(5):
        pred = ['sex_new', 'age_cat', 'race_new', var]
        imputer = IterativeImputer(n_iter=5,
                                   sample_posterior=True,
                                   random_state=i)
        completed.append(imputer.fit_transform(d[pred]))
    completed_mean = np.mean(completed, axis=0)
    imputed = pd.DataFrame(completed_mean, columns=pred)
Beispiel #20
0
review.head(3)

# In[35]:

# Import IterativeImputer from fancyimpute
from fancyimpute import IterativeImputer

# Copy diabetes to diabetes_mice_imputed
review_mice_imputed = review.copy(deep=True)

# Initialize IterativeImputer
mice_imputer = IterativeImputer()

# Impute using fit_tranform on diabetes
review_mice_imputed.iloc[:, :] = mice_imputer.fit_transform(review)

#rounding off the imputed data
review_mice_imputed.review_scores_location = round(
    review_mice_imputed.review_scores_location, 0)

#view the data
review_mice_imputed.head(3)

# In[36]:

#replacing the null values with the imputed value in the dataset
df3_airbnb.review_scores_location = review_mice_imputed.review_scores_location.copy(
)

# In[37]:
Beispiel #21
0
    # meta_train_loss.backward()
    # opt.step()
    learner = maml.clone()

    x_support, x_query, y_support, y_query = train_test_split(x_train,
                                                              y_train,
                                                              test_size=0.25,
                                                              stratify=y_train)

    ss = StandardScaler()
    x_support = ss.fit_transform(x_support)
    x_query = ss.transform(x_query)
    x_test = ss.transform(x_test)

    mice = IterativeImputer(max_iter=1000)
    x_support = mice.fit_transform(x_support)
    x_query = mice.fit_transform(x_query)
    x_test = mice.fit_transform(x_test)
    for _ in range(adapt_steps):  # adaptation_steps
        support_preds = learner(torch.from_numpy(x_support).float())
        support_loss = lossfn(
            support_preds.float(),
            torch.from_numpy(y_support.values.reshape(-1, 1))).float()
        learner.adapt(support_loss)

    query_preds = learner(x_query)
    query_loss = lossfn(query_preds, y_query)
    meta_train_loss += query_loss

    opt.zero_grad()
    meta_train_loss.backward()
Beispiel #22
0
def reconstruct(dataset, mask):
    print('Reconstructing using MICE...')

    # train_data = dataset.orig_ds['train_X']
    # mask = dataset.miss_masks[config_idx]['train_X']

    (datasetLen, dim) = np.shape(dataset)
    train_data = dataset.copy()
    incomplete_dataset = np.zeros((datasetLen, dim))

    # IterativeImputer requires corrupted entries to be identified as NaN
    # Using the mask to replace in the input dataset all zero entries for NaN
    for i in range(datasetLen):
        frame = train_data.loc[i, :]
        ms = mask.loc[i, :]
        ms.values[ms.values == 0] = np.nan
        incomplete_dataset[i] = frame.values * ms.values

    incomplete_dataset = pd.DataFrame(incomplete_dataset)

    n_imputations = 5
    reconstructed_dataset = []
    # IterativeImputer replicates MICE algorithm when used for multiple imputations
    # by applying it repeatedly to the same dataset
    for i in tqdm(range(n_imputations)):
        imputer = IterativeImputer(n_iter=10,
                                   sample_posterior=True,
                                   random_state=i)
        reconstructed_dataset.append(imputer.fit_transform(incomplete_dataset))

    reconstructed_dataset_mean = np.mean(reconstructed_dataset, axis=0)
    reconstructed_dataset_std = np.std(reconstructed_dataset, axis=0)

    return pd.DataFrame(reconstructed_dataset_mean)


## DEBUG TOOLS ##
# import reconstruct as rc
# import matplotlib.pyplot as plt
# if __name__ == "__main__":
#     original_dataset, incomplete_dataset, mask = rc.get_dataset(mode='MCAR', n_samples=100)
#
#     original_dataset = pd.DataFrame(original_dataset)
#     incomplete_dataset = pd.DataFrame(incomplete_dataset)
#     mask = pd.DataFrame(mask)
#
#     reconstructed_dataset = reconstruct(incomplete_dataset, mask)
#
#     inc = incomplete_dataset.loc[0,:]
#     rec = reconstructed_dataset.loc[0,:]
#     orig = original_dataset.loc[0,:]
#
#     print(np.shape(inc))
#     print(np.shape(rec))
#     print(np.shape(orig))
#
#     samples = np.vstack([inc, rec, orig])
#     fig = rc.plot(samples)
#     plt.savefig('Multiple_Impute_out1/{}.png'.format(str(0).zfill(3)), bbox_inches='tight')
#     plt.close(fig)

# from sklearn.linear_model import LinearRegression
# import os
# import sys
# projectdir = os.path.dirname(__file__)
# app_path = os.path.join(projectdir, 'scikit-mice')
# sys.path.insert(0, app_path)
# import skmice
#
# from statsmodels.imputation import mice
# import statsmodels.api as sm

# np.set_printoptions(linewidth=115, suppress=False, precision=1, floatmode='fixed')
#
# def gendat():
#     """
#     Create a data set with missing values.
#     """
#
#     np.random.seed(34243)
#
#     n = 20
#     p = 5
#
#     exog = np.random.normal(size=(n, p))
#     exog[:, 0] = exog[:, 1] - exog[:, 2] + 2*exog[:, 4]
#     exog[:, 0] += np.random.normal(size=n)
#     exog[:, 2] = 1*(exog[:, 2] > 0)
#
#     endog = exog.sum(1) + np.random.normal(size=n)
#
#     df = pd.DataFrame(exog)
#     df.columns = ["x%d" % k for k in range(1, p+1)]
#
#     df["y"] = endog
#
#     # df.x1[0:60] = np.nan
#     # df.x2[0:40] = np.nan
#     df.x1[0:5] = np.nan
#     df.x2[15:19] = np.nan
#     df.x3[10:30:2] = np.nan
#     df.x4[20:50:3] = np.nan
#     df.x5[40:45] = np.nan
#     df.y[30:100:2] = np.nan
#
#     return df
#
# def reconstruct2(dataset, mask):
#     incomplete_dataset = np.zeros(np.shape(dataset))
#
#     # IterativeImputer requires corrupted entries to be identified as NaN
#     # Using the mask to replace in the input dataset all zero entries for NaN
#     for i in range(len(dataset)):
#         frame = dataset.loc[i, :]
#         ms = mask.loc[i, :]
#         ms.values[ms.values == 0] = np.nan
#
#         incomplete_dataset[i] = frame.values*ms.values
#
#     incomplete_dataset = pd.DataFrame(incomplete_dataset)
#     incomplete_dataset.columns = map(str, incomplete_dataset.columns.values)
#
#     incomplete_dataset.columns = [item + ':' for item in incomplete_dataset.columns]
#
#     print(incomplete_dataset.columns)
#
#     # sys.exit(0)
#
#     # print(incomplete_dataset)
#
#     reconstructed_dataset = mice.MICEData(incomplete_dataset)
#     # print(np.shape(imp_data))
#     print(np.shape(reconstructed_dataset.data))
#     print(reconstructed_dataset.data)
#
#     # mi = mice.MICE("y ~ x1 + x2 + x1:x2", sm.OLS, reconstructed_dataset)
#     mi = mice.MICE("0", sm.OLS, reconstructed_dataset)
#     results = mi.fit(n_burnin=10, n_imputations=10)
#
#     print(np.shape(reconstructed_dataset.data))
#
#     sys.exit(0)
#
#     return pd.DataFrame(reconstructed_dataset)
#

# if __name__ == "__main__":
#     original_dataset, dataset, mask = rc.get_dataset(mode='MCAR', n_samples=100)
#
#     original_dataset = pd.DataFrame(original_dataset)
#     dataset = pd.DataFrame(dataset)
#     mask = pd.DataFrame(mask)
#
#     incomplete_dataset = np.zeros(np.shape(dataset))
#
#     for i in range(len(dataset)):
#         frame = dataset.loc[i, :]
#         ms = mask.loc[i, :]
#         ms.values[ms.values == 0] = np.nan
#
#         incomplete_dataset[i] = frame.values*ms.values
#
#     print(np.shape(incomplete_dataset))
#     # print(incomplete_dataset[0:1,:].reshape((2, -1)))
#
#     imputer = IterativeImputer(missing_values=np.nan, n_iter=2, sample_posterior=True, random_state=1)
#     # reconstructed_dataset = imputer.fit_transform(incomplete_dataset[0, :].reshape((2, -1)))
#     reconstructed_dataset = imputer.fit_transform(incomplete_dataset)
#     # reconstructed_dataset = imputer.complete(incomplete_dataset)
#
#
#     # print(reconstructed_dataset_mean.shape)
#     print(np.shape(reconstructed_dataset))
#     # print(reconstructed_dataset)
#
#     # fig = rc.plot([reconstructed_dataset])
#     # plt.savefig('Multiple_Impute_out1/{}.png'.format(str(0).zfill(3)), bbox_inches='tight')
#     # plt.close(fig)
#
#     # sys.exit(0)
Beispiel #23
0
# Import IterativeImputer from fancyimpute
from fancyimpute import IterativeImputer

# Copy diabetes to diabetes_mice_imputed
diabetes_mice_imputed = diabetes.copy(deep=True)

# Initialize IterativeImputer
mice_imputer = IterativeImputer()

# Impute using fit_tranform on diabetes
diabetes_mice_imputed.iloc[:, :] = mice_imputer.fit_transform(diabetes)
# SoftImpute AND IterativeSVD, SimpleFill, MatrixFactorization, and

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

import sklearn.datasets as SKD

data = pd.read_csv('ai_mavan_adhd7.csv', sep=',', index_col=None)

# In[ ]:

# MICE IMPUTATION
mice_impute = IterativeImputer()
traindatafill = mice_impute.fit_transform(adhd)

# In[ ]:

# KNN way to impute

adhd_filled_knn = KNN(k=3).fit_transform(
    adhd
)  #use 3 nearest rows which have a feature to fill in each row’s missing features

# In[ ]:

# NUCLEARNOMMINIMIZATION
adhd_filled_nnm = NuclearNormMinimization().fit_transform(adhd)

# In[69]:
                             colorbar = False, title = df_key)
# %%
# As we can see from the charts above, any of the imputation methods we tried will work well.
# The read line depicts the imputed values, and they don't follow the pattern of the data.
# We'll need to try some more advanced techniques such as:

# KNN imputation
# However, since the dataset is very large, KNN approach is not possible.

# Mice imputation
# %% markdown
# ### MICE Imputation
# %%
df_2_mice = df_2.copy(deep=True)
mice_imputer = IterativeImputer()
df_2_mice.iloc[:, :] = mice_imputer.fit_transform(df_2_mice)
# %%
df_2_mice.head()
# %%
# Now we need to focus on the latitude and longitude imputations.
# %%
# Even by splitting the data in 3 distinct datasets, it's not possible to use KNN imputation for latitude and longitude
# duo to the dataset size and hardware limitations.
# Furthermore, the columns have over 50% of missing data, so the best approach in this case is to drop them.
# %%
# We will evaluate which model performed best on df_2 imputations, select it, and concatenate the datasets df_1 and df_2.
# %%
# Plot graphs of imputed DataFrames and the complete case
df_2['price'].plot(kind='kde', c='red', linewidth=3)
df_2_mean['price'].plot(kind='kde')
df_2_median['price'].plot(kind='kde')
for col in label_list:  # encode data leaving nan as they are
    label_column = pd.DataFrame(features[col].values)
    temp_labels = pd.Series(
        [i for i in label_column.iloc[:, 0].unique() if type(i) == str])
    labelencoder_X.fit(temp_labels)
    features[col] = features[col].map(
        lambda x: labelencoder_X.transform([x])[0] if type(x) == str else x)

# Multiple Imputation to fill nan values
from fancyimpute import IterativeImputer
import missingno as msno
import matplotlib.pyplot as plt

msno.bar(features, figsize=(12, 6), fontsize=12, color='steelblue')
mice = IterativeImputer()
data = pd.DataFrame(data=mice.fit_transform(features),
                    columns=features.columns,
                    index=features.index)

# checking if there is no null value anymore
data.isnull().values.any()

# Drop rows that Year.Built>2019 & <0
data = data[~(data['Year.Built'] < 0) & ~(data['Year.Built'] > 2019)]

# Calculate age of house
data['Age_House'] = 2019 - data['Year.Built']

# Move Target variable at the end
data = data[[c for c in data if c not in ['Sale.Price']] + ['Sale.Price']]