def _knn_impute(self):
     for col in self.target:
         k_impute = impute.KNNImputer()
         k_impute.fit(self.df[col].values)
         self.output_df.loc[:, col] = k_impute.fit_transform(
             self.df[col].values)
     return self.output_df
Example #2
0
def fixData(trainFileName,
            testFileName,
            features,
            imputer="simple",
            strategy="mean"):

    print("Fixing Data\n")  #Read files into pandas array
    training_data = pd.read_csv(trainFileName)
    testing_data = pd.read_csv(testFileName)

    featuresForDummies = ["Embarked", "Sex"]

    trainSurvived = training_data["Survived"]
    passengerID = testing_data["PassengerId"]

    features2 = []
    for i in range(len(features)):
        features2.append(
            features[i])  #Appends feature selected to the features to use

    training_data = training_data[features2]
    testing_data = testing_data[features2]

    tr_data = pd.get_dummies(
        training_data,
        columns=featuresForDummies)  #Get dummies for required ones
    te_data = pd.get_dummies(testing_data, columns=featuresForDummies)

    if imputer.lower() == "simple":
        imp = impute.SimpleImputer(missing_values=np.NaN,
                                   strategy=strategy)  #Imputes data
    elif imputer.lower() == "knn":
        imp = impute.KNNImputer(missing_values=np.NaN)
    elif imputer.lower() == "iterative":
        imp = impute.IterativeImputer(missing_values=np.NaN,
                                      initial_strategy=strategy)
    else:
        print("You did not enter a correct imputation method.")
        print(
            "Correct imputation methods include: \"Simple\", \"KNN\", \"Iterative\""
        )

    imp.fit(te_data)
    dummied_test = imp.transform(te_data)  #Fits data

    imp.fit(tr_data)
    dummied_train = imp.transform(tr_data)

    return (dummied_test, dummied_train, trainSurvived, passengerID
            )  #Returns the completed arrays
Example #3
0
    def fit(self, X, y=None):

        X = X.copy()

        columns = X.columns.values
        indices = X.index

        #toto sme uz riesili v preprocessing notebooku - chceme, aby nam null hodnoty neinkrementovali encoding hodnoty v strede datasetu,
        #ale aby sme mali urcity range celociselnych hodnot, bez dier, ktore sa pouzije v imputerovi
        #je to klucove aj pri KNN imputerovi, aj pri Iterative imputerovi, lebo pri iterative pracujeme so ciselnymi hodnotami,
        #ktore su kludne aj desatinne, a teda nakoniec sa vysledok imputera rounduje
        #a pri knn sice pracujeme s celocislenymi cislami, no nakoniec imputuje sa priemer ziskany z danych
        #n-susedov, co znova moze byt desatinne cislo
        #takze, aby sme nahodou pri roundovani sa nedostali na encoding hodnotu, ktora patri null hodnote, tak
        #feedujeme danemu ordinal encodingu hned na zaciatku null hodnoty
        null_values = pd.DataFrame(index=pd.Index([-1]),
                                   columns=columns,
                                   data=[[np.nan
                                          for i in range(len(columns))]])
        X = pd.concat([null_values, X])

        self.ordinal_encoder = ce.ordinal.OrdinalEncoder(
            handle_missing="return_nan", handle_unknown="return_nan")
        X = self.ordinal_encoder.fit_transform(X)

        X = X[1:]

        if self.imputer_type == "knn":
            self.imputer = impute.KNNImputer()
            X = self.imputer.fit(X)

        elif self.imputer_type == "iterative":

            self.imputer = impute.IterativeImputer(
                max_iter=20,
                random_state=42,
                initial_strategy="most_frequent",
                min_value=X.min(),
                max_value=X.max())

            try:
                X = self.imputer.fit(X)
            except (ValueError, np.linalg.LinAlgError):
                print(
                    "Jeden error bol trapnuty, kedy funkcii vadili NaNs. Tento error je ale divny, lebo mu to vadi",
                    "len prvy krat, a potom to uz ide...")
                X = self.imputer.fit(X)

        return self
Example #4
0
def _get_preprocessor(
    num_features: List[str], cat_features: List[str]
) -> pipeline.Pipeline:

    num_transformer = pipeline.Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ("impute", impute.KNNImputer(n_neighbors = 10)),
    ])

    cat_transformer = pipeline.Pipeline([
        ("impute", impute.SimpleImputer(strategy = "constant", fill_value = "missing")),
        ("encode", preprocessing.OneHotEncoder(drop = "first")),
    ] )

    preprocessor = compose.ColumnTransformer(
        [("num", num_transformer, num_features), 
        ("cat", cat_transformer, cat_features)
    ])
    return preprocessor
    def data_imputation(self):
        """Imputate data with Means"""

        # KNN Imputer
        imp1 = impute.KNNImputer(n_neighbors=5,
                                 weights='uniform',
                                 metric='nan_euclidean')
        self.training_data_X = imp1.fit_transform(self.training_data_X)
        imp2 = impute.KNNImputer(n_neighbors=5,
                                 weights='uniform',
                                 metric='nan_euclidean')
        self.testing_data_X = imp2.fit_transform(self.testing_data_X)
        # np.savetxt("C:/Users/lihanmin/Desktop/data_processing/temp1.csv", self.training_data_X, delimiter=',')

        # Simple Imputer with 'mean' strategy
        # imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
        # self.training_data_X = imp.fit_transform(self.training_data_X)
        # np.savetxt("C:/Users/lihanmin/Desktop/data_processing/temp2.csv", self.training_data_X, delimiter=',')

        if WRITE_TO_CSV:
            # Write Into CSV
            training_set_temp = np.concatenate(
                (self.training_data_Y, self.training_data_X), axis=1)
            training_set = np.concatenate(
                (self.ar_training_set[:, 0:2], training_set_temp), axis=1)
            test_set_temp = np.concatenate(
                (self.testing_data_Y, self.testing_data_X), axis=1)
            test_set = np.concatenate(
                (self.ar_testing_set[:, 0:2], test_set_temp), axis=1)
            train_num = len(training_set)
            test_num = len(test_set)
            # print(train_num, test_num)
            for i in range(train_num):
                if training_set[i][7] == 0:
                    training_set[i][7] = '女'
                else:
                    training_set[i][7] = '男'
            for j in range(test_num):
                if test_set[j][7] == 0:
                    test_set[j][7] = '女'
                else:
                    test_set[j][7] = '男'
            # print(pd.__version__)
            # print(type(self.df_org_data.columns))

            # Get names
            col_names = []
            for item in self.df_org_data.columns:
                col_names.append(item)

            # Merge
            if MERGE_TRAIN_TEST:
                total_length = len(self.ar_org_data)
                total_data = []
                idx_train = 0
                idx_test = 0
                for i in range(total_length):
                    if i % 3 == 0:
                        total_data.append(test_set[idx_test])
                        idx_test += 1
                    else:
                        total_data.append(training_set[idx_train])
                        idx_train += 1
                total_data = pd.DataFrame(total_data)
                total_data.columns = col_names
                total_file = 'C:/Users/lihanmin/Desktop/data_processing/Imputed_data.csv'
                total_data.to_csv(total_file,
                                  sep=',',
                                  encoding='gbk',
                                  index=False)
                exit()

            # Whole data
            test_set = pd.DataFrame(test_set)
            training_set = pd.DataFrame(training_set)

            # Set name
            test_set.columns = col_names
            training_set.columns = col_names

            # Write
            train_file = 'C:/Users/lihanmin/Desktop/data_processing/train.csv'
            test_file = 'C:/Users/lihanmin/Desktop/data_processing/test.csv'
            test_set.to_csv(test_file, sep=',', encoding='gbk', index=False)
            training_set.to_csv(train_file,
                                sep=',',
                                encoding='gbk',
                                index=False)
            exit()

        # Normalization
        self.training_data_X = preprocessing.scale(self.training_data_X)
        self.testing_data_X = preprocessing.scale(self.testing_data_X)
Example #6
0
from sklearn import impute, preprocessing, compose, pipeline, linear_model, multioutput
from typing import List

def _get_preprocessor(
    num_features: List[str] , cat_features: List: str
) -> compose.ColumnTransformer:
    """
    Returns preprocessing pipeline adapted to specified numerical
    and categorical features
    """

    num_transformer = pipeline.Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ("impute", impute.KNNImputer(n_neighbors = 10)),
    ])

    cat_transformer = pipeline.Pipeline([
        ("impute", impute.SimpleImputer(strategy = "constant", fill_value = "missing")),
        ("encode", preprocessing.OneHotEncoder(drop = "first")),
    ] )

    preprocessor = compose.ColumnTransformer(
        [("num", num_transformer, num_features), 
        ("cat", cat_transformer, cat_features)
    ])
    return preprocessor

def get_lr_model(
    num_features: List[str], cat_features: List[str], C: float = 1.0
) -> pipeline.Pipeline:
    """
Example #7
0
 def knn_imputer(self, col, neighbors = 5):
     imputer = impute.KNNImputer(n_neighbors=neighbors)
     self.impute(col, imputer)
Example #8
0
    #test_object_columns = {key: value for key, value in test_columns.items() if value == "object"}

    #test_object_columns = list(test_object_columns.keys())

    #convert to dummy data
    test_data = pd.get_dummies(test_data,
                               columns=train_object_columns,
                               drop_first=True)

    full_data = pd.concat([train_data, test_data]).drop("SalePrice", axis=1)

    #Apply imputer to fill missing values:
    #imputer = IterativeImputer(max_iter=10, random_state=0)

    imputer = impute.KNNImputer(n_neighbors=5)

    #imputed_data = imputer.fit_transform(test_data.values)

    #test_data = pd.DataFrame(imputed_data, columns = test_data.columns)

    imputed_data = imputer.fit_transform(full_data.values)

    test_data = pd.DataFrame(
        imputed_data[(train_data.shape[0]):(imputed_data.shape[0]), :],
        columns=full_data.columns)

    train_data = pd.DataFrame(imputed_data[:train_data.shape[0] - 1, :],
                              columns=full_data.columns)

    train_data["SalePrice"] = save_prices  # *10e8
Example #9
0
import pandas as pd
from sklearn import impute

if __name__ == "__main__":
    imputer = impute.KNNImputer(copy=False)
    df_train = pd.read_csv("../input/train.csv")
    train_len = df_train.shape[0]
    df_test = pd.read_csv("../input/test.csv")
    test_len = df_test.shape[0]
    df = pd.concat([df_train, df_test], ignore_index=True)

    for c in df.drop(['id', 'target'], axis=1).columns:
        df.loc[:, c] = df.loc[:, c]
import numpy as np
from sklearn import impute

# create a random numpy array with 10 sample
# and 6 features and values ranging from 1 to 15
X = np.random.randint(1, 15, (10, 6))

# convert the array to float
X = X.astype(float)

# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan

# use 2 nearest neighbours to fill na values
knn_imputer = impute.KNNImputer(n_neighbors=2)
X_imputer = knn_imputer.fit_transform(X)
print(X)
print(X_imputer)
Example #11
0
    data['Margin'].update(pd.Series(Margin[0]))
    data['Density'].update(pd.Series(Density[0]))
    data['Severity'].update(pd.Series(Severity[0]))
    #create the Labelencoder object
    le = preprocessing.LabelEncoder()
    #convert the categorical columns into numeric
    data["Severity"] = le.fit_transform(data["Severity"])
    data["Shape"] = le.fit_transform(data["Shape"])

if preprocesado == 3:
    #create the Labelencoder object
    le = preprocessing.LabelEncoder()
    #convert the categorical columns into numeric
    data["Severity"] = le.fit_transform(data["Severity"])
    data["Shape"] = le.fit_transform(data["Shape"])
    imputer = impute.KNNImputer()
    birads = imputer.fit_transform([data['BI-RADS'].values])
    Age = imputer.fit_transform([data['Age'].values])
    Shape = imputer.fit_transform([data['Shape'].values])
    Margin = imputer.fit_transform([data['Margin'].values])
    Density = imputer.fit_transform([data['Density'].values])
    Severity = imputer.fit_transform([data['Severity'].values])
    data['BI-RADS'].update(pd.Series(birads[0]))
    data['Age'].update(pd.Series(Age[0]))
    data['Shape'].update(pd.Series(Shape[0]))
    data['Margin'].update(pd.Series(Margin[0]))
    data['Density'].update(pd.Series(Density[0]))
    data['Severity'].update(pd.Series(Severity[0]))

print(data.shape)