def feature_engineered(df): # use only feature engineered stuff df = feature_engineering.drop_useless_columns(df) df = encode_days_as_costumer(df) df = feature_engineering.partner_binary(df) df = feature_engineering.responsiveness_share(df) df = feature_engineering.alcoholic(df) df = feature_engineering.income_housemember(df) df = feature_engineering.kids_home(df) df = feature_engineering.income_share(df) df = feature_engineering.veggie(df) df = feature_engineering.phd(df) df = feature_engineering.ave_purchase(df) df = feature_engineering.tutti_frutti(df) df = df.drop(columns=[ "Year_Birth", "Income", 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Dt_Customer', 'Recency', 'Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain' ], axis=1) df = outlier_IQR( df, columns=['income_housemember', 'income_share', 'ave_purchase']) return df
def box_cox_pipeline(df): # delete unwanted columns df = feature_engineering.drop_useless_columns(df) df = encode_education(df) df = one_hot_encoding(df, columns=["Marital_Status"]) df = impute_income_KNN(df) df = encode_days_as_costumer(df) bx_cx_trans_dict = { "log": np.log, "sqrt": np.sqrt, "exp": np.exp, "**1/4": lambda x: np.power(x, 0.25), "**2": lambda x: np.power(x, 2), "**4": lambda x: np.power(x, 4) } # treatment weird values columns = [ "Income", "Kidhome", "Teenhome", 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Recency' ] # 3) perform power transformations on scaled features and select the best for feature in columns: for trans_key, trans_value in bx_cx_trans_dict.items(): # 3) 1) 1) apply transformation on training data feature_trans = np.round(trans_value(df[feature]), 4) feature_trans.loc[np.isfinite(feature_trans) == False] = -5 df[str(feature) + str(trans_key)] = feature_trans return df
def chop_off(df): # delete unwanted columns df = feature_engineering.drop_useless_columns(df) # check for nan df = df.dropna() df = encode_days_as_costumer(df) # treatment weird values df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"]) df = outlier_IQR(df, columns=[ "Year_Birth", "Income", 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Recency' ]) #encoding df = one_hot_encoding(df, columns=["Marital_Status"]) df = one_hot_encoding(df, columns=["Education"]) # cutoff based on chi-squared test return df
def small_pipeline(df): df = impute_income_KNN(df) df = df.drop(["Kidhome", "Teenhome"], axis=1) df = feature_engineering.drop_useless_columns(df) df = encode_days_as_costumer(df) df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"]) df = one_hot_encoding(df, columns=["Marital_Status"]) df = one_hot_encoding(df, columns=["Education"]) return df
def joris_preprocessing_pipeline(df): df = impute_income_KNN(df) df = feature_engineering.partner_binary(df) df = feature_engineering.income_housemember(df) df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"]) df = one_hot_encoding(df, columns=["Marital_Status"]) df = one_hot_encoding(df, columns=["Education"]) df = encode_days_as_costumer(df) df = feature_engineering.drop_useless_columns(df) df = replace_income(df) df = feature_engineering.responsiveness_share(df) df = feature_engineering.ave_purchase(df) df = feature_engineering.income_share(df) return df
def simple_pipeline(df): # delete unwanted columns df = feature_engineering.drop_useless_columns(df) # treatment weird values df = marital_others(df) df = encode_days_as_costumer(df) # check for nan df = df.dropna() # look at extreme values df = one_hot_encoding(df, columns=["Marital_Status"]) df = one_hot_encoding(df, columns=["Education"]) # feature engineering df = encode_days_as_costumer(df) return df
def bin_it_preprocessing_pipeline(df): df = impute_income_KNN(df) df = feature_engineering.partner_binary(df) df = feature_engineering.income_housemember(df) df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"]) df = one_hot_encoding(df, columns=["Marital_Status"]) df = one_hot_encoding(df, columns=["Education"]) df = encode_days_as_costumer(df) df = feature_engineering.drop_useless_columns(df) df = replace_income(df) df = feature_engineering.responsiveness_share(df) df = feature_engineering.ave_purchase(df) df = feature_engineering.income_share(df) df = Binning_Features(df, "Income", n_bins=5) df = Binning_Features(df, "MntWines", n_bins=5) df = Binning_Features(df, "MntFruits", n_bins=5) df = Binning_Features(df, "MntMeatProducts", n_bins=5) df = Binning_Features(df, "MntFishProducts", n_bins=5) df = Binning_Features(df, "MntSweetProducts", n_bins=5) df = Binning_Features(df, "MntGoldProds", n_bins=5) return df
def morten_preprocessing_pipeline(df): """ One-Version of a Preprocessing Pipeline. Decisions are justified in Data_CLeaning.ipynb. """ df = remove_birthyear(df, 1940) df = missing_imputer(df, "Income", "median") df = outlier_cutoff(df, "MntSweetProducts", 210) df = outlier_cutoff(df, "MntMeatProducts", 1250) df = outlier_cutoff(df, "MntGoldProds", 250) df = outlier_value_imputer(df, "NumWebPurchases", 11, 11) df = outlier_value_imputer(df, "NumCatalogPurchases", 11, 11) df = outlier_value_imputer(df, "NumWebVisitsMonth", 9, 9) df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"]) df = encode_education(df) df = feature_engineering.partner_binary(df) df = one_hot_encoding(df, columns=["Marital_Status"]) df = encode_days_as_costumer(df) df = feature_engineering.drop_useless_columns(df) df = feature_engineering.responsiveness_share(df) del df["Complain"] return df