def clean_train_data(data):
    """
    Initial training data processing
    """

    data = data.reset_index(drop=True)
    train_y = data.iloc[:, -1]
    train_y = train_y.reset_index(drop=True)
    train_X = data.iloc[:, :-1]

    train_X = process_features(train_X)

    encoder = LeaveOneOutEncoder(cols=[
        "Hair Color", "Wears Glasses", "University Degree", "Gender",
        "Country", "Profession"
    ])

    encoder.fit(train_X, train_y)
    data2 = pd.concat([
        encoder.transform(train_X, train_y).reset_index(drop=True),
        train_y.reset_index(drop=True)
    ],
                      axis=1)

    return (data2, encoder)
Beispiel #2
0
class DFLeaveOneOutEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = LeaveOneOutEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols], y)

        return self

    def transform(self, X):
        return self.__transform(X)

    def __transform(self, X, y=None):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.drop(columns=self.transform_cols)
        new_X = pd.concat([
            new_X,
            self.model.transform(X[self.transform_cols]) if y is None else
            self.model.fit_transform(X[self.transform_cols], y)
        ],
                          axis=1)

        return new_X

    def fit_transform(self, X, y):
        # NOTE: Result of fit_transform() is different from fit() + transform()
        return self.fit(X, y).__transform(X, y)
Beispiel #3
0
 def __init__(self, num_config=None, categorical_config=None):
     self.num_features = [
         'power', 'kilometer', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_11', 
         'v_12', 'v_13', 'v_14', 'carAge', 'v_10_1', 'v_10_2', 'v_10_3', 'nameEncode', 
         'modelEncode', 'regionCodeEncode', 'gearbox', 'notRepairedDamage', 'seller', 
         'offerType'
     ]
     self.categorical_features = ['model', 'brand', 'bodyType', 'fuelType', 'createMon']
     self.encoded_cates = ['name', 'model', 'regionCode']
     self.cate_encoder = LeaveOneOutEncoder(cols=self.encoded_cates)
     self.general_model = None
     super().__init__(self.num_features, self.categorical_features, num_config, categorical_config)
Beispiel #4
0
class MineFeatureManager(FeatureManager):
    def __init__(self, num_config=None, categorical_config=None):
        self.num_features = [
            'power', 'kilometer', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_11', 
            'v_12', 'v_13', 'v_14', 'carAge', 'v_10_1', 'v_10_2', 'v_10_3', 'nameEncode', 
            'modelEncode', 'regionCodeEncode', 'gearbox', 'notRepairedDamage', 'seller', 
            'offerType'
        ]
        self.categorical_features = ['model', 'brand', 'bodyType', 'fuelType', 'createMon']
        self.encoded_cates = ['name', 'model', 'regionCode']
        self.cate_encoder = LeaveOneOutEncoder(cols=self.encoded_cates)
        self.general_model = None
        super().__init__(self.num_features, self.categorical_features, num_config, categorical_config)

    def _feature_engien(self, features):
        zero_na = {0: np.nan}
        features = features.replace({'power': zero_na, 'v_5': zero_na, 'v_6': zero_na})
        
        features['carAge'] = (features['creatDate'] - features['regDate']).apply(lambda x: x.days)
        features['createMon'] = features['creatDate'].dt.month
        features['notRepairedDamage'] = features['notRepairedDamage'].replace('-', np.nan).astype(float)
        
        features.loc[features['power'] > 600, 'power'] = np.nan
        features['power'] = np.log(features['power'])
        features.loc[features['v_7'] > 0.5, 'v_7'] = np.nan
        features.loc[features['v_11'] > 10, 'v_11'] = np.nan
        features.loc[features['v_13'] > 7.5, 'v_13'] = np.nan
        features.loc[features['v_14'] > 7.5, 'v_14'] = np.nan
        
        features.loc[features['v_10'] <= 0, 'v_10_1'] = features.loc[features['v_10'] <= 0, 'v_10']
        features.loc[(features['v_10'] >= 0) & (features['v_10'] < 6), 'v_10_2'] = features.loc[(features['v_10'] >= 0) & (features['v_10'] < 6), 'v_10']
        features.loc[features['v_10'] > 8, 'v_10_3'] = features.loc[features['v_10'] > 8, 'v_10']
        features.loc[~features['model'].isin(self.general_model), 'model'] = np.nan
        return features
    
    def get_model_features(self, features):
        features = features.copy()
        self.general_model = df.model.value_counts()[df.model.value_counts() < 2000].index
        encoded_cate = self.cate_encoder.fit_transform(features[self.encoded_cates], features['logPrice'])
        for cate in self.encoded_cates:
            features[cate + 'Encode'] = encoded_cate[cate]
        features = self._feature_engien(features)
        return super().get_model_features(features)
    
    def transform_feature(self, features):
        features = features.copy()
        encoded_cate = self.cate_encoder.transform(features[self.encoded_cates])
        for cate in self.encoded_cates:
            features[cate + 'Encode'] = encoded_cate[cate]
        features = self._feature_engien(features)
        return super().get_model_features(features)
    def categorical_handle(self):
        self.__application_train_feature[self.__categorical_columns] = (
            self.__application_train_feature[
                self.__categorical_columns].fillna("missing"))

        self.__encoder = LeaveOneOutEncoder()
        self.__encoder.fit(
            self.__application_train_feature[self.__categorical_columns],
            self.__application_train_label)
        self.__application_train_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__application_train_feature[self.__categorical_columns])
        self.__application_test_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__application_test_feature[self.__categorical_columns])
Beispiel #6
0
 def LeaveOneOut_Encoding(self, sigma: float = 0.05):
     """
     留一编码
     :param sigma:
     :return:
     """
     self.encoder = LeaveOneOutEncoder(cols=self.cols, sigma=sigma)
Beispiel #7
0
def loo_encode(train_df, test_df, column): 
    loo = LeaveOneOutEncoder()
    new_feature = "{}_loo".format(column)
    loo.fit(train_df[column], train_df["target"])
    train_df[new_feature] = loo.transform(train_df[column])
    test_df[new_feature] = loo.transform(test_df[column])
    return new_feature
Beispiel #8
0
    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame), 'X is not a dataframe! %s' % type(X)
        self.feature_names = X.columns

        if self.cat_features is not None:
            cat_encoder = LeaveOneOutEncoder(cols=self.cat_features)
            cat_encoder.fit(X, y)
            self.transformers.append(cat_encoder)

        if self.normalize:
            scaler = StandardScaler(copy=False)
            scaler.fit(X)
            self.transformers.append(scaler)

        if self.quantile_transform:
            quantile_train = X.copy()
            if self.cat_features is not None:
                quantile_train = cat_encoder.transform(quantile_train)

            if self.quantile_noise:
                r = np.random.RandomState(self.random_state)
                stds = np.std(quantile_train.values, axis=0, keepdims=True)
                noise_std = self.quantile_noise / np.maximum(stds, self.quantile_noise)
                quantile_train += noise_std * r.randn(*quantile_train.shape)

            qt = QuantileTransformer(random_state=self.random_state,
                                     n_quantiles=self.n_quantiles,
                                     output_distribution=self.output_distribution,
                                     copy=False)
            # if self.cat_features is not None:
            #     conti_fs = [f for f in self.feature_names if f not in self.cat_features]
            #     qt = ColumnTransformer(transformers=[("quantile", qt, conti_fs)],
            #                            remainder='passthrough')
            qt.fit(quantile_train)
            self.transformers.append(qt)

        if y is not None and self.y_normalize:
            self.y_mu, self.y_std = y.mean(axis=0), y.std(axis=0)
            print("Normalize y. mean = {}, std = {}".format(self.y_mu, self.y_std))
    def ADULT():
        tr_path = get_file_path(data_set, 'adult')

        if not all(os.path.exists(fname) for fname in (tr_path)):
            #os.makedirs(path, exist_ok=True)
            train_archive_path = get_file_path(data_set, 'archive')

            gdd.download_file_from_google_drive(
                file_id=dataset_url[data_set]['adult']['file_id'],
                dest_path=train_archive_path,
                unzip=True)

        df = pd.read_csv(tr_path)

        labels = df.pop('<=50K')

        X_train, X_test = df[:26049].copy(), df[26049:].copy()
        y_train, y_test = labels[:26049].copy(), labels[26049:].copy()

        X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.2)

        class_to_int = {c: i for i, c in enumerate(y_train.unique())}
        y_train_int = [class_to_int[v] for v in y_train]
        y_val_int = [class_to_int[v] for v in y_val]
        y_test_int = [class_to_int[v] for v in y_test]
        cat_features = [
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'native-country'
        ]

        cat_encoder = LeaveOneOutEncoder()
        cat_encoder.fit(X_train[cat_features], y_train_int)
        X_train[cat_features] = cat_encoder.transform(X_train[cat_features])
        X_val[cat_features] = cat_encoder.transform(X_val[cat_features])
        X_test[cat_features] = cat_encoder.transform(X_test[cat_features])

        # Node is going to want to have the values as float32 at some points
        X_train = X_train.values.astype('float32')
        X_val = X_val.values.astype('float32')
        X_test = X_test.values.astype('float32')
        y_train = np.array(y_train_int).astype('float32')
        y_val = np.array(y_val_int).astype('float32')
        y_test = np.array(y_test_int).astype('float32')

        return dict(
            X_train=X_train,
            y_train=y_train,
            X_valid=X_val,
            y_valid=y_val,
            X_test=X_test,
            y_test=y_test,
        )
Beispiel #10
0
 def __init__(self, kind, **kwargs):
     self.kind = kind
     if kind not in ['OHE', 'TE', 'LOOE', 'WOE', 'LE']:
         raise Exception(
             "Encoder type not supported, choose one of ('OHE','TE','LOOE','WOE', 'LE')"
         )
     else:
         if kind == 'OHE':
             self.encoder = OneHotEncoder(**kwargs)
         elif kind == 'TE':
             self.encoder = TargetEncoder(**kwargs)
         elif kind == 'LOOE':
             self.encoder = LeaveOneOutEncoder(**kwargs)
         elif kind == 'WOE':
             self.encoder = WOEEncoder(**kwargs)
         elif kind == 'LE':
             self.encoder = MultiColumnTransformer(LabelEncoder)
Beispiel #11
0
    def transform(self, sigma=0.3):
        """Power transform continuous and leave-one-out target encode categorical."""
        # Get current feature names just in case
        self._get_features()

        # ------------------------------------------------------------------- #
        #                           Continuous                                #
        # ------------------------------------------------------------------- #
        # Power transformation to make feature distributions closer to Guassian
        power = PowerTransformer(method="yeo-johnson", standardize=False)
        self.X_[self.continuous_] = power.fit_transform(
            self.X_[self.continuous_])
        self._check_data()

        # ------------------------------------------------------------------- #
        #                          Categorical                                #
        # ------------------------------------------------------------------- #
        # Nominal
        features = self._get_features()
        encoder = LeaveOneOutEncoder(return_df=True)
        encoder.fit(self.X_[self.nominal_], self.y_)
        self.X_[self.nominal_] = encoder.transform(self.X_[self.nominal_])
        self._check_data()

        # Ordinal
        encoder.fit(self.X_[self.ordinal_], self.y_)
        self.X_[self.ordinal_] = encoder.transform(self.X_[self.ordinal_])
        self._check_data()

        # ------------------------------------------------------------------- #
        #                          Standardize                                #
        # ------------------------------------------------------------------- #
        standard = StandardScaler()
        standard.fit(self.X_)
        X = standard.transform(self.X_)
        self.X_ = pd.DataFrame(data=X, columns=self.features_)
        self._check_data()

        return self
    def CLICK():
        valid_size = 100_000
        validation_seed = None
        csv_path = get_file_path(data_set, 'data_csv')
        if not os.path.exists(csv_path):
            download_data(data_set)

        data = pd.read_csv(csv_path, index_col=0)
        X, y = data.drop(columns=['target']), data['target']
        X_train, X_test = X[:-100_000].copy(), X[-100_000:].copy()
        y_train, y_test = y[:-100_000].copy(), y[-100_000:].copy()

        y_train = (y_train.values.reshape(-1) == 1).astype('int64')
        y_test = (y_test.values.reshape(-1) == 1).astype('int64')

        cat_features = [
            'url_hash', 'ad_id', 'advertiser_id', 'query_id', 'keyword_id',
            'title_id', 'description_id', 'user_id'
        ]

        X_train, X_val, y_train, y_val = train_test_split(
            X_train,
            y_train,
            test_size=valid_size,
            random_state=validation_seed)

        cat_encoder = LeaveOneOutEncoder()
        cat_encoder.fit(X_train[cat_features], y_train)
        X_train[cat_features] = cat_encoder.transform(X_train[cat_features])
        X_val[cat_features] = cat_encoder.transform(X_val[cat_features])
        X_test[cat_features] = cat_encoder.transform(X_test[cat_features])
        return dict(X_train=X_train.values.astype('float32'),
                    y_train=y_train,
                    X_valid=X_val.values.astype('float32'),
                    y_valid=y_val,
                    X_test=X_test.values.astype('float32'),
                    y_test=y_test)
Beispiel #13
0
 def __init__(self, cols=None):
     self.encoder = LeaveOneOut(cols=cols)
def DataCleaner(values_df, labels_df, test_df):

    # Training Set
    df = pd.merge(values_df, labels_df, on='id')

    #Fills in the mod
    for col in df.columns[df.isna().sum() > 0]:
        mode = df[col].mode()[0]
        df[col].fillna(value=mode, inplace=True)

    #dropping
    to_drop = [
        'funder', 'num_private', 'subvillage', 'region_code', 'recorded_by',
        'source_type', 'waterpoint_type', 'scheme_name', 'payment_type',
        'quantity_group'
    ]
    df.drop(columns=to_drop, inplace=True)
    #targets to 0,1,2
    df['status_group'] = df['status_group'].map({
        'functional': 2,
        'functional needs repair': 1,
        'non functional': 0
    })

    #date column
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['year_recorded'] = df['date_recorded'].dt.year
    df['month_recorded'] = df['date_recorded'].dt.month
    df.drop(columns='date_recorded', inplace=True)

    #Test Set
    #TEST SET TRANSFORM
    test_df = pd.read_csv('test_set_values.csv')

    #Fills in the mod
    for col in test_df.columns[test_df.isna().sum() > 0]:
        mode = test_df[col].mode()[0]
        test_df[col].fillna(value=mode, inplace=True)

    #dropping
    to_drop = [
        'funder', 'num_private', 'subvillage', 'region_code', 'recorded_by',
        'source_type', 'waterpoint_type', 'scheme_name', 'payment_type',
        'quantity_group'
    ]
    test_df.drop(columns=to_drop, inplace=True)

    #date column
    test_df['date_recorded'] = pd.to_datetime(test_df['date_recorded'])
    test_df['year_recorded'] = test_df['date_recorded'].dt.year
    test_df['month_recorded'] = test_df['date_recorded'].dt.month
    test_df.drop(columns='date_recorded', inplace=True)

    #target encode
    target = 'status_group'
    lst_te = [
        'wpt_name', 'basin', 'region', 'district_code', 'lga', 'ward',
        'scheme_management', 'installer', 'source'
    ]

    #encoder = TargetEncoder()
    encoder = LeaveOneOutEncoder()

    te_everything = [
        'wpt_name', 'basin', 'region', 'district_code', 'lga', 'ward',
        'scheme_management', 'installer', 'source', 'extraction_type',
        'extraction_type_group', 'extraction_type_class', 'management',
        'payment', 'water_quality', 'management_group', 'quality_group',
        'quantity', 'source_class', 'waterpoint_type_group'
    ]

    for c in te_everything:
        df[str(c) + '_encoded'] = encoder.fit_transform(
            df[c].values, df[target])  # TRAINING SET
        test_df[str(c) + '_encoded'] = encoder.transform(
            test_df[c].values)  # TEST SET
        df.drop(columns=c, inplace=True)  # TRAINING SET
        test_df.drop(columns=c, inplace=True)  # TEST SET

#     #one hot encode
#     encoder_ohe = OneHotEncoder(sparse=False)

    ohe = [
        'extraction_type', 'extraction_type_group', 'extraction_type_class',
        'management', 'payment', 'water_quality', 'management_group',
        'quality_group', 'quantity', 'source_class', 'waterpoint_type_group'
    ]

    #     #ONE HOT ENCODING TRAINING SET
    #     df_new = df[ohe]
    #     encoder_ohe.fit(df_new)
    #     x = encoder_ohe.transform(df_new)
    #     df1 = pd.DataFrame(x)
    #     df = pd.concat([df, df1], axis=1)
    #     df.drop(columns=ohe, inplace=True)

    #     #ONE HOT ENCODING TEST SET
    #     df_new1 = test_df[ohe]
    #     x1 = encoder_ohe.transform(df_new1)
    #     df2 = pd.DataFrame(x1)
    #     test_df = pd.concat([test_df, df2], axis = 1)
    #     test_df.drop(columns=ohe, inplace=True)

    return df, test_df
#Replacing null values.
df['BP-LOW'].fillna(bp_mean[1], inplace=True)

#Replacing null values by its respective mean value.
df['HB'].fillna(df['HB'].mean(), inplace=True)

df['CREATININE'].fillna(df['CREATININE'].mode()[0], inplace=True)

df['UREA'].fillna(df['UREA'].mean(), inplace=True)

df_drop = df.drop(['SL.', 'PAST MEDICAL HISTORY CODE'], axis=1)

cat_col = df_drop.select_dtypes(exclude=np.number).columns

#Leave one out encoder.
le = LeaveOneOutEncoder()

df_drop[cat_col] = le.fit_transform(X=df_drop[cat_col],
                                    y=df_drop['TOTAL COST TO HOSPITAL '])
#Train test split
X = df_drop.drop('TOTAL COST TO HOSPITAL ', axis=1)
y = df_drop['TOTAL COST TO HOSPITAL ']

#Page Layout:

col1 = st.sidebar
col2, col3 = st.beta_columns((1, 1))

empty = pd.DataFrame(columns=X.columns)

#Manual Input
def fn_logistic(df_train, df_test):

    ############################# Import packages #############################
    import os
    import numpy as np
    import pandas as pd
    # import pickle
    import h2o
    from h2o.estimators.glm import H2OGeneralizedLinearEstimator
    from h2o.grid.grid_search import H2OGridSearch

    #category encoders
    from category_encoders import LeaveOneOutEncoder

    #needed for fn_computeRatiosOfNumerics()
    from itertools import permutations

    #stops the output of warnings when running models on test data which have different factor levels for categorical
    #data than on the train data. I am aware this is not best practice, but makes the output more readable
    import warnings
    warnings.filterwarnings('ignore')

    ################################ Functions #############################

    def fn_MAE(actuals, predictions):
        return np.round(np.mean(np.abs(predictions - actuals)))

    def fn_RMSE(actuals, predictions):
        return np.round(np.sqrt(np.mean((predictions - actuals)**2)))

    def fn_tosplines(x):
        x = x.values
        # hack: remove zeros to avoid issues where lots of values are zero
        x_nonzero = x[x != 0]
        ptiles = np.percentile(x_nonzero, [10, 20, 40, 60, 80, 90])
        ptiles = np.unique(ptiles)
        print(var, ptiles)
        df_ptiles = pd.DataFrame({var: x})
        for idx, ptile in enumerate(ptiles):
            df_ptiles[var + '_' + str(idx)] = np.maximum(0, x - ptiles[idx])
        return (df_ptiles)

    def fn_computeRatiosOfNumerics(df, variables):
        ## Process:
        # 1. Gets passed most important numeric variables
        # 2. Computes all pairwise ratios between each of these i.e
        # - get all permutations of length 2, and divide term 1 by term 2
        # e. Returns a dataframe containing engineered variables, with appropriately named columns

        pairs = []
        lst_series = []
        for i in range(len(variables) + 1):
            for subset in permutations(variables, i):
                if len(subset) == 2: pairs.extend([subset])
        temp_colnames = []
        for elem in pairs:
            ## create column names
            temp_colname = 'ratio_{}.{}'.format(elem[0], elem[1])
            temp_colnames.append(temp_colname)
            #compute ratio
            try:
                srs_pair_ratio = df[elem[0]] / df[elem[1]]
            except ZeroDivisionError:
                #if denominator is 0, will catch error and assign nan value to that ratio
                srs_pair_ratio = np.nan
                srs_pair_ratio = np.nan
            srs_pair_ratio.rename(temp_colname, inplace=True)
            lst_series.append(srs_pair_ratio)
        #create dataframe with appropriate column names
        df_2 = pd.DataFrame(index=df.index, columns=temp_colnames)
        #fill dataframe with series
        for idx, col in enumerate(df_2):
            df_2[col] = lst_series[idx]

        # Seems df division already catches ZeroDivisonError and assigns infinity value when denom = 0 but not numerator
        # In such case, want 0 coefficient.
        # Also want 0 coefficients when both numerator and denom are 0
        # therefore replace all inf and nan values with zeroes
        df_2.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
        return df_2

    def fn_createInteractions(df, factors):
        ## takes as input a pandas dataframe, and a LIST of column names on which to create interactions
        #create an h2o frame
        h2o_df_temp = h2o.H2OFrame(df[factors],
                                   destination_frame='df_interactions_temp')

        #use H2OFrame.interaction(factors, pairwise, max_factors, min_occurence, destination_frame=None)
        h2o_df_temp = h2o_df_temp.interaction(factors,
                                              pairwise=True,
                                              max_factors=100,
                                              min_occurrence=1)

        return h2o_df_temp.as_data_frame(use_pandas=True)

    ################################ DEFINE VARIABLES #############################

    vars_all = df_train.columns.values
    var_dep = ['target']

    vars_notToUse = ['unique_id']
    vars_ind = [
        var for var in vars_all if var not in (vars_notToUse + var_dep)
    ]

    # find the categorical vars - this includes the hccv
    vars_ind_categorical = list(
        df_train.columns[df_train.dtypes == 'category'])
    # find numeric vars
    vars_ind_numeric = [
        var for var in vars_ind if var not in vars_ind_categorical
    ]

    ## GET HCCV VARS
    ## If want to use some cardinality threshold other than 30, can edit threshold below:
    th_card = 30
    srs_card = df_train[vars_ind_categorical].nunique()
    vars_ind_hccv = srs_card[srs_card > th_card].index.values.tolist(
    )  #stores names of categorical variables with cardinality higher than threshold

    # for convenience store dependent variable as y
    y = df_train[var_dep].values.ravel()

    ########################## Set index for train, val, design, test data #############################
    #### Create folds to seperate train data into train, val, design, test
    rng = np.random.RandomState(2020)
    fold = rng.randint(0, 10, df_train.shape[0])
    df_train['fold'] = fold

    #get indices for each subset
    idx_train = df_train['fold'].isin(range(8))
    idx_val = df_train['fold'].isin([7, 8])
    idx_design = df_train['fold'].isin(range(9))

    #drop fold column as no longer needed (and want to maintain similar structure to df_test
    df_train.drop(columns='fold', inplace=True)

    ############################## **Start and connect the H2O JVM** #############################
    # - Load the previous models in order to identify most important variables. To save time (and given that function can only take as input the train and test data), relevant code has been commented out but left in so that you may see my approach. I have instead hard-coded numeric and categorical variables I have found to be most important.

    # *Models are taking very long to run so have pre-loaded them below.*
    # - uncomment the below code to load the models but note that they must be in the PData directory

    # ### Connect to H2O cluster
    h2o.init(
        port=54321
    )  # line left uncommented as I make use of H2O functions throughout the script

    # ### LOAD THE MODELS

    # # GLM basic, no interactions, no mean imputation for missing level values in test
    # # model name: GLM_model_basic
    # path_glm_basic = dirPData + 'GLM_model_basic'

    # # GLM basic, no interactions, WITH mean imputation for missing level values in test
    # # model name: GLM_model_basic_meanImpute
    # path_glm_basic_meanImpute = dirPData + 'GLM_model_basic_meanImpute'

    # # GLM numerical divisons, no interactions, WITH mean imputation for missing level values in test
    # # model name: GLM_model_numeric_meanImpute
    # path_glm_numeric_meanImpute = dirPData + 'GLM_model_numeric_meanImpute'

    # # GLM numerical divisons, with interactions, WITH mean imputation for missing level values in test
    # # model name: GLM_model_numeric_interactions_meanImpute

    # glm_basic = h2o.load_model(path = path_glm_basic)
    # glm_basic_meanImpute = h2o.load_model(path = path_glm_basic_meanImpute)
    # glm_numeric_meanImpute = h2o.load_model(path = path_glm_numeric_meanImpute)

    ############################## DEAL WITH MISSINGS #############################

    #### IDENTIFY MISSINGS

    ## Check for missing numerics which have been replaced with -99 (placeholder, really it is missing)
    #get percentage of missing values for each feature
    srs_missing = pd.DataFrame(df_train.loc[:, :] == -99).sum(
        axis=0) / len(df_train)
    # print(srs_missing[srs_missing!=0])  #show which numerics have 'missing' placeholder values, and their percentage of missing values

    #get list of variables which have more than x% missing values
    #arbitrarily setting threshold to 50% but could tune this parameter if time permits
    missings_th = 0.5
    many_missings = [
        var for var in df_train.columns.values
        if srs_missing[var] >= missings_th
    ]

    ## DO NOT USE VARIABLES WITH MORE THAN x% MISSINGS
    #add vars from many_missings to vars_notToUse, remove them from list of numeric variables
    vars_notToUse.extend(many_missings)
    #turn into set and set back into list - deals with issue of duplicates when running code multiple time
    vars_notToUse = list(set(vars_notToUse))

    #remove variables in many_missings from var_ind_numeric
    vars_ind_numeric = [
        var for var in vars_ind_numeric if var not in vars_notToUse
    ]
    # print([var for var in vars_ind_numeric if var in vars_notToUse])  #double check they've been removed: printed list should be empty

    ### MEAN-IMPUTE MISSINGS

    # list of variables to impute
    vars_toImpute = [
        var for var in srs_missing[srs_missing > 0].index.tolist()
        if var not in many_missings
    ]

    #get subset dataframe (only cols which are in variables_toImpute)
    #get only values != -99 -> this will mean that the missings will be returned as NaN. Can then use fillna
    df_temp = df_train[vars_toImpute][
        df_train[vars_toImpute] != -99].copy()  #make a working copy

    #use fillna: computing the mean of each column and filling NaNs with this mean value.
    df_temp.fillna(df_temp.mean(), inplace=True)

    df_train[vars_toImpute] = df_temp

    ############################## SPLINE HIGH CARDINALITY NUMERICS #############################
    ## Attempt at capturing non-linear relationships in model

    ### Spline numeric variables with cardinality higher than 8
    # define variables to spline
    vars_ind_tospline = df_train[vars_ind_numeric].columns[(
        df_train[vars_ind_numeric].nunique() > 8)].tolist()
    #Find the percentiles on train data only, then apply same percentiles to both train and test data, even if test data distribution is very different.
    #update df_train, df_test
    for var in vars_ind_tospline:
        df_ptiles = fn_tosplines(df_train[var])
        df_train.drop(columns=[var], inplace=True)
        df_test.drop(columns=[var], inplace=True)
        vars_ind_numeric.remove(var)
        df_train = pd.concat([df_train, df_ptiles], axis=1, sort=False)
        df_test = pd.concat([df_test, df_ptiles], axis=1, sort=False)
        vars_ind_numeric.extend(df_ptiles.columns.tolist())

    ############################## DEAL WITH HCCVs #############################
    # - note that any modifications made to train data must also be made to test data (engineered colums etc)

    ### HCCV ENCODING USING category_encoders

    enc = LeaveOneOutEncoder(cols=vars_ind_hccv, sigma=0.3)
    enc.fit(df_train[idx_design], y[idx_design])
    df_train = enc.transform(df_train)  #encode hccvs in train data
    # df_train[vars_ind_hccv].head()

    df_test[
        'target'] = np.nan  #add NaN target column to test dataset in order for it to have same shape as df_train
    df_test = enc.transform(df_test)  #encode hccvs in test data
    df_test.drop(columns='target',
                 inplace=True)  #drop target column from df_test

    ############################## INTERACTIONS #############################
    # - same applies here, whatever interactions are in train data must also be in test data

    ### DEFINE FIVE MOST IMPORTANT CATEGORICAL VARS

    ### NOTE: The below interactions are created based on the largest
    ### coefficients in a previously-run model. The code below identifies
    ### those coefficients by loading the model and manipulating the data.
    ### However, as assignment requires only input to be train and test
    ### datasets, the most important variables have been hardcoded in.

    ### Inspect coefficients from basic model with no interactions
    ## Plot standardised coefficients
    # glm_basic.std_coef_plot(num_of_features=10)

    ## Get list of 5 most important variables via varimp()
    # note that glm_basic.varimp() contains some onehots created by H2o on the fly when building the model, and thus some aren't actually present in the train/test frames
    # therefore can't refer to them before running a model, and we need to refer to the original variables before h2o onehots them
    # we extract these by:
    # - getting only the name of the variable and not its values i.e. var[0] for var in glm_basic.varimp()
    # - splitting on onehot delimiter '.' and keeping only first part of result. This is name of original variable

    # # Get list of FIVE most important categorical variables
    # vars_mostImp_cat=[]
    # for var in glm_basic.varimp():
    #     orig_var = var[0].split('.')[0]
    #     if orig_var in vars_ind_categorical and orig_var not in vars_mostImp_cat:  #check if numeric
    #         #add to list of important categorical vars only if not already in list
    #         vars_mostImp_cat.append(orig_var)
    #     if len(vars_mostImp_cat)>= 5:
    #         break

    vars_mostImp_cat = ['f09', 'f03', 'f07', 'f27', 'e11'
                        ]  #comment this line if uncommenting the above block

    #Get dataframe of interactions all pairwise interactions between five most important categorical variables
    df_train_interactions = fn_createInteractions(df_train, vars_mostImp_cat)
    df_test_interactions = fn_createInteractions(df_test, vars_mostImp_cat)

    #append new columns to df_train and df_test
    df_train[df_train_interactions.columns.values] = df_train_interactions
    df_test[df_test_interactions.columns.values] = df_test_interactions

    # include new numeric variables in vars_ind_numeric
    vars_ind_categorical.extend(df_train_interactions.columns.tolist())

    ############################## OTHER FEATURES #############################
    # DIVISON OF NUMERICS
    # - must also add engineered columns to test data

    ### DEFINE THREE MOST IMPORTANT NUMERICAL VARS

    ### NOTE: The below interactions are created based on the largest
    ### coefficients in a previously-run model. The code below identifies
    ### those coefficients by loading the model and manipulating the data.
    ### However, as assignment requires only input to be train and test
    ### datasets, the most important variables have been hardcoded in.

    # # plot largest standardised coefficients
    # # glm_basic.std_coef_plot(num_of_features=10)
    # # Get list of THREE most important variables
    # vars_mostImp_numeric=[]
    # for var in glm_basic.varimp():
    #     orig_var = var[0].split('.')[0]
    #     if orig_var in vars_ind_numeric and orig_var not in vars_mostImp_numeric:  #check if numeric
    #         #add to list of important numeric vars
    #         vars_mostImp_numeric.append(orig_var)
    #     if len(vars_mostImp_numeric)>= 3:
    #         break

    vars_mostImp_numeric = [
        'f11', 'f11_0', 'f11_1'
    ]  #comment this line if uncommenting the above block

    ### COMPUTE RATIO COLUMNS FOR BOTH DATASETS
    df_temp_train = fn_computeRatiosOfNumerics(df_train, vars_mostImp_numeric)
    df_temp_test = fn_computeRatiosOfNumerics(df_test, vars_mostImp_numeric)

    #append new columns to df_train and df_test
    df_train[df_temp_train.columns.values] = df_temp_train
    df_test[df_temp_test.columns.values] = df_temp_test

    # include new numeric variables in vars_ind_numeric
    vars_ind_numeric.extend(df_temp_train.columns.tolist())

    ############################## LOAD DATA INTO H2O JVM #############################

    ### START JVM
    # h2o.init(port=54321)  #commented as already connected to H2O cluster
    # h2o.connect(port=54321)

    ### Remove all data previously loaded (if any) in JVM as no longer need it
    for key in h2o.ls()['key']:
        h2o.remove(key)

    #### Create H2OFrames in H2O cluster for df_train, df_test
    h2o_df_train = h2o.H2OFrame(df_train[vars_ind_numeric +
                                         vars_ind_categorical + var_dep],
                                destination_frame='df_train')
    h2o_df_test = h2o.H2OFrame(df_test[vars_ind_numeric +
                                       vars_ind_categorical],
                               destination_frame='df_test')

    ### Change target to enum type as we are building a classification model
    # h2o_df_train[var_dep].types
    h2o_df_train[var_dep] = h2o_df_train[var_dep].asfactor()
    # h2o_df_train[var_dep].types

    ############################## DEFINE THE FEATURES TO BE USED #############################

    features = vars_ind_numeric + vars_ind_categorical

    ###USE BOOLEAN MASKS TO INDEX TRAIN,VAL,DESIGN DATA
    idx_h2o_train = h2o.H2OFrame(idx_train.astype('int').values)
    idx_h2o_val = h2o.H2OFrame(idx_val.astype('int').values)
    idx_h2o_design = h2o.H2OFrame(idx_design.astype('int').values)

    ############################# MODELLING #############################

    ### H2O GRIDSEARCH - hyper-parameter tuning
    # ## Will use random grid search rather than cartesian to save some time

    ### NOTE: The below code is commented out as it takes approximately 1h
    ### to run. After running, the best model was selected according to AUC
    ### and its corresponding hyper-parameters were recorded. These are
    ### hard-coded later on in a single GLM estimation, in order to estimate
    ### only the best model and save on computational time/resources.

    # ## GLM hyper parameters

    # lambda_opts = [16. * 2.**-i for i in np.arange(15)]
    # alpha_opts = [0, 0.5, 0.99]
    # glm_params = {
    #     'alpha': alpha_opts,
    #     'lambda': lambda_opts
    # }
    # search_criteria = {
    #     'strategy': 'RandomDiscrete',
    #     'max_runtime_secs': 3600
    # }

    # ## Train and validate a random grid of GLMs
    # ##According to H2O documentation, must use logit link as we are estimating a binomial classification model.
    # glm_grid = H2OGridSearch(
    #     model=H2OGeneralizedLinearEstimator(
    #         family='binomial',
    #         link='logit',
    #         nfolds=10,
    #         seed=2020,
    #         keep_cross_validation_models=False,
    #         keep_cross_validation_predictions=False,
    #         keep_cross_validation_fold_assignment=False,
    #         missing_values_handling='mean_imputation'
    #     )
    #     , grid_id='glm_grid'
    #     , hyper_params=glm_params
    #     , search_criteria=search_criteria
    # #     , parallelism = 0 #adaptive parallelism, decided by H2O
    # )

    # glm_grid.train(x=features,
    #                y='target',
    #                training_frame=h2o_df_train[idx_h2o_design, :],
    #                seed=2020)

    # ## Get the grid results, sorted by validation AUC
    # glm_grid_performance = glm_grid.get_grid(sort_by='auc', decreasing=True)
    # glm_grid_performance

    ############################### best model results ###########################
    # #     alpha         lambda          model_ids                 auc        # #
    # #0      [0.0]  [9.765625E-4]  glm_grid_model_38  0.8595786171889577      # #
    ##############################################################################

    ### ESTIMATE GLM via H2O, using hyper-params found through grid-search

    # We set family to bimonial as we are running a classification GLM model (with only two classes).
    # According to H2O documentation, must use logit link as we are estimating a binomial classification model.
    # missing_values_handling -> MeanImputation: deals with new sample having categorical levels not seen in training. Replaces the unseen value with the most frequent level present in TRAINING SET.
    # keep_cross_valudation_* -> set to false to save some memory in H2o cluster.
    model = H2OGeneralizedLinearEstimator(
        alpha=0.00,
        family='binomial',
        link='logit',
        lambda_=9.765625E-4,
        nfolds=10,
        seed=2020,
        keep_cross_validation_models=False,
        keep_cross_validation_predictions=False,
        keep_cross_validation_fold_assignment=False,
        missing_values_handling='mean_imputation')
    print('Estimating GLM model...'
          )  #notification of progress when running function
    model.train(x=features,
                y='target',
                training_frame=h2o_df_train[idx_h2o_design, :])

    ### NOTE: This model is run using hard-coded values of alpha and lambda.
    ### These are the ones corresponding to the best model found via grid
    ### search above. Computation (wall) time: 3min 3s

    ### Save the model
    dirPData = '../PData/'
    dirPOutput = '../POutput/'
    best_glm = model
    best_glm_path = h2o.save_model(model=best_glm, path=dirPData, force=True)
    print(best_glm_path)

    ## MAKE PREDICTIONS ON TEST DATASET
    temp_preds = best_glm.predict(h2o_df_test)

    ### Export predictions to kaggle-required format
    df_test['Predicted'] = np.round(temp_preds[2].as_data_frame(), 5)
    df_preds = df_test[['unique_id', 'Predicted']].copy()
    df_test[['unique_id',
             'Predicted']].to_csv(dirPOutput + 'best_glm_250k.csv',
                                  index=False)

    #### KAGGLE AUCROC PUBLIC LEADERBOARD SCORE: 0.80162

    ### SHUT DOWN H2O CLUSTER
    # h2o.cluster().shutdown()  #not shutting down cluster as not sure if this will cause issues when returning the handle to the h2o object

    ############################### END OF FUNCTION, RETURN ###########################
    # - trained H2OGeneralizedLinearEstimator object
    # - Test data fed to object when making predictions: handle to H2OFrame object
    # - Kaggle public leaderboard score, hardcoded as 3 dp
    return [best_glm, h2o_df_test, 0.802]
Beispiel #17
0
#pd.get_dummies(X[["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"]])

X = X.drop(["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"], axis=1) \
        .join(pd.get_dummies(X[["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"]]))


from sklearn.feature_extraction import FeatureHasher

#h = FeatureHasher(input_type='string', n_features=1000)
#X[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].values
#hash_X = h.fit_transform(X[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].values)
#hash_X = pd.DataFrame(hash_X.toarray())

from category_encoders import LeaveOneOutEncoder
loo_encoder = LeaveOneOutEncoder(cols=["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"])
loo_X = loo_encoder.fit_transform(X[["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"]], y)
X = X.drop(["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"], axis=1).join(loo_X)

X.ord_1.replace(to_replace = ['Novice', 'Contributor','Expert', 'Master', 'Grandmaster'],
                         value = [0, 1, 2, 3, 4], inplace = True)

X.ord_2.replace(to_replace = ['Freezing', 'Cold', 'Warm', 'Hot','Boiling Hot', 'Lava Hot'],
                         value = [0, 1, 2, 3, 4, 5], inplace = True)


from sklearn.preprocessing import LabelEncoder
for i in ["ord_3", "ord_4"]:
    le = LabelEncoder()
    X[[i]] = le.fit_transform(X[[i]])
Beispiel #18
0
def doCleanupEncode(X,
                    y=None,
                    cat=None,
                    oh=None,
                    binary=None,
                    loo=None,
                    woe=None,
                    lp_cols=None,
                    NoData=True):
    from enrich import replaceCVs
    from enrich import one_hot_encode
    from category_encoders import BinaryEncoder
    from category_encoders import OneHotEncoder
    from category_encoders import WOEEncoder
    from category_encoders import LeaveOneOutEncoder

    if NoData is False:
        if cat is not None | oh is not None:
            # translate associated columns' null, NaN, blank and 9 values to zero
            X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0)

    if oh is not None:
        if NoData:
            ec = OneHotEncoder(cols=oh,
                               use_cat_names=True,
                               return_df=True,
                               handle_unknown='indicator',
                               handle_missing='indicator').fit(X)
            X = ec.fit_transform(X)
            # dropping these columns did not help performance
            # for o in oh:
            #    stem = o.split("_")[1]
            #    d1 = "L_" + stem + "_-1"
            #    d2 = "L_" + stem + "_nan"
            #    print("DROPPING ", d1, " ", d2, "\n")
            #    X.drop(d1, axis=1, errors='ignore', inplace=True)
            #    X.drop(d2, axis=1, errors='ignore', inplace=True)
        else:
            # one-hot encode, then drop 0 if created
            for oh_c in oh:
                X = one_hot_encode(X, oh_c, False)
                X.drop(0, axis=1, errors='ignore', inplace=True)

    if binary is not None:
        # binary encode binary columns
        if NoData:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True,
                                handle_unknown='indicator').fit(X)
            X = enc.transform(X)
        else:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True).fit(X)
            X = enc.transform(X)

    if woe is not None:
        # use weight of evidence on woe columns
        for w in woe:
            X[w] = X[w].fillna('NoData')

        wenc = WOEEncoder(cols=woe).fit(X, y)
        X = wenc.transform(X).round(2)

    if loo is not None:
        # use leave one out on loo columns
        for l in loo:
            X[l] = X[l].fillna('NoData')

        lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y)
        X = lenc.transform(X).round(2)

    # Cast all to int64
    # X = X.astype("int64")

    if lp_cols is not None:
        # drop least predictive
        X.drop(lp_cols, axis=1, errors="ignore", inplace=True)

    X.reset_index(drop=True, inplace=True)
    return X
Beispiel #19
0
    # 单词特征的特征散列化
    def hash_features(word_list, m):
        output = [0] * m
        for word in word_list:
            index = hash_fcn(word) % m
            output[index] += 1
        return output

    # 带符号的特征散列化
    def hash_features(word_list, m):
        output = [0] * m
        for word in word_list:
            index = hash_fcn(word) % m
            sign_bit = sign_hash(word) % 2
            if sign_bit == 0:
                output[index] -= 1
            else:
                output[index] += 1
        return output

    h = FeatureHasher(n_features=m, input_type="string")
    f = h.trasnform(df["feat"])

    enc = TargetEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)

    enc = LeaveOneOutEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)

    enc = WOEEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)
Beispiel #20
0
class LeaveOneOutEncoder():
    """Maps each categorical value to one column using LeaveOneOut encoding.

    Parameters:
        cols: [str]
            list of column names to encode.
    """
    name = 'leave_one_out'

    def __init__(self, cols=None):
        self.encoder = LeaveOneOut(cols=cols)

    def fit(self, X, features, y):
        """Fits encoder to data table.
        returns self
        """
        self.encoder.fit(X, y)
        self.features = self.encode_features_list(X, features)
        return self

    def transform(self, X):
        """Encodes matrix and updates features accordingly.
        returns encoded matrix (dataframe)
        """
        X_new = self.encoder.transform(X)
        X_new.columns = self._rename_columns(self.features)
        return X_new

    def fit_transform(self, X, features, y=None):
        """First fits, then transforms matrix.
        returns encoded matrix (dataframe)
        """
        self.encoder.fit(X, y)
        self.features = self.encode_features_list(X, features)
        X_new = self.encoder.fit_transform(X, y)
        X_new.columns = self._rename_columns(self.features)
        return X_new

    def get_mapping(self, category):
        """Gets the mapping for the LeaveOneOut encoder. Only takes strings of the column name, not the index number.
        returns mapping (dict)
        """
        return self.encoder.mapping[category]

    def encode_features_list(self, X, features):
        feature_list = []
        for f in features:
            if f.get_name() in self.encoder.cols:
                f = ft.Feature([f],
                               primitive=LeaveOneOutEnc(self, f.get_name()))
            feature_list.append(f)
        return feature_list

    def _rename_columns(self, features):
        feature_names = []
        for feature in features:
            for fname in feature.get_feature_names():
                feature_names.append(fname)
        return feature_names

    def get_features(self):
        return self.features

    def get_name(self):
        return self.name
 def __init__(self, columns=None, **kwargs):
     self.columns = columns
     self.model = LeaveOneOutEncoder(**kwargs)
     self.transform_cols = None
# After imputing missing data, we used quantile transformer. This transformation method is robust to
# outliers, and transforms variables so they have normal distribution and
# all have similar range.

numeric_transformer = Pipeline(steps=[(
    'imputer', SimpleImputer(strategy='median')
), ('scaler',
    QuantileTransformer(output_distribution='normal', random_state=0))])

# For categorical features we imputed missing data with the most frequent value of the column.
# After that we encoded these variables using bayesian encoder LeaveOneOutEncoder. We chose this encoder
# because our categorical variables were of high cardinality.

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')
            ), ('leaveoneout', LeaveOneOutEncoder(return_df=False))])

# for Indicator variables we imputed missing data with 0, as they only have values
# 0 and 1, 1 for event occuring)
indicator_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])

# we used column transformer to transform all the data based on  variable type
preprocessor = ColumnTransformer(transformers=[(
    'num', numeric_transformer,
    numeric_features), ('cat', categorical_transformer, categorical_features
                        ), ('ind', indicator_transformer, indicator_features)])

######################## SAMPLING #########################################################

# Our data was severely imbalanced, we had ratio of 1:200 of the event happening.
Beispiel #23
0
def clean_train_data(data):
    """
    Initial training data processing
    """

    train_y = data.iloc[:, -1]
    train_y = train_y.reset_index(drop=True)
    train_X = data.iloc[:, :-1]

    train_X = process_features(train_X)

    rep_points = train_X[["X", "Y", "Country"]].drop_duplicates()
    # coords = train_X[["X","Y"]].drop_duplicates().as_matrix(columns=['Y', 'X'])
    # dbscan.fit_predict(np.radians(coords))
    #
    # cluster_labels = dbscan.labels_
    # num_clusters = len(set(cluster_labels))
    # print('Number of clusters: {}'.format(num_clusters))
    # clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
    # centermost_points = clusters.map(get_centermost_point)
    # lats, lons = zip(*centermost_points)
    # rep_points = pd.DataFrame({'lon':lons, 'lat':lats})
    # # dist_from_clusters = pd.DataFrame(createDistColumns(train_X,rep_points))
    # # print("Extra columns:" + str(np.shape(dist_from_clusters)))
    #
    #
    # #train_X = train_X.join(new_column)
    #
    # dists = dist_from_origin(train_X)
    # #
    # # #print(train_X)
    # # # train_X = train_X[["X","Y","Z"]]
    # # # train_X = train_X.reset_index(drop=True)
    # #
    # # new_column = dbscan_predict(dbscan,train_X[["X","Y"]])
    # # new_column = pd.DataFrame(new_column, columns = ["Cluster"])
    train_X = train_X.drop("X", 1)
    train_X = train_X.drop("Y", 1)
    # # train_X = train_X.join(new_column)
    # #train_X = new_column
    # #train_X = train_X.join(dist_from_clusters)
    # train_X = train_X.join(dists)
    #print(train_X)
    #train_X = dist_from_clusters

    # testing_file = "Testing.csv"
    # dist_from_clusters.to_csv(testing_file, index=False)

    # data = data.drop('X', 1)

    # To just remove outliers - Note Terrible performance doing it this way
    # data = reject_outliers(data,"Income in EUR")

    # To winsorise incomes
    # data["Income in EUR"] = data["Income in EUR"].apply(using_mstats)

    # To winsorise ages
    # data["Age"] = data["Age"].apply(using_mstats)

    # To winsorise size of city
    # data["Size of City"] = data["Size of City"].apply(using_mstats)

    #ohe = OneHotEncoder(categories='auto', handle_unknown = 'ignore')

    # data["Age"] = ageBinner.fit_transform(data[["Age"]])
    #
    # data["Body Height [cm]"] = heightBinner.fit_transform(data[["Body Height [cm]"]])
    #
    # data["Size of City"] = cityBinner.fit_transform(data[["Size of City"]])
    #
    # feature_arr = ohe.fit_transform(data[["Hair Color","Size of City","Body Height [cm]",
    #     "Wears Glasses","University Degree","Country","Gender","Year of Record","Age"]]).toarray()
    #
    # names = ohe.get_feature_names(["Hair Color","Size of City","Body Height [cm]",
    #     "Wears Glasses","University Degree","Country","Gender","Year of Record","Age"])

    # df2 = pd.DataFrame(feature_arr, columns = names)
    #
    # data2 = pd.concat([df2.reset_index(drop=True),
    # data[["Income in EUR"]].reset_index(drop=True)], axis=1)

    # continous version
    #ohe = OneHotEncoder(categories='auto', handle_unknown = 'ignore')

    ohe = LeaveOneOutEncoder(cols=[
        "Hair Color", "Wears Glasses", "University Degree", "Gender",
        "Country", "Profession"
    ])
    #ohe = LeaveOneOutEncoder(cols = ["Cluster"])

    # Stop blowing up my processing :(
    #data = data.drop('Profession', 1)

    ohe.fit(train_X, train_y)
    #data2 = pd.concat([ohe.transform(train_X,train_y).reset_index(drop=True), train_y.reset_index(drop=True)],axis = 1)
    data2 = pd.concat([
        ohe.transform(train_X, train_y).reset_index(drop=True),
        train_y.reset_index(drop=True)
    ],
                      axis=1)

    # feature_arr = ohe.fit_transform(data[["Hair Color",
    #     "Wears Glasses","University Degree","Country","Gender"]]).toarray()
    #
    # names = ohe.get_feature_names(["Hair Color",
    #     "Wears Glasses","University Degree","Country","Gender"])
    #
    # df2 = pd.DataFrame(feature_arr, columns = names)
    # data2 = pd.concat([df2.reset_index(drop=True),
    # data[["Year of Record","Age","Size of City","Body Height [cm]","Income in EUR"]].reset_index(drop=True)], axis=1)
    #
    # #
    # testing_file = "Testing2.csv"
    # data2.to_csv(testing_file, index=False)
    # testing_file = "Testing2.csv"
    # train_X.to_csv(testing_file, index=False)
    return (data2, ohe, rep_points)
Beispiel #24
0
        y_test = (y_test.values.reshape(-1) == 1).astype('int64')

        cat_features = [
            'url_hash', 'ad_id', 'advertiser_id', 'query_id', 'keyword_id',
            'title_id', 'description_id', 'user_id'
        ]

        X_train, X_val, y_train, y_val = train_test_split(
            X_train,
            y_train,
            test_size=valid_size,
            random_state=validation_seed)

        num_features = X_train.shape[1]
        num_classes = len(set(y_train))
        cat_encoder = LeaveOneOutEncoder()
        cat_encoder.fit(X_train[cat_features], y_train)
        X_train[cat_features] = cat_encoder.transform(X_train[cat_features])
        X_val[cat_features] = cat_encoder.transform(X_val[cat_features])
        X_test[cat_features] = cat_encoder.transform(X_test[cat_features])
        data_dict = dict(X_train=X_train.values.astype('float32'),
                         y_train=y_train,
                         X_valid=X_val.values.astype('float32'),
                         y_valid=y_val,
                         X_test=X_test.values.astype('float32'),
                         y_test=y_test,
                         num_features=num_features,
                         num_classes=num_classes)
        #print(f"====== fetch_CLICK:\tX_train={X_train.shape}\tX_valid={X_valid.shape}\tX_test={X_test.shape}")
        with open(pkl_path, "wb") as fp:
            pickle.dump(data_dict, fp)
class StackingBaseline(object):
    def __init__(self, *, path):
        self.__path = path
        self.__application_train = None
        self.__application_test = None
        self.__sample_submission = None

        # data prepare
        self.__application_train_feature = None
        self.__application_train_label = None
        self.__application_test_feature = None

        self.__categorical_columns = None
        self.__numeric_columns = None

        # numeric handle
        # categorical handle
        self.__encoder = None

        # model fit
        self.__lr = None
        self.__ef = None
        self.__rf = None
        self.__gb = None
        self.__xgb = None
        self.__sclf = None

    def data_prepare(self):
        self.__application_train = pd.read_csv(
            os.path.join(self.__path, "application_train.csv"))
        self.__application_test = pd.read_csv(
            os.path.join(self.__path, "application_test.csv"))
        self.__sample_submission = pd.read_csv(
            os.path.join(self.__path, "sample_submission.csv"))

        self.__application_train = self.__application_train.drop("SK_ID_CURR",
                                                                 axis=1)
        self.__application_test = self.__application_test.drop("SK_ID_CURR",
                                                               axis=1)

        self.__application_train_feature = self.__application_train[[
            i for i in self.__application_train.columns if i != "TARGET"
        ]]
        self.__application_train_label = self.__application_train["TARGET"]
        self.__application_test_feature = self.__application_test

        self.__categorical_columns = self.__application_train_feature.select_dtypes(
            include=["object"]).columns.tolist()
        self.__numeric_columns = [
            i for i in self.__application_train_feature.columns
            if i not in self.__categorical_columns
        ]

    def numeric_handle(self):
        self.__application_train_feature[
            self.__numeric_columns] = self.__application_train_feature[
                self.__numeric_columns].fillna(-999.0)
        self.__application_test_feature[
            self.__numeric_columns] = self.__application_test_feature[
                self.__numeric_columns].fillna(-999.0)

    def categorical_handle(self):
        self.__application_train_feature[self.__categorical_columns] = (
            self.__application_train_feature[
                self.__categorical_columns].fillna("missing"))

        self.__encoder = LeaveOneOutEncoder()
        self.__encoder.fit(
            self.__application_train_feature[self.__categorical_columns],
            self.__application_train_label)
        self.__application_train_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__application_train_feature[self.__categorical_columns])
        self.__application_test_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__application_test_feature[self.__categorical_columns])

    def model_fit(self):
        self.__ef = ExtraTreesClassifier(n_jobs=-1)
        self.__rf = RandomForestClassifier(n_jobs=-1)
        self.__lr = LogisticRegression()
        self.__gb = GradientBoostingClassifier()
        self.__xgb = XGBClassifier(n_jobs=-1, missing=-999.0)
        self.__sclf = StackingCVClassifier(
            classifiers=[self.__ef, self.__rf, self.__gb, self.__xgb],
            meta_classifier=self.__lr,
            use_probas=True,
            cv=3)
        self.__sclf.fit(self.__application_train_feature.values,
                        self.__application_train_label.values)

    def model_predict(self):
        self.__sample_submission["TARGET"] = np.clip(
            self.__sclf.predict_proba(
                self.__application_test_feature.values)[:, 1], 0, 1)
        self.__sample_submission.to_csv(
            '/Users/David/Desktop/0.Home default risk/submission/stack_baseline',
            index=False)