def TargetEncode(data,target): 
    #Select all categorical columns
    data_to_encode=data.select_dtypes(include=['object'])
    print('Data to be encoded: ')
    cols=list(data_to_encode.columns)
    print(len(cols))
    cols='\n'.join(cols)
    print(cols)
    print('\n')
    print('\n')
    #For each column, encode using target encoder
    cols=list(data_to_encode.columns)
    model=TargetEncoder().fit(X=data[cols],y=data[target])
    #File where the target encoding model is saved
    filename="targetencodemodel.sav"
    #Open file in binary mode
    f=open(filename,'wb')
    #Dump model to file
    pickle.dump(model,f)
    f.close()
    print("Model saved in ",filename)
    print("\n")
    print("\n")
    #Read Model from file
    # f=open(filename,'rb')
    # model1=pickle.load(f)
    # f.close()
    # print("Model Loaded")
    # print(model1)
    # print('\n')
    data[cols]=model.transform(X=data[cols])
    #Return encoded data
    return data
Beispiel #2
0
def target_encoding(train,
                    target,
                    test=None,
                    feat_to_encode=None,
                    smooth=0.2,
                    random_state=9527):
    print('Target encoding...')
    train.sort_index(inplace=True)
    target = train.pop(target)
    if feat_to_encode is None:
        feat_to_encode = train.columns.tolist()
    smoothing = smooth
    oof = pd.DataFrame([])
    for tr_idx, oof_idx in StratifiedKFold(n_splits=5,
                                           random_state=random_state,
                                           shuffle=True).split(train, target):
        ce_target_encoder = TargetEncoder(cols=feat_to_encode,
                                          smoothing=smoothing)
        ce_target_encoder.fit(train.iloc[tr_idx, :], target.iloc[tr_idx])
        oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]),
                         ignore_index=False)
    ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing)
    ce_target_encoder.fit(train, target)
    train = oof.sort_index()
    if test is not None:
        test = ce_target_encoder.transform(test)
    features = list(train)
    print('Target encoding done!')
    return train, test, features, target
    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_feature_df.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"],
                                                 axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        # 使用这种方式无法报错, 可以换一种方法, 删除相似度为 1 的特征
        # drop duplicate column
        # self.__train_feature = self.__train_feature.T.drop_duplicates().T
        # self.__test_feature = self.__test_feature[self.__train_feature.columns.tolist()]

        # encoder
        self.__categorical_columns = (self.__train_feature.select_dtypes(
            include="object").columns.tolist())
        self.__train_feature[self.__categorical_columns] = (
            self.__train_feature[self.__categorical_columns].fillna("missing"))
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature[self.__categorical_columns]))

        for col in self.__train_feature.columns.tolist():
            if self.__train_feature[col].std() == 0.:
                print(col)
                self.__remove_feature.append(col)
    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__train_feature_stacking_tree = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_tree_train.csv"))
        self.__train_feature_stacking_linear = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_linear_train.csv"))
        self.__train_feature_stacking_network = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_network_train.csv"))
        self.__train_feature_gp = pd.read_csv(
            os.path.join(self.__input_path, "genetic_train_feature.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)

        self.__encoder = TargetEncoder()
        self.__categorical_columns = self.__train_feature.select_dtypes(
            "object").columns.tolist()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature[self.__categorical_columns]))

        self.__train_feature = pd.concat([
            self.__train_feature, self.__train_feature_stacking_tree,
            self.__train_feature_stacking_linear,
            self.__train_feature_stacking_network
        ],
                                         axis=1)
    def data_prepare(self):
        self.__sample_submission = pd.read_csv(
            os.path.join(self.__input_path, "sample_submission.csv"))

        # selected feature
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_select_feature_df.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        self.__categorical_columns = self.__train_feature.select_dtypes(
            "object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(
            self.__train_feature.loc[:, self.__categorical_columns],
            self.__train_label)
        self.__train_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature.loc[:, self.__categorical_columns]))
        self.__test_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(
                self.__test_feature.loc[:, self.__categorical_columns]))

        del self.__train, self.__test, self.__categorical_columns, self.__encoder
        gc.collect()
Beispiel #6
0
def prepare_df(df, columns, target):
    ''' Prepares a pd.DataFrame by turning missing scikit-learn preprocessors into "None" strings and
            performs target encoding at the input columns.

    Parameters:
    -----------
    df: pd.DataFrame
        Contains a pd.DataFrame with the generated meta-data.
    columns: list
        Contains a list with the columns that contain scikit-learn estimators and scikit-learn preprocessors.
    target: str
        Contains a string that represents the name of the column that is the target of the dataset.
    Returns:
    --------
    pd.DataFrame
        Contains adjusted pd.DataFrame.
    '''
    df = deepcopy(df)
    df = df.reset_index(drop=True)
    df = df.drop_duplicates()
    y = df[target]

    for column in ['component_1', 'component_2', 'component_3']:
        df[column] = df[column].apply(lambda x: nan_to_none(x))

    for column in columns:
        df[column] = df[column].astype('category')
        df['{}_codes'.format(column)] = df[column].cat.codes
        enc = TargetEncoder(cols=[column])
        df['{}_encoded'.format(column)] = enc.fit_transform(df[column], y)

    return df
Beispiel #7
0
    def data_prepare(self):
        self.__train_feature_before = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_before_df.csv"))
        self.__train_feature_after = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_after_df.csv"))
        self.__train = pd.concat(
            [self.__train_feature_before, self.__train_feature_after])
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_feature_df.csv"))

        self.__train_label = self.__train["TARGET"].copy()
        self.__train_feature = (self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                  axis=1)).copy()
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()].copy()
        self.__categorical_columns = self.__train_feature.select_dtypes(
            include="object").columns.tolist()

        encoder = TargetEncoder()
        encoder.fit(self.__train_feature[self.__categorical_columns],
                    self.__train_label)
        self.__train_feature[self.__categorical_columns] = encoder.transform(
            self.__train_feature[self.__categorical_columns])
Beispiel #8
0
class DFMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = TargetEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols], y)

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.drop(columns=self.transform_cols)
        new_X = pd.concat(
            [new_X, self.model.transform(X[self.transform_cols])], axis=1)

        return new_X

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
Beispiel #9
0
 def __init__(self):
     self.mode_imputer = SimpleImputer(strategy="most_frequent")
     self.cat_cols = [
         'home_ownership', 'purpose', 'addr_state', 'initial_list_status'
     ]
     self.target_encoder = TargetEncoder(handle_missing='return_nan',
                                         handle_unknown='return_nan')
Beispiel #10
0
    def __init__(self,
                 sparksess=None,
                 logdir='/encoder',
                 handle_unknown='-99999',
                 save_encoder=False):
        self.spark = sparksess
        self.logdir = logdir
        self.save_encoder

        self.ordinal_encoder_features = []
        self.onehot_encoder_features = []
        self.count_encoder_features = []
        self.target_encoder_features = []
        self.ordinal_encoder = OrdinalEncoder(
            cols=self.ordinal_encoder_features,
            return_df=True,
            handle_unknown=handle_unknown)
        self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
        self.count_encoder = CountEncoder(cols=self.count_encoder_features,
                                          return_df=True,
                                          handle_unknown=handle_unknown)
        self.target_encoder = TargetEncoder(cols=self.target_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_select_feature_df.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        # drop column na
        self.__train_feature = self.__train_feature[list(
            (self.__train_feature.isna().sum() /
             self.__train_feature.isna().count()
             )[(self.__train_feature.isna().sum() /
                self.__train_feature.isna().count()) < 0.2].index)]
        self.__test_feature = self.__test_feature[
            self.__train_feature.columns.tolist()]

        # columns 而不是 index
        self.__categorical_index = self.__train_feature.select_dtypes(
            include="object").columns.tolist()
        self.__numeric_index = self.__train_feature.select_dtypes(
            exclude="object").columns.tolist()

        # filler Imputer all np.nan remove column
        self.__filler = Imputer(strategy="median")
        self.__filler.fit(self.__train_feature[self.__numeric_index])
        self.__train_feature[self.__numeric_index] = self.__filler.transform(
            self.__train_feature[self.__numeric_index])
        self.__test_feature[self.__numeric_index] = self.__filler.transform(
            self.__test_feature[self.__numeric_index])

        # encoder
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature[self.__categorical_index],
                           self.__train_label)
        self.__train_feature[
            self.__categorical_index] = self.__encoder.transform(
                self.__train_feature[self.__categorical_index])
        self.__test_feature[
            self.__categorical_index] = self.__encoder.transform(
                self.__test_feature[self.__categorical_index])

        # scaler pandas in numpy out
        self.__scaler = MinMaxScaler()
        self.__scaler.fit(self.__train_feature)
        self.__train_feature = pd.DataFrame(
            self.__scaler.transform(self.__train_feature),
            columns=self.__train_feature.columns)
        self.__test_feature = pd.DataFrame(self.__scaler.transform(
            self.__test_feature),
                                           columns=self.__test_feature.columns)
Beispiel #12
0
    def data_prepare(self):
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature, self.__train_label)
        self.__train_feature = self.__encoder.transform(self.__train_feature)

        self.__pca = PCA(n_components=2, random_state=7)
        self.__train_feature = self.__pca.fit_transform(self.__train_feature)
        self.__train_feature = pd.DataFrame(self.__train_feature,
                                            columns=["col_1", "col_2"])
Beispiel #13
0
class CategoricalPreprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mode_imputer = SimpleImputer(strategy="most_frequent")
        self.cat_cols = [
            'home_ownership', 'purpose', 'addr_state', 'initial_list_status'
        ]
        self.target_encoder = TargetEncoder(handle_missing='return_nan',
                                            handle_unknown='return_nan')

    def fit(self, X, y=None):
        self.mode_imputer.fit(X[self.cat_cols])
        self.target_encoder.fit(X["zip_code"], y)
        return self

    def transform(self, X, y=None):
        Xc = X.copy()

        # encode emp_length
        lookup = {
            '< 1 year': 0,
            '1 year': 1,
            '2 years': 2,
            '3 years': 3,
            '4 years': 4,
            '5 years': 5,
            '6 years': 6,
            '7 years': 7,
            '8 years': 8,
            '9 years': 9,
            '10+ years': 10
        }
        Xc["emp_length"] = Xc["emp_length"].replace(lookup)

        # issue date
        Xc["issue_d"] = pd.to_datetime(Xc["issue_d"])
        tmp = Xc[
            "issue_d"].values  # keep a copy of the raw date for when we transform earliest credit line
        Xc["issue_d"] = (
            Xc["issue_d"] -
            datetime.datetime(2000, 1, 1)).astype('timedelta64[M]')

        # earliest credit line
        Xc["earliest_cr_line"] = pd.to_datetime(Xc["earliest_cr_line"])
        Xc["earliest_cr_line"] = (
            tmp - Xc["earliest_cr_line"]).astype('timedelta64[M]')

        # imputation for home_ownership, purpose, addr_state, and initial_list_status
        Xc[self.cat_cols] = self.mode_imputer.transform(Xc[self.cat_cols])

        # encode zip code
        Xc["zip_code"] = self.target_encoder.transform(Xc["zip_code"])

        return Xc

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
Beispiel #14
0
 def _create_feature(cls, conf) -> pd.DataFrame:
     df = Base.get_df(conf)
     df = df.merge(CreditCardBalance.get_df(conf), on="SK_ID_CURR", how="left")
     # fit with train data and transform with both date
     train_df = df[df['TARGET'].notnull()].copy()
     categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
     df = TargetEncoder(cols=categorical_columns).fit(train_df, train_df['TARGET']).transform(df)
     df = df.groupby(by=['SK_ID_CURR'], as_index=False).agg({col: 'mean' for col in categorical_columns})
     return df[categorical_columns + ['SK_ID_CURR']].rename(
         columns={col: f"{col}_target_encode" for col in categorical_columns}
     )
Beispiel #15
0
    def data_prepare(self):
        self.__train = pd.read_csv(os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(
            ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1)

        self.__encoder = TargetEncoder()
        self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(self.__train_feature[self.__categorical_columns])
        )
Beispiel #16
0
def categorical_encoding(df_X, y, cat_vars, id_train, method=None):
    if method is None:
        return df_X.values, df_X.columns

    target_enc = TargetEncoder(cols=cat_vars,
                               drop_invariant=False,
                               return_df=True,
                               impute_missing=False,
                               handle_unknown='error')
    target_enc.fit(df_X.iloc[id_train], pd.Series(y).iloc[id_train])
    df_X = target_enc.transform(df_X)

    return df_X.values, df_X.columns
Beispiel #17
0
    def transform(self, X):

        if self.aliases:
            X[self.aliases] = X[self.cols]
            self.cols = self.aliases

        t_enc = TargetEncoder(cols=self.cols)
        X = t_enc.fit_transform(X, X[self.target_col])
        if not self.ordinal_transform:
            return X

        o_enc = OrdinalEncoder()
        X[self.cols] = o_enc.fit_transform(X[self.cols])
        return X
Beispiel #18
0
    def data_prepare(self):
        self.__sample_submission = pd.read_csv(os.path.join(self.__input_path_1, "sample_submission.csv"))
        self.__train = pd.read_csv(os.path.join(self.__input_path_1, "train_feature_df.csv"))
        self.__test = pd.read_csv(os.path.join(self.__input_path_1, "test_feature_df.csv"))
        self.__train_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_train_res.csv"))
        self.__test_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_test_res.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1)
        self.__test_feature = self.__test[self.__train_feature.columns]

        self.__train_res = self.__train_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1)
        self.__test_res = self.__test_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1)

        self.__train_feature = pd.concat([self.__train_feature, self.__train_res], axis=1)
        self.__test_feature = pd.concat([self.__test_feature, self.__test_res], axis=1)

        self.__categorical_index = np.where(self.__train_feature.dtypes == "object")[0]
        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__train_feature.iloc[:, self.__categorical_index].fillna("missing")
        )
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__test_feature.iloc[:, self.__categorical_index].fillna("missing")
        )

        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature.iloc[:, self.__categorical_index], self.__train_label)
        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__encoder.transform(self.__train_feature.iloc[:, self.__categorical_index])
        )
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__encoder.transform(self.__test_feature.iloc[:, self.__categorical_index])
        )

        # There are NaNs in test dataset (feature number 77) but there were no NaNs in learn dataset"
        self.__numeric_index = np.where(self.__train_feature.dtypes != "object")[0]
        self.__train_feature.iloc[:, self.__numeric_index] = (
            self.__train_feature.iloc[:, self.__numeric_index].apply(
                lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0)
            )
        )
        self.__test_feature.iloc[:, self.__numeric_index] = (
            self.__test_feature.iloc[:, self.__numeric_index].apply(
                lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0)
            )
        )

        # blending 之前需要 shuffle, 这里其实并不需要, 因为后面 StratifiedKFold shuffle
        self.__train_feature, self.__train_label = shuffle(self.__train_feature, self.__train_label)
class EntityEmbeddingTree(BaseEstimator, TransformerMixin):
    def __init__(self, *, numeric_columns, categorical_columns):
        self.__numeric_columns = numeric_columns
        self.__categorical_columns = categorical_columns
        self.__target_encoder, self.__one_hot_encoder = [
            None for _ in range(2)
        ]
        self.__max_target, self.__max_param = [None for _ in range(2)]
        self.__clf = None

    def fit(self, X, y):
        X = X.copy(deep=True)
        y = y.copy(deep=True)

        self.__target_encoder = TargetEncoder()
        X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0)
        X[self.__categorical_columns] = X[self.__categorical_columns].fillna(
            "missing").astype(str)
        X[self.__categorical_columns] = self.__target_encoder.fit_transform(
            X[self.__categorical_columns], y)

        self.__max_target, self.__max_param = optimize_rf(X, y)
        self.__clf = RandomForestClassifier(
            min_samples_leaf=max(
                min(self.__max_param["min_samples_leaf"], 1.0), 0),
            n_estimators=max(int(round(self.__max_param["n_estimators"])), 1))

        self.__clf.fit(X, y)
        gc.collect()

        return self

    def transform(self, X):
        X = X.copy(deep=True)

        X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0)
        X[self.__categorical_columns] = X[self.__categorical_columns].fillna(
            "missing").astype(str)
        X[self.__categorical_columns] = self.__target_encoder.transform(
            X[self.__categorical_columns])
        gc.collect()

        return pd.DataFrame(self.__clf.apply(X)).astype(str)

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X=X, y=y)

        return self.transform(X)
def remove_outliers(data, columns_config):
    quantitative_columns = columns_config["quantitative_columns"]
    semi_quali_columns = columns_config["semi_quali_columns"]
    qualitative_columns = columns_config["qualitative_columns"]
    target_features_list = columns_config["target_features_list"]
    indicator_features_list = columns_config["indicator_features_list"]

    pipeline = make_pipeline(ExcludeColumnsTransformer(["Id"]),
                             CreateSumTransformer(target_features_list),
                             CreateOneHotTransformer(indicator_features_list),
                             NewHouseTransformer(),
                             BoxCoxTransformer(quantitative_columns),
                             FillnaMeanTransformer(quantitative_columns),
                             TargetEncoder(semi_quali_columns),
                             SimpleOneHotEncoder(qualitative_columns),
                             NormalizeTransformer(quantitative_columns),
                             FillnaMeanMatrixTransformer())

    # Prepare Data Training
    X = data
    y = data[['SalePrice']]
    X = pipeline.fit_transform(X, y)

    # fit the model
    clf = IsolationForest(max_samples=100)
    clf.fit(X)
    outlier_index = clf.predict(X)
    clean_df = data[outlier_index == 1].reset_index(inplace=False, drop=True)

    return clean_df
Beispiel #21
0
def mean_encode(columns: Union[List[str], str],
                targets: Union[List[str], str],
                smoothing: float = 1.0,
                min_samples_leaf: int = 1) -> CategoryEncoder:
    """Performs mean target encoding in parallel

    An alias to stl.category_encode(TargetEncoder(smoothing, min_samples_leaf), columns, targets).

    Args:
        columns: list of encoded columns. Treats string as a list of length 1
        targets: list of target columns. Should be provided if encoder uses target. Treats string as a list of length 1
        smoothing: smoothing effect to balance categorical average vs prior.
            Higher value means stronger regularization.
            The value must be strictly bigger than 0.
        min_samples_leaf: minimum samples to take category average into account.

    Returns:
        A feature constructor performing mean encoding for each pair (column, target) and returning the concatenation.

    Examples:
        >>> stl.mean_encoding(['Sex', 'Embarked'], ['Survived', 'Age'])
        >>> stl.mean_encoding(['Sex', 'Embarked'], 'Survived', smoothing=1.5, min_samples_leaf=5)
    """
    enc = TargetEncoder(smoothing=smoothing, min_samples_leaf=min_samples_leaf)
    return category_encode(enc, columns=columns, targets=targets)
        def gbm_model_crossval(learning_rate, n_estimators, subsample,
                               colsample_bytree, reg_alpha, reg_lambda):
            estimator = Pipeline([
                ("ENCODER",
                 ColumnTransformer(
                     [("ORD_ENCODER", OrdinalEncoder(categories="auto"),
                       ord_encoder_columns),
                      ("TAR_ENCODER", TargetEncoder(cols=tar_encoder_columns),
                       tar_encoder_columns)],
                     remainder="drop")),
                ("LGBMCLF",
                 LGBMClassifier(max_depth=1,
                                learning_rate=learning_rate,
                                n_estimators=np.int(np.round(n_estimators)),
                                subsample=subsample,
                                colsample_bytree=colsample_bytree,
                                reg_alpha=reg_alpha,
                                reg_lambda=reg_lambda,
                                random_state=7,
                                n_jobs=-1))
            ])

            cval = cross_val_score(estimator,
                                   self.__train_feature,
                                   self.__train_label,
                                   scoring="roc_auc",
                                   cv=StratifiedKFold(n_splits=5,
                                                      shuffle=True,
                                                      random_state=7))

            return cval.mean()
Beispiel #23
0
def plot_1_6(X, y):
    """ Evaluates 3 classifiers and plots the results in a bar chart.
    Also compares different category encoders
    """

    classifiers = [
        LogisticRegression(random_state=1),
        SVC(random_state=1),
        RandomForestClassifier(random_state=1)
    ]
    encoders = [
        OneHotEncoder(sparse=False, handle_unknown='ignore'),
        TargetEncoder()
    ]

    #results = dict.fromkeys(range(3), dict.fromkeys(range(4),0))
    results = []

    #same 3 cross-validation folds (shuffle and random_state=1)
    kf = KFold(n_splits=3, shuffle=True, random_state=1)

    for clf in classifiers:
        for encoder in encoders:
            pipeline = flexible_pipeline(categorical, clf, encoder)
            result = cross_val_score(pipeline,
                                     X,
                                     y,
                                     cv=kf,
                                     n_jobs=-1,
                                     scoring='roc_auc')
            results.append(np.mean(result))

    heatmap(['OneHot', 'Target'], ['Logistic', 'SVM', 'Random Forest'],
            [results[0:2], results[2:4], results[4:6]])
Beispiel #24
0
    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_df.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"],
                                                 axis=1)
        self.__numeric_columns = self.__train_feature.select_dtypes(
            exclude="object").columns.tolist()
        self.__categorical_columns = self.__train_feature.select_dtypes(
            include="object").columns.tolist()

        self.__imputer = Imputer(strategy="median")
        self.__imputer.fit(self.__train_feature[self.__numeric_columns])
        self.__train_feature[self.__numeric_columns] = (
            self.__imputer.transform(
                self.__train_feature[self.__numeric_columns]))

        self.__train_feature[self.__categorical_columns] = (
            self.__train_feature[self.__categorical_columns].fillna("missing"))
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature[self.__categorical_columns]))

        # 非监督 feature filter
        self.__unsupervise_selector = VarianceThreshold()
        self.__unsupervise_selector.fit(self.__train_feature)
        self.__train_feature = (pd.DataFrame(
            self.__unsupervise_selector.transform(self.__train_feature),
            columns=[
                i for i, j in zip(self.__train_feature.columns,
                                  self.__unsupervise_selector.get_support())
                if j == 1
            ]))

        # 监督 feature filter
        pd.concat([
            pd.Series(self.__train_feature.columns).to_frame("feature"),
            pd.Series(
                mutual_info_classif(self.__train_feature,
                                    self.__train_label)).to_frame("mi")
        ],
                  axis=1).to_csv(os.path.join(self.__output_path,
                                              "train_feature_df_fs_mi.csv"),
                                 index=False)
Beispiel #25
0
    def data_prepare(self):
        self.__feature_importance = pd.read_csv(
            os.path.join(self.__input_path,
                         "feature_importance_feature_data_V5.csv"))
        self.__feature_importance = (self.__feature_importance.groupby([
            "feature"
        ])["importance"].mean().to_frame("importance").reset_index(
            drop=False)).sort_values("importance",
                                     ascending=False).reset_index(drop=True)
        self.__feature_top_column = list(self.__feature_importance.iloc[0:200,
                                                                        0])

        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"),
            usecols=self.__feature_top_column + ["TARGET"])
        self.__test = pd.read_csv(os.path.join(self.__input_path,
                                               "test_select_feature_df.csv"),
                                  usecols=self.__feature_top_column)

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop("TARGET", axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        # encoder
        self.__categorical_columns = self.__train_feature.select_dtypes(
            include="object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__train_feature[self.__categorical_columns])
        self.__test_feature[
            self.__categorical_columns] = self.__encoder.transform(
                self.__test_feature[self.__categorical_columns])

        # filler
        self.__numeric_columns = self.__train_feature.select_dtypes(
            exclude="object").columns.tolist()
        self.__filler = Imputer(strategy="median")
        self.__filler.fit(self.__train_feature[self.__numeric_columns])
        self.__train_feature[self.__numeric_columns] = self.__filler.transform(
            self.__train_feature[self.__numeric_columns])
        self.__test_feature[self.__numeric_columns] = self.__filler.transform(
            self.__test_feature[self.__numeric_columns])
 def fit(self, data: pd.DataFrame):
     log.info("TargetEncode fit: %s", self.targets)
     for target in self.targets:
         self.encoders["enc_{}".format(target)] = TargetEncoder(
             cols=self.cols, handle_missing="return_nan")
         log.info("Target encoding fit for target: %s", target)
         self.encoders["enc_{}".format(target)].fit(data[self.cols],
                                                    data[target])
Beispiel #27
0
    def getTestTrainSlipt(self):
        ## If both testX and testTrainSplit are not passed throw exception.
        if ((self.testX is None) and (self.testTrainSplit is None)):
            raise Exception("Please pass testX or testTrainSplit")

        if (self.targetEncodeCols):
            for col in self.targetEncodeCols:
                encoder = TargetEncoder()
                self.X[col] = encoder.fit_transform(self.X[col])
                if (self.testX):
                    self.testX[col] = encoder.fit_transform(self.testX[col])

        if (self.testTrainSplit):
            X_train, X_test, y_train, y_test = train_test_split(
                self.X, self.Y, test_size=self.testTrainSplit, random_state=7)
            return X_train, X_test, y_train, y_test
        else:
            return self.X, self.testX, self.Y, self.testY
def clean_train_data_target_encoded(data):
    #uses target encodier instead
    data = data.reset_index(drop=True)
    train_y = data.iloc[:,-1]
    train_y = train_y.reset_index(drop=True)
    train_X = data.iloc[:,:-1]
    
    train_X = process_features(train_X)
    
    
    encoder = TargetEncoder(cols = ["Hair Color",
         "Wears Glasses","University Degree","Gender","Country","Profession", 
         "Housing Situation", "Satisfation with employer"], smoothing = 300)

    encoder.fit(train_X,train_y)
    data2 =  pd.concat([encoder.transform(train_X,train_y).reset_index(drop=True),train_y.reset_index(drop=True)],axis=1)
    #data2 = data2.fillna(method="ffill")
    
    return (data2,encoder)
Beispiel #29
0
def frontend_preproc(df, y):
    '''
    Function that produces the preprocessing of the DataFrame before applying the model on the front-end.
    :df: concat of df_input by the user and X features of the model
    :y: target
    '''
    ### Feature Engineering
    ohe_cols = ['gearbox', 'fuel_type', 'warranty', 'dealer', 'doors']

    # OHE
    ohe = OneHotEncoder(categories='auto')
    feature_arr = ohe.fit_transform(df[ohe_cols]).toarray()
    feature_labels = ohe.categories_

    # Using a dictionary to produce all the new OHE columns
    feature_cols = []
    for k, v in dict(zip(ohe_cols, feature_labels)).items():
        for i in v:
            el = k + '_' + str(i)
            feature_cols.append(el)

    ohe_features = pd.DataFrame(feature_arr, columns=feature_cols)
    df = pd.concat([df, ohe_features], axis=1)
    df = df.drop(ohe_cols, axis=1)

    # Target Encoding
    cat_cols = df.select_dtypes(exclude=["number"]).columns
    cols_encoded = list(map(lambda c: c + '_encoded', cat_cols))

    t_encoder = TargetEncoder()
    t_encoder.fit(df[1:][cat_cols], y)
    df[cols_encoded] = t_encoder.transform(df[cat_cols])
    df = df.drop(cat_cols, axis=1)

    # Column Transformation: QuantileTransformer
    qt = QuantileTransformer(n_quantiles=500,
                             output_distribution='normal',
                             random_state=33)

    data = qt.fit_transform(df)
    df = pd.DataFrame(data, columns=df.columns)
    
    return df
Beispiel #30
0
class ScatterPlot(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        self.__train = None
        self.__train_feature, self.__train_label = [None for _ in range(2)]

        self.__encoder = None
        self.__pca, self.__t_sne = [None for _ in range(2)]

    def data_read(self):
        self.__train = pd.read_csv(os.path.join(self.__input_path,
                                                "train.csv"))
        self.__train = self.__train.drop(["id"], axis=1)
        self.__train_feature, self.__train_label = (self.__train.drop(
            ["target"],
            axis=1).copy(deep=True), self.__train["target"].copy(deep=True))
        self.__train_feature = self.__train_feature.astype(str)

    def data_prepare(self):
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature, self.__train_label)
        self.__train_feature = self.__encoder.transform(self.__train_feature)

        self.__pca = PCA(n_components=2, random_state=7)
        self.__train_feature = self.__pca.fit_transform(self.__train_feature)
        self.__train_feature = pd.DataFrame(self.__train_feature,
                                            columns=["col_1", "col_2"])

        # self.__t_sne = TSNE(verbose=True, random_state=7)
        # self.__train_feature = self.__t_sne.fit_transform(self.__train_feature)
        # self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"])

    def scatter_plot(self):
        _, ax = plt.subplots(figsize=(16, 9))
        ax = sns.scatterplot(x="col_1",
                             y="col_2",
                             hue=self.__train_label,
                             data=self.__train_feature,
                             ax=ax)
        ax.get_figure().savefig(os.path.join(self.__output_path, "PCA.png"))