class CustomEncoder(BaseEstimator, TransformerMixin): # Class constructor def __init__(self): self.woe = None self.woe_cols = None self.dummy_cols = None self.thresh = None @staticmethod def thresh_func(n): return n // 50 + np.ceil(np.log2(n)) def fit(self, X, y): self.woe_cols = [] self.dummy_cols = [] self.thresh = self.thresh_func(len(X)) cat_cols = X.dtypes[X.dtypes == "object"].index for i in cat_cols: if len(X[i].unique()) > self.thresh: self.woe_cols.append(i) else: self.dummy_cols.append(i) self.woe = WOEEncoder(drop_invariant=True, random_state=1234, cols=self.woe_cols) self.woe.fit(X, y) return self def transform(self, X, y=None): X = self.woe.transform(X, y) X = pd.get_dummies(X, columns=self.dummy_cols) return X
class DFWOEEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = WOEEncoder(**kwargs) self.transform_cols = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols], y) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.drop(columns=self.transform_cols) new_X = pd.concat( [new_X, self.model.transform(X[self.transform_cols])], axis=1) return new_X def fit_transform(self, X, y): return self.fit(X, y).transform(X)
def fit(self, X, y): self.woe_cols = [] self.dummy_cols = [] self.thresh = self.thresh_func(len(X)) cat_cols = X.dtypes[X.dtypes == "object"].index for i in cat_cols: if len(X[i].unique()) > self.thresh: self.woe_cols.append(i) else: self.dummy_cols.append(i) self.woe = WOEEncoder(drop_invariant=True, random_state=1234, cols=self.woe_cols) self.woe.fit(X, y) return self
def get_signals(X_train, y_train, X_test, threshold): """ Used to predict buy and sell signals. The function itself has no awareness what it is predicting. It is just a helper function used by get_possible_trades(). Target is the column that contains the target. The other columns are considered to be features to be used for training and prediction. The function uses a balanced weight of evidence scorecard to predict the signals. It returns the signals array. Note that the function uses 70% for training and 30% for testing. The date where the split happens is dependent on how much data the hist dataframe contains. So, the caller will not see a single split date for all tickers. """ log(f"- Building model with features: {X_train.columns}") scaler = StandardScaler() encoder = WOEEncoder() binner = KBinsDiscretizer(n_bins=5, encode='ordinal') objectify = FunctionTransformer(func=stringify, check_inverse=False, validate=False) imputer = SimpleImputer(strategy='constant', fill_value=0.0) clf = LogisticRegression(class_weight='balanced', random_state=42) pipe = make_pipeline(scaler, binner, objectify, encoder, imputer, clf) pipe.fit(X_train, y_train.values) test_signals = (pipe.predict_proba(X_test) > threshold).astype(int)[:,1] return y_train.values, test_signals.copy()
def fit_transform(self, df, colname, targetname): ''' Fit encoder, transform column in df, save attributes for transform(/inverse_transform(). Variable is encoded with adding minor noize to reduce the risk of overfitting. Parameters ---------- df : pd.DataFrame Data containing the colname to transform. colname : str Column name in df to be transformed. targetname : str column name for extracting the mean values for each colname category. Returns ------- transformed_df : pd.DataFrame Data with the column transformed. ''' assert_fit_transform_args(df, colname) assert_binary_target(df, targetname) encoded_df = df.copy() self._colname = colname from category_encoders import WOEEncoder generic_encoder = WOEEncoder(**self._params) encoded_column = generic_encoder.fit_transform(df[colname], df[targetname]) self.__generic_encoder = generic_encoder encoded_df[self._colname] = encoded_column # save inverse_transform pattern for test set (without noize) woe_vals_no_noize = self.transform(df)[self._colname].unique() original_vals = df[self._colname].unique() self.__pattern = dict(zip(woe_vals_no_noize, original_vals)) return encoded_df
def WOE_Encoding(self, regularization: float = 1.0, sigma: float = 0.05, randomized: bool = False): """ woe编码 :param regularization: :param sigma: :param randomized: :return: """ self.encoder = WOEEncoder(cols=self.cols, regularization=regularization, randomized=randomized, sigma=sigma)
def CatEncoder(X, cat_cols, tags, estimator_name, objective_type, trial, n_classes, random_state): if tags["handles categorical"] == False: large_threshold = 6 #TODO: handle numpy arrays with categorical? #TODO: handle multiclass / Regression if isinstance(X, pd.DataFrame) and isinstance(cat_cols[0], str): large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold] small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold] elif isinstance(X, pd.DataFrame): large_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) > large_threshold] small_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) <= large_threshold] else: large_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) > large_threshold] small_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) <= large_threshold] enc_pipe = None cat_enc_types = ["target", "binary", "catboost"] if len(small_cardinal_cats) > 0: enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True)) if len(large_cardinal_cats) > 0: if (objective_type == "classification" and n_classes == 1): cat_enc_types.append("woe") cat_enc_type = trial.suggest_categorical(estimator_name + " cat_enc_type", cat_enc_types) if cat_enc_type == "binary": # mapping = get_mapping(X, large_cardinal_cats) enc = BinaryEncoder(cols=large_cardinal_cats, # mapping=mapping ) elif cat_enc_type == "woe": enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True) elif cat_enc_type == "target": min_samples_leaf = 6 # TODO: calculate percentage or something else enc = TargetEncoder(min_samples_leaf=min_samples_leaf, cols=large_cardinal_cats) else: # catboost enc = CatBoostEncoder(cols=large_cardinal_cats, random_state=random_state) # TODO: replace SEED # TODO: permute to the dataset beforehand enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc) return enc_pipe
def __init__(self, kind, **kwargs): self.kind = kind if kind not in ['OHE', 'TE', 'LOOE', 'WOE', 'LE']: raise Exception( "Encoder type not supported, choose one of ('OHE','TE','LOOE','WOE', 'LE')" ) else: if kind == 'OHE': self.encoder = OneHotEncoder(**kwargs) elif kind == 'TE': self.encoder = TargetEncoder(**kwargs) elif kind == 'LOOE': self.encoder = LeaveOneOutEncoder(**kwargs) elif kind == 'WOE': self.encoder = WOEEncoder(**kwargs) elif kind == 'LE': self.encoder = MultiColumnTransformer(LabelEncoder)
def CatEncoder(X, cat_cols, tags, objective_type, trial, n_classes, random_state): if tags["handles categorical"] == False: large_threshold = 6 #TODO: handle numpy arrays with categorical? large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold] small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold] enc_pipe = None cat_enc_types = ["binary", "catboost", "woe", "target"] if small_cardinal_cats is not None: enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True)) if large_cardinal_cats is not None: if (objective_type == "classification" and n_classes > 2): #multiclass cat_enc_types = ["binary"] cat_enc_type = trial.suggest_categorical("cat_enc_type", cat_enc_types) if cat_enc_type == "binary": # mapping = get_mapping(X, large_cardinal_cats) enc = BinaryEncoder(cols=large_cardinal_cats, drop_invariant=True, # mapping=mapping ) elif cat_enc_type == "woe": enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True) elif cat_enc_type == "target": min_samples_leaf = 10 # TODO: calculate percentage or something else enc = TargetEncoder(min_samples_leaf=min_samples_leaf, cols=large_cardinal_cats, drop_invariant=True) else: # catboost enc = CatBoostEncoder(cols=large_cardinal_cats, drop_invariant=True, random_state=random_state) # TODO: replace SEED # TODO: permute to the dataset beforehand enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc) return enc_pipe
def get_signals(hist, target, threshold): """ Used to predict buy and sell signals. The function itself has no awareness what it is predicting. It is just a helper function used by get_possible_trades(). Target is the column that contains the target. The other columns are considered to be features to be used for training and prection. The function uses a balanced weight of evidence scorecard to predict the signals. It returns the signals array. Note that the function uses 70% for training and 30% for testing. The date where the split happens is dependent on how much data the hist dataframe contains. So, the caller will not see a single split date for all tickers. """ # NB: we do not include smooth in data! data = hist[['Close', 'Open', 'Low', 'High']] data = features(data, hist, target) used_cols = [c for c in data.columns.tolist() if c not in [target]] X, y, X_train, X_test, y_train, y_test = split_data( data, used_cols, target, 0.7) encoder = WOEEncoder() binner = KBinsDiscretizer(n_bins=5, encode='ordinal') objectify = FunctionTransformer(func=stringify, check_inverse=False, validate=False) imputer = SimpleImputer(strategy='constant', fill_value=0.0) clf = LogisticRegression(class_weight='balanced', random_state=42) pipe = make_pipeline(binner, objectify, encoder, imputer, clf) pipe.fit(X_train, y_train.values) test_signals = (pipe.predict_proba(X_test) > threshold).astype(int)[:, 1] return y_train.values, test_signals
def doCleanupEncode(X, y=None, cat=None, oh=None, binary=None, loo=None, woe=None, lp_cols=None, NoData=True): from enrich import replaceCVs from enrich import one_hot_encode from category_encoders import BinaryEncoder from category_encoders import OneHotEncoder from category_encoders import WOEEncoder from category_encoders import LeaveOneOutEncoder if NoData is False: if cat is not None | oh is not None: # translate associated columns' null, NaN, blank and 9 values to zero X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0) if oh is not None: if NoData: ec = OneHotEncoder(cols=oh, use_cat_names=True, return_df=True, handle_unknown='indicator', handle_missing='indicator').fit(X) X = ec.fit_transform(X) # dropping these columns did not help performance # for o in oh: # stem = o.split("_")[1] # d1 = "L_" + stem + "_-1" # d2 = "L_" + stem + "_nan" # print("DROPPING ", d1, " ", d2, "\n") # X.drop(d1, axis=1, errors='ignore', inplace=True) # X.drop(d2, axis=1, errors='ignore', inplace=True) else: # one-hot encode, then drop 0 if created for oh_c in oh: X = one_hot_encode(X, oh_c, False) X.drop(0, axis=1, errors='ignore', inplace=True) if binary is not None: # binary encode binary columns if NoData: enc = BinaryEncoder(cols=binary, drop_invariant=True, return_df=True, handle_unknown='indicator').fit(X) X = enc.transform(X) else: enc = BinaryEncoder(cols=binary, drop_invariant=True, return_df=True).fit(X) X = enc.transform(X) if woe is not None: # use weight of evidence on woe columns for w in woe: X[w] = X[w].fillna('NoData') wenc = WOEEncoder(cols=woe).fit(X, y) X = wenc.transform(X).round(2) if loo is not None: # use leave one out on loo columns for l in loo: X[l] = X[l].fillna('NoData') lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y) X = lenc.transform(X).round(2) # Cast all to int64 # X = X.astype("int64") if lp_cols is not None: # drop least predictive X.drop(lp_cols, axis=1, errors="ignore", inplace=True) X.reset_index(drop=True, inplace=True) return X
def __init__(self, columns=None, **kwargs): self.columns = columns self.model = WOEEncoder(**kwargs) self.transform_cols = None
def Convert_to_numeric(df): #Ordinal features map_ord = { 'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4, 'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5 } scii_letters_list = list(string.ascii_letters) map_ord_hex = dict(zip(scii_letters_list, range(0, len(scii_letters_list)))) df['ord_0'] = df['ord_0'] df['ord_1'] = df['ord_1'].replace(map_ord) df['ord_2'] = df['ord_2'].replace(map_ord) df['ord_3'] = df['ord_3'].replace(map_ord_hex) df['ord_4'] = df['ord_4'].replace(map_ord_hex) df[features_ord] = df[features_ord].fillna(df[features_ord].mean()) StandardScaler_Encoder = preprocessing.StandardScaler() df[features_ord] = StandardScaler_Encoder.fit_transform( df[features_ord].astype(float)) #Binary, Low nominal and time features WOE encoder. n_splits = 5 WOE_features = features_bin + features_low_nom + features_cyc # for col in WOE_features: # df[f'{col}_Encode']=0 # for tr_idx, tst_idx in StratifiedKFold(n_splits=n_splits, random_state=2020, shuffle=True).split(df[:600000], y_train): # WOE_encoder = WOEEncoder(cols=col) # WOE_encoder.fit(df[:600000].iloc[tr_idx, :], y_train.iloc[tr_idx]) # col_df=WOE_encoder.transform(df)[col]/n_splits # df[f'{col}_Encode']= df[f'{col}_Encode']+col_df WOE_features_encode = [w + '_Encode' for w in WOE_features] for c in WOE_features_encode: df[c] = 0 for tr_idx, tst_idx in StratifiedKFold(n_splits=n_splits, random_state=2020, shuffle=True).split( df[:600000], y_train): WOE_encoder = WOEEncoder(cols=WOE_features) WOE_encoder.fit(df[:600000].loc[tr_idx, WOE_features], y_train.iloc[tr_idx]) col_df = WOE_encoder.transform(df[WOE_features]) / n_splits df.loc[:, WOE_features_encode] += col_df df = df.drop(WOE_features, axis=1) #High Nominal Features Label encoder. Label_col = features_hi_nom + features_hi_ord for col in Label_col: Label_Encoder = preprocessing.LabelEncoder() df[col] = Label_Encoder.fit_transform( df[col].fillna("-1").astype(str).values) return df
# 单词特征的特征散列化 def hash_features(word_list, m): output = [0] * m for word in word_list: index = hash_fcn(word) % m output[index] += 1 return output # 带符号的特征散列化 def hash_features(word_list, m): output = [0] * m for word in word_list: index = hash_fcn(word) % m sign_bit = sign_hash(word) % 2 if sign_bit == 0: output[index] -= 1 else: output[index] += 1 return output h = FeatureHasher(n_features=m, input_type="string") f = h.trasnform(df["feat"]) enc = TargetEncoder(cols=['Name_of_col', 'Another_name']) training_set = enc.fit_transform(X_train, y_train) enc = LeaveOneOutEncoder(cols=['Name_of_col', 'Another_name']) training_set = enc.fit_transform(X_train, y_train) enc = WOEEncoder(cols=['Name_of_col', 'Another_name']) training_set = enc.fit_transform(X_train, y_train)
from sklearn.preprocessing import OrdinalEncoder as SklOrdinalEncoder from category_encoders import WOEEncoder, OrdinalEncoder from skl2onnx import update_registered_converter, to_onnx, get_model_alias from skl2onnx.common.data_types import FloatTensorType from skl2onnx.common.utils import check_input_and_output_numbers from skl2onnx.algebra.onnx_ops import OnnxCast from skl2onnx.algebra.onnx_operator import OnnxSubEstimator from skl2onnx.sklapi import WOETransformer import skl2onnx.sklapi.register # noqa data = load_iris() X, y = data.data, data.target X = X.astype(np.int64)[:, :2] y = (y == 2).astype(np.int64) woe = WOEEncoder(cols=[0]).fit(X, y) print(woe.transform(X[:5])) ######################################## # Let's look into the trained parameters of the model. # It appears that WOEEncoder uses an OrdinalEncoder # but not the one from scikit-learn. We need to add a # converter for this model tool. print("encoder", type(woe.ordinal_encoder), woe.ordinal_encoder) print("mapping", woe.mapping) print("encoder.mapping", woe.ordinal_encoder.mapping) print("encoder.cols", woe.ordinal_encoder.cols) ###################################### # Custom converter for OrdinalEncoder
def add_woe_encoding(self): self.pipeline.append(("WOEncoder", WOEEncoder()))