def clean_myvolts(df): dropped_cols = DEFAULTS['MyVoltsDroppedCols'].split( ',') + DEFAULTS['MyVoltsIgnoredCols'].split(',') new_df = df.drop(dropped_cols, axis=1) # new_df.dropna(inplace=True) for col in DEFAULTS['MyVoltsNumberCols'].split(','): mean = new_df[col].mean() new_df[col].fillna(mean, inplace=True) new_df.fillna('unknown', inplace=True) encode_cols = DEFAULTS['MyVoltsEncodeCols'].split(',') cbe = CatBoostEncoder(cols=encode_cols, return_df=True, drop_invariant=True, handle_missing='return_nan') cbe.fit(X=new_df, y=new_df['set_clicked']) new_df = cbe.transform(new_df) # one_hot_encode_cols = DEFAULTS['MyVoltsOneHotEncodeCols'].split(',') # new_df = oh_encode(new_df, one_hot_encode_cols) # label_encode_cols = DEFAULTS['MyVoltsLabelEncodeCols'].split(',') # new_df = label_encode(new_df, label_encode_cols) return new_df
class DFCatBoostEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = CatBoostEncoder(**kwargs) self.transform_cols = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols], y) return self def transform(self, X): return self.__transform(X) def __transform(self, X, y=None): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.drop(columns=self.transform_cols) new_X = pd.concat([ new_X, self.model.transform(X[self.transform_cols]) if y is None else self.model.fit_transform(X[self.transform_cols], y) ], axis=1) return new_X def fit_transform(self, X, y): # NOTE: Result of fit_transform() is different from fit() + transform() return self.fit(X, y).__transform(X, y)
def encode_cat_features(self, X, y, cat_features, train_mask, val_mask, test_mask): from category_encoders import CatBoostEncoder enc = CatBoostEncoder() A = X.to_numpy(copy=True) b = y.to_numpy(copy=True) A[np.ix_(train_mask, cat_features)] = enc.fit_transform(A[np.ix_(train_mask, cat_features)], b[train_mask]) A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform(A[np.ix_(val_mask + test_mask, cat_features)]) A = A.astype(float) return pd.DataFrame(A, columns=X.columns)
def CatBoost_Encoding(self, sigma: float = None, a: float = 1): """ CatBoost是一个基于树的梯度提升模型。其在包含大量类别特征的数据集问题中具有出色的效果。 在使用Catboost编码器之前,必须先对训练数据随机排列,因为在Catboost中,编码是基于“时间”的概念,即数据集中观测值的顺序。 :param sigma: :param a: :return: """ self.encoder = CatBoostEncoder(cols=self.cols, a=a, sigma=sigma)
def models_to_compare(self) -> Dict[ModelName, Dict]: lightgbm_step_categorical_features_params = f"{ModelName.LIGHTGBM.value}__{CATEGORICAL_FEATURE}" return { ModelName.CATBOOST: { TaskName.CLASSIFICATION: Pipeline([(ModelName.CATBOOST.value, CatBoostClassifier( cat_features=self.categorical_features_indices, verbose=0))]), TaskName.REGRESSION: Pipeline([(ModelName.CATBOOST.value, CatBoostRegressor( cat_features=self.categorical_features_indices, verbose=0))]) }, ModelName.LIGHTGBM: { TaskName.CLASSIFICATION: Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()), (ModelName.LIGHTGBM.value, LGBMClassifier())]), TaskName.REGRESSION: Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()), (ModelName.LIGHTGBM.value, LGBMRegressor())]), FIT_PARAMS: { lightgbm_step_categorical_features_params: self.categorical_features } }, ModelName.LIGHTGBM_WITH_CATBOOST_ENCODER: { TaskName.CLASSIFICATION: Pipeline([(ModelName.CATBOOST_ENCODER.value, CatBoostEncoder()), (ModelName.LIGHTGBM.value, LGBMClassifier())]), TaskName.REGRESSION: Pipeline([(ModelName.CATBOOST_ENCODER.value, CatBoostEncoder()), (ModelName.LIGHTGBM.value, LGBMRegressor())]) }, ModelName.XGBOOST_WITH_CATBOOST_ENCODER: { TaskName.CLASSIFICATION: Pipeline([(ModelName.CATBOOST_ENCODER.value, CatBoostEncoder()), (ModelName.XGBOOST.value, XGBClassifier())]), TaskName.REGRESSION: Pipeline([(ModelName.CATBOOST_ENCODER.value, CatBoostEncoder()), (ModelName.XGBOOST.value, XGBRegressor())]) }, ModelName.XGBOOST: { TaskName.CLASSIFICATION: Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()), (ModelName.XGBOOST.value, XGBClassifier())]), TaskName.REGRESSION: Pipeline([(ModelName.ORDINAL_ENCODER, OrdinalEncoder()), (ModelName.XGBOOST.value, XGBRegressor())]) } }
def LeaveOneOut_Encoding(self, sigma: float = 0.05): """ 留一编码 :param sigma: :return: """ self.encoder = LeaveOneOutEncoder(cols=self.cols, sigma=sigma)
def Hashing_Encoding(self, n_components: int = 8): """ 哈希编码,将任意数量的变量以一定的规则映射到给定数量的变量。特征哈希可能会导致要素之间发生冲突。哈希编码器的大小及复杂程度不随数据类别的增多而增多。 :param n_components: 用来表示特征的位数 :return: """ self.encoder = HashingEncoder(cols=self.cols, n_components=n_components)
def OneHot_Encoding(self, handle_missing='indicator', handle_unknown='indicator'): """ one-hot编码,其可以将具有n_categories个可能值的一个分类特征转换为n_categories个二进制特征,其中一个为1,所有其他为0 :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列 :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列 :return: """ self.encoder = OneHotEncoder(cols=self.cols, handle_missing=handle_missing, handle_unknown=handle_unknown)
def Helmert_Encoding(self, handle_missing='indicator', handle_unknown='indicator'): """ Helmert编码,分类特征中的每个值对应于Helmert矩阵中的一行 :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列 :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列 :return: """ self.encoder = HelmertEncoder(cols=self.cols, handle_unknown=handle_unknown, handle_missing=handle_missing)
def Devaition_Encoding(self, handle_missing='indicator', handle_unknown='indicator'): """ 偏差编码。偏差编码后,线性模型的系数可以反映该给定该类别变量值的情况下因变量的平均值与全局因变量的平均值的差异 :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列 :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列 :return: """ self.encoder = SumEncoder(cols=self.cols, handle_missing=handle_missing, handle_unknown=handle_unknown)
def fit_transform(self, X: pd.DataFrame, y) -> pd.DataFrame: self.cbe_ = [] cv = check_cv(self.cv) cbe = CatBoostEncoder(cols=X.columns.tolist(), return_df=False, **self.cbe_params) X_transformed = np.zeros_like(X, dtype=np.float64) for train_idx, valid_idx in cv.split(X, y): self.cbe_.append(clone(cbe).fit(X.loc[train_idx], y[train_idx])) X_transformed[valid_idx] = self.cbe_[-1].transform( X.loc[valid_idx]) return pd.DataFrame(X_transformed, columns=X.columns)
def get_training_and_test_dfs(): """ Read in the test data csv file and return a dataframe for each organization :return: """ missing_values = DEFAULTS['MissingValues'].split(',') df_train = pd.read_csv(DEFAULTS['TrainingFile'], na_values=missing_values) df_test = pd.read_csv(DEFAULTS['TestFile'], na_values=missing_values) output_df = pd.DataFrame(columns=['recommendation_set_id', 'set_clicked']) jabref_train, myvolts_train, homepage_train = split_data(df_train) jabref_test, myvolts_test, homepage_test = split_data(df_test) output_df['recommendation_set_id'] = myvolts_test[ 'recommendation_set_id'].copy() dropped_cols = DEFAULTS['MyVoltsDroppedCols'].split( ',') + DEFAULTS['MyVoltsIgnoredCols'].split(',') myvolts_train = myvolts_train.drop(dropped_cols, axis=1) myvolts_test = myvolts_test.drop(dropped_cols, axis=1) for col in DEFAULTS['MyVoltsNumberCols'].split(','): mean = myvolts_train[col].mean() myvolts_train[col].fillna(mean, inplace=True) mean = myvolts_test[col].mean() myvolts_test[col].fillna(mean, inplace=True) myvolts_train.fillna('unknown', inplace=True) myvolts_test.fillna('unknown', inplace=True) # myvolts_train['train'] = 1 # myvolts_test['train'] = 0 encode_cols = DEFAULTS['MyVoltsEncodeCols'].split(',') cbe = CatBoostEncoder(cols=encode_cols, return_df=True, drop_invariant=True, handle_missing='return_nan') cbe.fit(X=myvolts_train, y=myvolts_train['set_clicked']) myvolts_train = cbe.transform(myvolts_train) myvolts_test = cbe.transform(myvolts_test) # combined = pd.concat([myvolts_train, myvolts_test]) # combined = oh_encode(combined, encode_cols) # label_encode_cols = DEFAULTS['MyVoltsLabelEncodeCols'].split(',') # combined = label_encode(combined, label_encode_cols) # myvolts_train = combined[combined['train'] == 1] # myvolts_test = combined[combined['train'] == 0] # myvolts_train = myvolts_train.drop(['train'], axis=1) # myvolts_test = myvolts_test.drop(['train'], axis=1) return myvolts_train, myvolts_test, output_df
def MEstimate_Encoding(self, m: float = 1.0, sigma: float = 0.05, randomized: bool = False): """ M估计量编码是目标编码的一个简化版本 :param m: :param sigma: :param randomized: :return: """ self.encoder = MEstimateEncoder(cols=self.cols, m=m, sigma=sigma, randomized=randomized)
def Target_Encoding(self, min_samples_leaf: int = 1, smoothing: float = 1.0): """ 目标编码是一种不仅基于特征值本身,还基于相应因变量的类别变量编码方法。 对于分类问题:将类别特征替换为给定某一特定类别值的因变量后验概率与所有训练数据上因变量的先验概率的组合。 对于连续目标:将类别特征替换为给定某一特定类别值的因变量目标期望值与所有训练数据上因变量的目标期望值的组合。 该方法严重依赖于因变量的分布,但这大大减少了生成编码后特征的数量。 :param min_samples_leaf: :param smoothing: :return: """ self.encoder = TargetEncoder(cols=self.cols, min_samples_leaf=min_samples_leaf, smoothing=smoothing)
def WOE_Encoding(self, regularization: float = 1.0, sigma: float = 0.05, randomized: bool = False): """ woe编码 :param regularization: :param sigma: :param randomized: :return: """ self.encoder = WOEEncoder(cols=self.cols, regularization=regularization, randomized=randomized, sigma=sigma)
def CatEncoder(X, cat_cols, tags, estimator_name, objective_type, trial, n_classes, random_state): if tags["handles categorical"] == False: large_threshold = 6 #TODO: handle numpy arrays with categorical? #TODO: handle multiclass / Regression if isinstance(X, pd.DataFrame) and isinstance(cat_cols[0], str): large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold] small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold] elif isinstance(X, pd.DataFrame): large_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) > large_threshold] small_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) <= large_threshold] else: large_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) > large_threshold] small_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) <= large_threshold] enc_pipe = None cat_enc_types = ["target", "binary", "catboost"] if len(small_cardinal_cats) > 0: enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True)) if len(large_cardinal_cats) > 0: if (objective_type == "classification" and n_classes == 1): cat_enc_types.append("woe") cat_enc_type = trial.suggest_categorical(estimator_name + " cat_enc_type", cat_enc_types) if cat_enc_type == "binary": # mapping = get_mapping(X, large_cardinal_cats) enc = BinaryEncoder(cols=large_cardinal_cats, # mapping=mapping ) elif cat_enc_type == "woe": enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True) elif cat_enc_type == "target": min_samples_leaf = 6 # TODO: calculate percentage or something else enc = TargetEncoder(min_samples_leaf=min_samples_leaf, cols=large_cardinal_cats) else: # catboost enc = CatBoostEncoder(cols=large_cardinal_cats, random_state=random_state) # TODO: replace SEED # TODO: permute to the dataset beforehand enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc) return enc_pipe
def JamesStein_Encoding(self, model: str = 'independent', sigma: float = 0.05, randomized: bool = False): """ James-Stein编码,也是一种基于目标编码的编码方法,也尝试通过参数B来平衡先验概率与观测到的条件概率。 但与目标编码与M估计量编码不同的是,James-Stein编码器通过方差比而不是样本大小来平衡两个概率。 :param model: :param sigma: :param randomized: :return: """ self.encoder = JamesSteinEncoder(cols=self.cols, model=model, sigma=sigma, randomized=randomized)
def CatEncoder(X, cat_cols, tags, objective_type, trial, n_classes, random_state): if tags["handles categorical"] == False: large_threshold = 6 #TODO: handle numpy arrays with categorical? large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold] small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold] enc_pipe = None cat_enc_types = ["binary", "catboost", "woe", "target"] if small_cardinal_cats is not None: enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True)) if large_cardinal_cats is not None: if (objective_type == "classification" and n_classes > 2): #multiclass cat_enc_types = ["binary"] cat_enc_type = trial.suggest_categorical("cat_enc_type", cat_enc_types) if cat_enc_type == "binary": # mapping = get_mapping(X, large_cardinal_cats) enc = BinaryEncoder(cols=large_cardinal_cats, drop_invariant=True, # mapping=mapping ) elif cat_enc_type == "woe": enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True) elif cat_enc_type == "target": min_samples_leaf = 10 # TODO: calculate percentage or something else enc = TargetEncoder(min_samples_leaf=min_samples_leaf, cols=large_cardinal_cats, drop_invariant=True) else: # catboost enc = CatBoostEncoder(cols=large_cardinal_cats, drop_invariant=True, random_state=random_state) # TODO: replace SEED # TODO: permute to the dataset beforehand enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc) return enc_pipe
def make_pipeline(df): x = df col_dtypes = get_types(x) encoder = ColumnTransformer( [('categorical', CatBoostEncoder(), col_dtypes['object']), # could use passthrough=remainder, but this way makes column ordering more obvious ('numeric', FunctionTransformer(), col_dtypes['int64'] + col_dtypes['float64']) ] ) all_columns_idx = np.full((len(x)), True, dtype=bool) imputer = ColumnTransformer( [('knn_imputer', KNNImputer(), all_columns_idx)] ) pipeline = Pipeline(steps=[ ('encoder', encoder), ('imputer', imputer), ]) return pipeline, col_dtypes['object'] + col_dtypes['int64'] + col_dtypes['float64']
def __init__(self, columns=None, **kwargs): self.columns = columns self.model = CatBoostEncoder(**kwargs) self.transform_cols = None
def Ordinal_Encoding(self): """ 序数编码将类别变量转化为一列序数变量,包含从1到类别数量之间的整数 :return: """ self.encoder = OrdinalEncoder(cols=self.cols)
def reg_model(labelled_data, unlabelled_data): """ Parameters: training dataframe, unknown dataframe Returns: results dataframe (Instance, Income) ffill on NaN from training data, Replaces NaN in test data with ffill, cat-encodes non-numeric fields, scales values, 80/20 splits data to help verify model, uses LightGBM """ # print("throwing away rows to speed up model") # speed up testing by throwing away some data # clean_labelled = labelled_data.sample(frac=0.2) clean_labelled = labelled_data.copy() clean_unlabelled = unlabelled_data.copy() print("cleaning data...") # get rid of weird value clean_labelled.loc[:, "Work Experience in Current Job [years]"] = pandas.to_numeric( labelled_data[ "Work Experience in Current Job [years]"], errors="coerce") clean_unlabelled.loc[:, "Work Experience in Current Job [years]"] = pandas.to_numeric( unlabelled_data[ "Work Experience in Current Job [years]"], errors="coerce") print("mixed type issue fixed..") # fix additional income field clean_labelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric( np.fromiter(map( lambda s: s.replace(" EUR", ""), clean_labelled[ "Yearly Income in addition to Salary (e.g. Rental Income)"], ), dtype=np.float), errors="coerce") clean_unlabelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric( np.fromiter(map( lambda s: s.replace(" EUR", ""), clean_unlabelled[ "Yearly Income in addition to Salary (e.g. Rental Income)"], ), dtype=np.float), errors="coerce") # dropping useless columns drop_columns(clean_unlabelled) drop_columns(clean_labelled) # removing NaN values clean_labelled.fillna(method="ffill", inplace=True) clean_unlabelled = clean_unlabelled[all_columns] clean_unlabelled.fillna(method="ffill", inplace=True) # input data for final predictions unknown_data = clean_unlabelled.drop(["Instance"], axis=1) print("splitting data into train and test...") # 80/20 split, and separating targets split = split_data(clean_labelled) train_data, train_target, test_data, test_target = split print("encoding categorical data...") # categorical encoding cat = CatBoostEncoder() train_data = cat.fit_transform(train_data, train_target) test_data = cat.transform(test_data) unknown_data = cat.transform(unknown_data) # separate additional income train_add_income = train_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values test_add_income = test_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values unknown_add_income = unknown_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values train_data = train_data[no_income_columns] test_data = test_data[no_income_columns] unknown_data = unknown_data[no_income_columns] train_target = train_target[ "Total Yearly Income [EUR]"].values - train_add_income test_target = test_target["Total Yearly Income [EUR]"].values print("scaling values...") # scaling values scaler = StandardScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) unknown_data = scaler.transform(unknown_data) print("fitting model...") # fit model reg = LGBMRegressor() # reg = TransformedTargetRegressor( # regressor=mod, # transformer=scaler # ) reg.fit(train_data, train_target) print("predicting test data...") test_result = reg.predict(test_data, num_iterations=15000) # add additional income test_result = test_result + test_add_income print("analysing test results...") # validate test error = mean_absolute_error(test_target, test_result) score = explained_variance_score(test_target, test_result) print("Mean absolute error of test data: ", error) print("Score: ", score) print("predicting unknown data...") # predict and format values = reg.predict(unknown_data) values = values + unknown_add_income results = pandas.DataFrame({ "Instance": clean_unlabelled["Instance"].values, "Total Yearly Income [EUR]": values }) print("Finished.") return results
from sklearn.ensemble import RandomForestClassifier from sklearn import metrics if __name__ == '__main__': df = pd.read_csv('dataset/data.csv') y = df['target'] df.drop('target', axis=1, inplace=True) perm = np.random.permutation(len(df)) train = df.iloc[perm].reset_index(drop=True) y = y.iloc[perm].reset_index(drop=True) # drop columns train.drop(['track_name', 'track_id'], axis=1, inplace=True) # Categorical Encoding cbe = CatBoostEncoder(cols=['artist', 'album']) cbe = cbe.fit(train, y) pickle.dump(cbe, open('saved_models/catboostencoder.pkl', 'wb')) train = cbe.transform(train) # Feature Scaling scaler = MinMaxScaler().fit(train) train = scaler.transform(train) # Model rf_model = pickle.load( open('saved_models/random_forest_grid_model.pkl', 'rb')) new_model = RandomForestClassifier( criterion=rf_model.best_params_['criterion'], min_impurity_decrease=rf_model.best_params_['min_impurity_decrease'], min_samples_leaf=rf_model.best_params_['min_samples_leaf'],
class FeatureEncoding(TransformerMixin): def __init__(self, cols: List = None): """ 初始化函数 :param cols: 编码列列表 """ self.cols = cols self.encoder = None def Ordinal_Encoding(self): """ 序数编码将类别变量转化为一列序数变量,包含从1到类别数量之间的整数 :return: """ self.encoder = OrdinalEncoder(cols=self.cols) def OneHot_Encoding(self, handle_missing='indicator', handle_unknown='indicator'): """ one-hot编码,其可以将具有n_categories个可能值的一个分类特征转换为n_categories个二进制特征,其中一个为1,所有其他为0 :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列 :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列 :return: """ self.encoder = OneHotEncoder(cols=self.cols, handle_missing=handle_missing, handle_unknown=handle_unknown) def Hashing_Encoding(self, n_components: int = 8): """ 哈希编码,将任意数量的变量以一定的规则映射到给定数量的变量。特征哈希可能会导致要素之间发生冲突。哈希编码器的大小及复杂程度不随数据类别的增多而增多。 :param n_components: 用来表示特征的位数 :return: """ self.encoder = HashingEncoder(cols=self.cols, n_components=n_components) def Helmert_Encoding(self, handle_missing='indicator', handle_unknown='indicator'): """ Helmert编码,分类特征中的每个值对应于Helmert矩阵中的一行 :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列 :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列 :return: """ self.encoder = HelmertEncoder(cols=self.cols, handle_unknown=handle_unknown, handle_missing=handle_missing) def Devaition_Encoding(self, handle_missing='indicator', handle_unknown='indicator'): """ 偏差编码。偏差编码后,线性模型的系数可以反映该给定该类别变量值的情况下因变量的平均值与全局因变量的平均值的差异 :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列 :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列 :return: """ self.encoder = SumEncoder(cols=self.cols, handle_missing=handle_missing, handle_unknown=handle_unknown) def Target_Encoding(self, min_samples_leaf: int = 1, smoothing: float = 1.0): """ 目标编码是一种不仅基于特征值本身,还基于相应因变量的类别变量编码方法。 对于分类问题:将类别特征替换为给定某一特定类别值的因变量后验概率与所有训练数据上因变量的先验概率的组合。 对于连续目标:将类别特征替换为给定某一特定类别值的因变量目标期望值与所有训练数据上因变量的目标期望值的组合。 该方法严重依赖于因变量的分布,但这大大减少了生成编码后特征的数量。 :param min_samples_leaf: :param smoothing: :return: """ self.encoder = TargetEncoder(cols=self.cols, min_samples_leaf=min_samples_leaf, smoothing=smoothing) def MEstimate_Encoding(self, m: float = 1.0, sigma: float = 0.05, randomized: bool = False): """ M估计量编码是目标编码的一个简化版本 :param m: :param sigma: :param randomized: :return: """ self.encoder = MEstimateEncoder(cols=self.cols, m=m, sigma=sigma, randomized=randomized) def JamesStein_Encoding(self, model: str = 'independent', sigma: float = 0.05, randomized: bool = False): """ James-Stein编码,也是一种基于目标编码的编码方法,也尝试通过参数B来平衡先验概率与观测到的条件概率。 但与目标编码与M估计量编码不同的是,James-Stein编码器通过方差比而不是样本大小来平衡两个概率。 :param model: :param sigma: :param randomized: :return: """ self.encoder = JamesSteinEncoder(cols=self.cols, model=model, sigma=sigma, randomized=randomized) def WOE_Encoding(self, regularization: float = 1.0, sigma: float = 0.05, randomized: bool = False): """ woe编码 :param regularization: :param sigma: :param randomized: :return: """ self.encoder = WOEEncoder(cols=self.cols, regularization=regularization, randomized=randomized, sigma=sigma) def LeaveOneOut_Encoding(self, sigma: float = 0.05): """ 留一编码 :param sigma: :return: """ self.encoder = LeaveOneOutEncoder(cols=self.cols, sigma=sigma) def CatBoost_Encoding(self, sigma: float = None, a: float = 1): """ CatBoost是一个基于树的梯度提升模型。其在包含大量类别特征的数据集问题中具有出色的效果。 在使用Catboost编码器之前,必须先对训练数据随机排列,因为在Catboost中,编码是基于“时间”的概念,即数据集中观测值的顺序。 :param sigma: :param a: :return: """ self.encoder = CatBoostEncoder(cols=self.cols, a=a, sigma=sigma) def fit(self, X: DataFrame, y: Series = None): """ 拟合函数 :param X: :param y: :return: """ if y is None: self.encoder.fit(X) else: self.encoder.fit(X, y) def transform(self, X: DataFrame): """ 转换函数 :param X: :return: """ res = self.encoder.transform(X) return res
def encoders(self): ohe = OneHotEncoder() cbe = CatBoostEncoder() return ohe, cbe
def _ml_data_prep(self): """Prepares datasets for ML This does one hot encoding, cat boost encoding, and train test split (if necessary). """ df_post = copy.deepcopy(self.df_post) train_prior = copy.deepcopy(self.df_prior) # create test data if not provided if self.test_data is None: logger.info( "No test data was provided. Test data will be created with", "a {}-{} ".format(self.train_size*100, (1-self.train_size)*100), "shuffle split from the post data set." ) df_post = shuffle(df_post) n_split = int(len(df_post)*self.train_size) train_post = df_post.iloc[:n_split] test = df_post.iloc[n_split:] else: test = copy.deepcopy(self.test_data) train_post = df_post # determine columns for OHE & CatBoost OHE_columns = [col for col in self.OHE_columns if col != self.target_column] high_cardinality_columns = [col for col in self.high_cardinality_columns if col != self.target_column] if len(OHE_columns) > 0: logger.info("One hot encoded columns: ", OHE_columns) if len(high_cardinality_columns) > 0: logger.info("Cat boost encoded columns: ", high_cardinality_columns) # concat and then OHE to ensure columns match train_prior['source'] = "Train Prior" test['source'] = "Test" train_post['source'] = "Train Post" df = pd.concat([train_prior, test, train_post]) df = pd.get_dummies(data=df, columns=OHE_columns) train_prior = df[df.source == 'Train Prior'].drop('source', axis=1) test = df[df.source == 'Test'].drop('source', axis=1) train_post = df[df.source == 'Train Post'].drop('source', axis=1) # CatBoostEncoder for high cardinality columns test_prior = copy.deepcopy(test) test_post = copy.deepcopy(test) tf_prior = CatBoostEncoder(cols=high_cardinality_columns, random_state=self.random_state) tf_post = CatBoostEncoder(cols=high_cardinality_columns, random_state=self.random_state) train_prior[high_cardinality_columns] = ( tf_prior.fit_transform(train_prior[high_cardinality_columns], train_prior[self.target_column]) ) test_prior[high_cardinality_columns] = ( tf_prior.transform(test_prior[high_cardinality_columns], test_prior[self.target_column]) ) train_post[high_cardinality_columns] = ( tf_post.fit_transform(train_post[high_cardinality_columns], train_post[self.target_column]) ) test_post[high_cardinality_columns] = ( tf_post.transform(test_post[high_cardinality_columns], test_post[self.target_column]) ) X_train_prior = train_prior.drop(self.target_column, axis=1).astype(float) y_train_prior = train_prior[self.target_column].astype(float) X_test_prior = test_prior.drop(self.target_column, axis=1).astype(float) y_test = test[self.target_column].astype(float) X_train_post = train_post.drop(self.target_column, axis=1).astype(float) y_train_post = train_post[self.target_column].astype(float) X_test_post = test_post.drop(self.target_column, axis=1).astype(float) self.X_train_prior = X_train_prior self.y_train_prior = y_train_prior self.X_test_prior = X_test_prior self.y_test = y_test self.X_train_post = X_train_post self.y_train_post = y_train_post self.X_test_post = X_test_post