class catboost_enc(BaseEstimator, TransformerMixin): def __init__(self, columns): self.columns = columns def fit(self, df, y=None): self.encoder = CatBoostEncoder( handle_unknown='value', cols=self.columns) #, use_cat_names=True) self.encoder = self.encoder.fit(df, y) return self def transform(self, df, y=None): df_ = df.copy() return self.encoder.transform(df_)
def catboost_encoder(self, df, configger): """ :param df: the train dataset. :param configger: the json str of configger setting, the params means: verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_missing: str options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. sigma: float adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma gives the standard deviation (spread or "width") of the normal distribution. a: float additive smoothing (it is the same variable as "m" in m-probability estimate). By default set to 1. :return: the transform result """ X, y, encode_col = self.get_Xy(df, configger) drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True) handle_missing = set_default_vale("handle_missing", configger, "value") handle_unknown = set_default_vale("handle_unknown", configger, "value") random_state = set_default_vale("random_state", configger, None) sigma = set_default_vale("sigma", configger, None) a = set_default_vale("a", configger, 1) encoder = CatBoostEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True, handle_unknown=handle_unknown, handle_missing=handle_missing, random_state=random_state, sigma=sigma, a=a) res = encoder.fit_transform(X, y) return res
def __init__(self, n_splits, cvfold, categorical_features, encoder=None, name='catboost_encoded'): self.n_splits = n_splits self.cvfold = cvfold self.categorical_features = categorical_features self.columns = [name + '_' + c for c in categorical_features] if encoder is None: self.encoder = CatBoostEncoder( cols=categorical_features, return_df=False, ) else: self.encoder = encoder
def _run(self): from category_encoders.cat_boost import CatBoostEncoder data = self.input[0] num_cols = self.input[1] cat_cols = self.input[2] train = data[data['isFraud'] != -1] X = train.drop('isFraud', axis=1) y = train['isFraud'].astype(np.uint8) del train encoder = CatBoostEncoder(verbose=1, cols=cat_cols) encoder.fit(X, y) cat_data: pd.DataFrame = data.drop('isFraud', axis=1) cat_data = encoder.transform(cat_data) cat_data = cat_data.join(data['isFraud']) self.output = cat_data
def get_single_encoder(encoder_name: str, cat_cols: list): """ Get encoder by its name :param encoder_name: Name of desired encoder :param cat_cols: Cat columns for encoding :return: Categorical encoder """ if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "OneHotEncoder": encoder = OneHotEncoder(cols=cat_cols) if encoder is None: raise NotImplementedError("To be implemented") return encoder
def get_single_encoder(encoder_name: str, cat_cols: list): if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == 'OneHotEncoder': encoder = OneHotEncoder(cols=cat_cols) # assert encoder is not None return encoder
X_test = pickle.load(f) with open("y_train.pkl", "rb") as f: y_train = pickle.load(f) with open("y_test.pkl", "rb") as f: y_test = pickle.load(f) with open("label_encoder.pkl", "rb") as f: encoder = pickle.load(f) cols_cat = [ "ZONA_METROPOLITANA", "CODIGO_POSTAL", "ruido", "CALIDAD_AIRE" ] cols_float = [col for col in X_train.columns if col not in cols_cat] X_train[cols_float] = X_train[cols_float].astype("float") X_test[cols_float] = X_test[cols_float].astype("float") cat_encoder = CatBoostEncoder(cols=cols_cat) X_train = cat_encoder.fit_transform(X_train, y_train) X_test = cat_encoder.transform(X_test) if "Oeste" in X_train.columns: X_train = X_train.drop("Oeste", axis=1) X_test = X_test.drop("Oeste", axis=1) labs_names = [c for c in encoder.classes_] if not args.stacking: model = models_dic[args.model]["model"] params = models_dic[args.model]["parameters"] else: model = stacking_models[args.model]["model"] params = stacking_models[args.model]["parameters"] counter = dict(Counter(y_train)) if not args.stacking:
def cat_encode(X, X_test, cols, y): ce = CatBoostEncoder(cols=cols) X = ce.fit_transform(X, y) X_test = ce.transform(X_test) return (X, X_test)
def main(): mlflow.start_run(run_name=NAME) if "X_train.pkl" not in os.listdir(): print("procesando los datos") X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False) print(X.shape) with open(f"label_encoder_{NAME}.pkl", "wb") as f: pickle.dump(encoder, f) print( f"##################### The shape of X is {X.shape} #######################" ) y = y.astype("int") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=15, stratify=y) with open("X_train.pkl", "wb") as f: pickle.dump(X_train, f) with open("X_test.pkl", "wb") as f: pickle.dump(X_test, f) with open("y_train.pkl", "wb") as f: pickle.dump(y_train, f) with open("y_test.pkl", "wb") as f: pickle.dump(y_test, f) print(X_train.shape) else: with open("X_train.pkl", "rb") as f: X_train = pickle.load(f) with open("X_test.pkl", "rb") as f: X_test = pickle.load(f) with open("y_train.pkl", "rb") as f: y_train = pickle.load(f) with open("y_test.pkl", "rb") as f: y_test = pickle.load(f) with open(f"label_encoder_XGB1704.pkl", "rb") as f: encoder = pickle.load(f) print("######### ajustando cat encoder ############") cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"] cols_float = [col for col in X_train.columns if col not in cols_cat] X_train[cols_float] = X_train[cols_float].astype("float") X_test[cols_float] = X_test[cols_float].astype("float") labs_names = [c for c in encoder.classes_] model = LGBMClassifier( class_weight="balanced", objective="multiclass:softmax", n_jobs=-1, random_state=100, silent=True, ) if MODE != "INDIVIDUAL": params = { "reg_alpha": (1e-3, 5.0, "log-uniform"), "reg_lambda": (1e-2, 50.0, "log-uniform"), "n_estimators": (600, 4500), "learning_rate": (5e-3, 1.0, "log-uniform"), "num_leaves": (20, 80), "boosting_type": ["gbdt", "goss"], "colsample_bytree": (0.1, 1.0, "uniform"), "subsample": (0.1, 1.0, "uniform"), "min_child_samples": (1, 25), "min_child_weight": (1e-6, 0.1, "log-uniform"), } print(params) cb = CatBoostEncoder(cols=cols_cat) X_train = cb.fit_transform(X_train, y_train) X_test = cb.transform(X_test) fit_params = { ### fit params ### "eval_set": [(X_test, y_test)], "eval_metric": lgb_f1_score, "early_stopping_rounds": 300, } pipeline = Pipeline(steps=[("clas_encoder", CatBoostEncoder( cols=cols_cat)), ("model", model)]) best_model = BayesSearchCV( model, params, n_iter=N_ITER, n_points=1, cv=cv, scoring=f2_scorer, random_state=100, optimizer_kwargs={"n_initial_points": 10}, fit_params=fit_params, ) def on_step(optim_result): score = best_model.best_score_ results = best_model.cv_results_ try: results_df = pd.DataFrame(results) results_df.to_csv(f"results_{NAME}.csv", header=True, index=False) print( f"############ Llevamos {results_df.shape[0]} pruebas #################" ) print(f"los resultados del cv de momento son {results_df}") except: print("Unable to convert cv results to pandas dataframe") mlflow.log_metric("best_score", score) with open(f"./best_{NAME}_params.pkl", "wb") as f: pickle.dump(best_model.best_params_, f) print("best score: %s" % score) if score >= 0.98: print("Interrupting!") return True print("ajustando modelo") if MODE != "INDIVIDUAL": print(X_train.dtypes) best_model.fit(X_train, y_train, callback=[on_step]) with open(f"./best_{NAME}_model.pkl", "wb") as f: pickle.dump(best_model, f) preds = best_model.predict(X_test) else: if NAME not in os.listdir(): os.mkdir(NAME) cat_encoder = CatBoostEncoder(cols=cols_cat) X_train = cat_encoder.fit_transform(X_train, y_train) X_test = cat_encoder.transform(X_test) best_model = BalancedBaggingClassifier( base_estimator=HistGradientBoostingClassifier( max_iter=3000, random_state=42, learning_rate=0.1, max_leaf_nodes=54, min_samples_leaf=2, scoring=f2_scorer, validation_fraction=0.1, n_iter_no_change=50, ), n_estimators=5, random_state=42, n_jobs=-1, max_features=0.7, sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)}, ) best_model.fit(X_train, y_train) preds = best_model.predict(X_test) print( f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}' ) print( f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}" ) print( f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}" ) cm = confusion_matrix(y_test, preds) grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names) grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE") with open(f"best_model_{NAME}.pkl", "wb") as f: pickle.dump(best_model, f) print("loggeando movidas") mlflow.log_metrics( metrics={ "f1": f1_score(y_test, preds, average="macro"), "precision": precision_score(y_test, preds, average="macro"), "recall": recall_score(y_test, preds, average="macro"), "accuracy": accuracy_score(y_test, preds), "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"), "f2": fbeta_score(y_test, preds, beta=2, average="macro"), }) if MODE != "INDIVIDUAL": best_params = best_model.best_params_ for param in best_params.keys(): mlflow.log_param(param, best_params[param]) cm = confusion_matrix(y_test, preds) grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names) grafico_conf_matrix.savefig(NAME) grafico_norm = print_confusion_matrix(cm, class_names=labs_names, normalize=False) grafico_norm.savefig(f"{NAME}_no_norm") mlflow.end_run()
number_of_new_test = len(set(X_test[col]) - train_values) fraction_of_new_test = np.mean(X_test[col].apply(lambda v: v not in train_values)) cc_info[col] = { "num_uniq_train": X_train[col].nunique(), "num_uniq_test": X_test[col].nunique(), "number_of_new_test": number_of_new_test, "fraction_of_new_test": fraction_of_new_test } return cc_info if __name__ == "__main__": print("*****************") df = pd.DataFrame({}) df["cat_col"] = [1, 2, 3, 1, 2, 3, 1, 1, 1] df["target"] = [0, 1, 0, 1, 0, 1, 0, 1, 0] # temp = df.copy() enc = CatBoostEncoder(cols=["cat_col"]) print(enc.fit_transform(temp, temp["target"])) # temp = df.copy() enc = MultipleEncoder(cols=["cat_col"], encoders_names_tuple=("CatBoostEncoder",)) print(enc.fit_transform(temp, temp["target"])) # temp = df.copy() enc = DoubleValidationEncoderNumerical(cols=["cat_col"], encoders_names_tuple=("CatBoostEncoder",)) print(enc.fit_transform(temp, temp["target"]))
def preProcess(path='train.csv', df_=None, train=True, save=False, save_path=None, pipe_path='pipe.pkl'): # Read Data if df_ is None: df = pd.read_csv(path, index_col=0) if path is None: raise ValueError('Must Define path or DataFrame') else: df = df_.copy() # Drop when NU_NOTA_LC is null if train df.dropna(subset=['NU_NOTA_LC'], inplace=True) if train: df.dropna(subset=['NU_NOTA_CN', 'NU_NOTA_CH'], inplace=True) # Create target data if train if train: try: target = df['NU_NOTA_MT'] except: raise ValueError('Column NU_NOTA_MT missing from data') else: target = None # Columns to select cols_select = [ 'SG_UF_RESIDENCIA', 'NU_IDADE', 'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ENSINO', 'TP_DEPENDENCIA_ADM_ESC', 'CO_PROVA_CH', #'CO_PROVA_LC', 'CO_PROVA_MT', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'TP_LINGUA', 'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q001', 'Q002', 'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047' ] # Select Columsn try: df = df[cols_select] except: raise ValueError('Column missing from data') float_cols = [ 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'NU_IDADE', 'TP_ANO_CONCLUIU' ] df[float_cols] = df[float_cols].astype('float64') # Create Pipeline floats pipe_float = Pipeline([ ('inputer', SimpleImputer(strategy="median")), ('scaler', StandardScaler()), ]) # Create Pipeline Categorical Features cat_cols = [ 'SG_UF_RESIDENCIA', 'TP_SEXO', 'CO_PROVA_CH', #'CO_PROVA_LC', 'CO_PROVA_MT', 'Q001', 'Q002', 'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO', 'TP_ENSINO', 'TP_DEPENDENCIA_ADM_ESC', 'TP_STATUS_REDACAO', 'TP_LINGUA' ] df[cat_cols] = df[cat_cols].astype('object') pipe_cat = Pipeline([('label encoder', CatBoostEncoder())]) # Create full pipeline pipe = ColumnTransformer( transformers=[('pipe_float', pipe_float, float_cols), ('pipe_cat', pipe_cat, cat_cols)] #, remainder = 'passthrough' ) pipe_target = Pipeline([('scaler', StandardScaler())]) # Fit pipelines if train: pipe.fit(df, target) pipe_target.fit(target.values.reshape(-1, 1)) with open(pipe_path, 'wb') as f: pickle.dump([pipe, pipe_target], f) else: with open(pipe_path, 'rb') as f: pipe, pipe_target = pickle.load(f) # Transform variables df = pipe.transform(df) if train: target = pipe_target.transform(target.values.reshape(-1, 1)) # Save file to pickle if save_path is not None: with open(save_path, 'wb') as f: pickle.dump([df, target], f) return [df, target]
def fit(self, df, y=None): self.encoder = CatBoostEncoder( handle_unknown='value', cols=self.columns) #, use_cat_names=True) self.encoder = self.encoder.fit(df, y) return self
temp = X[:, i] nan_indexes = pd.isnull(X[:, i]) temp[nan_indexes] = "unknown" temp[temp == "0"] = "unknown" temp[temp == "Unknown"] = "unknown" X[:, i] = temp temp = X_test[:, i] nan_indexes = pd.isnull(X_test[:, i]) temp[nan_indexes] = "unknown" temp[temp == "0"] = "unknown" temp[temp == "Unknown"] = "unknown" X_test[:, i] = temp #Encode categorical data print("Encoding data..") encoder_t = CatBoostEncoder(cols=cat_item_indexes) X = encoder_t.fit_transform(X, y) X_test = encoder_t.transform(X_test) X_test = X_test.astype(float) X_test = X_test.iloc[:, :].values X = X.astype(float) X = X.iloc[:, :].values #Scale data print("Scaling..") sc = RobustScaler() X = sc.fit_transform(X) X_test = sc.transform(X_test) #Fit model - n_estimators, max_depth & min_samples_split at current values will take a long time to run. #Reducing these values will reduce the RMSE by a small margin, but testing will be a lot faster.