class catboost_enc(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, df, y=None):
        self.encoder = CatBoostEncoder(
            handle_unknown='value', cols=self.columns)  #, use_cat_names=True)
        self.encoder = self.encoder.fit(df, y)
        return self

    def transform(self, df, y=None):
        df_ = df.copy()

        return self.encoder.transform(df_)
Exemple #2
0
    def _run(self):
        from category_encoders.cat_boost import CatBoostEncoder

        data = self.input[0]
        num_cols = self.input[1]
        cat_cols = self.input[2]

        train = data[data['isFraud'] != -1]

        X = train.drop('isFraud', axis=1)
        y = train['isFraud'].astype(np.uint8)

        del train

        encoder = CatBoostEncoder(verbose=1, cols=cat_cols)
        encoder.fit(X, y)

        cat_data: pd.DataFrame = data.drop('isFraud', axis=1)
        cat_data = encoder.transform(cat_data)
        cat_data = cat_data.join(data['isFraud'])
        self.output = cat_data
            y_train = pickle.load(f)
        with open("y_test.pkl", "rb") as f:
            y_test = pickle.load(f)
        with open("label_encoder.pkl", "rb") as f:
            encoder = pickle.load(f)

        cols_cat = [
            "ZONA_METROPOLITANA", "CODIGO_POSTAL", "ruido", "CALIDAD_AIRE"
        ]
        cols_float = [col for col in X_train.columns if col not in cols_cat]
        X_train[cols_float] = X_train[cols_float].astype("float")
        X_test[cols_float] = X_test[cols_float].astype("float")

    cat_encoder = CatBoostEncoder(cols=cols_cat)
    X_train = cat_encoder.fit_transform(X_train, y_train)
    X_test = cat_encoder.transform(X_test)
    if "Oeste" in X_train.columns:
        X_train = X_train.drop("Oeste", axis=1)
        X_test = X_test.drop("Oeste", axis=1)
    labs_names = [c for c in encoder.classes_]
    if not args.stacking:
        model = models_dic[args.model]["model"]
        params = models_dic[args.model]["parameters"]
    else:
        model = stacking_models[args.model]["model"]
        params = stacking_models[args.model]["parameters"]

    counter = dict(Counter(y_train))
    if not args.stacking:
        samp_strategy = {5: int(0.11 * counter[5])}
        model.set_params(**{"model__sampling_strategy": samp_strategy})
def cat_encode(X, X_test, cols, y):
    ce = CatBoostEncoder(cols=cols)
    X = ce.fit_transform(X, y)
    X_test = ce.transform(X_test)
    return (X, X_test)
def main():
    mlflow.start_run(run_name=NAME)

    if "X_train.pkl" not in os.listdir():
        print("procesando los datos")
        X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False)
        print(X.shape)

        with open(f"label_encoder_{NAME}.pkl", "wb") as f:
            pickle.dump(encoder, f)
        print(
            f"##################### The shape of X is {X.shape} #######################"
        )
        y = y.astype("int")
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.15,
                                                            random_state=15,
                                                            stratify=y)
        with open("X_train.pkl", "wb") as f:
            pickle.dump(X_train, f)
        with open("X_test.pkl", "wb") as f:
            pickle.dump(X_test, f)
        with open("y_train.pkl", "wb") as f:
            pickle.dump(y_train, f)
        with open("y_test.pkl", "wb") as f:
            pickle.dump(y_test, f)

        print(X_train.shape)

    else:
        with open("X_train.pkl", "rb") as f:
            X_train = pickle.load(f)
        with open("X_test.pkl", "rb") as f:
            X_test = pickle.load(f)
        with open("y_train.pkl", "rb") as f:
            y_train = pickle.load(f)
        with open("y_test.pkl", "rb") as f:
            y_test = pickle.load(f)
        with open(f"label_encoder_XGB1704.pkl", "rb") as f:
            encoder = pickle.load(f)
        print("######### ajustando cat encoder ############")

    cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"]
    cols_float = [col for col in X_train.columns if col not in cols_cat]
    X_train[cols_float] = X_train[cols_float].astype("float")
    X_test[cols_float] = X_test[cols_float].astype("float")

    labs_names = [c for c in encoder.classes_]

    model = LGBMClassifier(
        class_weight="balanced",
        objective="multiclass:softmax",
        n_jobs=-1,
        random_state=100,
        silent=True,
    )

    if MODE != "INDIVIDUAL":
        params = {
            "reg_alpha": (1e-3, 5.0, "log-uniform"),
            "reg_lambda": (1e-2, 50.0, "log-uniform"),
            "n_estimators": (600, 4500),
            "learning_rate": (5e-3, 1.0, "log-uniform"),
            "num_leaves": (20, 80),
            "boosting_type": ["gbdt", "goss"],
            "colsample_bytree": (0.1, 1.0, "uniform"),
            "subsample": (0.1, 1.0, "uniform"),
            "min_child_samples": (1, 25),
            "min_child_weight": (1e-6, 0.1, "log-uniform"),
        }

        print(params)

        cb = CatBoostEncoder(cols=cols_cat)
        X_train = cb.fit_transform(X_train, y_train)
        X_test = cb.transform(X_test)
        fit_params = {
            ### fit params ###
            "eval_set": [(X_test, y_test)],
            "eval_metric": lgb_f1_score,
            "early_stopping_rounds": 300,
        }

        pipeline = Pipeline(steps=[("clas_encoder",
                                    CatBoostEncoder(
                                        cols=cols_cat)), ("model", model)])

        best_model = BayesSearchCV(
            model,
            params,
            n_iter=N_ITER,
            n_points=1,
            cv=cv,
            scoring=f2_scorer,
            random_state=100,
            optimizer_kwargs={"n_initial_points": 10},
            fit_params=fit_params,
        )

    def on_step(optim_result):
        score = best_model.best_score_
        results = best_model.cv_results_
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(f"results_{NAME}.csv", header=True, index=False)
            print(
                f"############ Llevamos {results_df.shape[0]} pruebas #################"
            )
            print(f"los resultados del cv de momento son {results_df}")
        except:
            print("Unable to convert cv results to pandas dataframe")
        mlflow.log_metric("best_score", score)
        with open(f"./best_{NAME}_params.pkl", "wb") as f:
            pickle.dump(best_model.best_params_, f)

        print("best score: %s" % score)
        if score >= 0.98:
            print("Interrupting!")
            return True

    print("ajustando modelo")
    if MODE != "INDIVIDUAL":
        print(X_train.dtypes)
        best_model.fit(X_train, y_train, callback=[on_step])
        with open(f"./best_{NAME}_model.pkl", "wb") as f:
            pickle.dump(best_model, f)
        preds = best_model.predict(X_test)
    else:
        if NAME not in os.listdir():
            os.mkdir(NAME)

        cat_encoder = CatBoostEncoder(cols=cols_cat)
        X_train = cat_encoder.fit_transform(X_train, y_train)
        X_test = cat_encoder.transform(X_test)
        best_model = BalancedBaggingClassifier(
            base_estimator=HistGradientBoostingClassifier(
                max_iter=3000,
                random_state=42,
                learning_rate=0.1,
                max_leaf_nodes=54,
                min_samples_leaf=2,
                scoring=f2_scorer,
                validation_fraction=0.1,
                n_iter_no_change=50,
            ),
            n_estimators=5,
            random_state=42,
            n_jobs=-1,
            max_features=0.7,
            sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)},
        )
        best_model.fit(X_train, y_train)
        preds = best_model.predict(X_test)
        print(
            f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}'
        )
        print(
            f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        print(
            f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        cm = confusion_matrix(y_test, preds)
        grafico_conf_matrix = print_confusion_matrix(cm,
                                                     class_names=labs_names)
        grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE")

        with open(f"best_model_{NAME}.pkl", "wb") as f:
            pickle.dump(best_model, f)

    print("loggeando movidas")
    mlflow.log_metrics(
        metrics={
            "f1": f1_score(y_test, preds, average="macro"),
            "precision": precision_score(y_test, preds, average="macro"),
            "recall": recall_score(y_test, preds, average="macro"),
            "accuracy": accuracy_score(y_test, preds),
            "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"),
            "f2": fbeta_score(y_test, preds, beta=2, average="macro"),
        })
    if MODE != "INDIVIDUAL":
        best_params = best_model.best_params_
        for param in best_params.keys():
            mlflow.log_param(param, best_params[param])
    cm = confusion_matrix(y_test, preds)
    grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names)
    grafico_conf_matrix.savefig(NAME)
    grafico_norm = print_confusion_matrix(cm,
                                          class_names=labs_names,
                                          normalize=False)
    grafico_norm.savefig(f"{NAME}_no_norm")
    mlflow.end_run()
Exemple #6
0
    temp[nan_indexes] = "unknown"
    temp[temp == "0"] = "unknown"
    temp[temp == "Unknown"] = "unknown"
    X[:, i] = temp
    temp = X_test[:, i]
    nan_indexes = pd.isnull(X_test[:, i])
    temp[nan_indexes] = "unknown"
    temp[temp == "0"] = "unknown"
    temp[temp == "Unknown"] = "unknown"
    X_test[:, i] = temp

#Encode categorical data
print("Encoding data..")
encoder_t = CatBoostEncoder(cols=cat_item_indexes)
X = encoder_t.fit_transform(X, y)
X_test = encoder_t.transform(X_test)
X_test = X_test.astype(float)
X_test = X_test.iloc[:, :].values
X = X.astype(float)
X = X.iloc[:, :].values

#Scale data
print("Scaling..")
sc = RobustScaler()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)

#Fit model - n_estimators, max_depth  & min_samples_split at current values will take a long time to run.
#Reducing these values will reduce the RMSE by a small margin, but testing will be a lot faster.
print("\nBeginning Gradient Boosting Regression.")
gbrReg = GradientBoostingRegressor(n_estimators=3000,