class catboost_enc(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, df, y=None):
        self.encoder = CatBoostEncoder(
            handle_unknown='value', cols=self.columns)  #, use_cat_names=True)
        self.encoder = self.encoder.fit(df, y)
        return self

    def transform(self, df, y=None):
        df_ = df.copy()

        return self.encoder.transform(df_)
    def catboost_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'error', 'return_nan'  and 'value', defaults to 'value', which returns the target mean.
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
            sigma: float
                adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
                sigma gives the standard deviation (spread or "width") of the normal distribution.
            a: float
                additive smoothing (it is the same variable as "m" in m-probability estimate). By default set to 1.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        random_state = set_default_vale("random_state", configger, None)
        sigma = set_default_vale("sigma", configger, None)
        a = set_default_vale("a", configger, 1)

        encoder = CatBoostEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                  handle_unknown=handle_unknown, handle_missing=handle_missing,
                                  random_state=random_state, sigma=sigma, a=a)

        res = encoder.fit_transform(X, y)

        return res
Exemple #3
0
 def __init__(self, n_splits, cvfold, categorical_features, encoder=None, name='catboost_encoded'):
     self.n_splits = n_splits
     self.cvfold = cvfold
     self.categorical_features = categorical_features
     self.columns = [name + '_' + c for c in categorical_features]
     if encoder is None:
         self.encoder = CatBoostEncoder(
             cols=categorical_features,
             return_df=False,
         )
     else:
         self.encoder = encoder
Exemple #4
0
    def _run(self):
        from category_encoders.cat_boost import CatBoostEncoder

        data = self.input[0]
        num_cols = self.input[1]
        cat_cols = self.input[2]

        train = data[data['isFraud'] != -1]

        X = train.drop('isFraud', axis=1)
        y = train['isFraud'].astype(np.uint8)

        del train

        encoder = CatBoostEncoder(verbose=1, cols=cat_cols)
        encoder.fit(X, y)

        cat_data: pd.DataFrame = data.drop('isFraud', axis=1)
        cat_data = encoder.transform(cat_data)
        cat_data = cat_data.join(data['isFraud'])
        self.output = cat_data
def get_single_encoder(encoder_name: str, cat_cols: list):
    """
    Get encoder by its name
    :param encoder_name: Name of desired encoder
    :param cat_cols: Cat columns for encoding
    :return: Categorical encoder
    """
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)
    if encoder_name == "OneHotEncoder":
        encoder = OneHotEncoder(cols=cat_cols)
    if encoder is None:
        raise NotImplementedError("To be implemented")
    return encoder
def get_single_encoder(encoder_name: str, cat_cols: list):
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == 'OneHotEncoder':
        encoder = OneHotEncoder(cols=cat_cols)

    # assert encoder is not None
    return encoder
            X_test = pickle.load(f)
        with open("y_train.pkl", "rb") as f:
            y_train = pickle.load(f)
        with open("y_test.pkl", "rb") as f:
            y_test = pickle.load(f)
        with open("label_encoder.pkl", "rb") as f:
            encoder = pickle.load(f)

        cols_cat = [
            "ZONA_METROPOLITANA", "CODIGO_POSTAL", "ruido", "CALIDAD_AIRE"
        ]
        cols_float = [col for col in X_train.columns if col not in cols_cat]
        X_train[cols_float] = X_train[cols_float].astype("float")
        X_test[cols_float] = X_test[cols_float].astype("float")

    cat_encoder = CatBoostEncoder(cols=cols_cat)
    X_train = cat_encoder.fit_transform(X_train, y_train)
    X_test = cat_encoder.transform(X_test)
    if "Oeste" in X_train.columns:
        X_train = X_train.drop("Oeste", axis=1)
        X_test = X_test.drop("Oeste", axis=1)
    labs_names = [c for c in encoder.classes_]
    if not args.stacking:
        model = models_dic[args.model]["model"]
        params = models_dic[args.model]["parameters"]
    else:
        model = stacking_models[args.model]["model"]
        params = stacking_models[args.model]["parameters"]

    counter = dict(Counter(y_train))
    if not args.stacking:
def cat_encode(X, X_test, cols, y):
    ce = CatBoostEncoder(cols=cols)
    X = ce.fit_transform(X, y)
    X_test = ce.transform(X_test)
    return (X, X_test)
def main():
    mlflow.start_run(run_name=NAME)

    if "X_train.pkl" not in os.listdir():
        print("procesando los datos")
        X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False)
        print(X.shape)

        with open(f"label_encoder_{NAME}.pkl", "wb") as f:
            pickle.dump(encoder, f)
        print(
            f"##################### The shape of X is {X.shape} #######################"
        )
        y = y.astype("int")
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.15,
                                                            random_state=15,
                                                            stratify=y)
        with open("X_train.pkl", "wb") as f:
            pickle.dump(X_train, f)
        with open("X_test.pkl", "wb") as f:
            pickle.dump(X_test, f)
        with open("y_train.pkl", "wb") as f:
            pickle.dump(y_train, f)
        with open("y_test.pkl", "wb") as f:
            pickle.dump(y_test, f)

        print(X_train.shape)

    else:
        with open("X_train.pkl", "rb") as f:
            X_train = pickle.load(f)
        with open("X_test.pkl", "rb") as f:
            X_test = pickle.load(f)
        with open("y_train.pkl", "rb") as f:
            y_train = pickle.load(f)
        with open("y_test.pkl", "rb") as f:
            y_test = pickle.load(f)
        with open(f"label_encoder_XGB1704.pkl", "rb") as f:
            encoder = pickle.load(f)
        print("######### ajustando cat encoder ############")

    cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"]
    cols_float = [col for col in X_train.columns if col not in cols_cat]
    X_train[cols_float] = X_train[cols_float].astype("float")
    X_test[cols_float] = X_test[cols_float].astype("float")

    labs_names = [c for c in encoder.classes_]

    model = LGBMClassifier(
        class_weight="balanced",
        objective="multiclass:softmax",
        n_jobs=-1,
        random_state=100,
        silent=True,
    )

    if MODE != "INDIVIDUAL":
        params = {
            "reg_alpha": (1e-3, 5.0, "log-uniform"),
            "reg_lambda": (1e-2, 50.0, "log-uniform"),
            "n_estimators": (600, 4500),
            "learning_rate": (5e-3, 1.0, "log-uniform"),
            "num_leaves": (20, 80),
            "boosting_type": ["gbdt", "goss"],
            "colsample_bytree": (0.1, 1.0, "uniform"),
            "subsample": (0.1, 1.0, "uniform"),
            "min_child_samples": (1, 25),
            "min_child_weight": (1e-6, 0.1, "log-uniform"),
        }

        print(params)

        cb = CatBoostEncoder(cols=cols_cat)
        X_train = cb.fit_transform(X_train, y_train)
        X_test = cb.transform(X_test)
        fit_params = {
            ### fit params ###
            "eval_set": [(X_test, y_test)],
            "eval_metric": lgb_f1_score,
            "early_stopping_rounds": 300,
        }

        pipeline = Pipeline(steps=[("clas_encoder",
                                    CatBoostEncoder(
                                        cols=cols_cat)), ("model", model)])

        best_model = BayesSearchCV(
            model,
            params,
            n_iter=N_ITER,
            n_points=1,
            cv=cv,
            scoring=f2_scorer,
            random_state=100,
            optimizer_kwargs={"n_initial_points": 10},
            fit_params=fit_params,
        )

    def on_step(optim_result):
        score = best_model.best_score_
        results = best_model.cv_results_
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(f"results_{NAME}.csv", header=True, index=False)
            print(
                f"############ Llevamos {results_df.shape[0]} pruebas #################"
            )
            print(f"los resultados del cv de momento son {results_df}")
        except:
            print("Unable to convert cv results to pandas dataframe")
        mlflow.log_metric("best_score", score)
        with open(f"./best_{NAME}_params.pkl", "wb") as f:
            pickle.dump(best_model.best_params_, f)

        print("best score: %s" % score)
        if score >= 0.98:
            print("Interrupting!")
            return True

    print("ajustando modelo")
    if MODE != "INDIVIDUAL":
        print(X_train.dtypes)
        best_model.fit(X_train, y_train, callback=[on_step])
        with open(f"./best_{NAME}_model.pkl", "wb") as f:
            pickle.dump(best_model, f)
        preds = best_model.predict(X_test)
    else:
        if NAME not in os.listdir():
            os.mkdir(NAME)

        cat_encoder = CatBoostEncoder(cols=cols_cat)
        X_train = cat_encoder.fit_transform(X_train, y_train)
        X_test = cat_encoder.transform(X_test)
        best_model = BalancedBaggingClassifier(
            base_estimator=HistGradientBoostingClassifier(
                max_iter=3000,
                random_state=42,
                learning_rate=0.1,
                max_leaf_nodes=54,
                min_samples_leaf=2,
                scoring=f2_scorer,
                validation_fraction=0.1,
                n_iter_no_change=50,
            ),
            n_estimators=5,
            random_state=42,
            n_jobs=-1,
            max_features=0.7,
            sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)},
        )
        best_model.fit(X_train, y_train)
        preds = best_model.predict(X_test)
        print(
            f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}'
        )
        print(
            f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        print(
            f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        cm = confusion_matrix(y_test, preds)
        grafico_conf_matrix = print_confusion_matrix(cm,
                                                     class_names=labs_names)
        grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE")

        with open(f"best_model_{NAME}.pkl", "wb") as f:
            pickle.dump(best_model, f)

    print("loggeando movidas")
    mlflow.log_metrics(
        metrics={
            "f1": f1_score(y_test, preds, average="macro"),
            "precision": precision_score(y_test, preds, average="macro"),
            "recall": recall_score(y_test, preds, average="macro"),
            "accuracy": accuracy_score(y_test, preds),
            "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"),
            "f2": fbeta_score(y_test, preds, beta=2, average="macro"),
        })
    if MODE != "INDIVIDUAL":
        best_params = best_model.best_params_
        for param in best_params.keys():
            mlflow.log_param(param, best_params[param])
    cm = confusion_matrix(y_test, preds)
    grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names)
    grafico_conf_matrix.savefig(NAME)
    grafico_norm = print_confusion_matrix(cm,
                                          class_names=labs_names,
                                          normalize=False)
    grafico_norm.savefig(f"{NAME}_no_norm")
    mlflow.end_run()
        number_of_new_test = len(set(X_test[col]) - train_values)
        fraction_of_new_test = np.mean(X_test[col].apply(lambda v: v not in train_values))

        cc_info[col] = {
            "num_uniq_train": X_train[col].nunique(), "num_uniq_test": X_test[col].nunique(),
            "number_of_new_test": number_of_new_test, "fraction_of_new_test": fraction_of_new_test
        }
    return cc_info


if __name__ == "__main__":
    print("*****************")
    df = pd.DataFrame({})
    df["cat_col"] = [1, 2, 3, 1, 2, 3, 1, 1, 1]
    df["target"] = [0, 1, 0, 1, 0, 1, 0, 1, 0]

    #
    temp = df.copy()
    enc = CatBoostEncoder(cols=["cat_col"])
    print(enc.fit_transform(temp, temp["target"]))

    #
    temp = df.copy()
    enc = MultipleEncoder(cols=["cat_col"], encoders_names_tuple=("CatBoostEncoder",))
    print(enc.fit_transform(temp, temp["target"]))

    #
    temp = df.copy()
    enc = DoubleValidationEncoderNumerical(cols=["cat_col"], encoders_names_tuple=("CatBoostEncoder",))
    print(enc.fit_transform(temp, temp["target"]))
Exemple #11
0
def preProcess(path='train.csv',
               df_=None,
               train=True,
               save=False,
               save_path=None,
               pipe_path='pipe.pkl'):

    # Read Data
    if df_ is None:
        df = pd.read_csv(path, index_col=0)
        if path is None:
            raise ValueError('Must Define path or DataFrame')
    else:
        df = df_.copy()

    # Drop when NU_NOTA_LC is null if train
    df.dropna(subset=['NU_NOTA_LC'], inplace=True)

    if train:
        df.dropna(subset=['NU_NOTA_CN', 'NU_NOTA_CH'], inplace=True)

    # Create target data if train
    if train:
        try:
            target = df['NU_NOTA_MT']
        except:
            raise ValueError('Column NU_NOTA_MT missing from data')
    else:
        target = None

    # Columns to select
    cols_select = [
        'SG_UF_RESIDENCIA',
        'NU_IDADE',
        'TP_SEXO',
        'TP_COR_RACA',
        'TP_NACIONALIDADE',
        'TP_ST_CONCLUSAO',
        'TP_ANO_CONCLUIU',
        'TP_ENSINO',
        'TP_DEPENDENCIA_ADM_ESC',
        'CO_PROVA_CH',  #'CO_PROVA_LC', 
        'CO_PROVA_MT',
        'NU_NOTA_CN',
        'NU_NOTA_CH',
        'NU_NOTA_LC',
        'TP_LINGUA',
        'TP_STATUS_REDACAO',
        'NU_NOTA_COMP1',
        'NU_NOTA_COMP2',
        'NU_NOTA_COMP3',
        'NU_NOTA_COMP4',
        'NU_NOTA_COMP5',
        'NU_NOTA_REDACAO',
        'Q001',
        'Q002',
        'Q006',
        'Q024',
        'Q025',
        'Q026',
        'Q027',
        'Q047'
    ]

    # Select Columsn
    try:
        df = df[cols_select]
    except:
        raise ValueError('Column missing from data')

    float_cols = [
        'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP1',
        'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5',
        'NU_NOTA_REDACAO', 'NU_IDADE', 'TP_ANO_CONCLUIU'
    ]

    df[float_cols] = df[float_cols].astype('float64')

    # Create Pipeline floats
    pipe_float = Pipeline([
        ('inputer', SimpleImputer(strategy="median")),
        ('scaler', StandardScaler()),
    ])

    # Create Pipeline Categorical Features
    cat_cols = [
        'SG_UF_RESIDENCIA',
        'TP_SEXO',
        'CO_PROVA_CH',  #'CO_PROVA_LC',
        'CO_PROVA_MT',
        'Q001',
        'Q002',
        'Q006',
        'Q024',
        'Q025',
        'Q026',
        'Q027',
        'Q047',
        'TP_COR_RACA',
        'TP_NACIONALIDADE',
        'TP_ST_CONCLUSAO',
        'TP_ENSINO',
        'TP_DEPENDENCIA_ADM_ESC',
        'TP_STATUS_REDACAO',
        'TP_LINGUA'
    ]

    df[cat_cols] = df[cat_cols].astype('object')

    pipe_cat = Pipeline([('label encoder', CatBoostEncoder())])

    # Create full pipeline
    pipe = ColumnTransformer(
        transformers=[('pipe_float', pipe_float,
                       float_cols), ('pipe_cat', pipe_cat,
                                     cat_cols)]  #, remainder = 'passthrough'
    )

    pipe_target = Pipeline([('scaler', StandardScaler())])

    # Fit pipelines
    if train:

        pipe.fit(df, target)

        pipe_target.fit(target.values.reshape(-1, 1))

        with open(pipe_path, 'wb') as f:

            pickle.dump([pipe, pipe_target], f)

    else:

        with open(pipe_path, 'rb') as f:

            pipe, pipe_target = pickle.load(f)

    # Transform variables
    df = pipe.transform(df)

    if train:

        target = pipe_target.transform(target.values.reshape(-1, 1))

    # Save file to pickle
    if save_path is not None:
        with open(save_path, 'wb') as f:
            pickle.dump([df, target], f)

    return [df, target]
 def fit(self, df, y=None):
     self.encoder = CatBoostEncoder(
         handle_unknown='value', cols=self.columns)  #, use_cat_names=True)
     self.encoder = self.encoder.fit(df, y)
     return self
Exemple #13
0
    temp = X[:, i]
    nan_indexes = pd.isnull(X[:, i])
    temp[nan_indexes] = "unknown"
    temp[temp == "0"] = "unknown"
    temp[temp == "Unknown"] = "unknown"
    X[:, i] = temp
    temp = X_test[:, i]
    nan_indexes = pd.isnull(X_test[:, i])
    temp[nan_indexes] = "unknown"
    temp[temp == "0"] = "unknown"
    temp[temp == "Unknown"] = "unknown"
    X_test[:, i] = temp

#Encode categorical data
print("Encoding data..")
encoder_t = CatBoostEncoder(cols=cat_item_indexes)
X = encoder_t.fit_transform(X, y)
X_test = encoder_t.transform(X_test)
X_test = X_test.astype(float)
X_test = X_test.iloc[:, :].values
X = X.astype(float)
X = X.iloc[:, :].values

#Scale data
print("Scaling..")
sc = RobustScaler()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)

#Fit model - n_estimators, max_depth  & min_samples_split at current values will take a long time to run.
#Reducing these values will reduce the RMSE by a small margin, but testing will be a lot faster.