class DFCatBoostEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = CatBoostEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols], y)

        return self

    def transform(self, X):
        return self.__transform(X)

    def __transform(self, X, y=None):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.drop(columns=self.transform_cols)
        new_X = pd.concat([
            new_X,
            self.model.transform(X[self.transform_cols]) if y is None else
            self.model.fit_transform(X[self.transform_cols], y)
        ],
                          axis=1)

        return new_X

    def fit_transform(self, X, y):
        # NOTE: Result of fit_transform() is different from fit() + transform()
        return self.fit(X, y).__transform(X, y)
Exemple #2
0
 def encode_cat_features(self, X, y, cat_features, train_mask, val_mask, test_mask):
     from category_encoders import CatBoostEncoder
     enc = CatBoostEncoder()
     A = X.to_numpy(copy=True)
     b = y.to_numpy(copy=True)
     A[np.ix_(train_mask, cat_features)] = enc.fit_transform(A[np.ix_(train_mask, cat_features)], b[train_mask])
     A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform(A[np.ix_(val_mask + test_mask, cat_features)])
     A = A.astype(float)
     return pd.DataFrame(A, columns=X.columns)
Exemple #3
0
def reg_model(labelled_data, unlabelled_data):
    """ Parameters: training dataframe, unknown dataframe
        Returns: results dataframe (Instance, Income)

        ffill on NaN from training data,
        Replaces NaN in test data with ffill, 
        cat-encodes non-numeric fields, 
        scales values,
        80/20 splits data to help verify model, 
        uses LightGBM
    """

    # print("throwing away rows to speed up model")
    # speed up testing by throwing away some data
    # clean_labelled = labelled_data.sample(frac=0.2)
    clean_labelled = labelled_data.copy()
    clean_unlabelled = unlabelled_data.copy()

    print("cleaning data...")
    # get rid of weird value
    clean_labelled.loc[:,
                       "Work Experience in Current Job [years]"] = pandas.to_numeric(
                           labelled_data[
                               "Work Experience in Current Job [years]"],
                           errors="coerce")
    clean_unlabelled.loc[:,
                         "Work Experience in Current Job [years]"] = pandas.to_numeric(
                             unlabelled_data[
                                 "Work Experience in Current Job [years]"],
                             errors="coerce")
    print("mixed type issue fixed..")

    # fix additional income field
    clean_labelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric(
        np.fromiter(map(
            lambda s: s.replace(" EUR", ""),
            clean_labelled[
                "Yearly Income in addition to Salary (e.g. Rental Income)"],
        ),
                    dtype=np.float),
        errors="coerce")
    clean_unlabelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric(
        np.fromiter(map(
            lambda s: s.replace(" EUR", ""),
            clean_unlabelled[
                "Yearly Income in addition to Salary (e.g. Rental Income)"],
        ),
                    dtype=np.float),
        errors="coerce")

    # dropping useless columns
    drop_columns(clean_unlabelled)
    drop_columns(clean_labelled)

    # removing NaN values
    clean_labelled.fillna(method="ffill", inplace=True)
    clean_unlabelled = clean_unlabelled[all_columns]
    clean_unlabelled.fillna(method="ffill", inplace=True)

    # input data for final predictions
    unknown_data = clean_unlabelled.drop(["Instance"], axis=1)

    print("splitting data into train and test...")
    # 80/20 split, and separating targets
    split = split_data(clean_labelled)
    train_data, train_target, test_data, test_target = split

    print("encoding categorical data...")
    # categorical encoding
    cat = CatBoostEncoder()
    train_data = cat.fit_transform(train_data, train_target)
    test_data = cat.transform(test_data)
    unknown_data = cat.transform(unknown_data)

    # separate additional income
    train_add_income = train_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values
    test_add_income = test_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values
    unknown_add_income = unknown_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values

    train_data = train_data[no_income_columns]
    test_data = test_data[no_income_columns]
    unknown_data = unknown_data[no_income_columns]

    train_target = train_target[
        "Total Yearly Income [EUR]"].values - train_add_income
    test_target = test_target["Total Yearly Income [EUR]"].values

    print("scaling values...")
    # scaling values
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    unknown_data = scaler.transform(unknown_data)

    print("fitting model...")
    # fit model
    reg = LGBMRegressor()
    # reg = TransformedTargetRegressor(
    #     regressor=mod,
    #     transformer=scaler
    # )
    reg.fit(train_data, train_target)

    print("predicting test data...")
    test_result = reg.predict(test_data, num_iterations=15000)
    # add additional income
    test_result = test_result + test_add_income

    print("analysing test results...")
    # validate test
    error = mean_absolute_error(test_target, test_result)
    score = explained_variance_score(test_target, test_result)
    print("Mean absolute error of test data: ", error)
    print("Score: ", score)

    print("predicting unknown data...")
    # predict and format
    values = reg.predict(unknown_data)
    values = values + unknown_add_income

    results = pandas.DataFrame({
        "Instance": clean_unlabelled["Instance"].values,
        "Total Yearly Income [EUR]": values
    })
    print("Finished.")
    return results
    def _ml_data_prep(self):
        """Prepares datasets for ML

        This does one hot encoding, cat boost encoding, and train test
        split (if necessary).
        """

        df_post = copy.deepcopy(self.df_post)
        train_prior = copy.deepcopy(self.df_prior)

        # create test data if not provided
        if self.test_data is None:

            logger.info(
                "No test data was provided. Test data will be created with",
                "a {}-{} ".format(self.train_size*100, (1-self.train_size)*100),
                "shuffle split from the post data set."
            )

            df_post = shuffle(df_post)
            n_split = int(len(df_post)*self.train_size)

            train_post = df_post.iloc[:n_split]
            test = df_post.iloc[n_split:]

        else:
            test = copy.deepcopy(self.test_data)
            train_post = df_post

        # determine columns for OHE & CatBoost
        OHE_columns = [col for col in self.OHE_columns if
                       col != self.target_column]
        high_cardinality_columns = [col for col in self.high_cardinality_columns
                                 if col != self.target_column]

        if len(OHE_columns) > 0:
            logger.info("One hot encoded columns: ", OHE_columns)
        if len(high_cardinality_columns) > 0:
            logger.info("Cat boost encoded columns: ", high_cardinality_columns)

        # concat and then OHE to ensure columns match
        train_prior['source'] = "Train Prior"
        test['source'] = "Test"
        train_post['source'] = "Train Post"

        df = pd.concat([train_prior, test, train_post])
        df = pd.get_dummies(data=df, columns=OHE_columns)

        train_prior = df[df.source == 'Train Prior'].drop('source', axis=1)
        test = df[df.source == 'Test'].drop('source', axis=1)
        train_post = df[df.source == 'Train Post'].drop('source', axis=1)

        # CatBoostEncoder for high cardinality columns
        test_prior = copy.deepcopy(test)
        test_post = copy.deepcopy(test)

        tf_prior = CatBoostEncoder(cols=high_cardinality_columns,
                                   random_state=self.random_state)
        tf_post = CatBoostEncoder(cols=high_cardinality_columns,
                                  random_state=self.random_state)

        train_prior[high_cardinality_columns] = (
            tf_prior.fit_transform(train_prior[high_cardinality_columns],
                                   train_prior[self.target_column])
        )
        test_prior[high_cardinality_columns] = (
            tf_prior.transform(test_prior[high_cardinality_columns],
                               test_prior[self.target_column])
        )
        train_post[high_cardinality_columns] = (
            tf_post.fit_transform(train_post[high_cardinality_columns],
                                  train_post[self.target_column])
        )
        test_post[high_cardinality_columns] = (
            tf_post.transform(test_post[high_cardinality_columns],
                              test_post[self.target_column])
        )

        X_train_prior = train_prior.drop(self.target_column, axis=1).astype(float)
        y_train_prior = train_prior[self.target_column].astype(float)
        X_test_prior = test_prior.drop(self.target_column, axis=1).astype(float)
        y_test = test[self.target_column].astype(float)

        X_train_post = train_post.drop(self.target_column, axis=1).astype(float)
        y_train_post = train_post[self.target_column].astype(float)
        X_test_post = test_post.drop(self.target_column, axis=1).astype(float)

        self.X_train_prior = X_train_prior
        self.y_train_prior = y_train_prior
        self.X_test_prior = X_test_prior
        self.y_test = y_test
        self.X_train_post = X_train_post
        self.y_train_post = y_train_post
        self.X_test_post = X_test_post