def encode_high_cardinality_categorical_df(dataframe, fit=False):
   """
    Encode high cardinality categorical features using Binary Encoding and dropping invariant features
    In Binary Encoding, features are converted to a binary representation and binary digits are used as new
    features.
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), high card. categorical features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = BinaryEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'high_card_categorical_encoder')
    else:
        encoder = unpickle_obj('high_card_categorical_encoder')

    # transform data
    return encoder.transform(dataframe)
Esempio n. 2
0
def fit_binary(input_df: pd.DataFrame, cols: List[str], na_value: Any = None):
    """
    Creates the binary encoder by fitting it through the given DataFrame.
    NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value.
    Args:
        input_df: DataFrame used to fit the encoder
        cols: List of categorical columns to be encoded
        na_value: Default null value for DataFrame

    Returns:
        result_df: encoded input_df DataFrame
        model : encoder model to be passed to `transform_binary` method
    """
    df = input_df.copy()

    if na_value is not None:
        for col in cols:
            df[col] = df[col].replace({na_value: np.nan})

    encoder = BinaryEncoder(cols=cols, drop_invariant=True)
    encoder = encoder.fit(df)
    for idx in range(len(encoder.base_n_encoder.ordinal_encoder.mapping)):
        encoder.base_n_encoder.ordinal_encoder.mapping[idx]["mapping"].loc[
            np.nan] = -2

    result_df = encoder.transform(df)

    model = {"encoder": encoder, "cols": cols, "na_value": na_value}
    return result_df, model
Esempio n. 3
0
class DFBinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = BinaryEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y=None):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = self.model.transform(X[self.transform_cols])
        new_X[new_X.columns] = new_X[new_X.columns].astype('int8')

        new_X = pd.concat([X, new_X], axis=1)
        new_X.drop(columns=self.transform_cols, inplace=True)

        return new_X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def inverse_transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        columns = [
            x for x in X.columns
            if any([y for y in self.transform_cols if x.startswith(f'{y}_')])
        ]
        new_X = self.model.inverse_transform(X[columns])

        new_X = pd.concat([X, new_X], axis=1)
        new_X.drop(columns=columns, inplace=True)

        return new_X
class df_BinaryEncoder(TransformerMixin):
    """
    Use for encoding nominal features
    Parameters
    ----------
    handle_unknown: str, default='ignore'
    ----------
    """
    def __init__(self, handle_unknown='ignore'):
        self.handle_unknown = handle_unknown
        
    def fit(self, X, y=None):
        self.enc = BinaryEncoder(handle_unknown=self.handle_unknown)
        self.enc.fit(X)
        return self
    
    def transform(self, X):
        return self.enc.transform(X)
class BinaryEncoder():
    """Maps each categorical value to several columns using binary encoding.

    Parameters:
        cols: [str]
            list of column names to encode.
    """
    name = 'binary'

    def __init__(self, cols=None):
        self.encoder = Binary(cols=cols)

    def fit(self, X, features, y=None):
        """Fits encoder to data table.
        returns self.
        """
        self.encoder.fit(X, y)
        self.features = self.encode_features_list(X, features)
        return self

    def transform(self, X):
        """Encodes matrix and updates features accordingly.
        returns encoded matrix (dataframe).
        """
        X_new = self.encoder.transform(X)
        feature_names = []
        for feature in self.features:
            for fname in feature.get_feature_names():
                feature_names.append(fname)
        X_new.columns = feature_names

        return X_new

    def fit_transform(self, X, features, y=None):
        """First fits, then transforms matrix.
        returns encoded matrix (dataframe).
        """
        return self.fit(X, features, y).transform(X)

    def get_mapping(self, category):
        """Gets the mapping for the binary encoder and underlying ordinal encoder.
        returns tuple (binary_encoder_mapping, ordinal_encoder_mapping).
        """
        def mapping_helper(method, category):
            if isinstance(category, str):
                for map in method.mapping:
                    if map['col'] == category:
                        return map['mapping']
            return method.mapping[category]['mapping']

        return mapping_helper(self.encoder.base_n_encoder, category), \
            mapping_helper(self.encoder.base_n_encoder.ordinal_encoder, category)

    def encode_features_list(self, X, features):
        feature_list = []
        index = 0
        for f in features:
            if f.get_name() in self.encoder.base_n_encoder.cols:
                f = ft.Feature([f], primitive=BinaryEnc(self, index))
                index += 1
            feature_list.append(f)
        return feature_list

    def get_features(self):
        return self.features

    def get_name(self):
        return self.name
def main():
    import psutil

    # import matplotlib.pyplot as plt
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    # from sklearn.preprocessing import StandardScaler
    # from sklearn.naive_bayes import GaussianNB
    from sklearn.preprocessing import Imputer
    from sklearn.externals import joblib
    # from sklearn import metrics
    from category_encoders import BinaryEncoder
    from datetime import datetime

    from sklearn.model_selection import TimeSeriesSplit
    import os
    import numpy as np

    import sys
    sys.path.append("../")
    import serial_preprocess_data as preprocess
    import utils

    cpu_count = int(psutil.cpu_count() / 4) - 2
    print("Trying to use {} number of cpu".format(cpu_count))
    data_dir = "../../data/"
    hdf_files = sorted([data_dir + file for file in os.listdir(data_dir)
                        if '.h5' in file])

    columns = ['Year',
               'Cancelled',
               'Distance',
               'Diverted',
               'ArrTime',
               'Dest',
               'FlightNum',
               # 'DepDelay',  ## not using DepDelay
               'ActualElapsedTime',
               'ArrDelay',
               'DayofMonth',
               'UniqueCarrier',
               'Month',
               'DepTime',
               'Origin',
               'DayOfWeek'
               ]
    scoring = 'roc_auc'
    no_of_files = 12

    df = preprocess.readFilesToDf("h5", file_list=hdf_files[:no_of_files],
                                  cols=columns)

    print("Size of file read in is {0:.2f} GB".format(
          utils.getFileSizeInGB(hdf_files[:no_of_files])))
    print("Reading in {0} selected columns only".format(len(columns)))
    print("Columns are:", columns)
    print("Memory usage of the data frame is {0:.2f} GB".format(
          np.sum(df.memory_usage()) / 1e9))

    # preprocess data check the percentage of nans
    _ = preprocess.find_cardinality_of_categorical_variables(df)

    ix = preprocess.clean_data_minimally(df)
    # apply cleaning of the data
    df = df.iloc[ix].reindex()
    df = df.sort_values(by=['DayofMonth', 'Month', 'Year', 'DepTime'])

    feature_cols = list(df.columns)
    feature_cols.remove('ArrDelay')
    feature_cols.remove('Cancelled')

    df['delayCat'] = df.ArrDelay.apply(
        preprocess.convert_delay_into_multiple_categories)
    df['delayBinaryCat'] = df.ArrDelay.apply(
        preprocess.convert_delay_into_two_categories)
    X = df[feature_cols]
    y = df['delayBinaryCat']

    encoder = BinaryEncoder()
    encoder.fit(X)
    transformed_X = encoder.transform(X)

    print("Transformed columns are ", transformed_X.columns)

    df_gpby = df.groupby('delayCat')
    delay_percentage_breakdown = df_gpby.ArrDelay.count() / df.shape[0] * 100
    delay_percentage_breakdown.index = ['very early',
                                        'early',
                                        'on time',
                                        'late',
                                        'very late'
                                        ]
    print("Percentage breakdown of different categories " +
          "of the target variable is: \n",
          delay_percentage_breakdown)

    # the breakdown of delay is pretty balanced.
    # Although a careful study will also look at the correlation with other
    # other features

    tscv = TimeSeriesSplit()
    # cv_ixes = [(train_ix, test_ix)
    #            for train_ix, test_ix in tscv.split(transformed_X)]

    # only put grid search steps into pipeline
    rf_pipeline_steps = [
        # impute missing feature values with median values
        ("imputer", Imputer(strategy="median")),
        ('rf', RandomForestClassifier(n_jobs=cpu_count, oob_score=True)),
    ]

    gridsearch_parameters = dict([
        ("rf__n_estimators", [800]),
        ("rf__max_features", [None]),  # not many featuers to subset from
    ])

    rf_pipeline = Pipeline(rf_pipeline_steps)

    est = GridSearchCV(rf_pipeline,
                       param_grid=gridsearch_parameters,
                       n_jobs=1,
                       scoring=scoring,
                       cv=tscv.split(X),  # this does 3 fold cross-validation
                       )
    print("Fitting the values")
    print("Columns in the training data are ", X.columns)
    est.fit(transformed_X.values, y.values)
    print("Saving the model")
    print("Best score" + scoring + "is", est.best_score_)
    print("Best parameters are ", est.best_params_)

    datetime_stamp = datetime.now().strftime(
        "%D_%X").replace("/", "_").replace(":", "_")
    joblib.dump(est.best_estimator_,
                "./RF_CV_pipeline_" + datetime_stamp + ".pkl")