def encode_high_cardinality_categorical_df(dataframe, fit=False): """ Encode high cardinality categorical features using Binary Encoding and dropping invariant features In Binary Encoding, features are converted to a binary representation and binary digits are used as new features. --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), high card. categorical features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = BinaryEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'high_card_categorical_encoder') else: encoder = unpickle_obj('high_card_categorical_encoder') # transform data return encoder.transform(dataframe)
def fit_binary(input_df: pd.DataFrame, cols: List[str], na_value: Any = None): """ Creates the binary encoder by fitting it through the given DataFrame. NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value. Args: input_df: DataFrame used to fit the encoder cols: List of categorical columns to be encoded na_value: Default null value for DataFrame Returns: result_df: encoded input_df DataFrame model : encoder model to be passed to `transform_binary` method """ df = input_df.copy() if na_value is not None: for col in cols: df[col] = df[col].replace({na_value: np.nan}) encoder = BinaryEncoder(cols=cols, drop_invariant=True) encoder = encoder.fit(df) for idx in range(len(encoder.base_n_encoder.ordinal_encoder.mapping)): encoder.base_n_encoder.ordinal_encoder.mapping[idx]["mapping"].loc[ np.nan] = -2 result_df = encoder.transform(df) model = {"encoder": encoder, "cols": cols, "na_value": na_value} return result_df, model
class DFBinaryEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = BinaryEncoder(**kwargs) self.transform_cols = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = self.model.transform(X[self.transform_cols]) new_X[new_X.columns] = new_X[new_X.columns].astype('int8') new_X = pd.concat([X, new_X], axis=1) new_X.drop(columns=self.transform_cols, inplace=True) return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X) def inverse_transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) columns = [ x for x in X.columns if any([y for y in self.transform_cols if x.startswith(f'{y}_')]) ] new_X = self.model.inverse_transform(X[columns]) new_X = pd.concat([X, new_X], axis=1) new_X.drop(columns=columns, inplace=True) return new_X
class df_BinaryEncoder(TransformerMixin): """ Use for encoding nominal features Parameters ---------- handle_unknown: str, default='ignore' ---------- """ def __init__(self, handle_unknown='ignore'): self.handle_unknown = handle_unknown def fit(self, X, y=None): self.enc = BinaryEncoder(handle_unknown=self.handle_unknown) self.enc.fit(X) return self def transform(self, X): return self.enc.transform(X)
class BinaryEncoder(): """Maps each categorical value to several columns using binary encoding. Parameters: cols: [str] list of column names to encode. """ name = 'binary' def __init__(self, cols=None): self.encoder = Binary(cols=cols) def fit(self, X, features, y=None): """Fits encoder to data table. returns self. """ self.encoder.fit(X, y) self.features = self.encode_features_list(X, features) return self def transform(self, X): """Encodes matrix and updates features accordingly. returns encoded matrix (dataframe). """ X_new = self.encoder.transform(X) feature_names = [] for feature in self.features: for fname in feature.get_feature_names(): feature_names.append(fname) X_new.columns = feature_names return X_new def fit_transform(self, X, features, y=None): """First fits, then transforms matrix. returns encoded matrix (dataframe). """ return self.fit(X, features, y).transform(X) def get_mapping(self, category): """Gets the mapping for the binary encoder and underlying ordinal encoder. returns tuple (binary_encoder_mapping, ordinal_encoder_mapping). """ def mapping_helper(method, category): if isinstance(category, str): for map in method.mapping: if map['col'] == category: return map['mapping'] return method.mapping[category]['mapping'] return mapping_helper(self.encoder.base_n_encoder, category), \ mapping_helper(self.encoder.base_n_encoder.ordinal_encoder, category) def encode_features_list(self, X, features): feature_list = [] index = 0 for f in features: if f.get_name() in self.encoder.base_n_encoder.cols: f = ft.Feature([f], primitive=BinaryEnc(self, index)) index += 1 feature_list.append(f) return feature_list def get_features(self): return self.features def get_name(self): return self.name
def main(): import psutil # import matplotlib.pyplot as plt from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier # from sklearn.preprocessing import StandardScaler # from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import Imputer from sklearn.externals import joblib # from sklearn import metrics from category_encoders import BinaryEncoder from datetime import datetime from sklearn.model_selection import TimeSeriesSplit import os import numpy as np import sys sys.path.append("../") import serial_preprocess_data as preprocess import utils cpu_count = int(psutil.cpu_count() / 4) - 2 print("Trying to use {} number of cpu".format(cpu_count)) data_dir = "../../data/" hdf_files = sorted([data_dir + file for file in os.listdir(data_dir) if '.h5' in file]) columns = ['Year', 'Cancelled', 'Distance', 'Diverted', 'ArrTime', 'Dest', 'FlightNum', # 'DepDelay', ## not using DepDelay 'ActualElapsedTime', 'ArrDelay', 'DayofMonth', 'UniqueCarrier', 'Month', 'DepTime', 'Origin', 'DayOfWeek' ] scoring = 'roc_auc' no_of_files = 12 df = preprocess.readFilesToDf("h5", file_list=hdf_files[:no_of_files], cols=columns) print("Size of file read in is {0:.2f} GB".format( utils.getFileSizeInGB(hdf_files[:no_of_files]))) print("Reading in {0} selected columns only".format(len(columns))) print("Columns are:", columns) print("Memory usage of the data frame is {0:.2f} GB".format( np.sum(df.memory_usage()) / 1e9)) # preprocess data check the percentage of nans _ = preprocess.find_cardinality_of_categorical_variables(df) ix = preprocess.clean_data_minimally(df) # apply cleaning of the data df = df.iloc[ix].reindex() df = df.sort_values(by=['DayofMonth', 'Month', 'Year', 'DepTime']) feature_cols = list(df.columns) feature_cols.remove('ArrDelay') feature_cols.remove('Cancelled') df['delayCat'] = df.ArrDelay.apply( preprocess.convert_delay_into_multiple_categories) df['delayBinaryCat'] = df.ArrDelay.apply( preprocess.convert_delay_into_two_categories) X = df[feature_cols] y = df['delayBinaryCat'] encoder = BinaryEncoder() encoder.fit(X) transformed_X = encoder.transform(X) print("Transformed columns are ", transformed_X.columns) df_gpby = df.groupby('delayCat') delay_percentage_breakdown = df_gpby.ArrDelay.count() / df.shape[0] * 100 delay_percentage_breakdown.index = ['very early', 'early', 'on time', 'late', 'very late' ] print("Percentage breakdown of different categories " + "of the target variable is: \n", delay_percentage_breakdown) # the breakdown of delay is pretty balanced. # Although a careful study will also look at the correlation with other # other features tscv = TimeSeriesSplit() # cv_ixes = [(train_ix, test_ix) # for train_ix, test_ix in tscv.split(transformed_X)] # only put grid search steps into pipeline rf_pipeline_steps = [ # impute missing feature values with median values ("imputer", Imputer(strategy="median")), ('rf', RandomForestClassifier(n_jobs=cpu_count, oob_score=True)), ] gridsearch_parameters = dict([ ("rf__n_estimators", [800]), ("rf__max_features", [None]), # not many featuers to subset from ]) rf_pipeline = Pipeline(rf_pipeline_steps) est = GridSearchCV(rf_pipeline, param_grid=gridsearch_parameters, n_jobs=1, scoring=scoring, cv=tscv.split(X), # this does 3 fold cross-validation ) print("Fitting the values") print("Columns in the training data are ", X.columns) est.fit(transformed_X.values, y.values) print("Saving the model") print("Best score" + scoring + "is", est.best_score_) print("Best parameters are ", est.best_params_) datetime_stamp = datetime.now().strftime( "%D_%X").replace("/", "_").replace(":", "_") joblib.dump(est.best_estimator_, "./RF_CV_pipeline_" + datetime_stamp + ".pkl")