def encode_high_cardinality_categorical_df(dataframe, fit=False): """ Encode high cardinality categorical features using Binary Encoding and dropping invariant features In Binary Encoding, features are converted to a binary representation and binary digits are used as new features. --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), high card. categorical features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = BinaryEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'high_card_categorical_encoder') else: encoder = unpickle_obj('high_card_categorical_encoder') # transform data return encoder.transform(dataframe)
def do_cat_bin(X, X_test, cols): be = BinaryEncoder(cols=cols).fit(X[cols]) X_tr = be.transform(X[cols]) X_te = be.transform(X_test[cols]) new_cols = list(X_tr.columns) print(f'do_cat_bin: Done. Added {len(new_cols)} new columns.') return X_tr, X_te, new_cols
def fit_binary(input_df: pd.DataFrame, cols: List[str], na_value: Any = None): """ Creates the binary encoder by fitting it through the given DataFrame. NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value. Args: input_df: DataFrame used to fit the encoder cols: List of categorical columns to be encoded na_value: Default null value for DataFrame Returns: result_df: encoded input_df DataFrame model : encoder model to be passed to `transform_binary` method """ df = input_df.copy() if na_value is not None: for col in cols: df[col] = df[col].replace({na_value: np.nan}) encoder = BinaryEncoder(cols=cols, drop_invariant=True) encoder = encoder.fit(df) for idx in range(len(encoder.base_n_encoder.ordinal_encoder.mapping)): encoder.base_n_encoder.ordinal_encoder.mapping[idx]["mapping"].loc[ np.nan] = -2 result_df = encoder.transform(df) model = {"encoder": encoder, "cols": cols, "na_value": na_value} return result_df, model
def to_categorical( training_data: pd.DataFrame, test_data: pd.DataFrame ) -> (pd.DataFrame, pd.DataFrame): categorical_columns_list = list(training_data.columns[training_data.dtypes==object]) ce_be = BinaryEncoder(cols=categorical_columns_list, handle_unknown="inpute") training_data_ce_binary = ce_be.fit_transform(training_data) test_data_ce_binary = ce_be.transform(test_data) return dict(train_data_categorical=training_data_ce_binary, test_data_categorical=test_data_ce_binary)
def categoryEncode(df, cols=None, mode="binary"): if(mode == "ordinal"): encoder = OrdinalEncoder(cols=cols, handle_missing="return_nan", handle_unknown="return_nan") elif(mode == "binary"): encoder = BinaryEncoder(cols=cols) df_new = encoder.fit_transform(df) return df_new
class DFBinaryEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = BinaryEncoder(**kwargs) self.transform_cols = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = self.model.transform(X[self.transform_cols]) new_X[new_X.columns] = new_X[new_X.columns].astype('int8') new_X = pd.concat([X, new_X], axis=1) new_X.drop(columns=self.transform_cols, inplace=True) return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X) def inverse_transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) columns = [ x for x in X.columns if any([y for y in self.transform_cols if x.startswith(f'{y}_')]) ] new_X = self.model.inverse_transform(X[columns]) new_X = pd.concat([X, new_X], axis=1) new_X.drop(columns=columns, inplace=True) return new_X
class df_BinaryEncoder(TransformerMixin): """ Use for encoding nominal features Parameters ---------- handle_unknown: str, default='ignore' ---------- """ def __init__(self, handle_unknown='ignore'): self.handle_unknown = handle_unknown def fit(self, X, y=None): self.enc = BinaryEncoder(handle_unknown=self.handle_unknown) self.enc.fit(X) return self def transform(self, X): return self.enc.transform(X)
def read_feature(self, one_hot=False, create_not_existing_features=True): """ it reads a feature from disk and returns it. if one_hot = False, it returns it as was saved. if one_hot = True, returns the onehot of the categorical columns, by means of self.columns_to_onehot """ path = 'dataset/preprocessed/{}/{}/feature/{}/features.csv'.format( self.cluster, self.mode, self.name) if not os.path.exists(path): if create_not_existing_features: choice = 'y' print('Missing feature: creating') else: choice = yesno_choice( 'feature \'{}\' does not exist. want to create?'.format( self.name)) if choice == 'y': self.save_feature() else: return index_col = 0 if self.save_index else None df = pd.read_csv(path, index_col=index_col) #df = df.drop('Unnamed: 0', axis=1) print('{} feature read'.format(self.name)) # then proceed with one hot if one_hot: for t in self.columns_to_onehot: col = df[t[0]] one_hot_prefix = t[2] if len(t) == 3 else t[0] if t[1] == 'single': oh = pd.get_dummies(col, prefix=one_hot_prefix) elif t[1] == 'binary': ce = BinaryEncoder(cols=t[0]) oh = ce.fit_transform(col) else: mid = col.apply(lambda x: x.split('|') if isinstance(x, str) else x) mid.fillna(value='', inplace=True) mlb = MultiLabelBinarizer() oh = mlb.fit_transform(mid) oh = pd.DataFrame(oh, columns=mlb.classes_) oh = oh.astype(np.uint8) oh = oh.add_prefix(one_hot_prefix) df = df.drop([t[0]], axis=1) df = pd.concat([df, oh], axis=1) print('{} onehot completed'.format(self.name)) df = self.post_loading(df) return df
def CatEncoder(X, cat_cols, tags, estimator_name, objective_type, trial, n_classes, random_state): if tags["handles categorical"] == False: large_threshold = 6 #TODO: handle numpy arrays with categorical? #TODO: handle multiclass / Regression if isinstance(X, pd.DataFrame) and isinstance(cat_cols[0], str): large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold] small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold] elif isinstance(X, pd.DataFrame): large_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) > large_threshold] small_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) <= large_threshold] else: large_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) > large_threshold] small_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) <= large_threshold] enc_pipe = None cat_enc_types = ["target", "binary", "catboost"] if len(small_cardinal_cats) > 0: enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True)) if len(large_cardinal_cats) > 0: if (objective_type == "classification" and n_classes == 1): cat_enc_types.append("woe") cat_enc_type = trial.suggest_categorical(estimator_name + " cat_enc_type", cat_enc_types) if cat_enc_type == "binary": # mapping = get_mapping(X, large_cardinal_cats) enc = BinaryEncoder(cols=large_cardinal_cats, # mapping=mapping ) elif cat_enc_type == "woe": enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True) elif cat_enc_type == "target": min_samples_leaf = 6 # TODO: calculate percentage or something else enc = TargetEncoder(min_samples_leaf=min_samples_leaf, cols=large_cardinal_cats) else: # catboost enc = CatBoostEncoder(cols=large_cardinal_cats, random_state=random_state) # TODO: replace SEED # TODO: permute to the dataset beforehand enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc) return enc_pipe
def CatEncoder(X, cat_cols, tags, objective_type, trial, n_classes, random_state): if tags["handles categorical"] == False: large_threshold = 6 #TODO: handle numpy arrays with categorical? large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold] small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold] enc_pipe = None cat_enc_types = ["binary", "catboost", "woe", "target"] if small_cardinal_cats is not None: enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True)) if large_cardinal_cats is not None: if (objective_type == "classification" and n_classes > 2): #multiclass cat_enc_types = ["binary"] cat_enc_type = trial.suggest_categorical("cat_enc_type", cat_enc_types) if cat_enc_type == "binary": # mapping = get_mapping(X, large_cardinal_cats) enc = BinaryEncoder(cols=large_cardinal_cats, drop_invariant=True, # mapping=mapping ) elif cat_enc_type == "woe": enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True) elif cat_enc_type == "target": min_samples_leaf = 10 # TODO: calculate percentage or something else enc = TargetEncoder(min_samples_leaf=min_samples_leaf, cols=large_cardinal_cats, drop_invariant=True) else: # catboost enc = CatBoostEncoder(cols=large_cardinal_cats, drop_invariant=True, random_state=random_state) # TODO: replace SEED # TODO: permute to the dataset beforehand enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc) return enc_pipe
#ordinal encoding from sklearn.preprocessing import OrdinalEncoder ord1 = OrdinalEncoder() ord1.fit([df['ord_2']]) df["ord_2"] = ord1.fit_transform(df[["ord_2"]]) df.head(10) dnew = df.copy() #ordinal encoding through mapping temp_dict = {'Cold': 1, 'Warm': 2, 'Hot': 3} dnew['Ord_2_encod'] = dnew.ord_2.map(temp_dict) dnew = dnew.drop(['ord_2'], axis=1) #Binary encoding from category_encoders import BinaryEncoder encoder = BinaryEncoder(cols=['ord_2']) newdata = encoder.fit_transform(df['ord_2']) df = pd.concat([df, newdata], axis=1) df = df.drop(['ord_2'], axis=1) df.head(10) #Hash encoding from sklearn.feature_extraction import FeatureHasher h = FeatureHasher(n_features=3, input_type='string') hashed_Feature = h.fit_transform(df['nom_0']) hashed_Feature = hashed_Feature.toarray() df = pd.concat([df, pd.DataFrame(hashed_Feature)], axis=1) df.head(10) df.insert(6, "Target", [0, 1, 1, 0, 0, 1, 0, 0, 0, 1], True)
def predict(): ''' For rendering results on HTML GUI ''' features = [x for x in request.form.values()] #final_features = [np.array(int_features)] #prediction = model.predict(final_features) #output = round(prediction[0], 2) features = np.array(features) features = features.reshape(1, 6) features = pd.DataFrame(data=features, columns=[ 'Name', 'Genre', 'Comments', 'Likes', 'Popularity', 'Followers' ]) df = pd.read_csv('data.csv') cv = {'Comments': int, 'Likes': int, 'Popularity': int, 'Followers': int} df = df.astype(cv) features = features.astype(cv) #x=df[df['Views']==0].index df.drop(index=df[df['Views'] < df['Likes']].index, axis=1, inplace=True) df.drop(index=df[df['Views'] < df['Comments']].index, axis=1, inplace=True) df.drop(index=df[df['Views'] < df['Popularity']].index, axis=1, inplace=True) Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)) df = df[~((df < (Q1 - 3 * IQR)) | (df > (Q3 + 3 * IQR))).any(axis=1)] df = df.drop( columns=['Unique_ID', 'Country', 'Song_Name', 'Timestamp', 'index']) y = df['Views'] df = df.drop(columns=['Views']) be = BinaryEncoder() df = be.fit_transform(df) f = be.transform(features) X = df.iloc[:, :] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) rg1 = AdaBoostRegressor() rg1.fit(X_train, y_train) #ypred=rg1.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg2 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1) # para={'n_estimators':[250,300],'learning_rate':[1,0.1,0.01]} # grid=GridSearchCV(estimator=rg8,param_grid=para,verbose=1,cv=10,n_jobs=-1) rg2.fit(X_train, y_train) #ypred=rg2.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg3 = RandomForestRegressor(random_state=0, n_estimators=20, max_depth=15) # para={'n_estimators':[5,10,30,20],'max_depth':[5,8,20,17]} # grid=GridSearchCV(estimator=rg9,param_grid=para,cv=10,verbose=1,n_jobs=-1) rg3.fit(X_train, y_train) #ypred=rg3.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg6 = StackingRegressor([rg1, rg2], meta_regressor=rg3) rg6.fit(X_train, y_train) #ypred=rg6.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) f = f.iloc[:, :] y_pred = rg6.predict(f) y_pred = y_pred.astype(int) return render_template( 'index.html', prediction_text='Numberb of Views is {}'.format(y_pred))
def main(): import psutil # import matplotlib.pyplot as plt from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier # from sklearn.preprocessing import StandardScaler # from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import Imputer from sklearn.externals import joblib # from sklearn import metrics from category_encoders import BinaryEncoder from datetime import datetime from sklearn.model_selection import TimeSeriesSplit import os import numpy as np import sys sys.path.append("../") import serial_preprocess_data as preprocess import utils cpu_count = int(psutil.cpu_count() / 4) - 2 print("Trying to use {} number of cpu".format(cpu_count)) data_dir = "../../data/" hdf_files = sorted([data_dir + file for file in os.listdir(data_dir) if '.h5' in file]) columns = ['Year', 'Cancelled', 'Distance', 'Diverted', 'ArrTime', 'Dest', 'FlightNum', # 'DepDelay', ## not using DepDelay 'ActualElapsedTime', 'ArrDelay', 'DayofMonth', 'UniqueCarrier', 'Month', 'DepTime', 'Origin', 'DayOfWeek' ] scoring = 'roc_auc' no_of_files = 12 df = preprocess.readFilesToDf("h5", file_list=hdf_files[:no_of_files], cols=columns) print("Size of file read in is {0:.2f} GB".format( utils.getFileSizeInGB(hdf_files[:no_of_files]))) print("Reading in {0} selected columns only".format(len(columns))) print("Columns are:", columns) print("Memory usage of the data frame is {0:.2f} GB".format( np.sum(df.memory_usage()) / 1e9)) # preprocess data check the percentage of nans _ = preprocess.find_cardinality_of_categorical_variables(df) ix = preprocess.clean_data_minimally(df) # apply cleaning of the data df = df.iloc[ix].reindex() df = df.sort_values(by=['DayofMonth', 'Month', 'Year', 'DepTime']) feature_cols = list(df.columns) feature_cols.remove('ArrDelay') feature_cols.remove('Cancelled') df['delayCat'] = df.ArrDelay.apply( preprocess.convert_delay_into_multiple_categories) df['delayBinaryCat'] = df.ArrDelay.apply( preprocess.convert_delay_into_two_categories) X = df[feature_cols] y = df['delayBinaryCat'] encoder = BinaryEncoder() encoder.fit(X) transformed_X = encoder.transform(X) print("Transformed columns are ", transformed_X.columns) df_gpby = df.groupby('delayCat') delay_percentage_breakdown = df_gpby.ArrDelay.count() / df.shape[0] * 100 delay_percentage_breakdown.index = ['very early', 'early', 'on time', 'late', 'very late' ] print("Percentage breakdown of different categories " + "of the target variable is: \n", delay_percentage_breakdown) # the breakdown of delay is pretty balanced. # Although a careful study will also look at the correlation with other # other features tscv = TimeSeriesSplit() # cv_ixes = [(train_ix, test_ix) # for train_ix, test_ix in tscv.split(transformed_X)] # only put grid search steps into pipeline rf_pipeline_steps = [ # impute missing feature values with median values ("imputer", Imputer(strategy="median")), ('rf', RandomForestClassifier(n_jobs=cpu_count, oob_score=True)), ] gridsearch_parameters = dict([ ("rf__n_estimators", [800]), ("rf__max_features", [None]), # not many featuers to subset from ]) rf_pipeline = Pipeline(rf_pipeline_steps) est = GridSearchCV(rf_pipeline, param_grid=gridsearch_parameters, n_jobs=1, scoring=scoring, cv=tscv.split(X), # this does 3 fold cross-validation ) print("Fitting the values") print("Columns in the training data are ", X.columns) est.fit(transformed_X.values, y.values) print("Saving the model") print("Best score" + scoring + "is", est.best_score_) print("Best parameters are ", est.best_params_) datetime_stamp = datetime.now().strftime( "%D_%X").replace("/", "_").replace(":", "_") joblib.dump(est.best_estimator_, "./RF_CV_pipeline_" + datetime_stamp + ".pkl")
class BinaryEncoder(): """Maps each categorical value to several columns using binary encoding. Parameters: cols: [str] list of column names to encode. """ name = 'binary' def __init__(self, cols=None): self.encoder = Binary(cols=cols) def fit(self, X, features, y=None): """Fits encoder to data table. returns self. """ self.encoder.fit(X, y) self.features = self.encode_features_list(X, features) return self def transform(self, X): """Encodes matrix and updates features accordingly. returns encoded matrix (dataframe). """ X_new = self.encoder.transform(X) feature_names = [] for feature in self.features: for fname in feature.get_feature_names(): feature_names.append(fname) X_new.columns = feature_names return X_new def fit_transform(self, X, features, y=None): """First fits, then transforms matrix. returns encoded matrix (dataframe). """ return self.fit(X, features, y).transform(X) def get_mapping(self, category): """Gets the mapping for the binary encoder and underlying ordinal encoder. returns tuple (binary_encoder_mapping, ordinal_encoder_mapping). """ def mapping_helper(method, category): if isinstance(category, str): for map in method.mapping: if map['col'] == category: return map['mapping'] return method.mapping[category]['mapping'] return mapping_helper(self.encoder.base_n_encoder, category), \ mapping_helper(self.encoder.base_n_encoder.ordinal_encoder, category) def encode_features_list(self, X, features): feature_list = [] index = 0 for f in features: if f.get_name() in self.encoder.base_n_encoder.cols: f = ft.Feature([f], primitive=BinaryEnc(self, index)) index += 1 feature_list.append(f) return feature_list def get_features(self): return self.features def get_name(self): return self.name
def __init__(self, cols=None): self.encoder = Binary(cols=cols)
def nominal(name: str): return (name, Pipeline([(name + '.select', DataFrameSelector([name])), (name + '.scale', BinaryEncoder())]))
from tqdm import tqdm #import dask.dataframe as dd import pickle from argparse import ArgumentParser DataFrame = pd.core.frame.DataFrame Series = pd.core.series.Series Array = np.ndarray Imputer = Callable[[DataFrame], DataFrame] nan = np.nan df = pd.read_csv('data.csv').drop(['name'], axis=1) X_ = df.drop('status_group', axis=1) y = df.status_group be = BinaryEncoder() FEATS = 70 pca = PCA(n_components=FEATS) vals = pca.fit_transform(StandardScaler().fit_transform(be.fit_transform(X_))) X = pd.DataFrame(vals, columns=[f"pc{k+1}" for k in range(FEATS)], index=y.index).assign(y=y) def mcar_goblin(dat: DataFrame, ratio: float) -> DataFrame: ''' Simulate MCAR with bernoulli ''' def ident_or_nan(x: float) -> float: ''' if heads, replace value with nan. if tails, identity '''
def fit(self, X, y=None): self.enc = BinaryEncoder(handle_unknown=self.handle_unknown) self.enc.fit(X) return self
def __init__(self, columns=None, **kwargs): self.columns = columns self.model = BinaryEncoder(**kwargs) self.transform_cols = None
from category_encoders import MEstimateEncoder # Create the encoder instance. Choose m to control noise. encoder = MEstimateEncoder(cols=["Zipcode"], m=5.0) # Fit the encoder on the encoding split. encoder.fit(X_encode, y_encode) # Encode the Zipcode column to create the final training data X_train = encoder.transform(X_pretrain) ## Preprocessing mode_binary = Pipeline([ ('encoder', SimpleImputer(strategy = 'most_frequent')), ('binary', BinaryEncoder())]) transformer = ColumnTransformer([ ('one hot', OneHotEncoder(handle_unknown = 'ignore'), [ 'hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']), ('mode binary', mode_binary, ['country']), ('impute mode', SimpleImputer(strategy = 'most_frequent'), ['children'])], remainder = 'passthrough') #https://scikit-learn.org/stable/modules/compose.html#columntransformer-for-heterogeneous-data X = df.drop('is_canceled', axis = 1) y = df['is_canceled'] # Train test split X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 1515) # Feature Scaling after split
classifier = LogisticRegression(multi_class = "multinomial") mapper = DataFrameMapper([ (cat_cols, [OrdinalEncoder(), OneHotEncoder()]), (cont_cols, None) ]) build_audit(mapper, classifier, "OrdinalEncoderAudit") mapper = DataFrameMapper([ (cat_cols, BaseNEncoder(base = 2, drop_invariant = True)), (cont_cols, None) ]) build_audit(mapper, classifier, "Base2EncoderAudit") mapper = DataFrameMapper([ (cat_cols, BaseNEncoder(base = 3, drop_invariant = True)), (cont_cols, None) ]) build_audit(mapper, classifier, "Base3EncoderAudit") classifier = RandomForestClassifier(n_estimators = 31, random_state = 13) mapper = DataFrameMapper([ (cat_cols, BinaryEncoder()), (cont_cols, None) ]) build_audit(mapper, classifier, "BinaryEncoderAudit", compact = False)
build_audit(mapper, classifier, "OrdinalEncoderAudit") mapper = DataFrameMapper([(cat_cols, BaseNEncoder(base=2, drop_invariant=True)), (cont_cols, None)]) build_audit(mapper, classifier, "Base2EncoderAudit") mapper = DataFrameMapper([ (cat_cols, [BaseNEncoder(base=3, drop_invariant=True), OneHotEncoder()]), (cont_cols, None) ]) build_audit(mapper, classifier, "Base3EncoderAudit") classifier = XGBClassifier(objective="binary:logistic", n_estimators=31, max_depth=7, random_state=13) mapper = DataFrameMapper([(cat_cols, BaseNEncoder(base=4, drop_invariant=True)), (cont_cols, None)]) build_audit(mapper, classifier, "Base4EncoderAudit", compact=False) classifier = RandomForestClassifier(n_estimators=31, random_state=13) mapper = DataFrameMapper([(cat_cols, BinaryEncoder()), (cont_cols, None)]) build_audit(mapper, classifier, "BinaryEncoderAudit", compact=False)
def binaryEncoding(df,column): from category_encoders import BinaryEncoder encoder=BinaryEncoder(cols=[column]) df=encoder.fit_transform(df) return df
def doCleanupEncode(X, y=None, cat=None, oh=None, binary=None, loo=None, woe=None, lp_cols=None, NoData=True): from enrich import replaceCVs from enrich import one_hot_encode from category_encoders import BinaryEncoder from category_encoders import OneHotEncoder from category_encoders import WOEEncoder from category_encoders import LeaveOneOutEncoder if NoData is False: if cat is not None | oh is not None: # translate associated columns' null, NaN, blank and 9 values to zero X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0) if oh is not None: if NoData: ec = OneHotEncoder(cols=oh, use_cat_names=True, return_df=True, handle_unknown='indicator', handle_missing='indicator').fit(X) X = ec.fit_transform(X) # dropping these columns did not help performance # for o in oh: # stem = o.split("_")[1] # d1 = "L_" + stem + "_-1" # d2 = "L_" + stem + "_nan" # print("DROPPING ", d1, " ", d2, "\n") # X.drop(d1, axis=1, errors='ignore', inplace=True) # X.drop(d2, axis=1, errors='ignore', inplace=True) else: # one-hot encode, then drop 0 if created for oh_c in oh: X = one_hot_encode(X, oh_c, False) X.drop(0, axis=1, errors='ignore', inplace=True) if binary is not None: # binary encode binary columns if NoData: enc = BinaryEncoder(cols=binary, drop_invariant=True, return_df=True, handle_unknown='indicator').fit(X) X = enc.transform(X) else: enc = BinaryEncoder(cols=binary, drop_invariant=True, return_df=True).fit(X) X = enc.transform(X) if woe is not None: # use weight of evidence on woe columns for w in woe: X[w] = X[w].fillna('NoData') wenc = WOEEncoder(cols=woe).fit(X, y) X = wenc.transform(X).round(2) if loo is not None: # use leave one out on loo columns for l in loo: X[l] = X[l].fillna('NoData') lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y) X = lenc.transform(X).round(2) # Cast all to int64 # X = X.astype("int64") if lp_cols is not None: # drop least predictive X.drop(lp_cols, axis=1, errors="ignore", inplace=True) X.reset_index(drop=True, inplace=True) return X
def train_pipeline(X, y): """ Builds and trains a machine learning pipeline """ numerical_col = [ 'Num nights', 'Adults', 'Children', 'Session duration', 'Sessions', 'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageviews', 'Hits', 'Created to arrival' ] categorical_col = [ 'Language', 'Website', 'Enquiry type', 'Enquiry status', 'Client budget', 'Country code', 'GA source', 'GA medium', 'Device', 'Created month' ] binary_col = [ 'Flights booked', 'User agent', 'User repeat', 'User referral' ] text_col = ['Click path', 'GA keyword'] target = ['is booking'] # Numerical pipeline numerical_pipeline = make_pipeline(ColumnSelector(cols=numerical_col), SimpleImputer(strategy="median"), StandardScaler()) # Categorical pipeline categorical_pipeline = make_pipeline( ColumnSelector(cols=categorical_col), SimpleImputer(strategy="constant", fill_value='None'), OneHotEncoder()) # Binary pipeline binary_pipeline = make_pipeline(ColumnSelector(cols=binary_col), SimpleImputer(strategy="most_frequent"), BinaryEncoder()) # Text pipelines text_pipeline_1 = make_pipeline( ColumnSelector(cols=['Click path']), SimpleImputer(strategy='constant', fill_value=''), ReshapeTransformer(), HashingVectorizer(n_features=2**11), DenseTransformer()) text_pipeline_2 = make_pipeline( ColumnSelector(cols=['GA keyword']), SimpleImputer(strategy='constant', fill_value=''), ReshapeTransformer(), TfidfVectorizer(), DenseTransformer()) # Pipeline union processing_pipeline = make_union(numerical_pipeline, categorical_pipeline, binary_pipeline, text_pipeline_1, text_pipeline_2) estimator = BalancedRandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=60, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=472, n_jobs=1, oob_score=False, random_state=None, replacement=False, sampling_strategy='auto', verbose=0, warm_start=False) predictive_pipeline = make_pipeline(processing_pipeline, estimator) predictive_pipeline.fit(X, y) return predictive_pipeline
nrows=500) test = pd.read_csv(os.path.join(config["input_path"], "test.csv"), na_values=-1, nrows=500) train_feature, train_label = train.iloc[:, 2:].copy(), train.iloc[:, 1].copy() test_feature = test.iloc[:, 1:].copy() del train, test train_feature = train_feature[[ col for col in train_feature.columns if not col.startswith("ps_calc_") ]] test_feature = test_feature[train_feature.columns] ncs = [ col for col in train_feature.columns if not col.endswith(("_bin", "_cat")) ] ccs = [ col for col in train_feature.columns if col.endswith(("_bin", "_cat")) ] eet = EntityEmbeddingTree(numeric_columns=ncs, categorical_columns=ccs) eet.fit(X=train_feature, y=train_label) encoder = BinaryEncoder() print(encoder.fit_transform(eet.transform(X=train_feature)).shape) print(encoder.transform(eet.transform(X=test_feature)).shape)