def do_cat_bin(X, X_test, cols): be = BinaryEncoder(cols=cols).fit(X[cols]) X_tr = be.transform(X[cols]) X_te = be.transform(X_test[cols]) new_cols = list(X_tr.columns) print(f'do_cat_bin: Done. Added {len(new_cols)} new columns.') return X_tr, X_te, new_cols
def fit_binary(input_df: pd.DataFrame, cols: List[str], na_value: Any = None): """ Creates the binary encoder by fitting it through the given DataFrame. NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value. Args: input_df: DataFrame used to fit the encoder cols: List of categorical columns to be encoded na_value: Default null value for DataFrame Returns: result_df: encoded input_df DataFrame model : encoder model to be passed to `transform_binary` method """ df = input_df.copy() if na_value is not None: for col in cols: df[col] = df[col].replace({na_value: np.nan}) encoder = BinaryEncoder(cols=cols, drop_invariant=True) encoder = encoder.fit(df) for idx in range(len(encoder.base_n_encoder.ordinal_encoder.mapping)): encoder.base_n_encoder.ordinal_encoder.mapping[idx]["mapping"].loc[ np.nan] = -2 result_df = encoder.transform(df) model = {"encoder": encoder, "cols": cols, "na_value": na_value} return result_df, model
def encode_high_cardinality_categorical_df(dataframe, fit=False): """ Encode high cardinality categorical features using Binary Encoding and dropping invariant features In Binary Encoding, features are converted to a binary representation and binary digits are used as new features. --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), high card. categorical features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = BinaryEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'high_card_categorical_encoder') else: encoder = unpickle_obj('high_card_categorical_encoder') # transform data return encoder.transform(dataframe)
def to_categorical( training_data: pd.DataFrame, test_data: pd.DataFrame ) -> (pd.DataFrame, pd.DataFrame): categorical_columns_list = list(training_data.columns[training_data.dtypes==object]) ce_be = BinaryEncoder(cols=categorical_columns_list, handle_unknown="inpute") training_data_ce_binary = ce_be.fit_transform(training_data) test_data_ce_binary = ce_be.transform(test_data) return dict(train_data_categorical=training_data_ce_binary, test_data_categorical=test_data_ce_binary)
class DFBinaryEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = BinaryEncoder(**kwargs) self.transform_cols = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = self.model.transform(X[self.transform_cols]) new_X[new_X.columns] = new_X[new_X.columns].astype('int8') new_X = pd.concat([X, new_X], axis=1) new_X.drop(columns=self.transform_cols, inplace=True) return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X) def inverse_transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) columns = [ x for x in X.columns if any([y for y in self.transform_cols if x.startswith(f'{y}_')]) ] new_X = self.model.inverse_transform(X[columns]) new_X = pd.concat([X, new_X], axis=1) new_X.drop(columns=columns, inplace=True) return new_X
class df_BinaryEncoder(TransformerMixin): """ Use for encoding nominal features Parameters ---------- handle_unknown: str, default='ignore' ---------- """ def __init__(self, handle_unknown='ignore'): self.handle_unknown = handle_unknown def fit(self, X, y=None): self.enc = BinaryEncoder(handle_unknown=self.handle_unknown) self.enc.fit(X) return self def transform(self, X): return self.enc.transform(X)
def predict(): ''' For rendering results on HTML GUI ''' features = [x for x in request.form.values()] #final_features = [np.array(int_features)] #prediction = model.predict(final_features) #output = round(prediction[0], 2) features = np.array(features) features = features.reshape(1, 6) features = pd.DataFrame(data=features, columns=[ 'Name', 'Genre', 'Comments', 'Likes', 'Popularity', 'Followers' ]) df = pd.read_csv('data.csv') cv = {'Comments': int, 'Likes': int, 'Popularity': int, 'Followers': int} df = df.astype(cv) features = features.astype(cv) #x=df[df['Views']==0].index df.drop(index=df[df['Views'] < df['Likes']].index, axis=1, inplace=True) df.drop(index=df[df['Views'] < df['Comments']].index, axis=1, inplace=True) df.drop(index=df[df['Views'] < df['Popularity']].index, axis=1, inplace=True) Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)) df = df[~((df < (Q1 - 3 * IQR)) | (df > (Q3 + 3 * IQR))).any(axis=1)] df = df.drop( columns=['Unique_ID', 'Country', 'Song_Name', 'Timestamp', 'index']) y = df['Views'] df = df.drop(columns=['Views']) be = BinaryEncoder() df = be.fit_transform(df) f = be.transform(features) X = df.iloc[:, :] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) rg1 = AdaBoostRegressor() rg1.fit(X_train, y_train) #ypred=rg1.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg2 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1) # para={'n_estimators':[250,300],'learning_rate':[1,0.1,0.01]} # grid=GridSearchCV(estimator=rg8,param_grid=para,verbose=1,cv=10,n_jobs=-1) rg2.fit(X_train, y_train) #ypred=rg2.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg3 = RandomForestRegressor(random_state=0, n_estimators=20, max_depth=15) # para={'n_estimators':[5,10,30,20],'max_depth':[5,8,20,17]} # grid=GridSearchCV(estimator=rg9,param_grid=para,cv=10,verbose=1,n_jobs=-1) rg3.fit(X_train, y_train) #ypred=rg3.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) rg6 = StackingRegressor([rg1, rg2], meta_regressor=rg3) rg6.fit(X_train, y_train) #ypred=rg6.predict(X_test) #sqrt(mean_squared_error(y_test,ypred)) f = f.iloc[:, :] y_pred = rg6.predict(f) y_pred = y_pred.astype(int) return render_template( 'index.html', prediction_text='Numberb of Views is {}'.format(y_pred))
class BinaryEncoder(): """Maps each categorical value to several columns using binary encoding. Parameters: cols: [str] list of column names to encode. """ name = 'binary' def __init__(self, cols=None): self.encoder = Binary(cols=cols) def fit(self, X, features, y=None): """Fits encoder to data table. returns self. """ self.encoder.fit(X, y) self.features = self.encode_features_list(X, features) return self def transform(self, X): """Encodes matrix and updates features accordingly. returns encoded matrix (dataframe). """ X_new = self.encoder.transform(X) feature_names = [] for feature in self.features: for fname in feature.get_feature_names(): feature_names.append(fname) X_new.columns = feature_names return X_new def fit_transform(self, X, features, y=None): """First fits, then transforms matrix. returns encoded matrix (dataframe). """ return self.fit(X, features, y).transform(X) def get_mapping(self, category): """Gets the mapping for the binary encoder and underlying ordinal encoder. returns tuple (binary_encoder_mapping, ordinal_encoder_mapping). """ def mapping_helper(method, category): if isinstance(category, str): for map in method.mapping: if map['col'] == category: return map['mapping'] return method.mapping[category]['mapping'] return mapping_helper(self.encoder.base_n_encoder, category), \ mapping_helper(self.encoder.base_n_encoder.ordinal_encoder, category) def encode_features_list(self, X, features): feature_list = [] index = 0 for f in features: if f.get_name() in self.encoder.base_n_encoder.cols: f = ft.Feature([f], primitive=BinaryEnc(self, index)) index += 1 feature_list.append(f) return feature_list def get_features(self): return self.features def get_name(self): return self.name
def doCleanupEncode(X, y=None, cat=None, oh=None, binary=None, loo=None, woe=None, lp_cols=None, NoData=True): from enrich import replaceCVs from enrich import one_hot_encode from category_encoders import BinaryEncoder from category_encoders import OneHotEncoder from category_encoders import WOEEncoder from category_encoders import LeaveOneOutEncoder if NoData is False: if cat is not None | oh is not None: # translate associated columns' null, NaN, blank and 9 values to zero X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0) if oh is not None: if NoData: ec = OneHotEncoder(cols=oh, use_cat_names=True, return_df=True, handle_unknown='indicator', handle_missing='indicator').fit(X) X = ec.fit_transform(X) # dropping these columns did not help performance # for o in oh: # stem = o.split("_")[1] # d1 = "L_" + stem + "_-1" # d2 = "L_" + stem + "_nan" # print("DROPPING ", d1, " ", d2, "\n") # X.drop(d1, axis=1, errors='ignore', inplace=True) # X.drop(d2, axis=1, errors='ignore', inplace=True) else: # one-hot encode, then drop 0 if created for oh_c in oh: X = one_hot_encode(X, oh_c, False) X.drop(0, axis=1, errors='ignore', inplace=True) if binary is not None: # binary encode binary columns if NoData: enc = BinaryEncoder(cols=binary, drop_invariant=True, return_df=True, handle_unknown='indicator').fit(X) X = enc.transform(X) else: enc = BinaryEncoder(cols=binary, drop_invariant=True, return_df=True).fit(X) X = enc.transform(X) if woe is not None: # use weight of evidence on woe columns for w in woe: X[w] = X[w].fillna('NoData') wenc = WOEEncoder(cols=woe).fit(X, y) X = wenc.transform(X).round(2) if loo is not None: # use leave one out on loo columns for l in loo: X[l] = X[l].fillna('NoData') lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y) X = lenc.transform(X).round(2) # Cast all to int64 # X = X.astype("int64") if lp_cols is not None: # drop least predictive X.drop(lp_cols, axis=1, errors="ignore", inplace=True) X.reset_index(drop=True, inplace=True) return X
def main(): import psutil # import matplotlib.pyplot as plt from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier # from sklearn.preprocessing import StandardScaler # from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import Imputer from sklearn.externals import joblib # from sklearn import metrics from category_encoders import BinaryEncoder from datetime import datetime from sklearn.model_selection import TimeSeriesSplit import os import numpy as np import sys sys.path.append("../") import serial_preprocess_data as preprocess import utils cpu_count = int(psutil.cpu_count() / 4) - 2 print("Trying to use {} number of cpu".format(cpu_count)) data_dir = "../../data/" hdf_files = sorted([data_dir + file for file in os.listdir(data_dir) if '.h5' in file]) columns = ['Year', 'Cancelled', 'Distance', 'Diverted', 'ArrTime', 'Dest', 'FlightNum', # 'DepDelay', ## not using DepDelay 'ActualElapsedTime', 'ArrDelay', 'DayofMonth', 'UniqueCarrier', 'Month', 'DepTime', 'Origin', 'DayOfWeek' ] scoring = 'roc_auc' no_of_files = 12 df = preprocess.readFilesToDf("h5", file_list=hdf_files[:no_of_files], cols=columns) print("Size of file read in is {0:.2f} GB".format( utils.getFileSizeInGB(hdf_files[:no_of_files]))) print("Reading in {0} selected columns only".format(len(columns))) print("Columns are:", columns) print("Memory usage of the data frame is {0:.2f} GB".format( np.sum(df.memory_usage()) / 1e9)) # preprocess data check the percentage of nans _ = preprocess.find_cardinality_of_categorical_variables(df) ix = preprocess.clean_data_minimally(df) # apply cleaning of the data df = df.iloc[ix].reindex() df = df.sort_values(by=['DayofMonth', 'Month', 'Year', 'DepTime']) feature_cols = list(df.columns) feature_cols.remove('ArrDelay') feature_cols.remove('Cancelled') df['delayCat'] = df.ArrDelay.apply( preprocess.convert_delay_into_multiple_categories) df['delayBinaryCat'] = df.ArrDelay.apply( preprocess.convert_delay_into_two_categories) X = df[feature_cols] y = df['delayBinaryCat'] encoder = BinaryEncoder() encoder.fit(X) transformed_X = encoder.transform(X) print("Transformed columns are ", transformed_X.columns) df_gpby = df.groupby('delayCat') delay_percentage_breakdown = df_gpby.ArrDelay.count() / df.shape[0] * 100 delay_percentage_breakdown.index = ['very early', 'early', 'on time', 'late', 'very late' ] print("Percentage breakdown of different categories " + "of the target variable is: \n", delay_percentage_breakdown) # the breakdown of delay is pretty balanced. # Although a careful study will also look at the correlation with other # other features tscv = TimeSeriesSplit() # cv_ixes = [(train_ix, test_ix) # for train_ix, test_ix in tscv.split(transformed_X)] # only put grid search steps into pipeline rf_pipeline_steps = [ # impute missing feature values with median values ("imputer", Imputer(strategy="median")), ('rf', RandomForestClassifier(n_jobs=cpu_count, oob_score=True)), ] gridsearch_parameters = dict([ ("rf__n_estimators", [800]), ("rf__max_features", [None]), # not many featuers to subset from ]) rf_pipeline = Pipeline(rf_pipeline_steps) est = GridSearchCV(rf_pipeline, param_grid=gridsearch_parameters, n_jobs=1, scoring=scoring, cv=tscv.split(X), # this does 3 fold cross-validation ) print("Fitting the values") print("Columns in the training data are ", X.columns) est.fit(transformed_X.values, y.values) print("Saving the model") print("Best score" + scoring + "is", est.best_score_) print("Best parameters are ", est.best_params_) datetime_stamp = datetime.now().strftime( "%D_%X").replace("/", "_").replace(":", "_") joblib.dump(est.best_estimator_, "./RF_CV_pipeline_" + datetime_stamp + ".pkl")
nrows=500) test = pd.read_csv(os.path.join(config["input_path"], "test.csv"), na_values=-1, nrows=500) train_feature, train_label = train.iloc[:, 2:].copy(), train.iloc[:, 1].copy() test_feature = test.iloc[:, 1:].copy() del train, test train_feature = train_feature[[ col for col in train_feature.columns if not col.startswith("ps_calc_") ]] test_feature = test_feature[train_feature.columns] ncs = [ col for col in train_feature.columns if not col.endswith(("_bin", "_cat")) ] ccs = [ col for col in train_feature.columns if col.endswith(("_bin", "_cat")) ] eet = EntityEmbeddingTree(numeric_columns=ncs, categorical_columns=ccs) eet.fit(X=train_feature, y=train_label) encoder = BinaryEncoder() print(encoder.fit_transform(eet.transform(X=train_feature)).shape) print(encoder.transform(eet.transform(X=test_feature)).shape)