def fit(self): """ perform model fitting """ # initialize y_vals = np.zeros((self.train_df.shape[0], )) if self.task == "multiclass": n_class = len(np.unique(self.train_df[self.target].values)) oof_pred = np.zeros((self.train_df.shape[0], n_class)) y_pred = np.zeros((self.test_df.shape[0], n_class)) else: oof_pred = np.zeros((self.train_df.shape[0], )) y_pred = np.zeros((self.test_df.shape[0], )) # group does not kick in when group k fold is used if self.group is not None: if self.group in self.features: self.features.remove(self.group) if self.group in self.categoricals: self.categoricals.remove(self.group) fi = np.zeros((self.n_splits, len(self.features))) # target encoding numerical_features = [f for f in self.features if f not in self.categoricals] if self.target_encoding: # perform target encoding k = 0 f = 1 overall_mean = self.train_df[self.target].mean() for c in self.categoricals: data_tmp = pd.DataFrame({c: self.train_df[c].values, 'target': self.train_df[self.target].values}) tmp = np.nan * np.ones(self.train_df.shape[0]) cv = self.get_cv() for fold, (train_idx, val_idx) in enumerate(cv): # target mean target_mean = data_tmp.iloc[train_idx].groupby(c)['target'].mean().reset_index() # smoothing target_count = data_tmp.iloc[train_idx].groupby(c)['target'].count().reset_index() target_count['target'] = target_count['target'].apply(lambda x : 1 / (1 + np.exp((-x -k) / f)) target_mean = L * target_mean + (1 - L) * overall_mean # allocate tmp[val_idx] = self.train_df[c].iloc[val_idx].map(target_mean).values self.train_df[c] = tmp # replace categorical variable in test target_mean = data_tmp.groupby(c)['target'].mean() self.test_df.loc[:, c] = self.test_df[c].map(target_mean).values # no categoricals any more numerical_features = self.features.copy() self.categoricals = [] # fill nan if self.model not in ['lgb', 'catb', 'xgb']: # fill NaN (numerical features -> median, categorical features -> mode) self.train_df[numerical_features] = self.train_df[numerical_features].replace([np.inf, -np.inf], np.nan) self.test_df[numerical_features] = self.test_df[numerical_features].replace([np.inf, -np.inf], np.nan) self.train_df[numerical_features] = self.train_df[numerical_features].fillna(self.train_df[numerical_features].median()) self.test_df[numerical_features] = self.test_df[numerical_features].fillna(self.test_df[numerical_features].median()) self.train_df[self.categoricals] = self.train_df[self.categoricals].fillna(self.train_df[self.categoricals].mode().iloc[0]) self.test_df[self.categoricals] = self.test_df[self.categoricals].fillna(self.test_df[self.categoricals].mode().iloc[0]) # scaling, if necessary if self.scaler is not None: # to normal pt = QuantileTransformer(n_quantiles=100, random_state=self.seed, output_distribution="normal") self.train_df[numerical_features] = pt.fit_transform(self.train_df[numerical_features]) self.test_df[numerical_features] = pt.transform(self.test_df[numerical_features]) # starndardize if self.scaler == "MinMax": scaler = MinMaxScaler() elif self.scaler == "Standard": scaler = StandardScaler() self.train_df[numerical_features] = scaler.fit_transform(self.train_df[numerical_features]) self.test_df[numerical_features] = scaler.transform(self.test_df[numerical_features]) x_test = self.test_df.copy() if self.model == "nn": x_test = [np.absolute(x_test[i]) for i in self.categoricals] + [x_test[numerical_features]] else: x_test = x_test[self.features] else: x_test = self.test_df[self.features] # fitting with out of fold cv = self.get_cv() for fold, (train_idx, val_idx) in enumerate(cv): # train test split x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx] y_train, y_val = self.train_df[self.target].iloc[train_idx], self.train_df[self.target].iloc[val_idx] if self.model == "nn": x_train = [np.absolute(x_train[i]) for i in self.categoricals] + [x_train[numerical_features]] x_val = [np.absolute(x_val[i]) for i in self.categoricals] + [x_val[numerical_features]] # model fitting train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val) model, importance = self.train_model(train_set, val_set) fi[fold, :] = importance y_vals[val_idx] = y_val # predictions and check cv score oofs, ypred = get_oof_ypred(model, x_val, x_test, self.model, self.task) y_pred += ypred.reshape(y_pred.shape) / self.n_splits if self.task == "multiclass": oof_pred[val_idx, :] = oofs.reshape(oof_pred[val_idx, :].shape) print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], np.argmax(oof_pred[val_idx, :], axis=1)))) else: oof_pred[val_idx] = oofs.reshape(oof_pred[val_idx].shape) print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], oof_pred[val_idx]))) # feature importance data frame fi_df = pd.DataFrame() for n in np.arange(self.n_splits): tmp = pd.DataFrame() tmp["features"] = self.features tmp["importance"] = fi[n, :] tmp["fold"] = n fi_df = pd.concat([fi_df, tmp], ignore_index=True) gfi = fi_df[["features", "importance"]].groupby(["features"]).mean().reset_index() fi_df = fi_df.merge(gfi, on="features", how="left", suffixes=('', '_mean')) # outputs if self.task == "multiclass": loss_score = self.calc_metric(y_vals, np.argmax(oof_pred, axis=1)) else: loss_score = self.calc_metric(y_vals, oof_pred) if self.verbose: print('Our oof loss score is: ', loss_score) return y_pred, loss_score, model, oof_pred, y_vals, fi_df def plot_feature_importance(self, rank_range=[1, 50]): """ function for plotting feature importance (nothing is returned when the model is NN) :EXAMPLE: # fit LGB regression model model = RunModel(train_df, test_df, target, features, categoricals=categoricals, model="lgb", task="regression", n_splits=4, cv_method="KFold", group=None, seed=1220, scaler=None) # plot fi_df = model.plot_feature_importance(rank_range=[1, 100]) """ # plot feature importance _, ax = plt.subplots(1, 1, figsize=(10, 20)) sorted_df = self.fi_df.sort_values(by = "importance_mean", ascending=False).reset_index().iloc[self.n_splits * (rank_range[0]-1) : self.n_splits * rank_range[1]] sns.barplot(data=sorted_df, x ="importance", y ="features", orient='h') ax.set_xlabel("feature importance") ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) return sorted_df
def fit(self): """ perform model fitting """ # initialize y_vals = np.zeros((self.train_df.shape[0], )) if self.task == "multiclass": n_class = len(np.unique(self.train_df[self.target].values)) oof_pred = np.zeros((self.train_df.shape[0], n_class)) y_pred = np.zeros((self.test_df.shape[0], n_class)) else: oof_pred = np.zeros((self.train_df.shape[0], )) y_pred = np.zeros((self.test_df.shape[0], )) # group does not kick in when group k fold is used if self.group is not None: if self.group in self.features: self.features.remove(self.group) if self.group in self.categoricals: self.categoricals.remove(self.group) fi = np.zeros((self.n_splits, len(self.features))) # target encoding numerical_features = [f for f in self.features if f not in self.categoricals] if self.target_encoding: # perform target encoding overall_mean = self.train_df[self.target].mean() for c in self.categoricals: data_tmp = pd.DataFrame({c: self.train_df[c].values, 'target': self.train_df[self.target].values}) tmp = np.nan * np.ones(self.train_df.shape[0]) cv = self.get_cv() for fold, (train_idx, val_idx) in enumerate(cv): # target mean target_mean = data_tmp.iloc[train_idx].groupby(c)['target'].mean() # smoothing target_count = data_tmp.iloc[train_idx].groupby(c)['target'].count() n_sigmoid = (1 / 1 + np.exp(-)) target_mean = L * target_mean + (1 - L) * overall_mean # allocate tmp[val_idx] = self.train_df[c].iloc[val_idx].map(target_mean).values self.train_df[c] = tmp # replace categorical variable in test target_mean = data_tmp.groupby(c)['target'].mean() self.test_df.loc[:, c] = self.test_df[c].map(target_mean).values # no categoricals any more numerical_features = self.features.copy() self.categoricals = [] # fill nan if self.model not in ['lgb', 'catb', 'xgb']: # fill NaN (numerical features -> median, categorical features -> mode) self.train_df[numerical_features] = self.train_df[numerical_features].replace([np.inf, -np.inf], np.nan) self.test_df[numerical_features] = self.test_df[numerical_features].replace([np.inf, -np.inf], np.nan) self.train_df[numerical_features] = self.train_df[numerical_features].fillna(self.train_df[numerical_features].median()) self.test_df[numerical_features] = self.test_df[numerical_features].fillna(self.test_df[numerical_features].median()) self.train_df[self.categoricals] = self.train_df[self.categoricals].fillna(self.train_df[self.categoricals].mode().iloc[0]) self.test_df[self.categoricals] = self.test_df[self.categoricals].fillna(self.test_df[self.categoricals].mode().iloc[0]) # scaling, if necessary if self.scaler is not None: # to normal pt = QuantileTransformer(n_quantiles=100, random_state=self.seed, output_distribution="normal") self.train_df[numerical_features] = pt.fit_transform(self.train_df[numerical_features]) self.test_df[numerical_features] = pt.transform(self.test_df[numerical_features]) # starndardize if self.scaler == "MinMax": scaler = MinMaxScaler() elif self.scaler == "Standard": scaler = StandardScaler() self.train_df[numerical_features] = scaler.fit_transform(self.train_df[numerical_features]) self.test_df[numerical_features] = scaler.transform(self.test_df[numerical_features]) x_test = self.test_df.copy() if self.model == "nn": x_test = [np.absolute(x_test[i]) for i in self.categoricals] + [x_test[numerical_features]] else: x_test = x_test[self.features] else: x_test = self.test_df[self.features] # fitting with out of fold cv = self.get_cv() for fold, (train_idx, val_idx) in enumerate(cv): # train test split x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx] y_train, y_val = self.train_df[self.target].iloc[train_idx], self.train_df[self.target].iloc[val_idx] if self.model == "nn": x_train = [np.absolute(x_train[i]) for i in self.categoricals] + [x_train[numerical_features]] x_val = [np.absolute(x_val[i]) for i in self.categoricals] + [x_val[numerical_features]] # model fitting train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val) model, importance = self.train_model(train_set, val_set) fi[fold, :] = importance y_vals[val_idx] = y_val # predictions and check cv score oofs, ypred = get_oof_ypred(model, x_val, x_test, self.model, self.task) y_pred += ypred.reshape(y_pred.shape) / self.n_splits if self.task == "multiclass": oof_pred[val_idx, :] = oofs.reshape(oof_pred[val_idx, :].shape) print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], np.argmax(oof_pred[val_idx, :], axis=1)))) else: oof_pred[val_idx] = oofs.reshape(oof_pred[val_idx].shape) print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], oof_pred[val_idx]))) # feature importance data frame fi_df = pd.DataFrame() for n in np.arange(self.n_splits): tmp = pd.DataFrame() tmp["features"] = self.features tmp["importance"] = fi[n, :] tmp["fold"] = n fi_df = pd.concat([fi_df, tmp], ignore_index=True) gfi = fi_df[["features", "importance"]].groupby(["features"]).mean().reset_index() fi_df = fi_df.merge(gfi, on="features", how="left", suffixes=('', '_mean')) # outputs if self.task == "multiclass": loss_score = self.calc_metric(y_vals, np.argmax(oof_pred, axis=1)) else: loss_score = self.calc_metric(y_vals, oof_pred) if self.verbose: print('Our oof loss score is: ', loss_score) return y_pred, loss_score, model, oof_pred, y_vals, fi_df