def _plot_logodd(self): # Gérer les manquants dans le GAM lignes_completes = np.invert( np.isnan(self.predictors_cont).sum(axis=1).astype(bool)) # Fit du GAM sur tout le monde gam = LogisticGAM(dtype=['numerical' for _ in range(self.d_cont)] + ['categorical' for _ in range( self.d_qual)]).fit( pd.concat([pd.DataFrame(self.predictors_cont[lignes_completes, :]).apply( lambda x: x.astype('float')), pd.DataFrame(self.predictors_qual[lignes_completes, :]).apply( lambda x: x.astype('category'))], axis=1), self.labels[lignes_completes]) # Quelles que soient les valeurs de predictors_cont_number et # predictors_qual_number, on plot tout pour l'instant plt.figure() fig, axs = plt.subplots(1, self.d_cont + self.d_qual) plt.rcParams['figure.figsize'] = (28, 8) for i, ax in enumerate(axs): try: XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') except ValueError: # pragma: no cover continue plt.show(block=False)
n_splines = [5, 10, 15, 20, 25] lams = lams * 6 # shift values to -3, 3 lams = lams - 3 lams = np.exp(lams) cons = [ 'convex', 'concave', 'monotonic_inc', 'monotonic_dec', 'circular', 'none' ] random = LogisticGAM(aa).gridsearch(trainX, trainy, weights=w, lam=lams, n_splines=n_splines) random = random.gridsearch(trainX, trainy, constraints=cons) print(random.lam) print(random.n_splines) print(random.constraints) print(random.accuracy(testX, testy)) from sklearn.metrics import confusion_matrix preds = random.predict(testX) print(confusion_matrix(testy, preds)) for i, term in enumerate(random.terms): if term.isintercept: continue XX = random.generate_X_grid(term=i) pdep, confi = random.partial_dependence(term=i, X=XX, width=0.95) plt.figure() plt.plot(XX[:, term.feature], pdep) plt.plot(XX[:, term.feature], confi, c='r', ls='--') plt.title(names1[i]) plt.show()
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from collections import Counter import pygam from pygam import LinearGAM, LogisticGAM import matplotlib.pyplot as plt # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folder tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) clf = LogisticGAM(terms="auto", lam=self.params["lam"], max_iter=self.params["max_iter"]) self.is_classifier = True else: clf = LinearGAM(terms="auto", lam=self.params["lam"], max_iter=self.params["max_iter"]) self.is_classifier = False X = self.basic_impute(X) # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [ orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or ( X_datatypes[col_count] == 'object') ] self.X_numeric = [ item for item in orig_cols if item not in self.X_categorical ] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: X.loc[:, self.X_categorical] = X[self.X_categorical].fillna( "Missing").copy() self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list( self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) # Replace missing values with a missing value code self.median_train = {} if len(self.X_numeric) > 0: for colname in self.X_numeric: self.median_train[colname] = X[colname].quantile(0.5) X.loc[:, colname] = X[colname].fillna( self.median_train[colname]).copy() try: clf.fit(X, y) except np.linalg.LinAlgError as e: raise IgnoreError("np.linalg.LinAlgError") from e except pygam.utils.OptimizationError as e: raise IgnoreError("pygam.utils.OptimizationError") from e except ValueError as e: if 'On entry to DLASCL parameter number' in str(e): raise IgnoreError('On entry to DLASCL parameter number') from e raise p_values = np.array(clf.statistics_['p_values']) # Plot the partial dependence plots for each feature for ii in range(X.shape[1]): XX = clf.generate_X_grid(term=ii) plt.figure() plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX)) plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX, width=.95)[1], c='r', ls='--') plt.title("Partial Dependence " + str(ii), fontdict={'fontsize': 10}) plt.show() plt.savefig(os.path.join( tmp_folder, 'Feature_partial_dependence_' + str(ii) + '.png'), bbox_inches="tight") if max(p_values[0:(len(p_values) - 1)]) > 0: importances = -np.log(p_values[0:(len(p_values) - 1)] + 10**(-16)) importances = list(importances / max(importances)) else: importances = [1] * (len(p_values) - 1) self.mean_target = np.array(sum(y) / len(y)) self.set_model_properties(model=clf, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
gam = LogisticGAM().fit(X, y) gam.summary() roc_auc_score(y, gam.predict_proba(X)) #0.994173140954495 gam.accuracy(X, y) #0.9560632688927944 #----------------------------------------------------- # Explore and interpret individual features plt.ion() plt.rcParams['figure.figsize'] = (28, 8) fig, axs = plt.subplots(1, X.shape[1]) for i, ax in enumerate(axs): XX = gam.generate_X_grid(term=i, meshgrid=True) pdep, confi = gam.partial_dependence(term=i, X=XX, meshgrid=True, width=.95) ax.plot(XX[0], pdep) ax.plot(XX[0], confi[:, 0], c='grey', ls='--') ax.plot(XX[0], confi[:, 1], c='grey', ls='--') ax.set_title(selected_features[i]) plt.show() #----------------------------------------------------- # Tuning Smoothness and Penalties n_splines = [25, 6, 25, 25, 6, 4]