def get_gam_model(self, features: [Field], model_type=TYPE_LINEAR): model_spec = f(0) if features[0].is_factor() else s( 0, n_splines=self.num_splines) for i in range(1, len(features)): model_spec += f(i) if features[i].is_factor() else s( i, n_splines=self.num_splines) if model_type == TYPE_LINEAR: return LinearGAM(model_spec) if model_type == TYPE_LOGISTIC: return LogisticGAM(model_spec)
def GAM_linear(X, y): X= X.to_numpy() y = y.to_numpy() from pygam import LinearGAM, s, f, te gam = LinearGAM(s(0) +s(1) +f(2)) gam.gridsearch(X,y) y_pred = gam.predict(X) y_pred = pd.DataFrame(y_pred) y_pred['actual'] =y y_pred['residual'] = y_pred.actual-y_pred[0] return gam, gam.summary(), y_pred
def _fit(self, X, y, mylam=None, **kwargs): if isinstance(X, pd.DataFrame): X = X.values if not self.fit_binary_feat_as_factor_term: self.model = self.model_cls(max_iter=self.max_iter, n_splines=self.n_splines, **self.kwargs) else: formulas = [] for idx, feat_name in enumerate(self.feature_names): num_unique_x = len(self.X_values_counts[feat_name]) if num_unique_x < 2: continue if num_unique_x == 2: formulas.append(f(idx)) else: formulas.append(s(idx)) the_formula = formulas[0] for term in formulas[1:]: the_formula += term self.model = self.model_cls(the_formula, max_iter=self.max_iter, n_splines=self.n_splines, **self.kwargs) if not self.search: # Just fit the model with this lam return self.model.fit(X, y, **kwargs) if mylam is None: mylam = self.search_lam # do a grid search over here try: print('search range from %f to %f' % (mylam[0], mylam[-1])) self.model.gridsearch(X, y, lam=mylam, **kwargs) except (np.linalg.LinAlgError, pygam.utils.OptimizationError) as e: print('Get the following error:', str(e), '\nRetry the grid search') if hasattr(self.model, 'coef_'): del self.model.coef_ self._fit(X, y, mylam=mylam[1:], **kwargs) if not hasattr(self.model, 'statistics_'): # Does not finish the training raise Exception('Training fails.') return self
def fit(self): S = s(0) if self.feature_names[0] in self.numerical_features else f(0) for i in range(1, len(self.feature_names)): if self.feature_names[i] in self.numerical_features: S += s(i) else: S += f(i) if self.mode == 'regression': gam = LinearGAM(S) gam.gridsearch(self.X_train, self.y_train) self._is_fitted = True self.explainer = gam elif self.mode == 'classification': gam = LogisticGAM(S) gam.gridsearch(np.array(self.X_train), self.y_train) self._is_fitted = True self.explainer = gam else: raise NameError( 'ERROR: mode should be regression or classification')
def lingam(term='spline'): """ Method to load unfitted Generalized Additive Models models of type modelclass INPUT: term: 'linear', 'spline' or 'factor' RETURN: model """ if term is 'linear': regmod = LinearGAM(l(0)) # GAM with spline term elif term is 'spline': regmod = LinearGAM(s(0)) # GAM with factor term elif term is 'factor': regmod = LinearGAM(f(0)) else: raise ValueError('Given Gam term unknown') utils.display_get_params('LinearGAM Model Description', regmod.get_params()) return(regmod)
gam2.summary() import pandas as pd pd.DataFrame(X).corr() ###################################################### # regression from pygam import LinearGAM, s, f from pygam.datasets import wage X, y = wage(return_X_y=True) ## model gam = LinearGAM(s(0) + s(1) + f(2)) gam.gridsearch(X, y) gam.summary() ## plotting plt.figure() fig, axs = plt.subplots(1, 3) titles = ['year', 'age', 'education'] for i, ax in enumerate(axs): XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--')
from pygam.datasets import wage from pygam import LinearGAM, s, f import numpy as np import matplotlib.pyplot as plt X, y = wage() gam = LinearGAM(s(0, n_splines=5) + s(1) + f(2)).fit(X, y) gam.summary() lam = np.logspace(-3, 5, 5) lams = [lam] * 3 gam.gridsearch(X, y, lam=lams) gam.summary() lams = np.random.rand(100, 3) # random points on [0, 1], with shape (100, 3) lams = lams * 8 - 3 # shift values to -3, 3 lams = np.exp(lams) # transforms values to 1e-3, 1e3 random_gam = LinearGAM(s(0) + s(1) + f(2)).gridsearch(X, y, lam=lams) random_gam.summary() print(gam.statistics_['GCV'] < random_gam.statistics_['GCV']) for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i)
#https://pygam.readthedocs.io/en/latest/notebooks/quick_start.html# import pygam from pygam.datasets import wage X, y = wage() from pygam import LinearGAM, s, f #Let’s fit a spline term to the first 2 features, and a factor term to the 3rd feature. gam = LinearGAM(s(0) + s(1) + f(2)).fit(X, y) gam.summary()
# https://pygam.readthedocs.io/en/latest/notebooks/quick_start.html from pygam.datasets import wage X, y = wage() X.shape, y.shape from pygam import LinearGAM, s, f gam = LinearGAM(s(0) + s(1) + f(2)).fit(X, y) gam.summary() #by deafaule s has 20 base functions gam2 = LinearGAM(s(0, n_splines=5) + s(1) + f(2)).fit(X, y) gam2.summary() #by default all terms has lambda penalty of 0.6, #running grid search for lambda optimization #using GCV (generalized cv-score) import numpy as np lam = np.logspace(-3, 5, 5) lams = [lam] * 3 lams gam3 = LinearGAM(s(0) + s(1) + f(2)) gam3.gridsearch(X, y, lam=lams) gam3.summary()
def gam_wave_1(df, df_train: pd.DataFrame): galaxy_to_int = dict( (i, g) for g, i in enumerate(df_train.galaxy.unique())) int_to_galaxy = {v: k for k, v in galaxy_to_int.items()} tmp_df_to_predict = df.loc[:, predictors_wave_1].dropna() tmp_df_to_change_train = df_train.loc[:, predictors_wave_1].dropna() df_train_return = df_train.copy() df_train.loc[:, ['galaxy']] = [galaxy_to_int[i] for i in df_train.galaxy] for TARGET in wave_1_gam: tmp_df_to_train_on = df_train.loc[:, predictors_wave_1 + [TARGET]] tmp_df_to_train_on = tmp_df_to_train_on.dropna() X = tmp_df_to_train_on.loc[:, tmp_df_to_train_on.columns != TARGET] y = tmp_df_to_train_on.loc[:, TARGET] lams = np.exp(np.random.random(size=(500, 9)) * 6 - 3) gam = LinearGAM( s(0, dtype='categorical', by=1) + f(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7)).gridsearch(np.array(X), np.array(y), lam=lams) tmp_df_to_predict.loc[:, ['galaxy']] = [ galaxy_to_int[i] for i in tmp_df_to_predict.galaxy ] tmp_df_to_change_train.loc[:, ['galaxy']] = [ galaxy_to_int[i] for i in tmp_df_to_change_train.galaxy ] tmp_df_to_predict[TARGET] = gam.predict(tmp_df_to_predict) tmp_df_to_change_train[TARGET] = gam.predict(tmp_df_to_change_train) tmp_df_to_predict.loc[:, ['galaxy']] = [ int_to_galaxy[i] for i in tmp_df_to_predict.galaxy ] tmp_df_to_change_train.loc[:, ['galaxy']] = [ int_to_galaxy[i] for i in tmp_df_to_change_train.galaxy ] df = df.set_index(['galaxy', 'galactic year']).fillna( tmp_df_to_predict.set_index(['galaxy', 'galactic year'])).reset_index() df_train_return = df_train_return.set_index([ 'galaxy', 'galactic year' ]).fillna(tmp_df_to_change_train.set_index(['galaxy', 'galactic year' ])).reset_index() tmp_df_to_predict = tmp_df_to_predict.drop(columns=[TARGET], axis=1) tmp_df_to_change_train = tmp_df_to_change_train.drop(columns=[TARGET], axis=1) return df, df_train_return
#'inverse' #'log' #'inverse-squared' from pygam.datasets import wage import matplotlib.pyplot as plt X, y = wage(return_X_y=True) #X[0] es el año X[0] = 0 es 2000?... #X[1] es la edad de la persona #X[2] es su nivel de estudios, 0 = basica, 1=media superior, 2 = universidad, 3= posgrado #y ingresos $$ ## model gam1 = LinearGAM(s(0) + s(1) + f(2), fit_intercept=False) gam1.gridsearch(X, y) ## plotting plt.figure(figsize=(10, 7.5)) fig, axs = plt.subplots(1, 3) titles = ['year', 'age', 'education'] for i, ax in enumerate(axs): XX = gam1.generate_X_grid(term=i) ax.plot(XX[:, i], gam1.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam1.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') ax.set_title(titles[i]) plt.rcParams['figure.figsize'] = [10, 7.5]
df = pd.DataFrame(dataset, columns=column_name) # df1 = pd.DataFrame(dataset1, columns=column_name) db.connect.commit() train_value = df['2020-09-01' > df['date']] x_train1 = train_value.iloc[:, 1:].astype('float64') y_train1 = train_value['value'].astype('float64').to_numpy() x_train2 = train_value['rain'].astype('float64') from pygam import LinearGAM, s, f from pygam.datasets import wage x_train2, y_train1 = wage() gam = LinearGAM(s(0) + s(1) + f(2)).fit(x_train2, y_train1) # s(0) + s(1) + f(2) gam.summary() import matplotlib.pyplot as plt for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) plt.figure() plt.plot(XX[:, term.feature], pdep) plt.plot(XX[:, term.feature], confi, c='r', ls='--')
import pygam as pg import pandas as pd import matplotlib.pyplot as plt filtered = pd.read_csv('full.csv') filtered = filtered.query('score != 0.5').query('time_awake<24') gam = pg.LogisticGAM( pg.s(0, basis='cp') + pg.f(1) + pg.s(2) + pg.s(3) + pg.s(4) + pg.s(5)).fit(X=filtered[[ 'hour', 'day', 'within_day', 'last_3_days', 'rating_diff', 'time_awake' ]], y=filtered['win']) gam.summary() ##hour has to be a cyclical smoother def draw_terms(gam, plt=None): for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) plt.figure() plt.plot(XX[:, term.feature], pdep) plt.plot(XX[:, term.feature], confi, c='r', ls='--') plt.title(repr(term)) plt.show()
elif row['maritl'] == '2. Married': val = 1 elif row['maritl'] == '3. Widowed': val = 2 elif row['maritl'] == '4. Divorced': val = 3 else: val = 4 return val Wage['mar_d'] = Wage.apply(marital_d, axis=1) Wage['job_d'] = np.where(Wage['jobclass'] == '1. Industrial', 0, 1) x3 = pd.concat([Wage['age'], Wage['mar_d'], Wage['job_d']], axis=1) gam = LinearGAM(s(0, n_splines=4) + f(1) + f(2)).fit(x3, Wage['wage']) for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) plt.figure() plt.plot(XX[:, term.feature], pdep) plt.plot(XX[:, term.feature], confi, c='r', ls='--') plt.title(repr(term)) plt.show() # ----------------------------------------------------------------------------